Blame - src/core/CL/cl_kernels/gemm.cl - ml/ComputeLibrary

2018-12-06 17:13:09 +0000

[diff] [blame]

45

Gian Marco Iodice

2020-08-10 21:44:14 +0100

[diff] [blame^]

46

#define LOAD_TENSOR_BOUNDARY_AWARE_M0XK0(M0, K0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \

47

({ \

48

if(y * M0 + M0 >= SRC_HEIGHT && PARTIAL_LOAD_M0 != 0) \

49

{ \

50

if(x * K0 + K0 >= SRC_WIDTH && (PARTIAL_LOAD_K0 != 0)) \

51

{ \

52

LOAD_TENSOR_M0XN0(PARTIAL_LOAD_M0, PARTIAL_LOAD_K0, DATA_TYPE, a, input_ptr, src_stride_y, zin); \

} \

else \

{ \

LOAD_TENSOR_M0XN0(PARTIAL_LOAD_M0, K0, DATA_TYPE, a, input_ptr, src_stride_y, zin); \

} \

} \

else \

{ \

if(x * K0 + K0 >= SRC_WIDTH && (PARTIAL_LOAD_K0 != 0)) \

62

{ \

63

LOAD_TENSOR_M0XN0(M0, PARTIAL_LOAD_K0, DATA_TYPE, a, input_ptr, src_stride_y, zin); \

} \

else \

{ \

LOAD_TENSOR_M0XN0(M0, K0, DATA_TYPE, a, input_ptr, src_stride_y, zin); \

} \

} \

})

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

72

/** This OpenCL kernel reshapes the lhs input matrix. The kernel splits the input matrix in blocks of size M0xK0 and stores each one (not transposed) in

73

* the output matrix unrolling the values.

74

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

75

* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)

76

* @note The width of the input tensor must be passed at compile time using -DSRC_WIDTH (e.g. -DSRC_WIDTH=16)

Gian Marco Iodice

2020-08-10 21:44:14 +0100

[diff] [blame^]

77

* @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT (e.g. -DSRC_HEIGHT=16)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

78

* @note The block's dimensions (M0 and K0) must be passed at compile time using -DM0 and -DK0 (e.g. -DM0=2, -DK0=2).

79

* @note The number of M0xK0 vertical blocks to store on the same output row must be passed at compile time using -DV0 (e.g. -DV0=2)

Gian Marco Iodice

2020-08-10 21:44:14 +0100

[diff] [blame^]

80

* @note The size of the partial load block in y must be passed at compile time using -DPARTIAL_LOAD_M0 (e.g. -DPARTIAL_LOAD_M0=1)

81

* @note The size of the partial load block in x must be passed at compile time using -DPARTIAL_LOAD_K0 (e.g. -DPARTIAL_LOAD_K0=1)

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

82

* @note Only the following values for M0, K0 and V0 are supported:

83

* M0: 2,3,4,5,6,7,8

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

84

* K0: 2,3,4,8,16

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

85

* V0: greater than 0

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

86

* @note In case the input has to be reinterpreted as a 3D tensor (e.g. input of convolution layer 1x1), the following information must be passed at compile time:

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

87

* -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D

88

* -# HEIGHT_GEMM3D: The height of the input in case it has to be reinterpreted as a 3D tensor.

89

* -# DEPTH_GEMM3D: The depth of the input in case it has to be reinterpreted as a 3D tensor

90

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped

91

* @note If the M0xK0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile time.

92

*

Michele Di Giorgio

2020-07-06 11:27:21 +0100

[diff] [blame]

93

* @param[in] src_ptr Pointer to the source LHS tensor. Supported data types: All

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

94

* @param[in] src_stride_x Stride of the source LHS tensor in X dimension (in bytes)

95

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

96

* @param[in] src_stride_y Stride of the source LHS tensor in Y dimension (in bytes)

97

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

98

* @param[in] src_stride_z Stride of the source LHS tensor in Z dimension (in bytes)

99

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

100

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source LHS tensor

101

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr

102

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

103

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

104

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

105

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

106

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

107

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

108

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

109

* @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)

110

*/

111

__kernel void gemm_reshape_lhs_matrix_nt(TENSOR3D_DECLARATION(src),

112

TENSOR3D_DECLARATION(dst)

113

#if defined(REINTERPRET_INPUT_AS_3D)

114

,

115

uint cross_plane_pad

116

#endif // REINTERPRET_INPUT_AS_3D

117

)

118

{

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

119

// Block size

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

120

#define BLOCK_SIZE ((M0) * (K0))

121

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

122

// Output offset X

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

123

#if defined(INTERLEAVE)

124

#define OUTPUT_OFFSET_X (K0)

125

#else // defined(INTERLEAVE)

126

#define OUTPUT_OFFSET_X (BLOCK_SIZE)

127

#endif // defined(INTERLEAVE)

128

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

129

// Output step X

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

130

#if defined(INTERLEAVE)

131

#define OUTPUT_STEP_X (K0) * (V0)

132

#else // Do not interleave

133

#define OUTPUT_STEP_X (K0)

134

#endif // defined(INTERLEAVE)

135

136

// Compute source and destination addresses

137

uint x = get_global_id(0);

138

uint y = get_global_id(1);

139

uint z = get_global_id(2);

140

141

// ------------------ Compute input/output addresses ---------------------------

142

143

// Compute the input address

144

__global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + x * (uint)K0 * sizeof(DATA_TYPE) + y * (uint)M0 * src_stride_y;

145

146

// Compute the output address

147

__global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)BLOCK_SIZE * (uint)V0 * sizeof(DATA_TYPE)) + ((y / (uint)V0) * (uint)dst_stride_y) + ((y % V0) *

148

(uint)OUTPUT_OFFSET_X * sizeof(DATA_TYPE));

149

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

150

// Create variables: uint zin0=0, zin1=0, zin2=0...zin(M0-1)=0;

151

REPEAT_VAR_INIT_TO_CONST(M0, uint, zin, 0);

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

152

153

#if defined(REINTERPRET_INPUT_AS_3D)

154

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

155

// multiply src_stride_z by DEPTH_GEMM3D

156

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

157

input_ptr += z * (uint)src_stride_z * DEPTH_GEMM3D;

158

159

// The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

160

CALCULATE_Z_OFFSET(M0, uint, zin, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, cross_plane_pad, src_stride_y);

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

161

162

#else // defined(REINTERPRET_INPUT_AS_3D)

163

164

input_ptr += z * (uint)src_stride_z;

165

166

#endif // defined(REINTERPRET_INPUT_AS_3D)

167

168

// Add offset for batched GEMM

169

output_ptr += z * (uint)dst_stride_z;

170

171

// ---------------------------Load input values --------------------------------

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

172

// Load values from the LHS matrix

Gian Marco Iodice

2020-08-10 21:44:14 +0100

[diff] [blame^]

173

REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, K0), a, 0);

174

175

LOAD_TENSOR_BOUNDARY_AWARE_M0XK0(M0, K0, DATA_TYPE, a, input_ptr, src_stride_y, zin);

176

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

177

// ---------------------------Store output values ------------------------------

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

178

REPEAT_VAR_INIT_TO_CONST(16, uint, zout, 0);

179

STORE_BLOCK(M0, K0, DATA_TYPE, a, output_ptr, OUTPUT_STEP_X * sizeof(DATA_TYPE), zout);

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

180

181

#undef BLOCK_SIZE

182

#undef OUTPUT_OFFSET_X

183

#undef OUTPUT_STEP_X

184

}

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

185

186

#if M0 == 2

187

#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \

188

({ \

189

VEC_DATA_TYPE(DATA_TYPE, M0) \

190

res = (VEC_DATA_TYPE(DATA_TYPE, M0))(a0.s##i, a1.s##i); \

191

VSTORE(M0) \

192

(res, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \

193

})

194

#elif M0 == 3 // M0 == 3

195

#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \

196

({ \

197

VEC_DATA_TYPE(DATA_TYPE, M0) \

198

res = (VEC_DATA_TYPE(DATA_TYPE, M0))(a0.s##i, a1.s##i, a2.s##i); \

199

VSTORE(M0) \

200

(res, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \

201

})

202

#elif M0 == 4 // M0 == 4

203

#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \

204

({ \

205

VEC_DATA_TYPE(DATA_TYPE, M0) \

206

res = (VEC_DATA_TYPE(DATA_TYPE, M0))(a0.s##i, a1.s##i, a2.s##i, a3.s##i); \

207

VSTORE(M0) \

208

(res, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \

209

})

210

#elif M0 == 5 // M0 == 5

211

#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \

212

({ \

213

VEC_DATA_TYPE(DATA_TYPE, 4) \

214

res0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s##i, a1.s##i, a2.s##i, a3.s##i); \

215

DATA_TYPE res1 = a4.s##i; \

216

VSTORE(4) \

217

(res0, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \

218

*((__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)) + 4) = res1; \

219

})

220

#elif M0 == 6 // M0 == 6

221

#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \

222

({ \

223

VEC_DATA_TYPE(DATA_TYPE, 4) \

224

res0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s##i, a1.s##i, a2.s##i, a3.s##i); \

225

VEC_DATA_TYPE(DATA_TYPE, 2) \

226

res1 = (VEC_DATA_TYPE(DATA_TYPE, 2))(a4.s##i, a5.s##i); \

227

VSTORE(4) \

228

(res0, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \

229

VSTORE(2) \

230

(res1, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)) + 4); \

231

})

232

#elif M0 == 7 // M0 == 7

233

#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \

234

({ \

235

VEC_DATA_TYPE(DATA_TYPE, 4) \

236

res0 = (VEC_DATA_TYPE(DATA_TYPE, 4))(a0.s##i, a1.s##i, a2.s##i, a3.s##i); \

237

VEC_DATA_TYPE(DATA_TYPE, 3) \

238

res1 = (VEC_DATA_TYPE(DATA_TYPE, 3))(a4.s##i, a5.s##i, a6.s##i); \

239

VSTORE(4) \

240

(res0, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \

241

VSTORE(3) \

242

(res1, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE)) + 4); \

243

})

244

#elif M0 == 8 // M0 == 8

245

#define TRANSPOSE_COLUMN_AND_STORE(output_ptr, output_step_x, i) \

246

({ \

247

VEC_DATA_TYPE(DATA_TYPE, M0) \

248

res = (VEC_DATA_TYPE(DATA_TYPE, M0))(a0.s##i, a1.s##i, a2.s##i, a3.s##i, a4.s##i, a5.s##i, a6.s##i, a7.s##i); \

249

VSTORE(M0) \

250

(res, 0, (__global DATA_TYPE *)(output_ptr + 0x##i * output_step_x * sizeof(DATA_TYPE))); \

251

})

252

#else // M0 not supported

253

#error "M0 value not supported"

254

#endif // N0 conditions

255

256

/** This OpenCL kernel reshapes the lhs input matrix. The kernel splits the input matrix in blocks of size M0xK0 and stores each one (transposed) in

257

* the output matrix unrolling the values.

258

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

259

* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)

260

* @note The width of the input tensor must be passed at compile time using -DSRC_WIDTH (e.g. -DSRC_WIDTH=16)

Gian Marco Iodice

2020-08-10 21:44:14 +0100

[diff] [blame^]

261

* @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT (e.g. -DSRC_HEIGHT=16)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

262

* @note The block's dimensions (M0 and K0) must be passed at compile time using -DM0 and -DK0 (e.g. -DM0=2, -DK0=2).

263

* @note The number of M0xK0 vertical blocks to store on the same output row must be passed at compile time using -DV0 (e.g. -DV0=2)

Gian Marco Iodice

2020-08-10 21:44:14 +0100

[diff] [blame^]

264

* @note The size of the partial load block in y must be passed at compile time using -DPARTIAL_LOAD_M0 (e.g. -DPARTIAL_LOAD_M0=1)

265

* @note The size of the partial load block in x must be passed at compile time using -DPARTIAL_LOAD_K0 (e.g. -DPARTIAL_LOAD_K0=1)

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

266

* @note Only the following values for M0, K0 and V0 are supported:

267

* M0: 2,3,4,5,6,7,8

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

268

* K0: 2,3,4,8,16

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

269

* V0: greater than 0

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

270

* @note In case the input has to be reinterpreted as a 3D tensor (e.g. input of convolution layer 1x1), the following information must be passed at compile time:

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

271

* -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D

272

* -# HEIGHT_GEMM3D: The height of the input in case it has to be reinterpreted as a 3D tensor.

273

* -# DEPTH_GEMM3D: The depth of the input in case it has to be reinterpreted as a 3D tensor

274

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped

275

* @note If the M0xK0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile time.

276

*

Michele Di Giorgio

2020-07-06 11:27:21 +0100

[diff] [blame]

277

* @param[in] src_ptr Pointer to the source LHS tensor. Supported data types: All

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

278

* @param[in] src_stride_x Stride of the source LHS tensor in X dimension (in bytes)

279

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

280

* @param[in] src_stride_y Stride of the source LHS tensor in Y dimension (in bytes)

281

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

282

* @param[in] src_stride_z Stride of the source LHS tensor in Z dimension (in bytes)

283

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

284

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source LHS tensor

285

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr

286

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

287

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

288

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

289

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

290

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

291

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

292

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

293

* @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)

294

*/

295

__kernel void gemm_reshape_lhs_matrix_t(TENSOR3D_DECLARATION(src),

296

TENSOR3D_DECLARATION(dst)

297

#if defined(REINTERPRET_INPUT_AS_3D)

298

,

299

uint cross_plane_pad

300

#endif // REINTERPRET_INPUT_AS_3D

)

{

// Block size

#define BLOCK_SIZE ((M0) * (K0))

305

306

// Output offset X

307

#if defined(INTERLEAVE)

308

#define OUTPUT_OFFSET_X (M0)

309

#else // defined(INTERLEAVE)

310

#define OUTPUT_OFFSET_X (BLOCK_SIZE)

311

#endif // defined(INTERLEAVE)

312

313

// Output step X

314

#if defined(INTERLEAVE)

315

#define OUTPUT_STEP_X (M0) * (V0)

316

#else // Do not interleave

317

#define OUTPUT_STEP_X (M0)

318

#endif // defined(INTERLEAVE)

319

320

// Compute source and destination addresses

321

uint x = get_global_id(0);

322

uint y = get_global_id(1);

323

uint z = get_global_id(2);

324

325

// ------------------ Compute input/output addresses ---------------------------

326

327

// Compute the input address

328

__global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + x * (uint)K0 * sizeof(DATA_TYPE) + y * (uint)M0 * src_stride_y;

329

330

// Compute the output address

331

__global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)BLOCK_SIZE * (uint)V0 * sizeof(DATA_TYPE)) + ((y / (uint)V0) * (uint)dst_stride_y) + ((y % V0) *

332

(uint)OUTPUT_OFFSET_X * sizeof(DATA_TYPE));

333

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

334

// Create variables: uint zin0=0, zin1=0, zin2=0...zin(M0-1)=0;

335

REPEAT_VAR_INIT_TO_CONST(M0, uint, zin, 0);

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

336

337

#if defined(REINTERPRET_INPUT_AS_3D)

338

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

339

// multiply src_stride_z by DEPTH_GEMM3D

340

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

341

input_ptr += z * (uint)src_stride_z * DEPTH_GEMM3D;

342

343

// The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

344

CALCULATE_Z_OFFSET(M0, uint, zin, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, cross_plane_pad, src_stride_y);

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

345

346

#else // defined(REINTERPRET_INPUT_AS_3D)

347

348

input_ptr += z * (uint)src_stride_z;

349

350

#endif // defined(REINTERPRET_INPUT_AS_3D)

351

352

// Add offset for batched GEMM

353

output_ptr += z * (uint)dst_stride_z;

354

355

// ---------------------------Load input values --------------------------------

Gian Marco Iodice

2020-08-10 21:44:14 +0100

[diff] [blame^]

356

REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, K0), a, 0);

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

357

Gian Marco Iodice

2020-08-10 21:44:14 +0100

[diff] [blame^]

358

LOAD_TENSOR_BOUNDARY_AWARE_M0XK0(M0, K0, DATA_TYPE, a, input_ptr, src_stride_y, zin);

359

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

360

// ---------------------------Transpose and store block -----------------------

361

362

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 0);

363

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 1);

364

#if K0 > 2

365

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 2);

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

366

#endif // K0 > 2

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

367

#if K0 > 3

368

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 3);

369

#endif // K0 > 3

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

370

#if K0 > 4

371

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 4);

372

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 5);

373

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 6);

374

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 7);

375

#endif // K0 > 4

376

#if K0 > 8

377

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 8);

378

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, 9);

379

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, A);

380

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, B);

381

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, C);

382

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, D);

383

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, E);

384

TRANSPOSE_COLUMN_AND_STORE(output_ptr, OUTPUT_STEP_X, F);

#endif // K0 > 8

#undef BLOCK_SIZE

#undef OUTPUT_OFFSET_X

389

#undef OUTPUT_STEP_X

390

}

Gian Marco Iodice

2020-08-10 21:44:14 +0100

[diff] [blame^]

391

#endif // defined(M0) && defined(K0) && defined(V0) && defined(DATA_TYPE) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(PARTIAL_LOAD_M0) && defined(PARTIAL_LOAD_K0)

Gian Marco Iodice

2018-12-06 17:13:09 +0000

[diff] [blame]

392

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

393

#if defined(K0) && defined(N0) && defined(H0) && defined(DATA_TYPE) && defined(SRC_HEIGHT)

394

/** This OpenCL kernel reshapes the rhs input matrix. The kernel splits the input matrix in blocks of size K0xN0 and stores each one (not transposed) in

395

* the output matrix unrolling the values.

396

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

397

* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)

398

* @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT (e.g. -DSRC_HEIGHT=16)

399

* @note The block's dimensions (K0 and N0) must be passed at compile time using -DK0 and -DN0 (e.g. -DK0=2, -DN0=2).

400

* @note The number of K0xN0 vertical blocks to store on the same output row must be passed at compile time using -DH0 (e.g. -DH0=2)

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

401

* @note If the K0xN0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile time.

402

* @note Only the following values for K0, N0 and H0 are supported:

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

403

* N0: 2,3,4,8,16

404

* K0: 1,2,3,4,8,16

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

405

* H0: greater than 0

406

*

Michele Di Giorgio

2020-07-06 11:27:21 +0100

[diff] [blame]

407

* @param[in] src_ptr Pointer to the source RHS tensor. Supported data types: All

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

408

* @param[in] src_stride_x Stride of the source RHS tensor in X dimension (in bytes)

409

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

410

* @param[in] src_stride_y Stride of the source RHS tensor in Y dimension (in bytes)

411

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

412

* @param[in] src_stride_z Stride of the source RHS tensor in Z dimension (in bytes)

413

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

414

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source RHS tensor

415

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr

416

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

417

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

418

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

419

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

420

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

421

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

422

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

423

*/

424

__kernel void gemm_reshape_rhs_matrix_nt(TENSOR3D_DECLARATION(src),

425

TENSOR3D_DECLARATION(dst))

426

{

427

// Block size

428

#define BLOCK_SIZE ((K0) * (N0))

429

430

// Output offset X

431

#if defined(INTERLEAVE)

432

#define OUTPUT_OFFSET_X (N0)

433

#else // defined(INTERLEAVE)

434

#define OUTPUT_OFFSET_X (BLOCK_SIZE)

435

#endif // defined(INTERLEAVE)

436

437

// Output step X

438

#if defined(INTERLEAVE)

439

#define OUTPUT_STEP_X (N0) * (H0)

440

#else // Do not interleave

441

#define OUTPUT_STEP_X (N0)

442

#endif // defined(INTERLEAVE)

443

444

// Compute source and destination addresses

445

uint x = get_global_id(0);

446

uint y = get_global_id(1);

447

uint z = get_global_id(2);

448

449

// ------------------ Compute input/output addresses ---------------------------

450

451

// Compute the input address

452

__global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + x * (uint)N0 * sizeof(DATA_TYPE) + y * (uint)K0 * src_stride_y + z * (uint)src_stride_z;

453

454

// Compute the output address

455

__global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + (y * (uint)BLOCK_SIZE * (uint)H0 * sizeof(DATA_TYPE)) + ((x % (uint)H0) * (uint)OUTPUT_OFFSET_X * sizeof(DATA_TYPE)) + ((

456

x / (uint)H0)

457

* (uint)dst_stride_y)

458

+ z * (uint)dst_stride_z;

459

460

// ---------------------------Load input values --------------------------------

461

Vidhya Sudhan Loganathan

17b0f8b

2019-01-08 12:17:03 +0000

[diff] [blame]

462

REPEAT_VAR_INIT_TO_CONST(K0, VEC_DATA_TYPE(DATA_TYPE, N0), a, 0); ////uint a0=0, a1=0, a2=0...a(M0-1)=0;

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

463

464

// Load values from the RHS matrix

465

a0 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y));

466

#if K0 > 1

467

if(y * (uint)K0 + 1 < SRC_HEIGHT)

468

{

469

a1 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y));

}

#endif // K0 > 1

#if K0 > 2

if(y * (uint)K0 + 2 < SRC_HEIGHT)

474

{

475

a2 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 2 * src_stride_y));

476

}

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

477

#endif // K0 > 2

478

#if K0 > 3

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

479

if(y * (uint)K0 + 3 < SRC_HEIGHT)

480

{

481

a3 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 3 * src_stride_y));

482

}

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

483

#endif // K0 > 3

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

484

#if K0 > 4

485

if(y * (uint)K0 + 4 < SRC_HEIGHT)

486

{

487

a4 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 4 * src_stride_y));

488

}

489

if(y * (uint)K0 + 5 < SRC_HEIGHT)

490

{

491

a5 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 5 * src_stride_y));

492

}

493

if(y * (uint)K0 + 6 < SRC_HEIGHT)

494

{

495

a6 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 6 * src_stride_y));

496

}

497

if(y * (uint)K0 + 7 < SRC_HEIGHT)

498

{

499

a7 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 7 * src_stride_y));

500

}

501

#endif // K0 > 4

502

#if K0 > 8

Gian Marco Iodice

2018-12-19 10:01:18 +0000

[diff] [blame]

503

if(y * (uint)K0 + 8 < SRC_HEIGHT)

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

504

{

505

a8 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 8 * src_stride_y));

506

}

507

if(y * (uint)K0 + 9 < SRC_HEIGHT)

508

{

509

a9 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 9 * src_stride_y));

510

}

511

if(y * (uint)K0 + 10 < SRC_HEIGHT)

512

{

513

aA = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 10 * src_stride_y));

514

}

515

if(y * (uint)K0 + 11 < SRC_HEIGHT)

516

{

517

aB = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 11 * src_stride_y));

518

}

519

if(y * (uint)K0 + 12 < SRC_HEIGHT)

520

{

521

aC = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 12 * src_stride_y));

522

}

523

if(y * (uint)K0 + 13 < SRC_HEIGHT)

524

{

525

aD = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 13 * src_stride_y));

526

}

527

if(y * (uint)K0 + 14 < SRC_HEIGHT)

528

{

529

aE = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 14 * src_stride_y));

530

}

531

if(y * (uint)K0 + 15 < SRC_HEIGHT)

532

{

533

aF = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 15 * src_stride_y));

}

#endif // K0 > 8

// ---------------------------Store output values ------------------------------

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

538

REPEAT_VAR_INIT_TO_CONST(16, uint, zout, 0);

539

STORE_BLOCK(K0, N0, DATA_TYPE, a, output_ptr, OUTPUT_STEP_X * sizeof(DATA_TYPE), zout);

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

540

541

#undef BLOCK_SIZE

542

#undef OUTPUT_OFFSET_X

#undef OUTPUT_STEP_X

}

#if defined(TRANSPOSE)

547

/** This OpenCL kernel reshapes the rhs input matrix. The kernel splits the input matrix in blocks of size K0xN0 and stores each one (transposed) in

548

* the output matrix unrolling the values.

549

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

550

* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)

551

* @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT (e.g. -DSRC_HEIGHT=16)

552

* @note The block's dimensions (K0 and N0) must be passed at compile time using -DK0 and -DN0 (e.g. -DK0=2, -DN0=2).

553

* @note The number of K0xN0 vertical blocks to store on the same output row must be passed at compile time using -DH0 (e.g. -DH0=2)

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

554

* @note If the K0xN0 blocks have to be interleaved, the option -DINTERLEAVE must passed at compile time.

555

* @note The option -DTRANSPOSE must passed at compile time.

556

* @note Only the following values for K0, N0 and H0 are supported:

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

557

* N0: 2,3,4,8,16

558

* K0: 2,3,4,8,16

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

559

* H0: greater than 0

560

*

Michele Di Giorgio

2020-07-06 11:27:21 +0100

[diff] [blame]

561

* @param[in] src_ptr Pointer to the source RHS tensor. Supported data types: All

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

562

* @param[in] src_stride_x Stride of the source RHS tensor in X dimension (in bytes)

563

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

564

* @param[in] src_stride_y Stride of the source RHS tensor in Y dimension (in bytes)

565

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

566

* @param[in] src_stride_z Stride of the source RHS tensor in Z dimension (in bytes)

567

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

568

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source RHS tensor

569

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr

570

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

571

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

572

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

573

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

574

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

575

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

576

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

577

*/

578

__kernel void gemm_reshape_rhs_matrix_t(TENSOR3D_DECLARATION(src),

579

TENSOR3D_DECLARATION(dst))

580

{

581

// Block size

582

#define BLOCK_SIZE ((K0) * (N0))

583

584

// Output offset X

585

#if defined(INTERLEAVE)

586

#define OUTPUT_OFFSET_X (K0)

587

#else // defined(INTERLEAVE)

588

#define OUTPUT_OFFSET_X (BLOCK_SIZE)

589

#endif // defined(INTERLEAVE)

590

591

// Output step X

592

#if defined(INTERLEAVE)

593

#define OUTPUT_STEP_X (K0) * (H0)

594

#else // Do not interleave

595

#define OUTPUT_STEP_X (K0)

596

#endif // defined(INTERLEAVE)

597

598

// Compute source and destination addresses

599

uint x = get_global_id(0);

600

uint y = get_global_id(1);

601

uint z = get_global_id(2);

602

603

// ------------------ Compute input/output addresses ---------------------------

604

605

// Compute the input address

606

__global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + x * (uint)N0 * sizeof(DATA_TYPE) + y * (uint)K0 * src_stride_y + z * (uint)src_stride_z;

607

608

// Compute the output address

609

__global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + (y * (uint)BLOCK_SIZE * (uint)H0 * sizeof(DATA_TYPE)) + ((x % H0) * (uint)OUTPUT_OFFSET_X * sizeof(DATA_TYPE)) + ((x /

610

(uint)H0) * (uint)dst_stride_y) + z * (uint)dst_stride_z;

611

612

// ---------------------------Load input values --------------------------------

Vidhya Sudhan Loganathan

17b0f8b

2019-01-08 12:17:03 +0000

[diff] [blame]

613

REPEAT_VAR_INIT_TO_CONST(K0, VEC_DATA_TYPE(DATA_TYPE, N0), a, 0); //VEC_DATA_TYPE(DATA_TYPE, N0) a0=0, a1=0, ... a(K0-1)=0;

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

614

615

// Load values from the RHS matrix

616

a0 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y));

617

if(y * (uint)K0 + 1 < SRC_HEIGHT)

618

{

619

a1 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y));

620

}

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

621

#if K0 > 2

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

622

if(y * (uint)K0 + 2 < SRC_HEIGHT)

623

{

624

a2 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 2 * src_stride_y));

625

}

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

626

#endif // K0 > 2

627

#if K0 > 3

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

628

if(y * (uint)K0 + 3 < SRC_HEIGHT)

629

{

630

a3 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 3 * src_stride_y));

631

}

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

632

#endif // K0 > 3

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

633

#if K0 > 4

634

if(y * (uint)K0 + 4 < SRC_HEIGHT)

635

{

636

a4 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 4 * src_stride_y));

637

}

638

if(y * (uint)K0 + 5 < SRC_HEIGHT)

639

{

640

a5 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 5 * src_stride_y));

641

}

642

if(y * (uint)K0 + 6 < SRC_HEIGHT)

643

{

644

a6 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 6 * src_stride_y));

645

}

646

if(y * (uint)K0 + 7 < SRC_HEIGHT)

647

{

648

a7 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 7 * src_stride_y));

649

}

650

#endif // K0 > 4

651

#if K0 > 8

Gian Marco Iodice

8912434

2018-12-19 14:17:22 +0000

[diff] [blame]

652

if(y * (uint)K0 + 8 < SRC_HEIGHT)

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

653

{

654

a8 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 8 * src_stride_y));

655

}

656

if(y * (uint)K0 + 9 < SRC_HEIGHT)

657

{

658

a9 = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 9 * src_stride_y));

659

}

660

if(y * (uint)K0 + 10 < SRC_HEIGHT)

661

{

662

aA = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 10 * src_stride_y));

663

}

664

if(y * (uint)K0 + 11 < SRC_HEIGHT)

665

{

666

aB = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 11 * src_stride_y));

667

}

668

if(y * (uint)K0 + 12 < SRC_HEIGHT)

669

{

670

aC = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 12 * src_stride_y));

671

}

672

if(y * (uint)K0 + 13 < SRC_HEIGHT)

673

{

674

aD = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 13 * src_stride_y));

675

}

676

if(y * (uint)K0 + 14 < SRC_HEIGHT)

677

{

678

aE = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 14 * src_stride_y));

679

}

680

if(y * (uint)K0 + 15 < SRC_HEIGHT)

681

{

682

aF = VLOAD(N0)(0, (__global DATA_TYPE *)(input_ptr + 15 * src_stride_y));

}

#endif // K0 > 8

// ---------------------------Transpose the block ------------------------------

Vidhya Sudhan Loganathan

17b0f8b

2019-01-08 12:17:03 +0000

[diff] [blame]

687

REPEAT_VAR_INIT_TO_CONST(N0, VEC_DATA_TYPE(DATA_TYPE, K0), res, 0); //VEC_DATA_TYPE(DATA_TYPE, K0) res0=0, res1=0, res2=0,... res(N0-1)=0;

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

688

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

689

#if K0 == 2

690

// This part computes the following transpositions:

// 2x2 -> 2x2

// 2x4 -> 4x2

// 2x8 -> 8x2

// 2x16 -> 16x2

res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0);

696

res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1);

697

#if N0 > 2

698

res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2);

699

#endif // N0 > 2

700

#if N0 > 3

701

res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3);

702

#endif // N0 > 3

703

#if N0 > 4

704

res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4);

705

res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5);

706

res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6);

707

res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7);

708

#endif // N0 > 4

709

#if N0 > 8

710

res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8);

711

res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9);

712

resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA);

713

resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB);

714

resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC);

715

resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD);

716

resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE);

717

resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF);

718

#endif // N0 > 8

719

720

#elif K0 == 3 // K0 == 2

721

// This part computes the following transpositions:

// 3x2 -> 2x3

// 3x4 -> 4x3

// 3x8 -> 8x3

// 3x16 -> 16x3

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

726

res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0, a2.s0);

727

res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1, a2.s1);

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

728

#if N0 > 2

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

729

res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2, a2.s2);

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

730

#endif // N0 > 2

731

#if N0 > 3

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

732

res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3, a2.s3);

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

733

#endif // N0 > 3

734

#if N0 > 4

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

735

res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4, a2.s4);

736

res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5, a2.s5);

737

res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6, a2.s6);

738

res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7, a2.s7);

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

739

#endif // N0 > 4

740

#if N0 > 8

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

741

res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8, a2.s8);

742

res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9, a2.s9);

743

resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA, a2.sA);

744

resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB, a2.sB);

745

resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC, a2.sC);

746

resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD, a2.sD);

747

resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE, a2.sE);

748

resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF, a2.sF);

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

749

#endif // N0 > 8

750

751

#elif K0 == 4 // K0 == 4

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

752

// This part computes the following transpositions:

// 4x2 -> 2x4

// 4x4 -> 4x4

// 4x8 -> 8x4

// 4x16 -> 16x4

res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0, a2.s0, a3.s0);

758

res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1, a2.s1, a3.s1);

759

#if N0 > 2

760

res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2, a2.s2, a3.s2);

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

761

#endif // N0 > 2

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

762

#if N0 > 3

763

res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3, a2.s3, a3.s3);

764

#endif // N0 > 3

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

765

#if N0 > 4

766

res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4, a2.s4, a3.s4);

767

res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5, a2.s5, a3.s5);

768

res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6, a2.s6, a3.s6);

769

res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7, a2.s7, a3.s7);

770

#endif // N0 > 4

771

#if N0 > 8

772

res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8, a2.s8, a3.s8);

773

res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9, a2.s9, a3.s9);

774

resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA, a2.sA, a3.sA);

775

resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB, a2.sB, a3.sB);

776

resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC, a2.sC, a3.sC);

777

resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD, a2.sD, a3.sD);

778

resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE, a2.sE, a3.sE);

779

resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF, a2.sF, a3.sF);

780

#endif // N0 > 8

781

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

782

#elif K0 == 8 // K0 == 8

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

783

// This part computes the following transpositions:

// 8x2 -> 2x8

// 8x4 -> 4x8

// 8x8 -> 8x8

// 8x16 -> 16x8

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

788

res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0, a2.s0, a3.s0, a4.s0, a5.s0, a6.s0, a7.s0);

789

res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1, a2.s1, a3.s1, a4.s1, a5.s1, a6.s1, a7.s1);

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

790

#if N0 > 2

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

791

res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2, a2.s2, a3.s2, a4.s2, a5.s2, a6.s2, a7.s2);

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

792

#endif // N0 > 2

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

793

#if N0 > 3

794

res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3, a2.s3, a3.s3, a4.s3, a5.s3, a6.s3, a7.s3);

795

#endif // N0 > 3

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

796

#if N0 > 4

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

797

res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4, a2.s4, a3.s4, a4.s4, a5.s4, a6.s4, a7.s4);

798

res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5, a2.s5, a3.s5, a4.s5, a5.s5, a6.s5, a7.s5);

799

res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6, a2.s6, a3.s6, a4.s6, a5.s6, a6.s6, a7.s6);

800

res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7, a2.s7, a3.s7, a4.s7, a5.s7, a6.s7, a7.s7);

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

801

#endif // N0 > 4

802

#if N0 > 8

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

803

res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8, a2.s8, a3.s8, a4.s8, a5.s8, a6.s8, a7.s8);

804

res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9, a2.s9, a3.s9, a4.s9, a5.s9, a6.s9, a7.s9);

805

resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA, a2.sA, a3.sA, a4.sA, a5.sA, a6.sA, a7.sA);

806

resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB, a2.sB, a3.sB, a4.sB, a5.sB, a6.sB, a7.sB);

807

resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC, a2.sC, a3.sC, a4.sC, a5.sC, a6.sC, a7.sC);

808

resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD, a2.sD, a3.sD, a4.sD, a5.sD, a6.sD, a7.sD);

809

resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE, a2.sE, a3.sE, a4.sE, a5.sE, a6.sE, a7.sE);

810

resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF, a2.sF, a3.sF, a4.sF, a5.sF, a6.sF, a7.sF);

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

811

#endif // N0 > 8

812

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

813

#elif K0 == 16 // K0 == 16

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

814

815

// This part computes the following transpositions:

// 16x2 -> 2x16

// 16x4 -> 4x16

// 16x8 -> 8x16

// 16x16 -> 16x16

res0 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s0, a1.s0, a2.s0, a3.s0, a4.s0, a5.s0, a6.s0, a7.s0,

821

a8.s0, a9.s0, aA.s0, aB.s0, aC.s0, aD.s0, aE.s0, aF.s0);

822

res1 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s1, a1.s1, a2.s1, a3.s1, a4.s1, a5.s1, a6.s1, a7.s1,

823

a8.s1, a9.s1, aA.s1, aB.s1, aC.s1, aD.s1, aE.s1, aF.s1);

824

#if N0 > 2

825

res2 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s2, a1.s2, a2.s2, a3.s2, a4.s2, a5.s2, a6.s2, a7.s2,

826

a8.s2, a9.s2, aA.s2, aB.s2, aC.s2, aD.s2, aE.s2, aF.s2);

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

827

#endif // N0 > 2

828

#if N0 > 3

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

829

res3 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s3, a1.s3, a2.s3, a3.s3, a4.s3, a5.s3, a6.s3, a7.s3,

830

a8.s3, a9.s3, aA.s3, aB.s3, aC.s3, aD.s3, aE.s3, aF.s3);

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

831

#endif // N0 > 3

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

832

#if N0 > 4

833

res4 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s4, a1.s4, a2.s4, a3.s4, a4.s4, a5.s4, a6.s4, a7.s4,

834

a8.s4, a9.s4, aA.s4, aB.s4, aC.s4, aD.s4, aE.s4, aF.s4);

835

res5 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s5, a1.s5, a2.s5, a3.s5, a4.s5, a5.s5, a6.s5, a7.s5,

836

a8.s5, a9.s5, aA.s5, aB.s5, aC.s5, aD.s5, aE.s5, aF.s5);

837

res6 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s6, a1.s6, a2.s6, a3.s6, a4.s6, a5.s6, a6.s6, a7.s6,

838

a8.s6, a9.s6, aA.s6, aB.s6, aC.s6, aD.s6, aE.s6, aF.s6);

839

res7 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s7, a1.s7, a2.s7, a3.s7, a4.s7, a5.s7, a6.s7, a7.s7,

840

a8.s7, a9.s7, aA.s7, aB.s7, aC.s7, aD.s7, aE.s7, aF.s7);

841

#endif // N0 > 4

842

#if N0 > 8

843

res8 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s8, a1.s8, a2.s8, a3.s8, a4.s8, a5.s8, a6.s8, a7.s8,

844

a8.s8, a9.s8, aA.s8, aB.s8, aC.s8, aD.s8, aE.s8, aF.s8);

845

res9 = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.s9, a1.s9, a2.s9, a3.s9, a4.s9, a5.s9, a6.s9, a7.s9,

846

a8.s9, a9.s9, aA.s9, aB.s9, aC.s9, aD.s9, aE.s9, aF.s9);

847

resA = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sA, a1.sA, a2.sA, a3.sA, a4.sA, a5.sA, a6.sA, a7.sA,

848

a8.sA, a9.sA, aA.sA, aB.sA, aC.sA, aD.sA, aE.sA, aF.sA);

849

resB = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sB, a1.sB, a2.sB, a3.sB, a4.sB, a5.sB, a6.sB, a7.sB,

850

a8.sB, a9.sB, aA.sB, aB.sB, aC.sB, aD.sB, aE.sB, aF.sB);

851

resC = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sC, a1.sC, a2.sC, a3.sC, a4.sC, a5.sC, a6.sC, a7.sC,

852

a8.sC, a9.sC, aA.sC, aB.sC, aC.sC, aD.sC, aE.sC, aF.sC);

853

resD = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sD, a1.sD, a2.sD, a3.sD, a4.sD, a5.sD, a6.sD, a7.sD,

854

a8.sD, a9.sD, aA.sD, aB.sD, aC.sD, aD.sD, aE.sD, aF.sD);

855

resE = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sE, a1.sE, a2.sE, a3.sE, a4.sE, a5.sE, a6.sE, a7.sE,

856

a8.sE, a9.sE, aA.sE, aB.sE, aC.sE, aD.sE, aE.sE, aF.sE);

857

resF = (VEC_DATA_TYPE(DATA_TYPE, K0))(a0.sF, a1.sF, a2.sF, a3.sF, a4.sF, a5.sF, a6.sF, a7.sF,

858

a8.sF, a9.sF, aA.sF, aB.sF, aC.sF, aD.sF, aE.sF, aF.sF);

#endif // N0 > 8

#else // N0 == 16

#error "Not supported N0 value"

863

#endif // N0 > 2

864

865

// ---------------------------Store the output values ------------------------------

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

866

REPEAT_VAR_INIT_TO_CONST(16, uint, zout, 0);

867

STORE_BLOCK(N0, K0, DATA_TYPE, res, output_ptr, OUTPUT_STEP_X * sizeof(DATA_TYPE), zout);

Gian Marco Iodice

2018-12-07 11:18:09 +0000

[diff] [blame]

868

869

#undef BLOCK_SIZE

870

#undef OUTPUT_OFFSET_X

871

#undef OUTPUT_STEP_X

872

}

873

#endif // defined(TRANSPOSE)

874

#endif // defined(K0) && defined(N0) && defined(H0) && defined(DATA_TYPE) && defined(SRC_HEIGHT)

875

Gian Marco Iodice

2019-03-15 10:13:05 +0000

[diff] [blame]

876

#if defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(DATA_TYPE) && defined(M) && defined(N) && defined(K)

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

877

878

#define CONCAT(a, b) a##b

879

880

#define ARM_DOT1(a, b, c) \

({ \

c = fma(a, b, c); \

})

#define ARM_DOT2(a, b, c) \

885

({ \

886

c = fma(a.s0, b.s0, c); \

887

c = fma(a.s1, b.s1, c); \

888

})

889

#define ARM_DOT3(a, b, c) \

890

({ \

891

ARM_DOT2(a, b, c); \

892

c = fma((a.s2), (b.s2), c); \

893

})

894

#define ARM_DOT4(a, b, c) \

895

({ \

896

ARM_DOT3(a, b, c); \

897

c = fma((a.s3), (b.s3), c); \

898

})

899

#define ARM_DOT8(a, b, c) \

900

({ \

901

ARM_DOT4((a.lo), (b.lo), c); \

902

ARM_DOT4((a.hi), (b.hi), c); \

903

})

904

#define ARM_DOT16(a, b, c) \

905

({ \

906

ARM_DOT8((a.lo), (b.lo), c); \

907

ARM_DOT8((a.hi), (b.hi), c); \

})

#if N0 == 2

#define ARM_DOT_K0XN0(k0, a, b, c) \

912

({ \

913

CONCAT(ARM_DOT, k0) \

914

((a), (b##0), (c.s0)); \

915

CONCAT(ARM_DOT, k0) \

916

((a), (b##1), (c.s1)); \

917

})

918

#elif N0 == 3 // N0 == 3

919

#define ARM_DOT_K0XN0(k0, a, b, c) \

920

({ \

921

CONCAT(ARM_DOT, k0) \

922

((a), (b##0), (c.s0)); \

923

CONCAT(ARM_DOT, k0) \

924

((a), (b##1), (c.s1)); \

925

CONCAT(ARM_DOT, k0) \

926

((a), (b##2), (c.s2)); \

927

})

928

#elif N0 == 4 // N0 == 4

929

#define ARM_DOT_K0XN0(k0, a, b, c) \

930

({ \

931

CONCAT(ARM_DOT, k0) \

932

((a), (b##0), (c.s0)); \

933

CONCAT(ARM_DOT, k0) \

934

((a), (b##1), (c.s1)); \

935

CONCAT(ARM_DOT, k0) \

936

((a), (b##2), (c.s2)); \

937

CONCAT(ARM_DOT, k0) \

938

((a), (b##3), (c.s3)); \

939

})

940

#elif N0 == 8 // N0 == 8

941

#define ARM_DOT_K0XN0(k0, a, b, c) \

942

({ \

943

CONCAT(ARM_DOT, k0) \

944

((a), (b##0), (c.s0)); \

945

CONCAT(ARM_DOT, k0) \

946

((a), (b##1), (c.s1)); \

947

CONCAT(ARM_DOT, k0) \

948

((a), (b##2), (c.s2)); \

949

CONCAT(ARM_DOT, k0) \

950

((a), (b##3), (c.s3)); \

951

CONCAT(ARM_DOT, k0) \

952

((a), (b##4), (c.s4)); \

953

CONCAT(ARM_DOT, k0) \

954

((a), (b##5), (c.s5)); \

955

CONCAT(ARM_DOT, k0) \

956

((a), (b##6), (c.s6)); \

957

CONCAT(ARM_DOT, k0) \

958

((a), (b##7), (c.s7)); \

959

})

960

#elif N0 == 16 // N0 == 16

961

#define ARM_DOT_K0XN0(k0, a, b, c) \

962

({ \

963

CONCAT(ARM_DOT, k0) \

964

((a), (b##0), (c.s0)); \

965

CONCAT(ARM_DOT, k0) \

966

((a), (b##1), (c.s1)); \

967

CONCAT(ARM_DOT, k0) \

968

((a), (b##2), (c.s2)); \

969

CONCAT(ARM_DOT, k0) \

970

((a), (b##3), (c.s3)); \

971

CONCAT(ARM_DOT, k0) \

972

((a), (b##4), (c.s4)); \

973

CONCAT(ARM_DOT, k0) \

974

((a), (b##5), (c.s5)); \

975

CONCAT(ARM_DOT, k0) \

976

((a), (b##6), (c.s6)); \

977

CONCAT(ARM_DOT, k0) \

978

((a), (b##7), (c.s7)); \

979

CONCAT(ARM_DOT, k0) \

980

((a), (b##8), (c.s8)); \

981

CONCAT(ARM_DOT, k0) \

982

((a), (b##9), (c.s9)); \

983

CONCAT(ARM_DOT, k0) \

984

((a), (b##A), (c.sA)); \

985

CONCAT(ARM_DOT, k0) \

986

((a), (b##B), (c.sB)); \

987

CONCAT(ARM_DOT, k0) \

988

((a), (b##C), (c.sC)); \

989

CONCAT(ARM_DOT, k0) \

990

((a), (b##D), (c.sD)); \

991

CONCAT(ARM_DOT, k0) \

992

((a), (b##E), (c.sE)); \

993

CONCAT(ARM_DOT, k0) \

994

((a), (b##F), (c.sF)); \

995

})

996

#else // N0 not supported

997

#error "N0 value not supported"

998

#endif // N0 conditions

999

1000

/** This OpenCL kernel computes the matrix multiplication between 2 matrices.

1001

* The LHS matrix is NOT reshaped

1002

* The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is transposed

1003

*

Gian Marco Iodice

2019-03-15 10:13:05 +0000

[diff] [blame]

1004

* @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

1005

* @note The GEMM's dimensions (M,N and K) must be passed at compile time using -DM, -DN and and -DK (e.g. -DM=52, -DN=30 and -DK=90)

1006

* @note The number of columns of LHS matrix must be passed at compile time using -DK (e.g. -DK=64)

1007

* @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (e.g. -DN0=8, -DK0=4).

1008

* @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)

1009

* @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1010

* @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.

SiCong Li

2020-06-26 10:02:06 +0100

[diff] [blame]

1011

* @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)

1012

* @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1013

* @note Only the following configurations of M0, N0 and K0 are currently supported:

1014

* - M0 = 1, 2, 3, 4, 5, 6, 7, 8

1015

* - N0 = 2, 3, 4, 8, 16

1016

* - K0 = 2, 3, 4, 8, 16

Gian Marco Iodice

62251f7

2019-03-11 16:07:12 +0000

[diff] [blame]

1017

* - H0 >= 1

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1018

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

1019

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

Gian Marco Iodice

2019-07-16 15:46:48 +0100

[diff] [blame]

1020

* The activation function is performed after the bias addition

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1021

* @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:

1022

* -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D

1023

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

1024

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

1025

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

1026

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix

1027

*

Sheri Zhang

2020-04-30 12:59:39 +0100

[diff] [blame]

1028

* @param[in] lhs_ptr Pointer to the LHS matrix. Supported data type: F16/F32

1029

* @param[in] lhs_stride_x Stride of the LHS matrix in X dimension (in bytes)

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1030

* @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

Sheri Zhang

2020-04-30 12:59:39 +0100

[diff] [blame]

1031

* @param[in] lhs_stride_y Stride of the LHS matrix in Y dimension (in bytes)

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1032

* @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

Sheri Zhang

2020-04-30 12:59:39 +0100

[diff] [blame]

1033

* @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS matrix

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1034

* @param[in] rhs_ptr Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr

1035

* @param[in] rhs_stride_x Stride of the RHS reshaped matrix in X dimension (in bytes)

1036

* @param[in] rhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1037

* @param[in] rhs_stride_y Stride of the RHS reshaped matrix in Y dimension (in bytes)

1038

* @param[in] rhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1039

* @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS reshaped matrix

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

1040

* @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

1041

* @param[in] bias_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

1042

* @param[in] bias_step_x (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)

1043

* @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

1044

* @param[in] bias_step_y (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)

1045

* @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1046

* @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr

1047

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

1048

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1049

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

1050

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

1051

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Sheri Zhang

2020-04-30 12:59:39 +0100

[diff] [blame]

1052

* @param[in] lhs_stride_z Stride of the LHS matrix in Z dimension (in bytes)

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1053

* @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes)

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

1054

* @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1055

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

1056

* @param[in] lhs_cross_plane_pad (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)

1057

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1058

*/

1059

__kernel void gemm_mm_reshaped_only_rhs_t(IMAGE_DECLARATION(lhs),

1060

IMAGE_DECLARATION(rhs),

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1061

#if defined(BETA)

1062

IMAGE_DECLARATION(bias),

1063

#endif // defined(BETA)

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1064

IMAGE_DECLARATION(dst),

1065

uint lhs_stride_z,

1066

uint rhs_stride_z,

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1067

#if defined(BETA)

1068

uint bias_stride_z,

1069

#endif //defined(BETA)

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1070

uint dst_stride_z

1071

#if defined(REINTERPRET_INPUT_AS_3D)

1072

,

1073

uint lhs_cross_plane_pad

1074

#endif // REINTERPRET_INPUT_AS_3D

1075

#if defined(REINTERPRET_OUTPUT_AS_3D)

1076

,

1077

uint dst_cross_plane_pad

1078

#endif // REINTERPRET_OUTPUT_AS_3D

1079

)

1080

{

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1081

// Block size

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1082

#define RHS_BLOCK_SIZE ((K0) * (N0))

1083

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1084

// RHS offset and step X

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1085

#if defined(RHS_INTERLEAVE)

1086

#define RHS_OFFSET_X (K0)

1087

#define RHS_STEP_X ((K0) * (H0))

1088

#define RHS_STEP_LOOP (1)

1089

#else // defined(RHS_INTERLEAVE)

1090

#define RHS_OFFSET_X (RHS_BLOCK_SIZE)

1091

#define RHS_STEP_X (K0)

1092

#define RHS_STEP_LOOP (H0)

1093

#endif // defined(RHS_INTERLEAVE)

1094

1095

uint x = get_global_id(0);

1096

uint y = get_global_id(1);

1097

uint z = get_global_id(2);

1098

Gian Marco Iodice

2019-03-15 10:13:05 +0000

[diff] [blame]

1099

#if defined(DUMMY_WORK_ITEMS)

1100

if((x * N0 >= N) || (y * M0 >= M))

{

return;

}

#endif // defined(DUMMY_WORK_ITEMS)

1105

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1106

// Compute LHS matrix address

SiCong Li

2020-07-15 12:09:58 +0100

[diff] [blame]

1107

uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1108

Sheri Zhang

2020-04-30 12:59:39 +0100

[diff] [blame]

1109

// Compute RHS reshaped matrix address

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1110

uint rhs_offset = rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (x / (uint)H0) * rhs_stride_y;

1111

1112

#if defined(MATRIX_B_DEPTH)

1113

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

1114

rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;

1115

#else // defined(MATRIX_B_DEPTH)

1116

rhs_offset += z * rhs_stride_z;

1117

#endif // defined(MATRIX_B_DEPTH)

1118

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

1119

REPEAT_VAR_INIT_TO_CONST(8, uint, zlhs, 0); //uint zlhs0=0,zlhs1=0,zlhs2=0,... zlhs7=0;

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1120

REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1121

1122

#if defined(REINTERPRET_INPUT_AS_3D)

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

1123

// The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

1124

CALCULATE_Z_OFFSET(M0, uint, zlhs, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1125

1126

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

1127

// multiply lhs_stride_z by DEPTH_GEMM3D

1128

lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;

1129

1130

#else // defined(REINTERPRET_INPUT_AS_3D)

1131

1132

// Add offset for batched GEMM

1133

lhs_offset += z * lhs_stride_z;

1134

1135

#endif // defined(REINTERPRET_INPUT_AS_3D)

1136

1137

// Initialize the accumulators

1138

REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0); //VEC_DATA_TYPE(DATA_TYPE, N0) c0=0,c1=0,c2=0,... c(M0-1)=0;

1139

1140

int i = 0;

1141

for(; i <= (K - K0); i += K0)

1142

{

1143

// Supported cases (M0, K0):

1144

// 1,2 - 1,3 - 1,4 - 1,8 - 1,16

1145

// 2,2 - 2,3 - 2,4 - 2,8 - 2,16

1146

// 3,2 - 3,3 - 3,4 - 3,8 - 3,16

1147

// 4,2 - 4,3 - 4,4 - 4,8 - 4,16

1148

// 5,2 - 5,3 - 5,4 - 5,8 - 5,16

1149

// 6,2 - 6,3 - 6,4 - 6,8 - 6,16

1150

// 7,2 - 7,3 - 7,4 - 7,8 - 7,16

1151

// 8,2 - 8,3 - 8,4 - 8,8 - 8,16

1152

// Load values from LHS matrix

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

1153

LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1154

Sheri Zhang

2020-04-30 12:59:39 +0100

[diff] [blame]

1155

// Load values from RHS reshaped matrix

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1156

LOAD_BLOCK(N0, K0, DATA_TYPE, b, rhs_ptr, rhs_offset, RHS_STEP_X * sizeof(DATA_TYPE), zero);

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1157

1158

// Accumulate

1159

ARM_DOT_K0XN0(K0, a0, b, c0);

1160

#if M0 > 1

1161

ARM_DOT_K0XN0(K0, a1, b, c1);

1162

#endif // M0 > 1

1163

#if M0 > 2

1164

ARM_DOT_K0XN0(K0, a2, b, c2);

1165

#endif // M0 > 2

1166

#if M0 > 3

1167

ARM_DOT_K0XN0(K0, a3, b, c3);

1168

#endif // M0 > 3

1169

#if M0 > 4

1170

ARM_DOT_K0XN0(K0, a4, b, c4);

1171

#endif // M0 > 4

1172

#if M0 > 5

1173

ARM_DOT_K0XN0(K0, a5, b, c5);

1174

#endif // M0 > 5

1175

#if M0 > 6

1176

ARM_DOT_K0XN0(K0, a6, b, c6);

1177

#endif // M0 > 6

1178

#if M0 > 7

1179

ARM_DOT_K0XN0(K0, a7, b, c7);

1180

#endif // M0 > 7

1181

1182

lhs_offset += K0 * sizeof(DATA_TYPE);

1183

rhs_offset += (N0 * RHS_STEP_X * RHS_STEP_LOOP) * sizeof(DATA_TYPE);

1184

}

1185

1186

// Left-over accumulations

1187

for(; i < K; ++i)

1188

{

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1189

// Load values from LHS matrix

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

1190

LOAD_BLOCK(M0, 1, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1191

Sheri Zhang

2020-04-30 12:59:39 +0100

[diff] [blame]

1192

// Load values from RHS reshaped matrix

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1193

LOAD_BLOCK(N0, 1, DATA_TYPE, b, rhs_ptr, rhs_offset, RHS_STEP_X * sizeof(DATA_TYPE), zero);

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1194

1195

// Accumulate

1196

ARM_DOT_K0XN0(1, a0, b, c0);

1197

#if M0 > 1

1198

ARM_DOT_K0XN0(1, a1, b, c1);

1199

#endif // M0 > 1

1200

#if M0 > 2

1201

ARM_DOT_K0XN0(1, a2, b, c2);

1202

#endif // M0 > 2

1203

#if M0 > 3

1204

ARM_DOT_K0XN0(1, a3, b, c3);

1205

#endif // M0 > 3

1206

#if M0 > 4

1207

ARM_DOT_K0XN0(1, a4, b, c4);

1208

#endif // M0 > 4

1209

#if M0 > 5

1210

ARM_DOT_K0XN0(1, a5, b, c5);

1211

#endif // M0 > 5

1212

#if M0 > 6

1213

ARM_DOT_K0XN0(1, a6, b, c6);

1214

#endif // M0 > 6

1215

#if M0 > 7

1216

ARM_DOT_K0XN0(1, a7, b, c7);

1217

#endif // M0 > 7

1218

1219

lhs_offset += sizeof(DATA_TYPE);

1220

rhs_offset += sizeof(DATA_TYPE);

1221

}

1222

SiCong Li

2020-07-15 12:09:58 +0100

[diff] [blame]

1223

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1224

1225

REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;

1226

1227

#if defined(REINTERPRET_OUTPUT_AS_3D)

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1228

1229

// The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

1230

CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1231

1232

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

1233

// multiply dst_stride_z by DEPTH_GEMM3D

1234

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

1235

1236

#else // defined(REINTERPRET_OUTPUT_AS_3D)

1237

1238

// Add offset for batched GEMM

1239

dst_addr += z * dst_stride_z;

1240

1241

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

1242

1243

// Multiply by the weight of matrix-matrix product and store the result

1244

#if defined(ALPHA)

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

1245

SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1246

#endif // defined(ALPHA)

1247

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1248

// Add beta*bias

1249

#if defined(BETA)

1250

#if defined(BROADCAST_BIAS)

1251

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));

1252

1253

LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

1254

1255

#ifndef UNIT_BETA

1256

SCALE_BLOCK(1, DATA_TYPE, bias, BETA);

1257

#endif // UNIT_BIAS

1258

1259

// c = c + bias[broadcasted]

1260

ADD_BLOCK_BROADCAST(M0, c, bias0);

1261

1262

#else // defined(BROADCAST_BIAS)

SiCong Li

2020-07-15 12:09:58 +0100

[diff] [blame]

1263

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z;

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1264

1265

LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

1266

1267

#ifndef UNIT_BETA

1268

SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);

#endif // UNIT_BIAS

// c = c + bias

ADD_BLOCK(M0, c, bias);

1273

1274

#endif // defined(BROADCAST_BIAS)

1275

#endif // defined(BETA)

1276

Gian Marco Iodice

2019-07-16 15:46:48 +0100

[diff] [blame]

1277

#if defined(ACTIVATION_TYPE)

1278

ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL);

1279

#endif // defined(ACTIVATION_TYPE)

1280

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1281

// Store output block

SiCong Li

2020-07-15 12:09:58 +0100

[diff] [blame]

1282

STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, y, x);

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

1283

1284

#undef RHS_BLOCK_SIZE

1285

#undef RHS_OFFSET_X

1286

#undef RHS_STEP_X

1287

}

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1288

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

1289

#if defined(OPENCL_IMAGE_SUPPORT)

1290

/** This OpenCL kernel computes the matrix multiplication between 2 matrices. The RHS matrix is stored in OpenCL image

1291

* The LHS matrix is NOT reshaped

1292

* The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is transposed

1293

*

1294

* @note -DOPENCL_IMAGE_SUPPORT must be passed at compile time in order to compile this OpenCL kernel

1295

* @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.

1296

* @note The GEMM's dimensions (M,N and K) must be passed at compile time using -DM, -DN and and -DK (e.g. -DM=52, -DN=30 and -DK=90)

1297

* @note The height of the RHS matrix, defined before creating the OpenCL image object from the OpenCL buffer, should be passed at compile time using -DRHS_HEIGHT=<value> (e.g. -DRHS_HEIGHT=32)

1298

* Since we cannot create a 3d image from a buffer, the third dimension could be collapsed with the second dimension so RHS_HEIGHT

1299

* could be different from the value returned by get_image_height(rhs_img).

1300

* @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (e.g. -DN0=8, -DK0=4).

1301

* @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)

1302

* @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)

1303

* @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.

SiCong Li

2020-06-26 10:02:06 +0100

[diff] [blame]

1304

* @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)

1305

* @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

1306

* @note Only the following configurations of M0, N0 and K0 are currently supported:

1307

* - M0 = 1, 2, 3, 4, 5, 6, 7, 8

* - N0 = 4, 8, 16

* - K0 = 4, 8, 16

* - H0 >= 1

*

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

1313

* The activation function is performed after the bias addition

1314

* @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:

1315

* -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D

1316

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

1317

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

1318

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

1319

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix

1320

*

1321

* @param[in] lhs_ptr Pointer to the LHS matrix. Supported data type: F32

1322

* @param[in] lhs_stride_x Stride of the LHS matrix in X dimension (in bytes)

1323

* @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1324

* @param[in] lhs_stride_y Stride of the LHS matrix in Y dimension (in bytes)

1325

* @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1326

* @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS matrix

1327

* @param[in] rhs_img The RHS reshaped matrix as OpenCL image object. Supported data type: same as @p lhs_ptr

1328

* @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

1329

* @param[in] bias_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

1330

* @param[in] bias_step_x (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)

1331

* @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

1332

* @param[in] bias_step_y (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)

1333

* @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

1334

* @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr

1335

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

1336

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1337

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

1338

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

1339

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

1340

* @param[in] lhs_stride_z Stride of the LHS matrix in Z dimension (in bytes)

1341

* @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes)

1342

* @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

1343

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

1344

* @param[in] lhs_cross_plane_pad (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)

1345

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

1346

*/

1347

__kernel void gemm_mm_reshaped_only_rhs_t_texture(IMAGE_DECLARATION(lhs),

1348

__read_only image2d_t rhs_img,

1349

#if defined(BETA)

1350

IMAGE_DECLARATION(bias),

1351

#endif // defined(BETA)

1352

IMAGE_DECLARATION(dst),

uint lhs_stride_z,

uint rhs_stride_z,

#if defined(BETA)

uint bias_stride_z,

#endif //defined(BETA)

1358

uint dst_stride_z

1359

#if defined(REINTERPRET_INPUT_AS_3D)

1360

,

1361

uint lhs_cross_plane_pad

1362

#endif // REINTERPRET_INPUT_AS_3D

1363

#if defined(REINTERPRET_OUTPUT_AS_3D)

1364

,

1365

uint dst_cross_plane_pad

1366

#endif // REINTERPRET_OUTPUT_AS_3D

)

{

// Pixel unit

#define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(K0)

1371

1372

#define LEFTOVER_K (K % K0)

1373

1374

// Block size

1375

#define RHS_BLOCK_SIZE (PIXEL_UNIT * (N0))

1376

1377

// RHS offset and step X

1378

#if defined(RHS_INTERLEAVE)

1379

#define RHS_OFFSET_X (PIXEL_UNIT)

1380

#define RHS_STEP_X (PIXEL_UNIT * (H0))

1381

#define RHS_STEP_LOOP (1)

1382

#else // defined(RHS_INTERLEAVE)

1383

#define RHS_OFFSET_X (RHS_BLOCK_SIZE)

1384

#define RHS_STEP_X PIXEL_UNIT

1385

#define RHS_STEP_LOOP (H0)

1386

#endif // defined(RHS_INTERLEAVE)

1387

1388

uint x = get_global_id(0);

1389

uint y = get_global_id(1);

1390

uint z = get_global_id(2);

1391

1392

#if defined(DUMMY_WORK_ITEMS)

1393

if((x * N0 >= N) || (y * M0 >= M))

{

return;

}

#endif // defined(DUMMY_WORK_ITEMS)

1398

1399

// Compute LHS matrix address

SiCong Li

2020-07-15 12:09:58 +0100

[diff] [blame]

1400

uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

1401

1402

#if defined(MATRIX_B_DEPTH)

1403

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

1404

const uint z_rhs = (get_global_id(2) % MATRIX_B_DEPTH);

1405

#else // defined(MATRIX_B_DEPTH)

1406

const uint z_rhs = get_global_id(2);

1407

#endif // defined(MATRIX_B_DEPTH)

1408

1409

// Compute RHS matrix coordinates

1410

uint x_rhs = (get_global_id(0) % H0) * (uint)RHS_OFFSET_X;

1411

const uint y_rhs = (get_global_id(0) / (uint)H0) + z_rhs * RHS_HEIGHT;

1412

1413

REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0);

1414

REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);

1415

1416

#if defined(REINTERPRET_INPUT_AS_3D)

1417

// The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

1418

CALCULATE_Z_OFFSET(M0, uint, zlhs, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);

1419

1420

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

1421

// multiply lhs_stride_z by DEPTH_GEMM3D

1422

lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;

1423

1424

#else // defined(REINTERPRET_INPUT_AS_3D)

1425

1426

// Add offset for batched GEMM

1427

lhs_offset += z * lhs_stride_z;

1428

1429

#endif // defined(REINTERPRET_INPUT_AS_3D)

1430

1431

// Initialize the accumulators

1432

REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0);

1433

1434

int i = 0;

1435

for(; i <= (K - K0); i += K0)

1436

{

1437

// Load values from LHS matrix

1438

LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);

1439

1440

// Load values from RHS matrix stored in a cl_image

1441

REPEAT_VAR_INIT_TO_CONST(N0, VEC_DATA_TYPE(DATA_TYPE, K0), b, 0);

1442

LOAD_TEXTURE2D(N0, PIXEL_UNIT, DATA_TYPE, b, rhs_img, x_rhs, y_rhs, RHS_STEP_X, 0);

1443

1444

// Accumulate

1445

ARM_DOT_K0XN0(K0, a0, b, c0);

1446

#if M0 > 1

1447

ARM_DOT_K0XN0(K0, a1, b, c1);

1448

#endif // M0 > 1

1449

#if M0 > 2

1450

ARM_DOT_K0XN0(K0, a2, b, c2);

1451

#endif // M0 > 2

1452

#if M0 > 3

1453

ARM_DOT_K0XN0(K0, a3, b, c3);

1454

#endif // M0 > 3

1455

#if M0 > 4

1456

ARM_DOT_K0XN0(K0, a4, b, c4);

1457

#endif // M0 > 4

1458

#if M0 > 5

1459

ARM_DOT_K0XN0(K0, a5, b, c5);

1460

#endif // M0 > 5

1461

#if M0 > 6

1462

ARM_DOT_K0XN0(K0, a6, b, c6);

1463

#endif // M0 > 6

1464

#if M0 > 7

1465

ARM_DOT_K0XN0(K0, a7, b, c7);

1466

#endif // M0 > 7

1467

1468

lhs_offset += K0 * sizeof(DATA_TYPE);

1469

x_rhs += N0 * RHS_STEP_X * RHS_STEP_LOOP;

}

#if LEFTOVER_K != 0

// Note: We cannot read out-of-bound elements from the RHS matrix because

1474

// the RHS width is always multiple of K0. This is not be true for the LHS matrix

union UNION_VEC_TYPE

{

DATA_TYPE s[K0];

VEC_DATA_TYPE(DATA_TYPE, K0)

v;

};

union UNION_VEC_TYPE a0 = {.v = 0 };

1484

#if M0 > 1

1485

union UNION_VEC_TYPE a1 = {.v = 0 };

1486

#endif // M0 > 1

1487

#if M0 > 2

1488

union UNION_VEC_TYPE a2 = {.v = 0 };

1489

#endif // M0 > 2

1490

#if M0 > 3

1491

union UNION_VEC_TYPE a3 = {.v = 0 };

1492

#endif // M0 > 3

1493

#if M0 > 4

1494

union UNION_VEC_TYPE a4 = {.v = 0 };

1495

#endif // M0 > 4

1496

#if M0 > 5

1497

union UNION_VEC_TYPE a5 = {.v = 0 };

1498

#endif // M0 > 5

1499

#if M0 > 6

1500

union UNION_VEC_TYPE a6 = {.v = 0 };

1501

#endif // M0 > 6

1502

#if M0 > 7

1503

union UNION_VEC_TYPE a7 = {.v = 0 };

1504

#endif // M0 > 7

1505

1506

REPEAT_VAR_INIT_TO_CONST(N0, VEC_DATA_TYPE(DATA_TYPE, K0), b, 0);

1507

1508

// Load from RHS matrix

1509

LOAD_TEXTURE2D(N0, PIXEL_UNIT, DATA_TYPE, b, rhs_img, x_rhs, y_rhs, RHS_STEP_X, 0);

1510

1511

// Load from LHS matrix

1512

for(int k = 0; k < LEFTOVER_K; ++k)

1513

{

1514

a0.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zlhs0);

1515

#if M0 > 1

1516

a1.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zlhs1);

1517

#endif // M0 > 1

1518

#if M0 > 2

1519

a2.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zlhs2);

1520

#endif // M0 > 2

1521

#if M0 > 3

1522

a3.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zlhs3);

1523

#endif // M0 > 3

1524

#if M0 > 4

1525

a4.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zlhs4);

1526

#endif // M0 > 4

1527

#if M0 > 5

1528

a5.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zlhs5);

1529

#endif // M0 > 5

1530

#if M0 > 6

1531

a6.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zlhs6);

1532

#endif // M0 > 6

1533

#if M0 > 7

1534

a7.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zlhs7);

1535

#endif // M0 > 7

1536

1537

lhs_offset += sizeof(DATA_TYPE);

}

// Accumulate

ARM_DOT_K0XN0(K0, a0.v, b, c0);

1542

#if M0 > 1

1543

ARM_DOT_K0XN0(K0, a1.v, b, c1);

1544

#endif // M0 > 1

1545

#if M0 > 2

1546

ARM_DOT_K0XN0(K0, a2.v, b, c2);

1547

#endif // M0 > 2

1548

#if M0 > 3

1549

ARM_DOT_K0XN0(K0, a3.v, b, c3);

1550

#endif // M0 > 3

1551

#if M0 > 4

1552

ARM_DOT_K0XN0(K0, a4.v, b, c4);

1553

#endif // M0 > 4

1554

#if M0 > 5

1555

ARM_DOT_K0XN0(K0, a5.v, b, c5);

1556

#endif // M0 > 5

1557

#if M0 > 6

1558

ARM_DOT_K0XN0(K0, a6.v, b, c6);

1559

#endif // M0 > 6

1560

#if M0 > 7

1561

ARM_DOT_K0XN0(K0, a7.v, b, c7);

1562

#endif // M0 > 7

1563

1564

#endif // LEFTOVER_K != 0

1565

SiCong Li

2020-07-15 12:09:58 +0100

[diff] [blame]

1566

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

1567

1568

REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;

1569

1570

#if defined(REINTERPRET_OUTPUT_AS_3D)

1571

1572

// The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

1573

CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);

1574

1575

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

1576

// multiply dst_stride_z by DEPTH_GEMM3D

1577

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

1578

1579

#else // defined(REINTERPRET_OUTPUT_AS_3D)

1580

1581

// Add offset for batched GEMM

1582

dst_addr += z * dst_stride_z;

1583

1584

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

1585

1586

// Multiply by the weight of matrix-matrix product and store the result

1587

#if defined(ALPHA)

1588

SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);

1589

#endif // defined(ALPHA)

// Add beta*bias

#if defined(BETA)

#if defined(BROADCAST_BIAS)

1594

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));

1595

1596

LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

1597

1598

#ifndef UNIT_BETA

1599

SCALE_BLOCK(1, DATA_TYPE, bias, BETA);

1600

#endif // UNIT_BIAS

1601

1602

// c = c + bias[broadcasted]

1603

ADD_BLOCK_BROADCAST(M0, c, bias0);

1604

1605

#else // defined(BROADCAST_BIAS)

SiCong Li

2020-07-15 12:09:58 +0100

[diff] [blame]

1606

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z;

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

1607

1608

LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

1609

1610

#ifndef UNIT_BETA

1611

SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);

#endif // UNIT_BIAS

// c = c + bias

ADD_BLOCK(M0, c, bias);

1616

1617

#endif // defined(BROADCAST_BIAS)

1618

#endif // defined(BETA)

1619

1620

#if defined(ACTIVATION_TYPE)

1621

ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL);

1622

#endif // defined(ACTIVATION_TYPE)

1623

1624

// Store output block

SiCong Li

2020-07-15 12:09:58 +0100

[diff] [blame]

1625

STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, y, x);

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

1626

1627

#undef RHS_BLOCK_SIZE

#undef RHS_OFFSET_X

#undef RHS_STEP_X

#undef LEFTOVER_K

#undef PIXEL_UNIT

}

#endif // defined(OPENCL_IMAGE_SUPPORT)

1634

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1635

#define VFMA(a, b, c) \

({ \

c = fma(a, b, c); \

})

#if M0 == 1

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

1641

#define VFMA_M0xN0(i, a, b, c) \

1642

({ \

1643

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1644

})

1645

#elif M0 == 2 // M0 == 2

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

1646

#define VFMA_M0xN0(i, a, b, c) \

1647

({ \

1648

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

1649

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1650

})

1651

#elif M0 == 3 // M0 == 3

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

1652

#define VFMA_M0xN0(i, a, b, c) \

1653

({ \

1654

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

1655

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

1656

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1657

})

1658

#elif M0 == 4 // M0 == 4

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

1659

#define VFMA_M0xN0(i, a, b, c) \

1660

({ \

1661

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

1662

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

1663

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \

1664

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1665

})

1666

#elif M0 == 5 // M0 == 5

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

1667

#define VFMA_M0xN0(i, a, b, c) \

1668

({ \

1669

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

1670

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

1671

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \

1672

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \

1673

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1674

})

1675

#elif M0 == 6 // M0 == 6

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

1676

#define VFMA_M0xN0(i, a, b, c) \

1677

({ \

1678

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

1679

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

1680

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \

1681

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \

1682

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \

1683

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1684

})

1685

#elif M0 == 7 // M0 == 7

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

1686

#define VFMA_M0xN0(i, a, b, c) \

1687

({ \

1688

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

1689

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

1690

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \

1691

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \

1692

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \

1693

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \

1694

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1695

})

1696

#elif M0 == 8 // M0 == 8

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

1697

#define VFMA_M0xN0(i, a, b, c) \

1698

({ \

1699

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

1700

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

1701

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \

1702

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \

1703

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \

1704

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \

1705

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \

1706

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##7).s##i), b, (c##7)); \

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1707

})

1708

#else // M0 not supported

1709

#error "M0 not supported"

1710

#endif // M0 not supported

1711

1712

/** This OpenCL kernel computes the matrix multiplication between 2 matrices.

1713

* The LHS matrix is NOT reshaped

1714

* The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is NOT transposed

1715

*

Gian Marco Iodice

2019-03-15 10:13:05 +0000

[diff] [blame]

1716

* @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

1717

* @note The GEMM's dimensions (M,N and K) must be passed at compile time using -DM, -DN and and -DK (e.g. -DM=52, -DN=30 and -DK=90).

1718

* @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (e.g. -DN0=8, -DK0=4).

1719

* @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)

1720

* @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1721

* @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.

SiCong Li

2020-06-26 10:02:06 +0100

[diff] [blame]

1722

* @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)

1723

* @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1724

* @note Only the following configurations of M0, N0 and K0 are currently supported:

1725

* - M0 = 1, 2, 3, 4, 5, 6, 7, 8

1726

* - N0 = 2, 3, 4, 8, 16

1727

* - K0 = 2, 3, 4, 8, 16

Gian Marco Iodice

2019-03-15 10:13:05 +0000

[diff] [blame]

1728

* - H0 >= 1

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1729

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

1730

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

Gian Marco Iodice

2019-07-16 15:46:48 +0100

[diff] [blame]

1731

* The activation function is performed after the bias addition

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1732

* @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:

1733

* -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D

1734

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

1735

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

1736

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

1737

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix

1738

*

Sheri Zhang

2020-04-30 12:59:39 +0100

[diff] [blame]

1739

* @param[in] lhs_ptr Pointer to the LHS matrix. Supported data type: F16/F32

1740

* @param[in] lhs_stride_x Stride of the LHS matrix in X dimension (in bytes)

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1741

* @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

Sheri Zhang

2020-04-30 12:59:39 +0100

[diff] [blame]

1742

* @param[in] lhs_stride_y Stride of the LHS matrix in Y dimension (in bytes)

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1743

* @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

Sheri Zhang

2020-04-30 12:59:39 +0100

[diff] [blame]

1744

* @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS matrix

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1745

* @param[in] rhs_ptr Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr

1746

* @param[in] rhs_stride_x Stride of the RHS reshaped matrix in X dimension (in bytes)

1747

* @param[in] rhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1748

* @param[in] rhs_stride_y Stride of the RHS reshaped matrix in Y dimension (in bytes)

1749

* @param[in] rhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1750

* @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS reshaped matrix

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

1751

* @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

1752

* @param[in] bias_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1753

* @param[in] bias_step_x (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

1754

* @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1755

* @param[in] bias_step_y (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)

1756

* @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

1757

* @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr

1758

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

1759

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1760

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

1761

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

1762

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Sheri Zhang

2020-04-30 12:59:39 +0100

[diff] [blame]

1763

* @param[in] lhs_stride_z Stride of the LHS matrix in Z dimension (in bytes)

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1764

* @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes)

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

1765

* @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1766

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

1767

* @param[in] lhs_cross_plane_pad (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)

1768

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1769

*/

1770

__kernel void gemm_mm_reshaped_only_rhs_nt(IMAGE_DECLARATION(lhs),

1771

IMAGE_DECLARATION(rhs),

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1772

#if defined(BETA)

1773

IMAGE_DECLARATION(bias),

1774

#endif // defined(BETA)

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1775

IMAGE_DECLARATION(dst),

1776

uint lhs_stride_z,

1777

uint rhs_stride_z,

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1778

#if defined(BETA)

1779

uint bias_stride_z,

1780

#endif //defined(BETA)

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1781

uint dst_stride_z

1782

#if defined(REINTERPRET_INPUT_AS_3D)

1783

,

1784

uint lhs_cross_plane_pad

1785

#endif // REINTERPRET_INPUT_AS_3D

1786

#if defined(REINTERPRET_OUTPUT_AS_3D)

1787

,

1788

uint dst_cross_plane_pad

1789

#endif // REINTERPRET_OUTPUT_AS_3D

)

{

// Block size

#define RHS_BLOCK_SIZE ((K0) * (N0))

1794

1795

// RHS offset and step X

1796

#if defined(RHS_INTERLEAVE)

1797

#define RHS_OFFSET_X (N0)

1798

#define RHS_STEP_X ((N0) * (H0))

1799

#define RHS_STEP_LOOP (1)

1800

#else // defined(RHS_INTERLEAVE)

1801

#define RHS_OFFSET_X (RHS_BLOCK_SIZE)

1802

#define RHS_STEP_X (N0)

1803

#define RHS_STEP_LOOP (H0)

1804

#endif // defined(RHS_INTERLEAVE)

1805

1806

uint x = get_global_id(0);

1807

uint y = get_global_id(1);

1808

uint z = get_global_id(2);

1809

Gian Marco Iodice

2019-03-15 10:13:05 +0000

[diff] [blame]

1810

#if defined(DUMMY_WORK_ITEMS)

1811

if((x * N0 >= N) || (y * M0 >= M))

{

return;

}

#endif // defined(DUMMY_WORK_ITEMS)

1816

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1817

// Compute LHS matrix address

SiCong Li

2020-07-15 12:09:58 +0100

[diff] [blame]

1818

uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1819

Sheri Zhang

2020-04-30 12:59:39 +0100

[diff] [blame]

1820

// Compute RHS reshaped matrix address

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1821

uint rhs_offset = rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (x / (uint)H0) * rhs_stride_y;

1822

1823

#if defined(MATRIX_B_DEPTH)

1824

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

1825

rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;

1826

#else // defined(MATRIX_B_DEPTH)

1827

rhs_offset += z * rhs_stride_z;

1828

#endif // defined(MATRIX_B_DEPTH)

1829

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1830

REPEAT_VAR_INIT_TO_CONST(8, uint, zin, 0); //uint zin0=0,zin1=0,zin2=0,... zin7=0;

1831

REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0); //uint zero0=0,zero1=0,zero2=0,... zero7=0;

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1832

1833

#if defined(REINTERPRET_INPUT_AS_3D)

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1834

1835

// The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

1836

CALCULATE_Z_OFFSET(M0, uint, zin, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1837

1838

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

1839

// multiply lhs_stride_z by DEPTH_GEMM3D

1840

lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;

1841

1842

#else // defined(REINTERPRET_INPUT_AS_3D)

1843

1844

// Add offset for batched GEMM

1845

lhs_offset += z * lhs_stride_z;

1846

1847

#endif // defined(REINTERPRET_INPUT_AS_3D)

1848

1849

// Initialize the accumulators

1850

REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0); //VEC_DATA_TYPE(DATA_TYPE, N0) c0=0,c1=0,c2=0,... c(N0-1)=0;

1851

1852

int i = 0;

1853

for(; i <= (K - K0); i += K0)

1854

{

1855

// Supported cases (M0, K0):

1856

// 1,2 - 1,3 - 1,4 - 1,8 - 1,16

1857

// 2,2 - 2,3 - 2,4 - 2,8 - 2,16

1858

// 3,2 - 3,3 - 3,4 - 3,8 - 3,16

1859

// 4,2 - 4,3 - 4,4 - 4,8 - 4,16

1860

// 5,2 - 5,3 - 5,4 - 5,8 - 5,16

1861

// 6,2 - 6,3 - 6,4 - 6,8 - 6,16

1862

// 7,2 - 7,3 - 7,4 - 7,8 - 7,16

1863

// 8,2 - 8,3 - 8,4 - 8,8 - 8,16

1864

// Load values from LHS matrix

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

1865

LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zin);

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1866

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

1867

VEC_DATA_TYPE(DATA_TYPE, N0)

1868

b0;

1869

1870

b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0 * RHS_STEP_X * sizeof(DATA_TYPE)));

1871

VFMA_M0xN0(0, a, b0, c);

1872

b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 1 * RHS_STEP_X * sizeof(DATA_TYPE)));

1873

VFMA_M0xN0(1, a, b0, c);

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1874

#if K0 > 2

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

1875

b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 2 * RHS_STEP_X * sizeof(DATA_TYPE)));

1876

VFMA_M0xN0(2, a, b0, c);

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1877

#endif // K0 > 2

1878

#if K0 > 3

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

1879

b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 3 * RHS_STEP_X * sizeof(DATA_TYPE)));

1880

VFMA_M0xN0(3, a, b0, c);

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1881

#endif // K0 > 3

1882

#if K0 > 4

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

1883

b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 4 * RHS_STEP_X * sizeof(DATA_TYPE)));

1884

VFMA_M0xN0(4, a, b0, c);

1885

b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 5 * RHS_STEP_X * sizeof(DATA_TYPE)));

1886

VFMA_M0xN0(5, a, b0, c);

1887

b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 6 * RHS_STEP_X * sizeof(DATA_TYPE)));

1888

VFMA_M0xN0(6, a, b0, c);

1889

b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 7 * RHS_STEP_X * sizeof(DATA_TYPE)));

1890

VFMA_M0xN0(7, a, b0, c);

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1891

#endif // K0 > 4

1892

#if K0 > 8

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

1893

b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 8 * RHS_STEP_X * sizeof(DATA_TYPE)));

1894

VFMA_M0xN0(8, a, b0, c);

1895

b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 9 * RHS_STEP_X * sizeof(DATA_TYPE)));

1896

VFMA_M0xN0(9, a, b0, c);

1897

b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 10 * RHS_STEP_X * sizeof(DATA_TYPE)));

1898

VFMA_M0xN0(A, a, b0, c);

1899

b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 11 * RHS_STEP_X * sizeof(DATA_TYPE)));

1900

VFMA_M0xN0(B, a, b0, c);

1901

b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 12 * RHS_STEP_X * sizeof(DATA_TYPE)));

1902

VFMA_M0xN0(C, a, b0, c);

1903

b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 13 * RHS_STEP_X * sizeof(DATA_TYPE)));

1904

VFMA_M0xN0(D, a, b0, c);

1905

b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 14 * RHS_STEP_X * sizeof(DATA_TYPE)));

1906

VFMA_M0xN0(E, a, b0, c);

1907

b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 15 * RHS_STEP_X * sizeof(DATA_TYPE)));

1908

VFMA_M0xN0(F, a, b0, c);

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1909

#endif // K0 > 8

1910

1911

lhs_offset += K0 * sizeof(DATA_TYPE);

1912

rhs_offset += K0 * RHS_STEP_X * RHS_STEP_LOOP * sizeof(DATA_TYPE);

1913

}

1914

1915

// Left-over accumulations

1916

for(; i < K; ++i)

1917

{

1918

// Load values from LHS matrix

1919

VEC_DATA_TYPE(DATA_TYPE, 2)

1920

a0 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zin0));

1921

#if M0 > 1

1922

VEC_DATA_TYPE(DATA_TYPE, 2)

1923

a1 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zin1));

1924

#endif // M0 > 1

1925

#if M0 > 2

1926

VEC_DATA_TYPE(DATA_TYPE, 2)

1927

a2 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zin2));

1928

#endif // M0 > 2

1929

#if M0 > 3

1930

VEC_DATA_TYPE(DATA_TYPE, 2)

1931

a3 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zin3));

1932

#endif // M0 > 3

1933

#if M0 > 4

1934

VEC_DATA_TYPE(DATA_TYPE, 2)

1935

a4 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zin4));

1936

#endif // M0 > 4

1937

#if M0 > 5

1938

VEC_DATA_TYPE(DATA_TYPE, 2)

1939

a5 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zin5));

1940

#endif // M0 > 5

1941

#if M0 > 6

1942

VEC_DATA_TYPE(DATA_TYPE, 2)

1943

a6 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zin6));

1944

#endif // M0 > 6

1945

#if M0 > 7

1946

VEC_DATA_TYPE(DATA_TYPE, 2)

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

1947

a7 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zin7));

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1948

#endif // M0 > 7

1949

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

1950

VEC_DATA_TYPE(DATA_TYPE, N0)

1951

b0;

1952

1953

b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0 * RHS_STEP_X * sizeof(DATA_TYPE)));

1954

VFMA_M0xN0(0, a, b0, c);

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1955

1956

lhs_offset += sizeof(DATA_TYPE);

1957

rhs_offset += RHS_STEP_X * sizeof(DATA_TYPE);

1958

}

1959

SiCong Li

2020-07-15 12:09:58 +0100

[diff] [blame]

1960

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1961

1962

REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;

1963

1964

#if defined(REINTERPRET_OUTPUT_AS_3D)

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1965

// The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

1966

CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1967

1968

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

1969

// multiply dst_stride_z by DEPTH_GEMM3D

1970

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

1971

1972

#else // defined(REINTERPRET_OUTPUT_AS_3D)

1973

1974

// Add offset for batched GEMM

1975

dst_addr += z * dst_stride_z;

1976

1977

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

1978

1979

// Multiply by the weight of matrix-matrix product and store the result

1980

#if defined(ALPHA)

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

1981

SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

1982

#endif // defined(ALPHA)

1983

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

1984

// Add beta*bias

1985

#if defined(BETA)

1986

#if defined(BROADCAST_BIAS)

1987

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));

1988

1989

LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

1990

1991

#ifndef UNIT_BETA

1992

SCALE_BLOCK(1, DATA_TYPE, bias, BETA);

1993

#endif // UNIT_BIAS

1994

1995

// c = c + bias[broadcasted]

1996

ADD_BLOCK_BROADCAST(M0, c, bias0);

1997

1998

#else // defined(BROADCAST_BIAS)

SiCong Li

2020-07-15 12:09:58 +0100

[diff] [blame]

1999

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z;

Georgios Pinitas

2019-05-21 13:32:43 +0100

[diff] [blame]

2000

2001

LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

2002

2003

#ifndef UNIT_BETA

2004

SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);

#endif // UNIT_BIAS

// c = c + bias

ADD_BLOCK(M0, c, bias);

2009

2010

#endif // defined(BROADCAST_BIAS)

2011

#endif // defined(BETA)

2012

Gian Marco Iodice

2019-07-16 15:46:48 +0100

[diff] [blame]

2013

#if defined(ACTIVATION_TYPE)

2014

ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL);

2015

#endif // defined(ACTIVATION_TYPE)

2016

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

2017

// Store output block

SiCong Li

2020-07-15 12:09:58 +0100

[diff] [blame]

2018

STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, y, x);

Gian Marco Iodice

2019-03-11 12:17:44 +0000

[diff] [blame]

2019

2020

#undef RHS_BLOCK_SIZE

2021

#undef RHS_OFFSET_X

2022

#undef RHS_STEP_X

2023

}

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

2024

2025

#if defined(OPENCL_IMAGE_SUPPORT)

2026

/** This OpenCL kernel computes the matrix multiplication between 2 matrices.

2027

* The LHS matrix is NOT reshaped

2028

* The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is NOT transposed

2029

*

2030

* @note -DOPENCL_IMAGE_SUPPORT must be passed at compile time in order to compile this OpenCL kernel

2031

* @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.

2032

* @note The GEMM's dimensions (M,N and K) must be passed at compile time using -DM, -DN and and -DK (e.g. -DM=52, -DN=30 and -DK=90).

2033

* @note The height of the RHS matrix, defined before creating the OpenCL image object from the OpenCL buffer, should be passed at compile time using -DRHS_HEIGHT=<value> (e.g. -DRHS_HEIGHT=32)

2034

* Since we cannot create a 3d image from a buffer, the third dimension could be collapsed with the second dimension so RHS_HEIGHT

2035

* could be different from the value returned by get_image_height(rhs_img).

2036

* @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (e.g. -DN0=8, -DK0=4).

2037

* @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)

2038

* @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)

2039

* @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.

SiCong Li

2020-06-26 10:02:06 +0100

[diff] [blame]

2040

* @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)

2041

* @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

2042

* @note Only the following configurations of M0, N0 and K0 are currently supported:

2043

* - M0 = 1, 2, 3, 4, 5, 6, 7, 8

* - N0 = 4, 8, 16

* - K0 = 4, 8, 16

* - H0 >= 1

*

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

2049

* The activation function is performed after the bias addition

2050

* @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:

2051

* -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D

2052

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

2053

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

2054

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

2055

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix

2056

*

2057

* @param[in] lhs_ptr Pointer to the LHS matrix. Supported data type: F32

2058

* @param[in] lhs_stride_x Stride of the LHS matrix in X dimension (in bytes)

2059

* @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

2060

* @param[in] lhs_stride_y Stride of the LHS matrix in Y dimension (in bytes)

2061

* @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

2062

* @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS matrix

2063

* @param[in] rhs_img The RHS reshaped matrix as OpenCL image object. Supported data type: same as @p lhs_ptr

2064

* @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

2065

* @param[in] bias_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

2066

* @param[in] bias_step_x (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)

2067

* @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

2068

* @param[in] bias_step_y (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)

2069

* @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

2070

* @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr

2071

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

2072

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

2073

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

2074

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

2075

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

2076

* @param[in] lhs_stride_z Stride of the LHS matrix in Z dimension (in bytes)

2077

* @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes)

2078

* @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

2079

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

2080

* @param[in] lhs_cross_plane_pad (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)

2081

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

2082

*/

2083

__kernel void gemm_mm_reshaped_only_rhs_nt_texture(IMAGE_DECLARATION(lhs),

2084

__read_only image2d_t rhs_img,

2085

#if defined(BETA)

2086

IMAGE_DECLARATION(bias),

2087

#endif // defined(BETA)

2088

IMAGE_DECLARATION(dst),

uint lhs_stride_z,

uint rhs_stride_z,

#if defined(BETA)

uint bias_stride_z,

#endif //defined(BETA)

2094

uint dst_stride_z

2095

#if defined(REINTERPRET_INPUT_AS_3D)

2096

,

2097

uint lhs_cross_plane_pad

2098

#endif // REINTERPRET_INPUT_AS_3D

2099

#if defined(REINTERPRET_OUTPUT_AS_3D)

2100

,

2101

uint dst_cross_plane_pad

2102

#endif // REINTERPRET_OUTPUT_AS_3D

)

{

// Pixel unit

#define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(N0)

2107

2108

// Block size

2109

#define RHS_BLOCK_SIZE ((K0) * (PIXEL_UNIT))

2110

2111

// RHS offset and step X

2112

#if defined(RHS_INTERLEAVE)

2113

#define RHS_OFFSET_X (PIXEL_UNIT)

2114

#define RHS_STEP_X ((PIXEL_UNIT) * (H0))

2115

#else // defined(RHS_INTERLEAVE)

2116

#define RHS_OFFSET_X (RHS_BLOCK_SIZE)

2117

#define RHS_STEP_X (PIXEL_UNIT)

2118

#endif // defined(RHS_INTERLEAVE)

2119

2120

uint x = get_global_id(0);

2121

uint y = get_global_id(1);

2122

uint z = get_global_id(2);

2123

2124

#if defined(DUMMY_WORK_ITEMS)

2125

if((x * N0 >= N) || (y * M0 >= M))

{

return;

}

#endif // defined(DUMMY_WORK_ITEMS)

2130

2131

// Compute LHS matrix address

SiCong Li

2020-07-15 12:09:58 +0100

[diff] [blame]

2132

uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

2133

2134

#if defined(MATRIX_B_DEPTH)

2135

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

2136

const uint z_rhs = (z % MATRIX_B_DEPTH);

2137

#else // defined(MATRIX_B_DEPTH)

2138

const uint z_rhs = z;

2139

#endif // defined(MATRIX_B_DEPTH)

2140

2141

// Compute RHS matrix coordinates

2142

uint x_rhs = (x % H0) * (uint)RHS_OFFSET_X;

2143

const uint y_rhs = (x / (uint)H0) + z_rhs * RHS_HEIGHT;

2144

2145

REPEAT_VAR_INIT_TO_CONST(8, uint, zin, 0);

2146

REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);

2147

2148

#if defined(REINTERPRET_INPUT_AS_3D)

2149

2150

// The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

2151

CALCULATE_Z_OFFSET(M0, uint, zin, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);

2152

2153

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

2154

// multiply lhs_stride_z by DEPTH_GEMM3D

2155

lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;

2156

2157

#else // defined(REINTERPRET_INPUT_AS_3D)

2158

2159

// Add offset for batched GEMM

2160

lhs_offset += z * lhs_stride_z;

2161

2162

#endif // defined(REINTERPRET_INPUT_AS_3D)

2163

2164

// Initialize the accumulators

2165

REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0);

2166

2167

int i = 0;

2168

for(; i <= (K - K0); i += K0)

2169

{

2170

// Load values from LHS matrix

2171

LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zin);

2172

2173

VEC_DATA_TYPE(DATA_TYPE, N0)

2174

b0;

2175

2176

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 0 * RHS_STEP_X), (y_rhs));

2177

VFMA_M0xN0(0, a, b0, c);

2178

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 1 * RHS_STEP_X), (y_rhs));

2179

VFMA_M0xN0(1, a, b0, c);

2180

#if K0 > 2

2181

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 2 * RHS_STEP_X), (y_rhs));

2182

VFMA_M0xN0(2, a, b0, c);

2183

#endif // K0 > 2

2184

#if K0 > 3

2185

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 3 * RHS_STEP_X), (y_rhs));

2186

VFMA_M0xN0(3, a, b0, c);

2187

#endif // K0 > 3

2188

#if K0 > 4

2189

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 4 * RHS_STEP_X), (y_rhs));

2190

VFMA_M0xN0(4, a, b0, c);

2191

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 5 * RHS_STEP_X), (y_rhs));

2192

VFMA_M0xN0(5, a, b0, c);

2193

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 6 * RHS_STEP_X), (y_rhs));

2194

VFMA_M0xN0(6, a, b0, c);

2195

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 7 * RHS_STEP_X), (y_rhs));

2196

VFMA_M0xN0(7, a, b0, c);

2197

#endif // K0 > 4

2198

#if K0 > 8

2199

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 8 * RHS_STEP_X), (y_rhs));

2200

VFMA_M0xN0(8, a, b0, c);

2201

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 9 * RHS_STEP_X), (y_rhs));

2202

VFMA_M0xN0(9, a, b0, c);

2203

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 10 * RHS_STEP_X), (y_rhs));

2204

VFMA_M0xN0(A, a, b0, c);

2205

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 11 * RHS_STEP_X), (y_rhs));

2206

VFMA_M0xN0(B, a, b0, c);

2207

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 12 * RHS_STEP_X), (y_rhs));

2208

VFMA_M0xN0(C, a, b0, c);

2209

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 13 * RHS_STEP_X), (y_rhs));

2210

VFMA_M0xN0(D, a, b0, c);

2211

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 14 * RHS_STEP_X), (y_rhs));

2212

VFMA_M0xN0(E, a, b0, c);

2213

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 15 * RHS_STEP_X), (y_rhs));

2214

VFMA_M0xN0(F, a, b0, c);

2215

#endif // K0 > 8

2216

2217

lhs_offset += K0 * sizeof(DATA_TYPE);

2218

x_rhs += K0 * RHS_STEP_X * RHS_STEP_LOOP;

2219

}

2220

2221

// Left-over accumulations

2222

for(; i < K; ++i)

2223

{

2224

// Load values from LHS matrix

2225

VEC_DATA_TYPE(DATA_TYPE, 2)

2226

a0 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zin0));

2227

#if M0 > 1

2228

VEC_DATA_TYPE(DATA_TYPE, 2)

2229

a1 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zin1));

2230

#endif // M0 > 1

2231

#if M0 > 2

2232

VEC_DATA_TYPE(DATA_TYPE, 2)

2233

a2 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zin2));

2234

#endif // M0 > 2

2235

#if M0 > 3

2236

VEC_DATA_TYPE(DATA_TYPE, 2)

2237

a3 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zin3));

2238

#endif // M0 > 3

2239

#if M0 > 4

2240

VEC_DATA_TYPE(DATA_TYPE, 2)

2241

a4 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zin4));

2242

#endif // M0 > 4

2243

#if M0 > 5

2244

VEC_DATA_TYPE(DATA_TYPE, 2)

2245

a5 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zin5));

2246

#endif // M0 > 5

2247

#if M0 > 6

2248

VEC_DATA_TYPE(DATA_TYPE, 2)

2249

a6 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zin6));

2250

#endif // M0 > 6

2251

#if M0 > 7

2252

VEC_DATA_TYPE(DATA_TYPE, 2)

2253

a7 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zin7));

2254

#endif // M0 > 7

2255

2256

VEC_DATA_TYPE(DATA_TYPE, N0)

2257

b0;

2258

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 0 * RHS_STEP_X), (y_rhs));

2259

2260

VFMA_M0xN0(0, a, b0, c);

2261

2262

lhs_offset += sizeof(DATA_TYPE);

x_rhs += RHS_STEP_X;

}

SiCong Li

2020-07-15 12:09:58 +0100

[diff] [blame]

2266

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

2267

2268

REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;

2269

2270

#if defined(REINTERPRET_OUTPUT_AS_3D)

2271

// The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

2272

CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);

2273

2274

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

2275

// multiply dst_stride_z by DEPTH_GEMM3D

2276

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

2277

2278

#else // defined(REINTERPRET_OUTPUT_AS_3D)

2279

2280

// Add offset for batched GEMM

2281

dst_addr += z * dst_stride_z;

2282

2283

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

2284

2285

// Multiply by the weight of matrix-matrix product and store the result

2286

#if defined(ALPHA)

2287

SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);

2288

#endif // defined(ALPHA)

// Add beta*bias

#if defined(BETA)

#if defined(BROADCAST_BIAS)

2293

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));

2294

2295

LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

2296

2297

#ifndef UNIT_BETA

2298

SCALE_BLOCK(1, DATA_TYPE, bias, BETA);

2299

#endif // UNIT_BIAS

2300

2301

// c = c + bias[broadcasted]

2302

ADD_BLOCK_BROADCAST(M0, c, bias0);

2303

2304

#else // defined(BROADCAST_BIAS)

SiCong Li

2020-07-15 12:09:58 +0100

[diff] [blame]

2305

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z;

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

2306

2307

LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

2308

2309

#ifndef UNIT_BETA

2310

SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);

#endif // UNIT_BIAS

// c = c + bias

ADD_BLOCK(M0, c, bias);

2315

2316

#endif // defined(BROADCAST_BIAS)

2317

#endif // defined(BETA)

2318

2319

#if defined(ACTIVATION_TYPE)

2320

ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL);

2321

#endif // defined(ACTIVATION_TYPE)

2322

2323

// Store output block

SiCong Li

2020-07-15 12:09:58 +0100

[diff] [blame]

2324

STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, y, x);

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

2325

2326

#undef RHS_BLOCK_SIZE

#undef RHS_OFFSET_X

#undef RHS_STEP_X

}

#endif // defined(OPENCL_IMAGE_SUPPORT)

Gian Marco Iodice

2019-03-15 10:13:05 +0000

[diff] [blame]

2331

#endif // defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(DATA_TYPE) && defined(M) && defined(N) && defined(K)

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

2332

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

2333

#if defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(DATA_TYPE) && defined(DATA_TYPE_ACCUMULATOR) && defined(M) && defined(N)

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2334

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

2335

#if defined(MIXED_PRECISION)

2336

#if K0 == 2

2337

#define ARM_DOT_K0(a, b, c) \

({ \

c += a.s0 * b.s0; \

c += a.s1 * b.s1; \

})

#elif K0 == 3 // K0 == 3

2343

#define ARM_DOT_K0(a, b, c) \

({ \

c += a.s0 * b.s0; \

c += a.s1 * b.s1; \

c += a.s2 * b.s2; \

})

#elif K0 == 4 // K0 == 4

2350

#define ARM_DOT_K0(a, b, c) \

({ \

c += a.s0 * b.s0; \

c += a.s1 * b.s1; \

c += a.s2 * b.s2; \

c += a.s3 * b.s3; \

})

#elif K0 == 8 // K0 == 8

2358

#define ARM_DOT_K0(a, b, c) \

({ \

c += a.s0 * b.s0; \

c += a.s1 * b.s1; \

c += a.s2 * b.s2; \

c += a.s3 * b.s3; \

c += a.s4 * b.s4; \

c += a.s5 * b.s5; \

c += a.s6 * b.s6; \

c += a.s7 * b.s7; \

})

#elif K0 == 16 // K0 == 16

2370

#define ARM_DOT_K0(a, b, c) \

({ \

c += a.s0 * b.s0; \

c += a.s1 * b.s1; \

c += a.s2 * b.s2; \

c += a.s3 * b.s3; \

c += a.s4 * b.s4; \

c += a.s5 * b.s5; \

c += a.s6 * b.s6; \

c += a.s7 * b.s7; \

c += a.s8 * b.s8; \

c += a.s9 * b.s9; \

c += a.sA * b.sA; \

c += a.sB * b.sB; \

c += a.sC * b.sC; \

c += a.sD * b.sD; \

c += a.sE * b.sE; \

c += a.sF * b.sF; \

})

#else // K0 not supported

2390

#error "K0 value not supported"

2391

#endif // K0 conditions

2392

#else // defined(MIXED_PRECISION)

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

2393

#if K0 == 2

2394

#define ARM_DOT_K0(a, b, c) \

2395

({ \

2396

c = fma(a.s0, b.s0, c); \

2397

c = fma(a.s1, b.s1, c); \

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2398

})

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

2399

#elif K0 == 3 // K0 == 3

2400

#define ARM_DOT_K0(a, b, c) \

2401

({ \

2402

c = fma(a.s0, b.s0, c); \

2403

c = fma(a.s1, b.s1, c); \

2404

c = fma(a.s2, b.s2, c); \

2405

})

2406

#elif K0 == 4 // K0 == 4

2407

#define ARM_DOT_K0(a, b, c) \

2408

({ \

2409

c = fma(a.s0, b.s0, c); \

2410

c = fma(a.s1, b.s1, c); \

2411

c = fma(a.s2, b.s2, c); \

2412

c = fma(a.s3, b.s3, c); \

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2413

})

2414

#elif K0 == 8 // K0 == 8

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

2415

#define ARM_DOT_K0(a, b, c) \

2416

({ \

2417

c = fma(a.s0, b.s0, c); \

2418

c = fma(a.s1, b.s1, c); \

2419

c = fma(a.s2, b.s2, c); \

2420

c = fma(a.s3, b.s3, c); \

2421

c = fma(a.s4, b.s4, c); \

2422

c = fma(a.s5, b.s5, c); \

2423

c = fma(a.s6, b.s6, c); \

2424

c = fma(a.s7, b.s7, c); \

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2425

})

2426

#elif K0 == 16 // K0 == 16

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

2427

#define ARM_DOT_K0(a, b, c) \

2428

({ \

2429

c = fma(a.s0, b.s0, c); \

2430

c = fma(a.s1, b.s1, c); \

2431

c = fma(a.s2, b.s2, c); \

2432

c = fma(a.s3, b.s3, c); \

2433

c = fma(a.s4, b.s4, c); \

2434

c = fma(a.s5, b.s5, c); \

2435

c = fma(a.s6, b.s6, c); \

2436

c = fma(a.s7, b.s7, c); \

2437

c = fma(a.s8, b.s8, c); \

2438

c = fma(a.s9, b.s9, c); \

2439

c = fma(a.sA, b.sA, c); \

2440

c = fma(a.sB, b.sB, c); \

2441

c = fma(a.sC, b.sC, c); \

2442

c = fma(a.sD, b.sD, c); \

2443

c = fma(a.sE, b.sE, c); \

2444

c = fma(a.sF, b.sF, c); \

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2445

})

2446

#else // K0 not supported

2447

#error "K0 value not supported"

2448

#endif // K0 conditions

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

2449

#endif // defined(MIXED_PRECISION)

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2450

2451

#if N0 == 2

2452

#define ARM_DOT_K0XN0(a, b, c) \

2453

({ \

2454

ARM_DOT_K0((a), (b##0), (c.s0)); \

2455

ARM_DOT_K0((a), (b##1), (c.s1)); \

2456

})

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

2457

#elif N0 == 3 // N0 == 3

2458

#define ARM_DOT_K0XN0(a, b, c) \

2459

({ \

2460

ARM_DOT_K0((a), (b##0), (c.s0)); \

2461

ARM_DOT_K0((a), (b##1), (c.s1)); \

2462

ARM_DOT_K0((a), (b##2), (c.s2)); \

2463

})

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2464

#elif N0 == 4 // N0 == 4

2465

#define ARM_DOT_K0XN0(a, b, c) \

2466

({ \

2467

ARM_DOT_K0((a), (b##0), (c.s0)); \

2468

ARM_DOT_K0((a), (b##1), (c.s1)); \

2469

ARM_DOT_K0((a), (b##2), (c.s2)); \

2470

ARM_DOT_K0((a), (b##3), (c.s3)); \

2471

})

2472

#elif N0 == 8 // N0 == 8

2473

#define ARM_DOT_K0XN0(a, b, c) \

2474

({ \

2475

ARM_DOT_K0((a), (b##0), (c.s0)); \

2476

ARM_DOT_K0((a), (b##1), (c.s1)); \

2477

ARM_DOT_K0((a), (b##2), (c.s2)); \

2478

ARM_DOT_K0((a), (b##3), (c.s3)); \

2479

ARM_DOT_K0((a), (b##4), (c.s4)); \

2480

ARM_DOT_K0((a), (b##5), (c.s5)); \

2481

ARM_DOT_K0((a), (b##6), (c.s6)); \

2482

ARM_DOT_K0((a), (b##7), (c.s7)); \

2483

})

2484

#elif N0 == 16 // N0 == 16

2485

#define ARM_DOT_K0XN0(a, b, c) \

2486

({ \

2487

ARM_DOT_K0((a), (b##0), (c.s0)); \

2488

ARM_DOT_K0((a), (b##1), (c.s1)); \

2489

ARM_DOT_K0((a), (b##2), (c.s2)); \

2490

ARM_DOT_K0((a), (b##3), (c.s3)); \

2491

ARM_DOT_K0((a), (b##4), (c.s4)); \

2492

ARM_DOT_K0((a), (b##5), (c.s5)); \

2493

ARM_DOT_K0((a), (b##6), (c.s6)); \

2494

ARM_DOT_K0((a), (b##7), (c.s7)); \

2495

ARM_DOT_K0((a), (b##8), (c.s8)); \

2496

ARM_DOT_K0((a), (b##9), (c.s9)); \

2497

ARM_DOT_K0((a), (b##A), (c.sA)); \

2498

ARM_DOT_K0((a), (b##B), (c.sB)); \

2499

ARM_DOT_K0((a), (b##C), (c.sC)); \

2500

ARM_DOT_K0((a), (b##D), (c.sD)); \

2501

ARM_DOT_K0((a), (b##E), (c.sE)); \

2502

ARM_DOT_K0((a), (b##F), (c.sF)); \

2503

})

2504

#else // N0 not supported

2505

#error "N0 value not supported"

2506

#endif // N0 conditions

2507

2508

/** This OpenCL kernel computes the matrix multiplication between 2 matrices.

2509

* The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be NOT transposed

2510

* The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be transposed

2511

*

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

2512

* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)

2513

* @note The data type used for the accumulators must be passed at compile time using -DDATA_TYPE_ACCUMULATOR (e.g. -DDATA_TYPE_ACCUMULATOR=float)

2514

* @note The F16 computation also supports mixed precision through the option -DMIXED_PRECISION passed at compile time. If enabled, DATA_TYPE_ACCUMULATOR should be set to float

Gian Marco Iodice

2019-03-15 10:13:05 +0000

[diff] [blame]

2515

* @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

2516

* @note The GEMM's dimensions M, N and K must be passed at compile time using -DM, -DN and -DK (e.g. -DM=52, -DN=90 and -DK=24).

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

2517

* @note The block's dimensions used for reshaping the LHS matrix and the RHS matrix (M0, N0 and K0) must be passed at compile time using -DM0, -DN0 and -DK0 (e.g. -DM0=4, -DN0=8, -DK0=4).

2518

* @note The number of M0xK0 vertical blocks stored on the same output row of the reshaped LHS matrix must be passed at compile time using -DV0 (e.g. -DV0=2)

2519

* @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2520

* @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option -DLHS_INTERLEAVE must passed at compile time.

2521

* @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.

2522

* @note Only the following configurations of M0, N0 and K0 are currently supported:

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

2523

* - M0 = 2, 3, 4, 5, 6, 7, 8

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

2524

* - N0 = 2, 3, 4, 8, 16

2525

* - K0 = 2, 3, 4, 8, 16

Gian Marco Iodice

62251f7

2019-03-11 16:07:12 +0000

[diff] [blame]

2526

* - V0 >= 1

2527

* - H0 >= 1

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2528

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

2529

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

Gian Marco Iodice

2019-07-16 15:46:48 +0100

[diff] [blame]

2530

* The activation function is performed after the bias addition

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

2531

* @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2532

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

2533

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

2534

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

2535

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix NOT reshaped

2536

*

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

2537

* @param[in] lhs_ptr Pointer to the LHS reshaped matrix. Supported data type: F16/F32

2538

* @param[in] lhs_stride_x Stride of the LHS reshaped matrix in X dimension (in bytes)

2539

* @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

2540

* @param[in] lhs_stride_y Stride of the LHS reshaped matrix in Y dimension (in bytes)

2541

* @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

2542

* @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS reshaped matrix

2543

* @param[in] rhs_ptr Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr

2544

* @param[in] rhs_stride_x Stride of the RHS reshaped matrix in X dimension (in bytes)

2545

* @param[in] rhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

2546

* @param[in] rhs_stride_y Stride of the RHS reshaped matrix in Y dimension (in bytes)

2547

* @param[in] rhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

2548

* @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS reshaped matrix

2549

* @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

2550

* @param[in] bias_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

2551

* @param[in] bias_step_x (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)

2552

* @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

2553

* @param[in] bias_step_y (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)

2554

* @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

2555

* @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr

2556

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

2557

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

2558

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

2559

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

2560

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Gian Marco Iodice

2020-06-25 17:18:36 +0100

[diff] [blame]

2561

* @param[in] k Number of columns in LHS matrix and rows in RHS matrix not reshaped.

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

2562

* @param[in] lhs_stride_z Stride of the LHS reshaped matrix in Z dimension (in bytes)

2563

* @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes)

2564

* @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

2565

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

2566

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2567

*/

2568

__kernel void gemm_mm_reshaped_lhs_nt_rhs_t(IMAGE_DECLARATION(lhs),

2569

IMAGE_DECLARATION(rhs),

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

2570

#if defined(BETA)

2571

IMAGE_DECLARATION(bias),

2572

#endif // defined(BETA)

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2573

IMAGE_DECLARATION(dst),

Gian Marco Iodice

2020-06-25 17:18:36 +0100

[diff] [blame]

2574

uint k,

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2575

uint lhs_stride_z,

2576

uint rhs_stride_z,

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

2577

#if defined(BETA)

2578

uint bias_stride_z,

2579

#endif //defined(BETA)

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2580

uint dst_stride_z

2581

#if defined(REINTERPRET_OUTPUT_AS_3D)

2582

,

2583

uint dst_cross_plane_pad

2584

#endif // REINTERPRET_OUTPUT_AS_3D

)

{

// Block size

#define LHS_BLOCK_SIZE ((K0) * (M0))

2589

2590

#if defined(LHS_INTERLEAVE)

2591

#define LHS_OFFSET_X (K0)

2592

#define LHS_STEP_X ((K0) * (V0))

2593

#define LHS_STEP_LOOP (1)

2594

#else // defined(INTERLEAVE)

2595

#define LHS_OFFSET_X (LHS_BLOCK_SIZE)

2596

#define LHS_STEP_X (K0)

2597

#define LHS_STEP_LOOP (V0)

2598

#endif // defined(INTERLEAVE)

2599

2600

// Block size

2601

#define RHS_BLOCK_SIZE ((K0) * (N0))

2602

2603

// RHS offset and step X

2604

#if defined(RHS_INTERLEAVE)

2605

#define RHS_OFFSET_X (K0)

2606

#define RHS_STEP_X ((K0) * (H0))

2607

#define RHS_STEP_LOOP (1)

2608

#else // defined(RHS_INTERLEAVE)

2609

#define RHS_OFFSET_X (RHS_BLOCK_SIZE)

2610

#define RHS_STEP_X (K0)

2611

#define RHS_STEP_LOOP (H0)

2612

#endif // defined(RHS_INTERLEAVE)

2613

Gian Marco Iodice

2019-03-15 10:13:05 +0000

[diff] [blame]

2614

#if defined(DUMMY_WORK_ITEMS)

2615

if((get_global_id(0) * N0 >= N) || (get_global_id(1) * M0 >= M))

{

return;

}

#endif // defined(DUMMY_WORK_ITEMS)

2620

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2621

// Compute LHS matrix address

2622

__global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (get_global_id(1) % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (get_global_id(1) / V0) * (uint)lhs_stride_y +

2623

(get_global_id(2) * lhs_stride_z);

2624

2625

// Compute RHS matrix address

2626

__global uchar *rhs_addr = rhs_ptr + rhs_offset_first_element_in_bytes + (get_global_id(0) % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (get_global_id(0) / (uint)H0) * rhs_stride_y;

2627

2628

#if defined(MATRIX_B_DEPTH)

2629

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

2630

rhs_addr += (get_global_id(2) % MATRIX_B_DEPTH) * rhs_stride_z;

2631

#else // defined(MATRIX_B_DEPTH)

2632

rhs_addr += get_global_id(2) * rhs_stride_z;

2633

#endif // defined(MATRIX_B_DEPTH)

2634

2635

// Initialize the accumulators

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

2636

REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0);

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2637

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

2638

REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0); //uint zlhs0=0,zlhs1=0,zlhs2=0,... zlhs7=0;

2639

REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

2640

Gian Marco Iodice

2020-06-25 17:18:36 +0100

[diff] [blame]

2641

for(int i = 0; i < k; i += K0)

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2642

{

2643

// Supported cases (M0, K0):

Gian Marco Iodice

2019-02-15 11:10:31 +0000

[diff] [blame]

2644

// 1,2 - 1,3 - 1,4 - 1,8 - 1,16

2645

// 2,2 - 2,3 - 2,4 - 2,8 - 2,16

2646

// 3,2 - 3,3 - 3,4 - 3,8 - 3,16

2647

// 4,2 - 4,3 - 4,4 - 4,8 - 4,16

2648

// 5,2 - 5,3 - 5,4 - 5,8 - 5,16

2649

// 6,2 - 6,3 - 6,4 - 6,8 - 6,16

2650

// 7,2 - 7,3 - 7,4 - 7,8 - 7,16

2651

// 8,2 - 8,3 - 8,4 - 8,8 - 8,16

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2652

// Load values from LHS matrix

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

2653

LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_addr, 0, LHS_STEP_X * sizeof(DATA_TYPE), zlhs);

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2654

2655

// Load values from RHS matrix

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

2656

LOAD_BLOCK(N0, K0, DATA_TYPE, b, rhs_addr, 0, RHS_STEP_X * sizeof(DATA_TYPE), zero);

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2657

2658

// Accumulate

2659

ARM_DOT_K0XN0(a0, b, c0);

2660

#if M0 > 1

2661

ARM_DOT_K0XN0(a1, b, c1);

2662

#endif // M0 > 1

2663

#if M0 > 2

2664

ARM_DOT_K0XN0(a2, b, c2);

2665

#endif // M0 > 2

2666

#if M0 > 3

2667

ARM_DOT_K0XN0(a3, b, c3);

2668

#endif // M0 > 3

2669

#if M0 > 4

2670

ARM_DOT_K0XN0(a4, b, c4);

2671

#endif // M0 > 4

2672

#if M0 > 5

2673

ARM_DOT_K0XN0(a5, b, c5);

2674

#endif // M0 > 5

2675

#if M0 > 6

2676

ARM_DOT_K0XN0(a6, b, c6);

2677

#endif // M0 > 6

2678

#if M0 > 7

2679

ARM_DOT_K0XN0(a7, b, c7);

2680

#endif // M0 > 7

2681

2682

lhs_addr += (M0 * LHS_STEP_X * LHS_STEP_LOOP) * sizeof(DATA_TYPE);

2683

rhs_addr += (N0 * RHS_STEP_X * RHS_STEP_LOOP) * sizeof(DATA_TYPE);

2684

}

2685

2686

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * dst_stride_y);

2687

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

2688

REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2689

2690

#if defined(REINTERPRET_OUTPUT_AS_3D)

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2691

2692

// The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

2693

CALCULATE_Z_OFFSET(M0, uint, zout, get_global_id(1), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2694

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

2695

// multiply dst_stride_z by DEPTH_GEMM3D

2696

dst_addr += get_global_id(2) * dst_stride_z * DEPTH_GEMM3D;

2697

2698

#else // defined(REINTERPRET_OUTPUT_AS_3D)

2699

2700

// Add offset for batched GEMM

2701

dst_addr += get_global_id(2) * dst_stride_z;

2702

2703

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

2704

2705

// Multiply by the weight of matrix-matrix product and store the result

2706

#if defined(ALPHA)

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

2707

SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2708

#endif // defined(ALPHA)

2709

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

2710

// Add beta*bias

2711

#if defined(BETA)

2712

#if defined(BROADCAST_BIAS)

2713

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));

2714

2715

LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

2716

2717

#ifndef UNIT_BETA

2718

SCALE_BLOCK(1, DATA_TYPE, bias, BETA);

2719

#endif // UNIT_BIAS

2720

2721

// c = c + bias[broadcasted]

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

2722

#if defined(MIXED_PRECISION)

2723

CONVERT_BLOCK(1, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);

2724

ADD_BLOCK_BROADCAST(M0, c, bias_hp0);

2725

#else // defined(MIXED_PRECISION)

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

2726

ADD_BLOCK_BROADCAST(M0, c, bias0);

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

2727

#endif // defined(MIXED_PRECISION)

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

2728

2729

#else // defined(BROADCAST_BIAS)

2730

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * bias_stride_y) + get_global_id(

2731

2) * bias_stride_z;

2732

2733

LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

2734

2735

#ifndef UNIT_BETA

2736

SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);

2737

#endif // UNIT_BIAS

2738

2739

// c = c + bias

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

2740

#if defined(MIXED_PRECISION)

2741

CONVERT_BLOCK(M0, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);

2742

ADD_BLOCK(M0, c, bias_hp);

2743

#else // defined(MIXED_PRECISION)

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

2744

ADD_BLOCK(M0, c, bias);

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

2745

#endif // defined(MIXED_PRECISION)

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

2746

2747

#endif // defined(BROADCAST_BIAS)

2748

#endif // defined(BETA)

2749

Gian Marco Iodice

2019-07-16 15:46:48 +0100

[diff] [blame]

2750

#if defined(ACTIVATION_TYPE)

Georgios Pinitas

2019-10-11 17:38:50 +0100

[diff] [blame]

2751

#if defined(MIXED_PRECISION)

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

2752

ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, c, A_VAL, B_VAL);

Georgios Pinitas

2019-10-11 17:38:50 +0100

[diff] [blame]

2753

#else // defined(MIXED_PRECISION)

2754

ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL);

2755

#endif // defined(MIXED_PRECISION)

Gian Marco Iodice

2019-07-16 15:46:48 +0100

[diff] [blame]

2756

#endif // defined(ACTIVATION_TYPE)

2757

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2758

// Store output block

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

2759

#if defined(MIXED_PRECISION)

2760

CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout);

2761

#else // defined(MIXED_PRECISION)

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

2762

STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout);

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

2763

#endif // defined(MIXED_PRECISION)

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

2764

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2765

#undef LHS_BLOCK_SIZE

2766

#undef LHS_OFFSET_X

2767

#undef LHS_STEP_X

2768

#undef RHS_BLOCK_SIZE

2769

#undef RHS_OFFSET_X

2770

#undef RHS_STEP_X

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

2771

#undef LHS_STEP_LOOP

2772

#undef RHS_STEP_LOOP

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

2773

}

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

2774

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

2775

#if defined(OPENCL_IMAGE_SUPPORT)

2776

/** This OpenCL kernel computes the matrix multiplication between 2 matrices. The RHS matrix is stored in OpenCL image object.

2777

* The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be NOT transposed

2778

* The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be transposed

2779

*

2780

* @note -DOPENCL_IMAGE_SUPPORT must be passed at compile time in order to compile this OpenCL kernel

2781

* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)

2782

* @note The data type used for the accumulators must be passed at compile time using -DDATA_TYPE_ACCUMULATOR (e.g. -DDATA_TYPE_ACCUMULATOR=float)

2783

* @note The F16 computation also supports mixed precision through the option -DMIXED_PRECISION passed at compile time. If enabled, DATA_TYPE_ACCUMULATOR should be set to float

2784

* @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.

2785

* @note The GEMM's dimensions M, N and K must be passed at compile time using -DM, -DN and -DK (e.g. -DM=52, -DN=90 and -DK=24).

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

2786

* @note The height of the RHS matrix, defined before creating the OpenCL image object from the OpenCL buffer, should be passed at compile time using -DRHS_HEIGHT=<value> (e.g. -DRHS_HEIGHT=32)

2787

* Since we cannot create a 3d image from a buffer, the third dimension could be collapsed with the second dimension so RHS_HEIGHT

2788

* could be different from the value returned by get_image_height(rhs_img).

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

2789

* @note The block's dimensions used for reshaping the LHS matrix and the RHS matrix (M0, N0 and K0) must be passed at compile time using -DM0, -DN0 and -DK0 (e.g. -DM0=4, -DN0=8, -DK0=4).

2790

* @note The number of M0xK0 vertical blocks stored on the same output row of the reshaped LHS matrix must be passed at compile time using -DV0 (e.g. -DV0=2)

2791

* @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)

2792

* @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option -DLHS_INTERLEAVE must passed at compile time.

2793

* @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.

2794

* @note Only the following configurations of M0, N0 and K0 are currently supported:

2795

* - M0 = 2, 3, 4, 5, 6, 7, 8

* - N0 = 4, 8, 16

* - K0 = 4, 8, 16

* - V0 >= 1

* - H0 >= 1

*

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

2802

* The activation function is performed after the bias addition

2803

* @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:

2804

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

2805

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

2806

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

2807

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix NOT reshaped

2808

*

2809

* @param[in] lhs_ptr Pointer to the LHS reshaped matrix. Supported data type: F32

2810

* @param[in] lhs_stride_x Stride of the LHS reshaped matrix in X dimension (in bytes)

2811

* @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

2812

* @param[in] lhs_stride_y Stride of the LHS reshaped matrix in Y dimension (in bytes)

2813

* @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

2814

* @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS reshaped matrix

2815

* @param[in] rhs_img The RHS reshaped matrix as OpenCL image object. Supported data type: same as @p lhs_ptr

2816

* @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

2817

* @param[in] bias_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

2818

* @param[in] bias_step_x (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)

2819

* @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

2820

* @param[in] bias_step_y (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)

2821

* @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

2822

* @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr

2823

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

2824

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

2825

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

2826

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

2827

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Gian Marco Iodice

2020-06-25 17:18:36 +0100

[diff] [blame]

2828

* @param[in] k Number of columns in LHS matrix and rows in RHS matrix not reshaped.

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

2829

* @param[in] lhs_stride_z Stride of the LHS reshaped matrix in Z dimension (in bytes)

2830

* @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes)

2831

* @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

2832

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

2833

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

2834

*/

2835

__kernel void gemm_mm_reshaped_lhs_nt_rhs_t_texture(IMAGE_DECLARATION(lhs),

2836

__read_only image2d_t rhs_img,

2837

#if defined(BETA)

2838

IMAGE_DECLARATION(bias),

2839

#endif // defined(BETA)

2840

IMAGE_DECLARATION(dst),

Gian Marco Iodice

2020-06-25 17:18:36 +0100

[diff] [blame]

2841

uint k,

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

uint lhs_stride_z,

uint rhs_stride_z,

#if defined(BETA)

uint bias_stride_z,

#endif //defined(BETA)

2847

uint dst_stride_z

2848

#if defined(REINTERPRET_OUTPUT_AS_3D)

2849

,

2850

uint dst_cross_plane_pad

2851

#endif // REINTERPRET_OUTPUT_AS_3D

)

{

// Pixel unit

#define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(K0)

2856

2857

// Block size

2858

#define LHS_BLOCK_SIZE ((K0) * (M0))

2859

2860

#if defined(LHS_INTERLEAVE)

2861

#define LHS_OFFSET_X (K0)

2862

#define LHS_STEP_X ((K0) * (V0))

2863

#define LHS_STEP_LOOP (1)

2864

#else // defined(INTERLEAVE)

2865

#define LHS_OFFSET_X (LHS_BLOCK_SIZE)

2866

#define LHS_STEP_X (K0)

2867

#define LHS_STEP_LOOP (V0)

2868

#endif // defined(INTERLEAVE)

2869

2870

// Block size

2871

#define RHS_BLOCK_SIZE (PIXEL_UNIT * (N0))

2872

2873

// RHS offset and step X

2874

#if defined(RHS_INTERLEAVE)

2875

#define RHS_OFFSET_X (PIXEL_UNIT)

2876

#define RHS_STEP_X (PIXEL_UNIT * (H0))

2877

#define RHS_STEP_LOOP (1)

2878

#else // defined(RHS_INTERLEAVE)

2879

#define RHS_OFFSET_X (RHS_BLOCK_SIZE)

2880

#define RHS_STEP_X PIXEL_UNIT

2881

#define RHS_STEP_LOOP (H0)

2882

#endif // defined(RHS_INTERLEAVE)

2883

2884

#if defined(DUMMY_WORK_ITEMS)

2885

if((get_global_id(0) * N0 >= N) || (get_global_id(1) * M0 >= M))

{

return;

}

#endif // defined(DUMMY_WORK_ITEMS)

2890

2891

// Compute LHS matrix address

2892

__global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (get_global_id(1) % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (get_global_id(1) / V0) * (uint)lhs_stride_y +

2893

(get_global_id(2) * lhs_stride_z);

2894

2895

#if defined(MATRIX_B_DEPTH)

2896

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

2897

const uint z_rhs = (get_global_id(2) % MATRIX_B_DEPTH);

2898

#else // defined(MATRIX_B_DEPTH)

2899

const uint z_rhs = get_global_id(2);

2900

#endif // defined(MATRIX_B_DEPTH)

2901

2902

// Compute RHS matrix coordinates

2903

uint x_rhs = (get_global_id(0) % H0) * (uint)RHS_OFFSET_X;

2904

const uint y_rhs = (get_global_id(0) / (uint)H0) + z_rhs * RHS_HEIGHT;

2905

2906

// Initialize the accumulators

2907

REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0);

2908

2909

REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0); //uint zlhs0=0,zlhs1=0,zlhs2=0,... zlhs7=0;

2910

REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);

2911

2912

for(int i = 0; i < K; i += K0)

2913

{

2914

// Load values from LHS matrix

2915

LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_addr, 0, LHS_STEP_X * sizeof(DATA_TYPE), zlhs);

2916

2917

// Load values from RHS matrix stored in a cl_image

2918

REPEAT_VAR_INIT_TO_CONST(N0, VEC_DATA_TYPE(DATA_TYPE, K0), b, 0);

2919

LOAD_TEXTURE2D(N0, PIXEL_UNIT, DATA_TYPE, b, rhs_img, x_rhs, y_rhs, RHS_STEP_X, 0);

2920

2921

// Accumulate

2922

ARM_DOT_K0XN0(a0, b, c0);

2923

#if M0 > 1

2924

ARM_DOT_K0XN0(a1, b, c1);

2925

#endif // M0 > 1

2926

#if M0 > 2

2927

ARM_DOT_K0XN0(a2, b, c2);

2928

#endif // M0 > 2

2929

#if M0 > 3

2930

ARM_DOT_K0XN0(a3, b, c3);

2931

#endif // M0 > 3

2932

#if M0 > 4

2933

ARM_DOT_K0XN0(a4, b, c4);

2934

#endif // M0 > 4

2935

#if M0 > 5

2936

ARM_DOT_K0XN0(a5, b, c5);

2937

#endif // M0 > 5

2938

#if M0 > 6

2939

ARM_DOT_K0XN0(a6, b, c6);

2940

#endif // M0 > 6

2941

#if M0 > 7

2942

ARM_DOT_K0XN0(a7, b, c7);

2943

#endif // M0 > 7

2944

2945

lhs_addr += (M0 * LHS_STEP_X * LHS_STEP_LOOP) * sizeof(DATA_TYPE);

2946

2947

x_rhs += N0 * RHS_STEP_X * RHS_STEP_LOOP;

2948

}

2949

2950

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * dst_stride_y);

2951

2952

REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);

2953

2954

#if defined(REINTERPRET_OUTPUT_AS_3D)

2955

2956

// The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

2957

CALCULATE_Z_OFFSET(M0, uint, zout, get_global_id(1), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);

2958

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

2959

// multiply dst_stride_z by DEPTH_GEMM3D

2960

dst_addr += get_global_id(2) * dst_stride_z * DEPTH_GEMM3D;

2961

2962

#else // defined(REINTERPRET_OUTPUT_AS_3D)

2963

2964

// Add offset for batched GEMM

2965

dst_addr += get_global_id(2) * dst_stride_z;

2966

2967

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

2968

2969

// Multiply by the weight of matrix-matrix product and store the result

2970

#if defined(ALPHA)

2971

SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);

2972

#endif // defined(ALPHA)

// Add beta*bias

#if defined(BETA)

#if defined(BROADCAST_BIAS)

2977

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));

2978

2979

LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

2980

2981

#ifndef UNIT_BETA

2982

SCALE_BLOCK(1, DATA_TYPE, bias, BETA);

2983

#endif // UNIT_BIAS

2984

2985

// c = c + bias[broadcasted]

2986

#if defined(MIXED_PRECISION)

2987

CONVERT_BLOCK(1, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);

2988

ADD_BLOCK_BROADCAST(M0, c, bias_hp0);

2989

#else // defined(MIXED_PRECISION)

2990

ADD_BLOCK_BROADCAST(M0, c, bias0);

2991

#endif // defined(MIXED_PRECISION)

2992

2993

#else // defined(BROADCAST_BIAS)

2994

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * bias_stride_y) + get_global_id(

2995

2) * bias_stride_z;

2996

2997

LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

2998

2999

#ifndef UNIT_BETA

3000

SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);

#endif // UNIT_BIAS

// c = c + bias

#if defined(MIXED_PRECISION)

3005

CONVERT_BLOCK(M0, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);

3006

ADD_BLOCK(M0, c, bias_hp);

3007

#else // defined(MIXED_PRECISION)

3008

ADD_BLOCK(M0, c, bias);

3009

#endif // defined(MIXED_PRECISION)

3010

3011

#endif // defined(BROADCAST_BIAS)

3012

#endif // defined(BETA)

3013

3014

#if defined(ACTIVATION_TYPE)

3015

#if defined(MIXED_PRECISION)

3016

ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, c, A_VAL, B_VAL);

3017

#else // defined(MIXED_PRECISION)

3018

ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL);

3019

#endif // defined(MIXED_PRECISION)

3020

#endif // defined(ACTIVATION_TYPE)

3021

3022

// Store output block

3023

#if defined(MIXED_PRECISION)

3024

CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout);

3025

#else // defined(MIXED_PRECISION)

3026

STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout);

3027

#endif // defined(MIXED_PRECISION)

3028

3029

#undef LHS_BLOCK_SIZE

3030

#undef LHS_OFFSET_X

3031

#undef LHS_STEP_X

3032

#undef RHS_BLOCK_SIZE

#undef RHS_OFFSET_X

#undef RHS_STEP_X

#undef PIXEL_UNIT

#undef LHS_STEP_LOOP

#undef RHS_STEP_LOOP

}

#endif // defined(OPENCL_IMAGE_SUPPORT)

3040

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3041

#if defined(LHS_TRANSPOSE)

3042

3043

#define VTYPE(TYPE, SIZE) VEC_DATA_TYPE(TYPE, SIZE)

3044

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

3045

#if defined(MIXED_PRECISION)

3046

3047

#if(GPU_ARCH == GPU_ARCH_MIDGARD)

3048

#define ARM_VFMA(N0, a, b, c) c += (CONVERT(a, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0))) * (CONVERT(b, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0)));

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3049

#else // GPU_ARCH == GPU_ARCH_MIDGARD

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

3050

#define ARM_VFMA(N0, a, b, c) c = fma((CONVERT(a, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0))), (CONVERT(b, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0))), (c));

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3051

#endif // GPU_ARCH == GPU_ARCH_MIDGARD

3052

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

3053

#else // defined(MIXED_PRECISION

3054

3055

#if(GPU_ARCH == GPU_ARCH_MIDGARD)

3056

#define ARM_VFMA(N0, a, b, c) c += (a) * (b);

3057

#else // GPU_ARCH == GPU_ARCH_MIDGARD

3058

#define ARM_VFMA(N0, a, b, c) c = fma((a), (b), (c));

3059

#endif // GPU_ARCH == GPU_ARCH_MIDGARD

3060

3061

#endif // defined(MIXED_PRECISION)

3062

3063

#define ARM_VVM_T_NT_1xN0x1(N0, TYPE, a, b, C) \

3064

({ \

3065

ARM_VFMA(N0, (VTYPE(TYPE, N0))(a), b, (C##0)); \

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3066

})

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

3067

#define ARM_VVM_T_NT_2xN0x1(N0, TYPE, a, b, C) \

3068

({ \

3069

ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s0), b, (C##0)); \

3070

ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s1), b, (C##1)); \

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3071

})

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

3072

#define ARM_VVM_T_NT_3xN0x1(N0, TYPE, a, b, C) \

3073

({ \

3074

ARM_VVM_T_NT_2xN0x1(N0, TYPE, a, b, C); \

3075

ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s2), b, (C##2)); \

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3076

})

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

3077

#define ARM_VVM_T_NT_4xN0x1(N0, TYPE, a, b, C) \

3078

({ \

3079

ARM_VVM_T_NT_3xN0x1(N0, TYPE, a, b, C); \

3080

ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s3), b, (C##3)); \

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3081

})

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

3082

#define ARM_VVM_T_NT_8xN0x1(N0, TYPE, a, b, C) \

3083

({ \

3084

ARM_VVM_T_NT_4xN0x1(N0, TYPE, a, b, C); \

3085

ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s4), b, (C##4)); \

3086

ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s5), b, (C##5)); \

3087

ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s6), b, (C##6)); \

3088

ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s7), b, (C##7)); \

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3089

})

3090

3091

// Factory macro for the column-vector (transposed) by row-vector (not transposed) multiplication. K0 = 1

3092

// a is the column-vector (transposed)

3093

// b is the row-vector (not transposed)

3094

// C is the output matrix

3095

// Lower case is a vector (a, b)

3096

// Upper case is a matrix (C)

3097

#define ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, a, b, C) ARM_VVM_T_NT_##M0##xN0x1(N0, TYPE, a, b, C)

3098

3099

#define ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, A, B, C) \

3100

({ \

3101

ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##0), (B##0), C); \

3102

})

3103

#define ARM_MM_T_NT_M0xN0x2(M0, N0, TYPE, A, B, C) \

3104

({ \

3105

ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, A, B, C); \

3106

ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##1), (B##1), C); \

3107

})

3108

#define ARM_MM_T_NT_M0xN0x3(M0, N0, TYPE, A, B, C) \

3109

({ \

3110

ARM_MM_T_NT_M0xN0x2(M0, N0, TYPE, A, B, C); \

3111

ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##2), (B##2), C); \

3112

})

3113

#define ARM_MM_T_NT_M0xN0x4(M0, N0, TYPE, A, B, C) \

3114

({ \

3115

ARM_MM_T_NT_M0xN0x3(M0, N0, TYPE, A, B, C); \

3116

ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##3), (B##3), C); \

3117

})

3118

#define ARM_MM_T_NT_M0xN0x8(M0, N0, TYPE, A, B, C) \

3119

({ \

3120

ARM_MM_T_NT_M0xN0x4(M0, N0, TYPE, A, B, C); \

3121

ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##4), (B##4), C); \

3122

ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##5), (B##5), C); \

3123

ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##6), (B##6), C); \

3124

ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##7), (B##7), C); \

3125

})

3126

#define ARM_MM_T_NT_M0xN0x16(M0, N0, TYPE, A, B, C) \

3127

({ \

3128

ARM_MM_T_NT_M0xN0x8(M0, N0, TYPE, A, B, C); \

3129

ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##8), (B##8), C); \

3130

ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##9), (B##9), C); \

3131

ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##A), (B##A), C); \

3132

ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##B), (B##B), C); \

3133

ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##C), (B##C), C); \

3134

ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##D), (B##D), C); \

3135

ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##E), (B##E), C); \

3136

ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##F), (B##F), C); \

3137

})

3138

3139

// Factory macro for the matrix (transposed) by matrix (not transposed) multiplication.

3140

// The dimensions for this matrix multiplications are defined through M0, N0 and K0

3141

// The dimensions supported are:

3142

// M0: 1, 2, 3, 4, 8

3143

// N0: 1, 2, 3, 4, 8, 16

3144

// K0: 1, 2, 3, 4, 8, 16

3145

// This macro calls the vector-by-matrix macro K0 times

3146

// A, B and C are matrices

Gian Marco Iodice

2019-09-24 12:05:06 +0100

[diff] [blame]

3147

#define ARM_MM_T_NT(M0, N0, K0, TYPE, A, B, C) \

3148

CONCAT(ARM_MM_T_NT_M0xN0x, K0) \

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3149

(M0, N0, TYPE, A, B, C)

3150

3151

/** This OpenCL kernel computes the matrix multiplication between 2 matrices.

3152

* The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be transposed

3153

* The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be NOT transposed

3154

*

3155

* @note LHS_TRANSPOSE should be passed at compile time in order to compile this OpenCL kernel (e.g. -DLHS_TRANSPOSE).

3156

* @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

3157

* @note The GEMM's dimensions M, N and K must be passed at compile time using -DM, -DN and -DK (e.g. -DM=52, -DN=90 and -DK=24).

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3158

* @note The block's dimensions used for reshaping the LHS matrix and the RHS matrix (M0, N0 and K0) must be passed at compile time using -DM0, -DN0 and -DK0 (e.g. -DM0=4, -DN0=8, -DK0=4).

3159

* @note The number of M0xK0 vertical blocks stored on the same output row of the reshaped LHS matrix must be passed at compile time using -DV0 (e.g. -DV0=2)

3160

* @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)

3161

* @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option -DLHS_INTERLEAVE must passed at compile time.

3162

* @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.

3163

* @note Only the following configurations of M0, N0 and K0 are currently supported:

3164

* - M0 = 2, 3, 4, 8

3165

* - N0 = 2, 3, 4, 8, 16

3166

* - K0 = 2, 3, 4, 8, 16

* - V0 >= 1

* - H0 >= 1

*

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

3171

* The activation function is performed after the bias addition

3172

* @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:

3173

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

3174

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

3175

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

3176

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix NOT reshaped

3177

*

3178

* @param[in] lhs_ptr Pointer to the LHS reshaped matrix. Supported data type: F16/F32

3179

* @param[in] lhs_stride_x Stride of the LHS reshaped matrix in X dimension (in bytes)

3180

* @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

3181

* @param[in] lhs_stride_y Stride of the LHS reshaped matrix in Y dimension (in bytes)

3182

* @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

3183

* @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS reshaped matrix

3184

* @param[in] rhs_ptr Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr

3185

* @param[in] rhs_stride_x Stride of the RHS reshaped matrix in X dimension (in bytes)

3186

* @param[in] rhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

3187

* @param[in] rhs_stride_y Stride of the RHS reshaped matrix in Y dimension (in bytes)

3188

* @param[in] rhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

3189

* @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS reshaped matrix

3190

* @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

3191

* @param[in] bias_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

3192

* @param[in] bias_step_x (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)

3193

* @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

3194

* @param[in] bias_step_y (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)

3195

* @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

3196

* @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr

3197

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

3198

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

3199

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

3200

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

3201

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Gian Marco Iodice

2020-06-25 17:18:36 +0100

[diff] [blame]

3202

* @param[in] k Number of columns in LHS matrix and rows in RHS matrix not reshaped.

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3203

* @param[in] lhs_stride_z Stride of the LHS reshaped matrix in Z dimension (in bytes)

3204

* @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes)

3205

* @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

3206

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

3207

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

3208

*/

3209

__kernel void gemm_mm_reshaped_lhs_t_rhs_nt(IMAGE_DECLARATION(lhs),

3210

IMAGE_DECLARATION(rhs),

3211

#if defined(BETA)

3212

IMAGE_DECLARATION(bias),

3213

#endif // defined(BETA)

3214

IMAGE_DECLARATION(dst),

Gian Marco Iodice

2020-06-25 17:18:36 +0100

[diff] [blame]

3215

uint k,

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

uint lhs_stride_z,

uint rhs_stride_z,

#if defined(BETA)

uint bias_stride_z,

#endif //defined(BETA)

3221

uint dst_stride_z

3222

#if defined(REINTERPRET_OUTPUT_AS_3D)

3223

,

3224

uint dst_cross_plane_pad

3225

#endif // REINTERPRET_OUTPUT_AS_3D

)

{

// Block size

#define LHS_BLOCK_SIZE ((K0) * (M0))

3230

3231

#if defined(LHS_INTERLEAVE)

3232

#define LHS_OFFSET_X (M0)

3233

#define LHS_STEP_X ((M0) * (V0))

3234

#define LHS_STEP_LOOP (1)

3235

#else // defined(INTERLEAVE)

3236

#define LHS_OFFSET_X (LHS_BLOCK_SIZE)

3237

#define LHS_STEP_X (M0)

3238

#define LHS_STEP_LOOP (V0)

3239

#endif // defined(INTERLEAVE)

3240

3241

// Block size

3242

#define RHS_BLOCK_SIZE ((K0) * (N0))

3243

3244

// RHS offset and step X

3245

#if defined(RHS_INTERLEAVE)

3246

#define RHS_OFFSET_X (N0)

3247

#define RHS_STEP_X ((N0) * (H0))

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3248

#else // defined(RHS_INTERLEAVE)

3249

#define RHS_OFFSET_X (RHS_BLOCK_SIZE)

3250

#define RHS_STEP_X (N0)

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3251

#endif // defined(RHS_INTERLEAVE)

3252

3253

const uint x = get_global_id(0);

3254

const uint y = get_global_id(1);

3255

const uint z = get_global_id(2);

3256

3257

#if defined(DUMMY_WORK_ITEMS)

3258

if((x * N0 >= N) || (y * M0 >= M))

{

return;

}

#endif // defined(DUMMY_WORK_ITEMS)

3263

3264

// Compute LHS matrix address

3265

__global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (y % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (y / V0) * (uint)lhs_stride_y + (z * lhs_stride_z);

3266

3267

// Compute RHS matrix address

3268

__global uchar *rhs_addr = rhs_ptr + rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (x / (uint)H0) * rhs_stride_y;

3269

3270

#if defined(MATRIX_B_DEPTH)

3271

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

3272

rhs_addr += (z % MATRIX_B_DEPTH) * rhs_stride_z;

3273

#else // defined(MATRIX_B_DEPTH)

3274

rhs_addr += z * rhs_stride_z;

3275

#endif // defined(MATRIX_B_DEPTH)

3276

3277

// Initialize the accumulators

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

3278

REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0);

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3279

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3280

REPEAT_VAR_INIT_TO_CONST(M0, uint, zero, 0);

3281

Gian Marco Iodice

2019-09-24 12:05:06 +0100

[diff] [blame]

3282

__global DATA_TYPE *lhs = (__global DATA_TYPE *)(lhs_addr);

3283

__global DATA_TYPE *rhs = (__global DATA_TYPE *)(rhs_addr);

3284

Gian Marco Iodice

2020-06-25 17:18:36 +0100

[diff] [blame]

3285

for(int i = 0; i < k; i += K0)

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3286

{

Gian Marco Iodice

2019-09-24 12:05:06 +0100

[diff] [blame]

3287

VEC_DATA_TYPE(DATA_TYPE, M0)

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

3288

a0;

Gian Marco Iodice

2019-09-24 12:05:06 +0100

[diff] [blame]

3289

VEC_DATA_TYPE(DATA_TYPE, N0)

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

3290

b0;

3291

3292

a0 = VLOAD(M0)(0, lhs);

Gian Marco Iodice

2019-09-24 12:05:06 +0100

[diff] [blame]

3293

b0 = VLOAD(N0)(0, rhs);

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3294

Gian Marco Iodice

2019-09-24 12:05:06 +0100

[diff] [blame]

3295

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3296

Gian Marco Iodice

2019-09-24 12:05:06 +0100

[diff] [blame]

3297

lhs += LHS_STEP_X;

3298

rhs += RHS_STEP_X;

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3299

Gian Marco Iodice

2019-09-24 12:05:06 +0100

[diff] [blame]

3300

#if K0 > 1

3301

a0 = VLOAD(M0)(0, lhs);

3302

b0 = VLOAD(N0)(0, rhs);

3303

3304

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

rhs += RHS_STEP_X;

#endif // K0 > 1

#if K0 > 2

a0 = VLOAD(M0)(0, lhs);

3312

b0 = VLOAD(N0)(0, rhs);

3313

3314

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

rhs += RHS_STEP_X;

#endif // K0 > 2

#if K0 > 3

a0 = VLOAD(M0)(0, lhs);

3322

b0 = VLOAD(N0)(0, rhs);

3323

3324

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

rhs += RHS_STEP_X;

#endif // K0 > 3

#if K0 > 4

a0 = VLOAD(M0)(0, lhs);

3332

b0 = VLOAD(N0)(0, rhs);

3333

3334

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

rhs += RHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3340

b0 = VLOAD(N0)(0, rhs);

3341

3342

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

rhs += RHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3348

b0 = VLOAD(N0)(0, rhs);

3349

3350

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

rhs += RHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3356

b0 = VLOAD(N0)(0, rhs);

3357

3358

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

rhs += RHS_STEP_X;

#endif // K0 > 4

#if K0 > 8

a0 = VLOAD(M0)(0, lhs);

3366

b0 = VLOAD(N0)(0, rhs);

3367

3368

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

rhs += RHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3374

b0 = VLOAD(N0)(0, rhs);

3375

3376

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

rhs += RHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3382

b0 = VLOAD(N0)(0, rhs);

3383

3384

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

rhs += RHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3390

b0 = VLOAD(N0)(0, rhs);

3391

3392

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

rhs += RHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3398

b0 = VLOAD(N0)(0, rhs);

3399

3400

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

rhs += RHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3406

b0 = VLOAD(N0)(0, rhs);

3407

3408

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

rhs += RHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3414

b0 = VLOAD(N0)(0, rhs);

3415

3416

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

rhs += RHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3422

b0 = VLOAD(N0)(0, rhs);

3423

3424

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

rhs += RHS_STEP_X;

#endif // K0 > 8

#ifndef LHS_INTERLEAVE

3431

lhs += (M0 * K0 * (V0 - 1));

3432

#endif // LHS_INTERLEAVE

3433

3434

#ifndef RHS_INTERLEAVE

3435

rhs += (N0 * K0 * (H0 - 1));

3436

#endif // RHS_INTERLEAVE

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3437

}

3438

3439

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * dst_stride_y);

3440

3441

REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);

3442

3443

#if defined(REINTERPRET_OUTPUT_AS_3D)

3444

3445

// The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

3446

CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);

3447

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

3448

// multiply dst_stride_z by DEPTH_GEMM3D

3449

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

3450

3451

#else // defined(REINTERPRET_OUTPUT_AS_3D)

3452

3453

// Add offset for batched GEMM

3454

dst_addr += z * dst_stride_z;

3455

3456

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

3457

3458

// Multiply by the weight of matrix-matrix product and store the result

3459

#if defined(ALPHA)

3460

SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);

3461

#endif // defined(ALPHA)

// Add beta*bias

#if defined(BETA)

#if defined(BROADCAST_BIAS)

3466

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE));

3467

3468

LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

3469

3470

#ifndef UNIT_BETA

3471

SCALE_BLOCK(1, DATA_TYPE, bias, BETA);

3472

#endif // UNIT_BIAS

3473

3474

// c = c + bias[broadcasted]

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

3475

#if defined(MIXED_PRECISION)

3476

CONVERT_BLOCK(1, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);

3477

ADD_BLOCK_BROADCAST(M0, c, bias_hp0);

3478

#else // defined(MIXED_PRECISION)

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3479

ADD_BLOCK_BROADCAST(M0, c, bias0);

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

3480

#endif // defined(MIXED_PRECISION)

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3481

3482

#else // defined(BROADCAST_BIAS)

3483

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * bias_stride_y) + z * bias_stride_z;

3484

3485

LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

3486

3487

#ifndef UNIT_BETA

3488

SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);

3489

#endif // UNIT_BIAS

3490

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

3491

#if defined(MIXED_PRECISION)

3492

CONVERT_BLOCK(M0, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);

3493

ADD_BLOCK(M0, c, bias_hp);

3494

#else // defined(MIXED_PRECISION)

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3495

ADD_BLOCK(M0, c, bias);

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

3496

#endif // defined(MIXED_PRECISION)

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3497

3498

#endif // defined(BROADCAST_BIAS)

3499

#endif // defined(BETA)

3500

3501

#if defined(ACTIVATION_TYPE)

Georgios Pinitas

2019-10-11 17:38:50 +0100

[diff] [blame]

3502

#if defined(MIXED_PRECISION)

3503

ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, c, A_VAL, B_VAL);

3504

#else // defined(MIXED_PRECISION)

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3505

ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL);

Georgios Pinitas

2019-10-11 17:38:50 +0100

[diff] [blame]

3506

#endif // defined(MIXED_PRECISION)

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3507

#endif // defined(ACTIVATION_TYPE)

3508

3509

// Store output block

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

3510

#if defined(MIXED_PRECISION)

3511

CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout);

3512

#else // defined(MIXED_PRECISION)

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3513

STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout);

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

3514

#endif // defined(MIXED_PRECISION)

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3515

3516

#undef LHS_BLOCK_SIZE

3517

#undef LHS_OFFSET_X

3518

#undef LHS_STEP_X

3519

#undef RHS_BLOCK_SIZE

#undef RHS_OFFSET_X

#undef RHS_STEP_X

}

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

3524

#if defined(OPENCL_IMAGE_SUPPORT)

3525

/** This OpenCL kernel computes the matrix multiplication between 2 matrices. The RHS matrix is stored in OpenCL image object.

3526

* The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be transposed

3527

* The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be NOT transposed

3528

*

3529

* @note -DOPENCL_IMAGE_SUPPORT must be passed at compile time in order to compile this OpenCL kernel

3530

* @note LHS_TRANSPOSE should be passed at compile time in order to compile this OpenCL kernel (e.g. -DLHS_TRANSPOSE).

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

3531

* @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.

3532

* @note The GEMM's dimensions M, N and K must be passed at compile time using -DM, -DN and -DK (e.g. -DM=52, -DN=90 and -DK=24).

Gian Marco Iodice

2020-06-19 16:56:57 +0100

[diff] [blame]

3533

* @note The height of the RHS matrix, defined before creating the OpenCL image object from the OpenCL buffer, should be passed at compile time using -DRHS_HEIGHT=<value> (e.g. -DRHS_HEIGHT=32)

3534

* Since we cannot create a 3d image from a buffer, the third dimension could be collapsed with the second dimension so RHS_HEIGHT

3535

* could be different from the value returned by get_image_height(rhs_img).

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

3536

* @note The block's dimensions used for reshaping the LHS matrix and the RHS matrix (M0, N0 and K0) must be passed at compile time using -DM0, -DN0 and -DK0 (e.g. -DM0=4, -DN0=8, -DK0=4).

3537

* @note The number of M0xK0 vertical blocks stored on the same output row of the reshaped LHS matrix must be passed at compile time using -DV0 (e.g. -DV0=2)

3538

* @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)

3539

* @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option -DLHS_INTERLEAVE must passed at compile time.

3540

* @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.

3541

* @note Only the following configurations of M0, N0 and K0 are currently supported:

* - M0 = 2, 3, 4, 8

* - N0 = 4, 8, 16

* - K0 = 4, 8, 16

* - V0 >= 1

* - H0 >= 1

*

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

3549

* The activation function is performed after the bias addition

3550

* @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:

3551

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

3552

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

3553

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

3554

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix NOT reshaped

3555

*

3556

* @param[in] lhs_ptr Pointer to the LHS reshaped matrix. Supported data type: F32

3557

* @param[in] lhs_stride_x Stride of the LHS reshaped matrix in X dimension (in bytes)

3558

* @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

3559

* @param[in] lhs_stride_y Stride of the LHS reshaped matrix in Y dimension (in bytes)

3560

* @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

3561

* @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS reshaped matrix

3562

* @param[in] rhs_img The RHS reshaped matrix as cl_image 2d. Supported data type: same as @p lhs_ptr

3563

* @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

3564

* @param[in] bias_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

3565

* @param[in] bias_step_x (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)

3566

* @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

3567

* @param[in] bias_step_y (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)

3568

* @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

3569

* @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr

3570

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

3571

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

3572

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

3573

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

3574

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Gian Marco Iodice

2020-06-25 17:18:36 +0100

[diff] [blame]

3575

* @param[in] k Number of columns in LHS matrix and rows in RHS matrix not reshaped.

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

3576

* @param[in] lhs_stride_z Stride of the LHS reshaped matrix in Z dimension (in bytes)

3577

* @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes)

3578

* @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

3579

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

3580

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

3581

*/

3582

__kernel void gemm_mm_reshaped_lhs_t_rhs_nt_texture(IMAGE_DECLARATION(lhs),

3583

__read_only image2d_t rhs_img,

3584

#if defined(BETA)

3585

IMAGE_DECLARATION(bias),

3586

#endif // defined(BETA)

3587

IMAGE_DECLARATION(dst),

Gian Marco Iodice

2020-06-25 17:18:36 +0100

[diff] [blame]

3588

uint k,

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

uint lhs_stride_z,

uint rhs_stride_z,

#if defined(BETA)

uint bias_stride_z,

#endif //defined(BETA)

3594

uint dst_stride_z

3595

#if defined(REINTERPRET_OUTPUT_AS_3D)

3596

,

3597

uint dst_cross_plane_pad

3598

#endif // REINTERPRET_OUTPUT_AS_3D

)

{

// Pixel unit

#define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(N0)

3603

3604

// Block size

3605

#define LHS_BLOCK_SIZE ((K0) * (M0))

3606

3607

#if defined(LHS_INTERLEAVE)

3608

#define LHS_OFFSET_X (M0)

3609

#define LHS_STEP_X ((M0) * (V0))

3610

#define LHS_STEP_LOOP (1)

3611

#else // defined(INTERLEAVE)

3612

#define LHS_OFFSET_X (LHS_BLOCK_SIZE)

3613

#define LHS_STEP_X (M0)

3614

#define LHS_STEP_LOOP (V0)

3615

#endif // defined(INTERLEAVE)

3616

3617

// Block size

3618

#define RHS_BLOCK_SIZE ((K0) * (PIXEL_UNIT))

3619

3620

// RHS offset and step X

3621

#if defined(RHS_INTERLEAVE)

3622

#define RHS_OFFSET_X (PIXEL_UNIT)

3623

#define RHS_STEP_X ((PIXEL_UNIT) * (H0))

3624

#else // defined(RHS_INTERLEAVE)

3625

#define RHS_OFFSET_X (RHS_BLOCK_SIZE)

3626

#define RHS_STEP_X (PIXEL_UNIT)

3627

#endif // defined(RHS_INTERLEAVE)

3628

3629

const uint x = get_global_id(0);

3630

const uint y = get_global_id(1);

3631

const uint z = get_global_id(2);

3632

3633

#if defined(DUMMY_WORK_ITEMS)

3634

if((x * N0 >= N) || (y * M0 >= M))

{

return;

}

#endif // defined(DUMMY_WORK_ITEMS)

3639

3640

// Compute LHS matrix address

3641

__global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (y % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (y / V0) * (uint)lhs_stride_y + (z * lhs_stride_z);

3642

3643

#if defined(MATRIX_B_DEPTH)

3644

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

3645

const uint z_rhs = (z % MATRIX_B_DEPTH);

3646

#else // defined(MATRIX_B_DEPTH)

3647

const uint z_rhs = z;

3648

#endif // defined(MATRIX_B_DEPTH)

3649

3650

// Compute RHS matrix coordinates

3651

uint x_rhs = (x % H0) * (uint)RHS_OFFSET_X;

3652

const uint y_rhs = (x / (uint)H0) + z_rhs * RHS_HEIGHT;

3653

3654

// Initialize the accumulators

3655

REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0);

3656

3657

REPEAT_VAR_INIT_TO_CONST(M0, uint, zero, 0);

3658

3659

__global DATA_TYPE *lhs = (__global DATA_TYPE *)(lhs_addr);

3660

3661

for(int i = 0; i < K; i += K0)

3662

{

3663

VEC_DATA_TYPE(DATA_TYPE, M0)

3664

a0;

3665

VEC_DATA_TYPE(DATA_TYPE, N0)

3666

b0;

3667

3668

a0 = VLOAD(M0)(0, lhs);

3669

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 0 * RHS_STEP_X), (y_rhs));

3670

3671

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

#if K0 > 1

a0 = VLOAD(M0)(0, lhs);

3677

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 1 * RHS_STEP_X), (y_rhs));

3678

3679

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

#endif // K0 > 1

#if K0 > 2

a0 = VLOAD(M0)(0, lhs);

3686

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 2 * RHS_STEP_X), (y_rhs));

3687

3688

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

#endif // K0 > 2

#if K0 > 3

a0 = VLOAD(M0)(0, lhs);

3695

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 3 * RHS_STEP_X), (y_rhs));

3696

3697

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

#endif // K0 > 3

#if K0 > 4

a0 = VLOAD(M0)(0, lhs);

3704

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 4 * RHS_STEP_X), (y_rhs));

3705

3706

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3711

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 5 * RHS_STEP_X), (y_rhs));

3712

3713

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3718

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 6 * RHS_STEP_X), (y_rhs));

3719

3720

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3725

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 7 * RHS_STEP_X), (y_rhs));

3726

3727

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

#endif // K0 > 4

#if K0 > 8

a0 = VLOAD(M0)(0, lhs);

3734

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 8 * RHS_STEP_X), (y_rhs));

3735

3736

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3741

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 9 * RHS_STEP_X), (y_rhs));

3742

3743

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3748

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 10 * RHS_STEP_X), (y_rhs));

3749

3750

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3755

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 11 * RHS_STEP_X), (y_rhs));

3756

3757

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3762

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 12 * RHS_STEP_X), (y_rhs));

3763

3764

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3769

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 13 * RHS_STEP_X), (y_rhs));

3770

3771

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3776

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 14 * RHS_STEP_X), (y_rhs));

3777

3778

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3783

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 15 * RHS_STEP_X), (y_rhs));

3784

3785

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

#endif // K0 > 8

#ifndef LHS_INTERLEAVE

3791

lhs += (M0 * K0 * (V0 - 1));

3792

#endif // LHS_INTERLEAVE

3793

3794

x_rhs += K0 * RHS_STEP_X;

3795

#ifndef RHS_INTERLEAVE

3796

x_rhs += (PIXEL_UNIT * K0 * (H0 - 1));

3797

#endif // RHS_INTERLEAVE

3798

}

3799

3800

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * dst_stride_y);

3801

3802

REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);

3803

3804

#if defined(REINTERPRET_OUTPUT_AS_3D)

3805

3806

// The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

3807

CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);

3808

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

3809

// multiply dst_stride_z by DEPTH_GEMM3D

3810

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

3811

3812

#else // defined(REINTERPRET_OUTPUT_AS_3D)

3813

3814

// Add offset for batched GEMM

3815

dst_addr += z * dst_stride_z;

3816

3817

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

3818

3819

// Multiply by the weight of matrix-matrix product and store the result

3820

#if defined(ALPHA)

3821

SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);

3822

#endif // defined(ALPHA)

// Add beta*bias

#if defined(BETA)

#if defined(BROADCAST_BIAS)

3827

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE));

3828

3829

LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

3830

3831

#ifndef UNIT_BETA

3832

SCALE_BLOCK(1, DATA_TYPE, bias, BETA);

3833

#endif // UNIT_BIAS

3834

3835

// c = c + bias[broadcasted]

3836

#if defined(MIXED_PRECISION)

3837

CONVERT_BLOCK(1, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);

3838

ADD_BLOCK_BROADCAST(M0, c, bias_hp0);

3839

#else // defined(MIXED_PRECISION)

3840

ADD_BLOCK_BROADCAST(M0, c, bias0);

3841

#endif // defined(MIXED_PRECISION)

3842

3843

#else // defined(BROADCAST_BIAS)

3844

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * bias_stride_y) + z * bias_stride_z;

3845

3846

LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

3847

3848

#ifndef UNIT_BETA

3849

SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);

3850

#endif // UNIT_BIAS

3851

3852

#if defined(MIXED_PRECISION)

3853

CONVERT_BLOCK(M0, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);

3854

ADD_BLOCK(M0, c, bias_hp);

3855

#else // defined(MIXED_PRECISION)

3856

ADD_BLOCK(M0, c, bias);

3857

#endif // defined(MIXED_PRECISION)

3858

3859

#endif // defined(BROADCAST_BIAS)

3860

#endif // defined(BETA)

3861

3862

#if defined(ACTIVATION_TYPE)

3863

#if defined(MIXED_PRECISION)

3864

ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, c, A_VAL, B_VAL);

3865

#else // defined(MIXED_PRECISION)

3866

ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL);

3867

#endif // defined(MIXED_PRECISION)

3868

#endif // defined(ACTIVATION_TYPE)

3869

3870

// Store output block

3871

#if defined(MIXED_PRECISION)

3872

CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout);

3873

#else // defined(MIXED_PRECISION)

3874

STORE_BLOCK(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout);

3875

#endif // defined(MIXED_PRECISION)

3876

3877

#undef LHS_BLOCK_SIZE

3878

#undef LHS_OFFSET_X

3879

#undef LHS_STEP_X

3880

#undef RHS_BLOCK_SIZE

#undef RHS_OFFSET_X

#undef RHS_STEP_X

#undef PIXEL_UNIT

#undef LHS_STEP_LOOP

#undef RHS_STEP_LOOP

}

#endif // defined(OPENCL_IMAGE_SUPPORT)

3888

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3889

#endif // defined(LHS_TRANSPOSE)

3890

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

3891

#endif // defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(K) && defined(DATA_TYPE)

3892

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

3893

#if defined(M0) && defined(N0) && defined(K0) && defined(K) && defined(DATA_TYPE)

3894

3895

#define VFMA(a, b, c) \

({ \

c = fma(a, b, c); \

})

#if M0 == 1

#define RHS_VFMA_M0xN0(i, a, b, c) \

3902

({ \

3903

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

3904

})

3905

#elif M0 == 2 // M0 == 2

3906

#define RHS_VFMA_M0xN0(i, a, b, c) \

3907

({ \

3908

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

3909

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

3910

})

3911

#elif M0 == 3 // M0 == 3

3912

#define RHS_VFMA_M0xN0(i, a, b, c) \

3913

({ \

3914

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

3915

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

3916

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \

3917

})

3918

#elif M0 == 4 // M0 == 4

3919

#define RHS_VFMA_M0xN0(i, a, b, c) \

3920

({ \

3921

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

3922

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

3923

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \

3924

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \

3925

})

3926

#elif M0 == 5 // M0 == 5

3927

#define RHS_VFMA_M0xN0(i, a, b, c) \

3928

({ \

3929

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

3930

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

3931

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \

3932

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \

3933

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \

3934

})

3935

#elif M0 == 6 // M0 == 6

3936

#define RHS_VFMA_M0xN0(i, a, b, c) \

3937

({ \

3938

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

3939

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

3940

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \

3941

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \

3942

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \

3943

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \

3944

})

3945

#elif M0 == 7 // M0 == 7

3946

#define RHS_VFMA_M0xN0(i, a, b, c) \

3947

({ \

3948

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

3949

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

3950

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \

3951

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \

3952

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \

3953

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \

3954

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \

3955

})

3956

#elif M0 == 8 // M0 == 8

3957

#define RHS_VFMA_M0xN0(i, a, b, c) \

3958

({ \

3959

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

3960

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

3961

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \

3962

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \

3963

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \

3964

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \

3965

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \

3966

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##7).s##i), b, (c##7)); \

3967

})

3968

#else // M0 not supported

3969

#error "M0 not supported"

3970

#endif // M0 not supported

3971

3972

/** This OpenCL kernel computes the matrix multiplication between 2 matrices.

3973

* The LHS matrix is NOT reshaped

3974

* The RHS matrix is NOT reshaped

3975

*

3976

* @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

3977

* @note The GEMM's dimensions (M,N and K) must be passed at compile time using -DM, -DN and and -DK (e.g. -DM=52, -DN=30 and -DK=90)

3978

* @note The number of columns of LHS matrix must be passed at compile time using -DK (e.g. -DK=64)

3979

* @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)

3980

* @note The number of K0 partial accumulations must be passed at compile time using -DK0 (e.g., -DK0=2)

3981

* @note The number of N0 columns to process must be passed at compile time using -DN0 (e.g. -DN0=2)

SiCong Li

2020-06-26 10:02:06 +0100

[diff] [blame]

3982

* @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)

3983

* @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

3984

* @note Only the following configurations of M0, N0 and K0 are currently supported:

3985

* - M0 = 1, 2, 3, 4, 5, 6, 7, 8

3986

* - N0 = 2, 3, 4, 8, 16

3987

* - K0 = 2, 3, 4, 8, 16

3988

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

3989

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

Gian Marco Iodice

2019-07-16 15:46:48 +0100

[diff] [blame]

3990

* The activation function is performed after the bias addition

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

3991

* @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:

3992

* -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D

3993

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

3994

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

3995

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

3996

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix

3997

*

Gian Marco Iodice

2019-06-24 14:40:30 +0100

[diff] [blame]

3998

* @param[in] lhs_ptr Pointer to the LHS matrix. Supported data type: F16/F32

3999

* @param[in] lhs_stride_x Stride of the LHS matrix in X dimension (in bytes)

4000

* @param[in] lhs_step_x lhs_stride_x * number of elements along X processed per workitem(in bytes)

4001

* @param[in] lhs_stride_y Stride of the LHS matrix in Y dimension (in bytes)

4002

* @param[in] lhs_step_y lhs_stride_y * number of elements along Y processed per workitem(in bytes)

4003

* @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS matrix

4004

* @param[in] rhs_ptr Pointer to the RHS matrix. Supported data type: same as @p lhs_ptr

4005

* @param[in] rhs_stride_x Stride of the RHS matrix in X dimension (in bytes)

4006

* @param[in] rhs_step_x rhs_stride_x * number of elements along X processed per workitem(in bytes)

4007

* @param[in] rhs_stride_y Stride of the RHS matrix in Y dimension (in bytes)

4008

* @param[in] rhs_step_y rhs_stride_y * number of elements along Y processed per workitem(in bytes)

4009

* @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS matrix

Gian Marco Iodice

2019-06-24 14:40:30 +0100

[diff] [blame]

4010

* @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

4011

* @param[in] bias_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

4012

* @param[in] bias_step_x (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)

4013

* @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

4014

* @param[in] bias_step_y (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)

4015

* @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

4016

* @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr

4017

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

4018

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

4019

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

4020

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

4021

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

4022

* @param[in] lhs_stride_z Stride of the LHS matrix in Z dimension (in bytes)

4023

* @param[in] rhs_stride_z Stride of the RHS matrix in Z dimension (in bytes)

4024

* @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

4025

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

4026

* @param[in] lhs_cross_plane_pad (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)

4027

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

4028

*/

4029

__kernel void gemm_mm_native(IMAGE_DECLARATION(lhs),

4030

IMAGE_DECLARATION(rhs),

Gian Marco Iodice

2019-06-24 14:40:30 +0100

[diff] [blame]

4031

#if defined(BETA)

4032

IMAGE_DECLARATION(bias),

4033

#endif // defined(BETA)

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

4034

IMAGE_DECLARATION(dst),

4035

uint lhs_stride_z,

4036

uint rhs_stride_z,

Gian Marco Iodice

2019-06-24 14:40:30 +0100

[diff] [blame]

4037

#if defined(BETA)

4038

uint bias_stride_z,

4039

#endif //defined(BETA)

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

4040

uint dst_stride_z

4041

#if defined(REINTERPRET_INPUT_AS_3D)

4042

,

4043

uint lhs_cross_plane_pad

4044

#endif // REINTERPRET_INPUT_AS_3D

4045

#if defined(REINTERPRET_OUTPUT_AS_3D)

4046

,

4047

uint dst_cross_plane_pad

4048

#endif // REINTERPRET_OUTPUT_AS_3D

)

{

// Block size

#define RHS_BLOCK_SIZE ((K0) * (N0))

4053

4054

// RHS offset and step X

4055

#define RHS_OFFSET_X (RHS_BLOCK_SIZE)

4056

4057

uint x = get_global_id(0);

4058

uint y = get_global_id(1);

4059

uint z = get_global_id(2);

4060

4061

#if defined(DUMMY_WORK_ITEMS)

4062

if((x * N0 >= N) || (y * M0 >= M))

{

return;

}

#endif // defined(DUMMY_WORK_ITEMS)

4067

4068

// Compute LHS matrix address

SiCong Li

2020-07-15 12:09:58 +0100

[diff] [blame]

4069

uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

4070

4071

// Compute RHS matrix address

4072

uint rhs_offset = rhs_offset_first_element_in_bytes + x * N0 * sizeof(DATA_TYPE);

4073

4074

#if defined(MATRIX_B_DEPTH)

4075

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

4076

rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;

4077

#else // defined(MATRIX_B_DEPTH)

4078

rhs_offset += z * rhs_stride_z;

4079

#endif // defined(MATRIX_B_DEPTH)

4080

Gian Marco Iodice

2019-06-24 14:40:30 +0100

[diff] [blame]

4081

REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0);

4082

REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

4083

4084

#if defined(REINTERPRET_INPUT_AS_3D)

4085

// The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

4086

CALCULATE_Z_OFFSET(M0, uint, zlhs, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);

4087

4088

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

4089

// multiply lhs_stride_z by DEPTH_GEMM3D

4090

lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;

4091

4092

#else // defined(REINTERPRET_INPUT_AS_3D)

4093

4094

// Add offset for batched GEMM

4095

lhs_offset += z * lhs_stride_z;

4096

4097

#endif // defined(REINTERPRET_INPUT_AS_3D)

4098

4099

// Initialize the accumulators

Gian Marco Iodice

2019-06-24 14:40:30 +0100

[diff] [blame]

4100

REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0); //VEC_DATA_TYPE(DATA_TYPE, N0) c0=0,c1=0,c2=0,... c(M0-1)=0;

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

4101

4102

int i = 0;

4103

for(; i <= (K - K0); i += K0)

4104

{

4105

// Supported cases (M0, K0):

4106

// 1,2 - 1,3 - 1,4 - 1,8 - 1,16

4107

// 2,2 - 2,3 - 2,4 - 2,8 - 2,16

4108

// 3,2 - 3,3 - 3,4 - 3,8 - 3,16

4109

// 4,2 - 4,3 - 4,4 - 4,8 - 4,16

4110

// 5,2 - 5,3 - 5,4 - 5,8 - 5,16

4111

// 6,2 - 6,3 - 6,4 - 6,8 - 6,16

4112

// 7,2 - 7,3 - 7,4 - 7,8 - 7,16

4113

// 8,2 - 8,3 - 8,4 - 8,8 - 8,16

4114

// Load values from LHS matrix

4115

LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);

4116

4117

// Load values from RHS matrix

Gian Marco Iodice

2019-06-24 14:40:30 +0100

[diff] [blame]

4118

LOAD_BLOCK(K0, N0, DATA_TYPE, b, rhs_ptr, rhs_offset, rhs_stride_y, zero);

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

4119

4120

RHS_VFMA_M0xN0(0, a, b0, c);

4121

RHS_VFMA_M0xN0(1, a, b1, c);

4122

#if K0 > 2

4123

RHS_VFMA_M0xN0(2, a, b2, c);

4124

#endif // K0 > 2

4125

#if K0 > 3

4126

RHS_VFMA_M0xN0(3, a, b3, c);

4127

#endif // K0 > 3

4128

#if K0 > 4

4129

RHS_VFMA_M0xN0(4, a, b4, c);

4130

RHS_VFMA_M0xN0(5, a, b5, c);

4131

RHS_VFMA_M0xN0(6, a, b6, c);

4132

RHS_VFMA_M0xN0(7, a, b7, c);

4133

#endif // K0 > 4

4134

#if K0 > 8

4135

RHS_VFMA_M0xN0(8, a, b8, c);

4136

RHS_VFMA_M0xN0(9, a, b9, c);

Gian Marco Iodice

7b9d7ca

2019-09-19 16:37:39 +0100

[diff] [blame]

4137

RHS_VFMA_M0xN0(A, a, bA, c);

4138

RHS_VFMA_M0xN0(B, a, bB, c);

4139

RHS_VFMA_M0xN0(C, a, bC, c);

4140

RHS_VFMA_M0xN0(D, a, bD, c);

4141

RHS_VFMA_M0xN0(E, a, bE, c);

4142

RHS_VFMA_M0xN0(F, a, bF, c);

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

4143

#endif // K0 > 8

4144

4145

lhs_offset += K0 * sizeof(DATA_TYPE);

4146

rhs_offset += K0 * rhs_stride_y;

4147

}

4148

4149

// Left-over accumulations

4150

for(; i < K; ++i)

4151

{

4152

// Load values from LHS matrix

4153

VEC_DATA_TYPE(DATA_TYPE, 2)

4154

a0 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zlhs0));

4155

#if M0 > 1

4156

VEC_DATA_TYPE(DATA_TYPE, 2)

4157

a1 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zlhs1));

4158

#endif // M0 > 1

4159

#if M0 > 2

4160

VEC_DATA_TYPE(DATA_TYPE, 2)

4161

a2 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zlhs2));

4162

#endif // M0 > 2

4163

#if M0 > 3

4164

VEC_DATA_TYPE(DATA_TYPE, 2)

4165

a3 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zlhs3));

4166

#endif // M0 > 3

4167

#if M0 > 4

4168

VEC_DATA_TYPE(DATA_TYPE, 2)

4169

a4 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zlhs4));

4170

#endif // M0 > 4

4171

#if M0 > 5

4172

VEC_DATA_TYPE(DATA_TYPE, 2)

4173

a5 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zlhs5));

4174

#endif // M0 > 5

4175

#if M0 > 6

4176

VEC_DATA_TYPE(DATA_TYPE, 2)

4177

a6 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zlhs6));

4178

#endif // M0 > 6

4179

#if M0 > 7

4180

VEC_DATA_TYPE(DATA_TYPE, 2)

4181

a7 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zlhs7));

4182

#endif // M0 > 7

4183

4184

VEC_DATA_TYPE(DATA_TYPE, N0)

4185

b = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0 * rhs_stride_y));

4186

RHS_VFMA_M0xN0(0, a, b, c);

4187

4188

lhs_offset += sizeof(DATA_TYPE);

4189

rhs_offset += rhs_stride_y;

4190

}

4191

SiCong Li

2020-07-15 12:09:58 +0100

[diff] [blame]

4192

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

4193

Gian Marco Iodice

2019-06-24 14:40:30 +0100

[diff] [blame]

4194

REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

4195

4196

#if defined(REINTERPRET_OUTPUT_AS_3D)

4197

// The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

4198

CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);

4199

4200

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

4201

// multiply dst_stride_z by DEPTH_GEMM3D

4202

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

4203

4204

#else // defined(REINTERPRET_OUTPUT_AS_3D)

4205

4206

// Add offset for batched GEMM

4207

dst_addr += z * dst_stride_z;

4208

4209

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

4210

4211

// Multiply by the weight of matrix-matrix product and store the result

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

4212

#if defined(ALPHA)

4213

SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);

4214

#endif // defined(ALPHA)

4215

Gian Marco Iodice

2019-06-24 14:40:30 +0100

[diff] [blame]

4216

// Add beta*bias

4217

#if defined(BETA)

4218

#if defined(BROADCAST_BIAS)

4219

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));

4220

4221

LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

4222

4223

#ifndef UNIT_BETA

4224

SCALE_BLOCK(1, DATA_TYPE, bias, BETA);

4225

#endif // UNIT_BIAS

4226

4227

// c = c + bias[broadcasted]

4228

ADD_BLOCK_BROADCAST(M0, c, bias0);

4229

4230

#else // defined(BROADCAST_BIAS)

SiCong Li

2020-07-15 12:09:58 +0100

[diff] [blame]

4231

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z;

Gian Marco Iodice

2019-06-24 14:40:30 +0100

[diff] [blame]

4232

4233

LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

4234

4235

#ifndef UNIT_BETA

4236

SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);

#endif // UNIT_BIAS

// c = c + bias

ADD_BLOCK(M0, c, bias);

4241

4242

#endif // defined(BROADCAST_BIAS)

4243

#endif // defined(BETA)

4244

Gian Marco Iodice

2019-07-16 15:46:48 +0100

[diff] [blame]

4245

#if defined(ACTIVATION_TYPE)

4246

ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, c, A_VAL, B_VAL);

4247

#endif // defined(ACTIVATION_TYPE)

4248

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

4249

// Store output block

SiCong Li

2020-07-15 12:09:58 +0100

[diff] [blame]

4250

STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, N, y, x);

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

4251

4252

#undef RHS_BLOCK_SIZE

#undef RHS_OFFSET_X

#undef RHS_STEP_X

}

#endif // defined(M0) && defined(N0) && defined(K0) && defined(K) && defined(DATA_TYPE)

4257

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4258

#if defined(COLS_B) && defined(MULT_TRANSPOSE1XW_WIDTH) && defined(MULT_INTERLEAVE4X4_HEIGHT)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4259

/** This OpenCL kernel is optimised for Midgard. It computes the matrix multiplication between matrix A reshaped (src0) and matrix B reshaped (src1)

Michele Di Giorgio

2018-11-16 16:04:25 +0000

[diff] [blame]

4260

*

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

4261

* @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4262

* @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (e.g. -DMULT_TRANSPOSE1XW_WIDTH=2)

4263

* @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (e.g. -DMULT_INTERLEAVE4X4_HEIGHT=2)

4264

* @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)

4265

* This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4266

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4267

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

4268

* The activation function is performed after the bias addition

4269

* @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4270

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

4271

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

4272

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

4273

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped

4274

*

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4275

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32

4276

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

4277

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

4278

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

4279

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

4280

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

4281

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4282

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

4283

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

4284

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

4285

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

4286

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4287

* @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

4288

* @param[in] src2_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

4289

* @param[in] src2_step_x (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)

4290

* @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

4291

* @param[in] src2_step_y (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)

4292

* @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

4293

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4294

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4295

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4296

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4297

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4298

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4299

* @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)

4300

* @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4301

* @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4302

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

4303

* @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4304

*/

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

4305

__kernel void gemm_mm_interleaved_transposed_f32(IMAGE_DECLARATION(src0),

4306

IMAGE_DECLARATION(src1),

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4307

#if defined(BETA)

4308

IMAGE_DECLARATION(src2),

4309

#endif // defined(BETA)

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

4310

IMAGE_DECLARATION(dst),

4311

uint src0_stride_z,

4312

uint src1_stride_z,

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4313

#if defined(BETA)

4314

uint src2_stride_z,

4315

#endif //defined(BETA)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4316

uint dst_stride_z

4317

#if defined(REINTERPRET_OUTPUT_AS_3D)

4318

,

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

4319

uint cross_plane_pad

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4320

#endif // REINTERPRET_OUTPUT_AS_3D

4321

)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4322

{

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4323

int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;

4324

int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

4325

int z = get_global_id(2);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4326

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4327

// Offset

4328

const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;

4329

const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 4;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4330

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4331

// src_addr_a = address of matrix A

4332

// src_addr_b = address of matrix B

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

4333

int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;

4334

int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;

4335

4336

#if defined(MATRIX_B_DEPTH)

4337

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

4338

src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;

4339

#else // defined(MATRIX_B_DEPTH)

4340

src1_addr_in_bytes += z * src1_stride_z;

4341

#endif // defined(MATRIX_B_DEPTH)

4342

4343

__global float *src_addr_a = (__global float *)(src0_ptr + src0_addr_in_bytes);

4344

__global float *src_addr_b = (__global float *)(src1_ptr + src1_addr_in_bytes);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4345

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4346

// Compute end row address for matrix B

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4347

__global float *src_end_addr_b = src_addr_b + COLS_B;

4348

4349

src_addr_a += offset_row_a;

4350

src_addr_b += offset_row_b;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4351

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4352

// Reset accumulators

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

float4 c0 = 0.0f;

float4 c1 = 0.0f;

float4 c2 = 0.0f;

float4 c3 = 0.0f;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4357

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4358

for(; src_addr_b <= (src_end_addr_b - (int)(8 * MULT_TRANSPOSE1XW_WIDTH)); src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4359

{

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4360

// Load values from matrix A (interleaved) and matrix B (transposed)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4361

float4 a0 = vload4(0, src_addr_a);

4362

float4 b0 = vload4(0, src_addr_b);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4363

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4364

c0 += (float4)a0.s0 * b0;

4365

c1 += (float4)a0.s1 * b0;

4366

c2 += (float4)a0.s2 * b0;

4367

c3 += (float4)a0.s3 * b0;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4368

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4369

// Load values from matrix A (interleaved) and matrix B (transposed)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4370

a0 = vload4(0, src_addr_a + 4 * MULT_INTERLEAVE4X4_HEIGHT);

4371

b0 = vload4(0, src_addr_b + 4 * MULT_TRANSPOSE1XW_WIDTH);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4372

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4373

c0 += (float4)a0.s0 * b0;

4374

c1 += (float4)a0.s1 * b0;

4375

c2 += (float4)a0.s2 * b0;

4376

c3 += (float4)a0.s3 * b0;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4377

}

4378

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4379

for(; src_addr_b < src_end_addr_b; src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4380

{

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4381

// Load values from matrix A (interleaved) and matrix B (transposed)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4382

float4 a0 = vload4(0, src_addr_a);

4383

float4 b0 = vload4(0, src_addr_b);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4384

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4385

c0 += (float4)a0.s0 * b0;

4386

c1 += (float4)a0.s1 * b0;

4387

c2 += (float4)a0.s2 * b0;

4388

c3 += (float4)a0.s3 * b0;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4389

}

4390

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4391

// Compute destination address

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4392

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

4393

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

4394

// Compute dst address

4395

__global uchar *dst_addr = offset(&dst, 0, 0);

4396

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4397

uint4 zout = 0;

4398

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4399

#if defined(REINTERPRET_OUTPUT_AS_3D)

4400

// Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

4401

// in order to take into account the presence of possible cross plane paddings

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4402

//

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

// | |

// | plane0 |

// | |

// |__________________|

4407

// |******************|

4408

// | cross_plane_pad |

4409

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4414

4415

// The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4416

zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;

4417

zout = min(DEPTH_GEMM3D - 1, zout);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4418

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

4419

// Add offset due to the cross plane paddings

4420

zout *= (cross_plane_pad * dst_stride_y);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4421

4422

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

4423

// multiply dst_stride_z by DEPTH_GEMM3D

4424

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4425

#else // defined(REINTERPRET_OUTPUT_AS_3D)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

4426

// Add offset for batched GEMM

4427

dst_addr += z * dst_stride_z;

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4428

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

4429

4430

// Multiply by the weight of matrix-matrix product and store the result

4431

#if defined(ALPHA)

4432

SCALE_BLOCK(4, float, c, ALPHA);

4433

#endif // defined(ALPHA)

// Add beta*bias

#if defined(BETA)

REPEAT_VAR_INIT_TO_CONST(4, uint, zero, 0);

4438

4439

#if defined(BROADCAST_BIAS)

4440

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float));

4441

4442

LOAD_BLOCK(1, 4, float, bias, src2_addr, 0, src2_stride_y, zero);

4443

4444

#ifndef UNIT_BETA

4445

SCALE_BLOCK(1, float, bias, BETA);

4446

#endif // UNIT_BIAS

4447

4448

// c = c + bias[broadcasted]

4449

ADD_BLOCK_BROADCAST(4, c, bias0);

4450

4451

#else // defined(BROADCAST_BIAS)

4452

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float)) + (get_global_id(1) * (uint)4 * src2_stride_y) + get_global_id(

4453

2) * src2_stride_z;

4454

4455

LOAD_BLOCK(4, 4, float, bias, src2_addr, 0, src2_stride_y, zero);

4456

4457

#ifndef UNIT_BETA

4458

SCALE_BLOCK(4, float, bias, BETA);

#endif // UNIT_BIAS

// c = c + bias

ADD_BLOCK(4, c, bias);

4463

4464

#endif // defined(BROADCAST_BIAS)

4465

#endif // defined(BETA)

4466

4467

#if defined(ACTIVATION_TYPE)

4468

ACTIVATION_BLOCK(4, ACTIVATION_TYPE, float, c, A_VAL, B_VAL);

4469

#endif // defined(ACTIVATION_TYPE)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

4470

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4471

// Store 4x4 block

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4472

vstore4(c0, 0, (__global float *)(dst_addr + 0 * dst_stride_y + zout.s0));

4473

vstore4(c1, 0, (__global float *)(dst_addr + 1 * dst_stride_y + zout.s1));

4474

vstore4(c2, 0, (__global float *)(dst_addr + 2 * dst_stride_y + zout.s2));

4475

vstore4(c3, 0, (__global float *)(dst_addr + 3 * dst_stride_y + zout.s3));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4476

}

4477

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4478

/** This OpenCL kernel is optimized for Bifrost and tt computes the matrix multiplication between matrix A reshaped (src0) and matrix B reshaped (src1)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4479

*

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

4480

* @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4481

* @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (e.g. -DMULT_TRANSPOSE1XW_WIDTH=2)

4482

* @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (e.g. -DMULT_INTERLEAVE4X4_HEIGHT=2)

4483

* @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (e.g. -DMULT_INTERLEAVE4X4_HEIGHT=2)

4484

* @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)

4485

* This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4486

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4487

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

4488

* The activation function is performed after the bias addition

4489

* @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4490

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

4491

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

4492

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

4493

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped

4494

*

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4495

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32

4496

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

4497

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

4498

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

4499

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

4500

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

4501

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4502

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

4503

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

4504

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

4505

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

4506

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4507

* @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

4508

* @param[in] src2_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

4509

* @param[in] src2_step_x (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)

4510

* @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

4511

* @param[in] src2_step_y (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)

4512

* @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

4513

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4514

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4515

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4516

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4517

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4518

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4519

* @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)

4520

* @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4521

* @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4522

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

4523

* @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4524

*/

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

4525

__kernel void gemm_mm_interleaved_transposed_f32_bifrost(IMAGE_DECLARATION(src0),

4526

IMAGE_DECLARATION(src1),

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4527

#if defined(BETA)

4528

IMAGE_DECLARATION(src2),

4529

#endif // defined(BETA)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

4530

IMAGE_DECLARATION(dst),

4531

uint src0_stride_z,

4532

uint src1_stride_z,

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4533

#if defined(BETA)

4534

uint src2_stride_z,

4535

#endif //defined(BETA)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4536

uint dst_stride_z

4537

#if defined(REINTERPRET_OUTPUT_AS_3D)

4538

,

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

4539

uint cross_plane_pad

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4540

#endif // REINTERPRET_OUTPUT_AS_3D

4541

)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4542

{

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4543

int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;

4544

int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

4545

int z = get_global_id(2);

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4546

4547

// Offset

4548

const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;

4549

const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 4;

4550

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4551

// src_addr_a = address of matrix A

4552

// src_addr_b = address of matrix B

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

4553

int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;

4554

int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;

4555

4556

#if defined(MATRIX_B_DEPTH)

4557

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

4558

src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;

4559

#else // defined(MATRIX_B_DEPTH)

4560

src1_addr_in_bytes += z * src1_stride_z;

4561

#endif // defined(MATRIX_B_DEPTH)

4562

4563

__global float *src_addr_a = (__global float *)(src0_ptr + src0_addr_in_bytes);

4564

__global float *src_addr_b = (__global float *)(src1_ptr + src1_addr_in_bytes);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4565

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4566

src_addr_a += offset_row_a;

4567

src_addr_b += offset_row_b;

4568

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4569

// Reset accumulators

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

float4 c0 = 0.0f;

float4 c1 = 0.0f;

float4 c2 = 0.0f;

float4 c3 = 0.0f;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4574

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

4575

#define COLS_MTX_B (COLS_B / (4 * MULT_TRANSPOSE1XW_WIDTH))

4576

4577

int i = 0;

4578

for(; i <= (int)(COLS_MTX_B - 4); i += 4)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4579

{

4580

// Load values from matrix A (interleaved) and matrix B (transposed)

4581

float4 a0 = vload4(0, src_addr_a);

4582

float4 b0 = vload4(0, src_addr_b);

4583

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

4584

src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;

4585

src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4586

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4587

c0.s0 = fma(a0.s0, b0.s0, c0.s0);

4588

c0.s1 = fma(a0.s0, b0.s1, c0.s1);

4589

c0.s2 = fma(a0.s0, b0.s2, c0.s2);

4590

c0.s3 = fma(a0.s0, b0.s3, c0.s3);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4591

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4592

c1.s0 = fma(a0.s1, b0.s0, c1.s0);

4593

c1.s1 = fma(a0.s1, b0.s1, c1.s1);

4594

c1.s2 = fma(a0.s1, b0.s2, c1.s2);

4595

c1.s3 = fma(a0.s1, b0.s3, c1.s3);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4596

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4597

c2.s0 = fma(a0.s2, b0.s0, c2.s0);

4598

c2.s1 = fma(a0.s2, b0.s1, c2.s1);

4599

c2.s2 = fma(a0.s2, b0.s2, c2.s2);

4600

c2.s3 = fma(a0.s2, b0.s3, c2.s3);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4601

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4602

c3.s0 = fma(a0.s3, b0.s0, c3.s0);

4603

c3.s1 = fma(a0.s3, b0.s1, c3.s1);

4604

c3.s2 = fma(a0.s3, b0.s2, c3.s2);

4605

c3.s3 = fma(a0.s3, b0.s3, c3.s3);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4606

4607

// Load values from matrix A (interleaved) and matrix B (transposed)

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

4608

a0 = vload4(0, src_addr_a);

4609

b0 = vload4(0, src_addr_b);

4610

4611

src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;

4612

src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4613

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4614

c0.s0 = fma(a0.s0, b0.s0, c0.s0);

4615

c0.s1 = fma(a0.s0, b0.s1, c0.s1);

4616

c0.s2 = fma(a0.s0, b0.s2, c0.s2);

4617

c0.s3 = fma(a0.s0, b0.s3, c0.s3);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4618

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4619

c1.s0 = fma(a0.s1, b0.s0, c1.s0);

4620

c1.s1 = fma(a0.s1, b0.s1, c1.s1);

4621

c1.s2 = fma(a0.s1, b0.s2, c1.s2);

4622

c1.s3 = fma(a0.s1, b0.s3, c1.s3);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4623

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4624

c2.s0 = fma(a0.s2, b0.s0, c2.s0);

4625

c2.s1 = fma(a0.s2, b0.s1, c2.s1);

4626

c2.s2 = fma(a0.s2, b0.s2, c2.s2);

4627

c2.s3 = fma(a0.s2, b0.s3, c2.s3);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4628

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4629

c3.s0 = fma(a0.s3, b0.s0, c3.s0);

4630

c3.s1 = fma(a0.s3, b0.s1, c3.s1);

4631

c3.s2 = fma(a0.s3, b0.s2, c3.s2);

4632

c3.s3 = fma(a0.s3, b0.s3, c3.s3);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4633

4634

// Load values from matrix A (interleaved) and matrix B (transposed)

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

4635

a0 = vload4(0, src_addr_a);

4636

b0 = vload4(0, src_addr_b);

4637

4638

src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;

4639

src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH;

4640

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4641

c0.s0 = fma(a0.s0, b0.s0, c0.s0);

4642

c0.s1 = fma(a0.s0, b0.s1, c0.s1);

4643

c0.s2 = fma(a0.s0, b0.s2, c0.s2);

4644

c0.s3 = fma(a0.s0, b0.s3, c0.s3);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

4645

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4646

c1.s0 = fma(a0.s1, b0.s0, c1.s0);

4647

c1.s1 = fma(a0.s1, b0.s1, c1.s1);

4648

c1.s2 = fma(a0.s1, b0.s2, c1.s2);

4649

c1.s3 = fma(a0.s1, b0.s3, c1.s3);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

4650

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4651

c2.s0 = fma(a0.s2, b0.s0, c2.s0);

4652

c2.s1 = fma(a0.s2, b0.s1, c2.s1);

4653

c2.s2 = fma(a0.s2, b0.s2, c2.s2);

4654

c2.s3 = fma(a0.s2, b0.s3, c2.s3);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

4655

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4656

c3.s0 = fma(a0.s3, b0.s0, c3.s0);

4657

c3.s1 = fma(a0.s3, b0.s1, c3.s1);

4658

c3.s2 = fma(a0.s3, b0.s2, c3.s2);

4659

c3.s3 = fma(a0.s3, b0.s3, c3.s3);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

4660

4661

// Load values from matrix A (interleaved) and matrix B (transposed)

4662

a0 = vload4(0, src_addr_a);

4663

b0 = vload4(0, src_addr_b);

4664

4665

src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;

4666

src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4667

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4668

c0.s0 = fma(a0.s0, b0.s0, c0.s0);

4669

c0.s1 = fma(a0.s0, b0.s1, c0.s1);

4670

c0.s2 = fma(a0.s0, b0.s2, c0.s2);

4671

c0.s3 = fma(a0.s0, b0.s3, c0.s3);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4672

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4673

c1.s0 = fma(a0.s1, b0.s0, c1.s0);

4674

c1.s1 = fma(a0.s1, b0.s1, c1.s1);

4675

c1.s2 = fma(a0.s1, b0.s2, c1.s2);

4676

c1.s3 = fma(a0.s1, b0.s3, c1.s3);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4677

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4678

c2.s0 = fma(a0.s2, b0.s0, c2.s0);

4679

c2.s1 = fma(a0.s2, b0.s1, c2.s1);

4680

c2.s2 = fma(a0.s2, b0.s2, c2.s2);

4681

c2.s3 = fma(a0.s2, b0.s3, c2.s3);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4682

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4683

c3.s0 = fma(a0.s3, b0.s0, c3.s0);

4684

c3.s1 = fma(a0.s3, b0.s1, c3.s1);

4685

c3.s2 = fma(a0.s3, b0.s2, c3.s2);

4686

c3.s3 = fma(a0.s3, b0.s3, c3.s3);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4687

}

4688

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

4689

for(; i < (int)(COLS_MTX_B); ++i)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4690

{

4691

// Load values from matrix A (interleaved) and matrix B (transposed)

4692

float4 a0 = vload4(0, src_addr_a);

4693

float4 b0 = vload4(0, src_addr_b);

4694

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

4695

src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;

4696

src_addr_b += 4 * MULT_TRANSPOSE1XW_WIDTH;

4697

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4698

c0.s0 = fma(a0.s0, b0.s0, c0.s0);

4699

c0.s1 = fma(a0.s0, b0.s1, c0.s1);

4700

c0.s2 = fma(a0.s0, b0.s2, c0.s2);

4701

c0.s3 = fma(a0.s0, b0.s3, c0.s3);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4702

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4703

c1.s0 = fma(a0.s1, b0.s0, c1.s0);

4704

c1.s1 = fma(a0.s1, b0.s1, c1.s1);

4705

c1.s2 = fma(a0.s1, b0.s2, c1.s2);

4706

c1.s3 = fma(a0.s1, b0.s3, c1.s3);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4707

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4708

c2.s0 = fma(a0.s2, b0.s0, c2.s0);

4709

c2.s1 = fma(a0.s2, b0.s1, c2.s1);

4710

c2.s2 = fma(a0.s2, b0.s2, c2.s2);

4711

c2.s3 = fma(a0.s2, b0.s3, c2.s3);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4712

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4713

c3.s0 = fma(a0.s3, b0.s0, c3.s0);

4714

c3.s1 = fma(a0.s3, b0.s1, c3.s1);

4715

c3.s2 = fma(a0.s3, b0.s2, c3.s2);

4716

c3.s3 = fma(a0.s3, b0.s3, c3.s3);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4717

}

4718

4719

// Compute destination address

4720

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

4721

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

4722

// Compute dst address

4723

__global uchar *dst_addr = offset(&dst, 0, 0);

4724

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4725

uint4 zout = 0;

Michele Di Giorgio

2018-11-16 16:04:25 +0000

[diff] [blame]

4726

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4727

#if defined(REINTERPRET_OUTPUT_AS_3D)

4728

// Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

4729

// in order to take into account the presence of possible cross plane paddings

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4730

//

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

// | |

// | plane0 |

// | |

// |__________________|

4735

// |******************|

4736

// | cross_plane_pad |

4737

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4742

4743

// The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4744

zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;

4745

zout = min(DEPTH_GEMM3D - 1, zout);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4746

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

4747

// Add offset due to the cross plane paddings

4748

zout *= (cross_plane_pad * dst_stride_y);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4749

4750

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

4751

// multiply dst_stride_z by DEPTH_GEMM3D

4752

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4753

#else // defined(REINTERPRET_OUTPUT_AS_3D)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

4754

// Add offset for batched GEMM

4755

dst_addr += z * dst_stride_z;

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4756

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

4757

4758

// Multiply by the weight of matrix-matrix product and store the result

4759

#if defined(ALPHA)

4760

SCALE_BLOCK(4, float, c, ALPHA);

4761

#endif // defined(ALPHA)

// Add beta*bias

#if defined(BETA)

REPEAT_VAR_INIT_TO_CONST(4, uint, zero, 0);

4766

4767

#if defined(BROADCAST_BIAS)

4768

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float));

4769

4770

LOAD_BLOCK(1, 4, float, bias, src2_addr, 0, src2_stride_y, zero);

4771

4772

#ifndef UNIT_BETA

4773

SCALE_BLOCK(1, float, bias, BETA);

4774

#endif // UNIT_BIAS

4775

4776

// c = c + bias[broadcasted]

4777

ADD_BLOCK_BROADCAST(4, c, bias0);

4778

4779

#else // defined(BROADCAST_BIAS)

4780

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float)) + (get_global_id(1) * (uint)4 * src2_stride_y) + get_global_id(

4781

2) * src2_stride_z;

4782

4783

LOAD_BLOCK(4, 4, float, bias, src2_addr, 0, src2_stride_y, zero);

4784

4785

#ifndef UNIT_BETA

4786

SCALE_BLOCK(4, float, bias, BETA);

#endif // UNIT_BIAS

// c = c + bias

ADD_BLOCK(4, c, bias);

4791

4792

#endif // defined(BROADCAST_BIAS)

4793

#endif // defined(BETA)

4794

4795

#if defined(ACTIVATION_TYPE)

4796

ACTIVATION_BLOCK(4, ACTIVATION_TYPE, float, c, A_VAL, B_VAL);

4797

#endif // defined(ACTIVATION_TYPE)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

4798

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4799

// Store 4x4 block

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4800

vstore4(c0, 0, (__global float *)(dst_addr + 0 * dst_stride_y + zout.s0));

4801

vstore4(c1, 0, (__global float *)(dst_addr + 1 * dst_stride_y + zout.s1));

4802

vstore4(c2, 0, (__global float *)(dst_addr + 2 * dst_stride_y + zout.s2));

4803

vstore4(c3, 0, (__global float *)(dst_addr + 3 * dst_stride_y + zout.s3));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4804

}

4805

Georgios Pinitas

8422558

2018-05-14 12:00:05 +0100

[diff] [blame]

4806

// Undefine local defines

4807

#undef COLS_MTX_B

4808

Matthew Bentham

6f31f8c

2017-10-27 11:50:06 +0100

[diff] [blame]

4809

#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4810

/** This OpenCL kernel computes the matrix multiplication between matrix A reshaped (src0) and matrix B reshaped (src1)

Michele Di Giorgio

2018-11-16 16:04:25 +0000

[diff] [blame]

4811

*

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

4812

* @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4813

* @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (e.g. -DMULT_TRANSPOSE1XW_WIDTH=2)

4814

* @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (e.g. -DMULT_INTERLEAVE4X4_HEIGHT=2)

4815

* @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)

4816

* This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4817

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4818

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

4819

* The activation function is performed after the bias addition

4820

* @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4821

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

4822

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

4823

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

4824

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped

4825

*

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4826

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16

4827

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

4828

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

4829

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

4830

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

4831

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

4832

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4833

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

4834

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

4835

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

4836

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

4837

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4838

* @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

4839

* @param[in] src2_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

4840

* @param[in] src2_step_x (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)

4841

* @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

4842

* @param[in] src2_step_y (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)

4843

* @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

4844

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4845

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4846

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4847

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4848

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4849

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4850

* @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)

4851

* @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4852

* @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4853

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

4854

* @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4855

*/

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

4856

__kernel void gemm_mm_interleaved_transposed_f16(IMAGE_DECLARATION(src0),

4857

IMAGE_DECLARATION(src1),

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4858

#if defined(BETA)

4859

IMAGE_DECLARATION(src2),

4860

#endif // defined(BETA)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

4861

IMAGE_DECLARATION(dst),

4862

uint src0_stride_z,

4863

uint src1_stride_z,

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4864

#if defined(BETA)

4865

uint src2_stride_z,

4866

#endif //defined(BETA)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4867

uint dst_stride_z

4868

#if defined(REINTERPRET_OUTPUT_AS_3D)

4869

,

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

4870

uint cross_plane_pad

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4871

#endif // REINTERPRET_OUTPUT_AS_3D

4872

)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4873

{

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4874

int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;

4875

int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

4876

int z = get_global_id(2);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4877

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4878

// Offset

4879

const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;

4880

const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 8;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4881

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4882

// src_addr_a = address of matrix A

4883

// src_addr_b = address of matrix B

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

4884

int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;

4885

int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;

4886

4887

#if defined(MATRIX_B_DEPTH)

4888

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

4889

src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;

4890

#else // defined(MATRIX_B_DEPTH)

4891

src1_addr_in_bytes += z * src1_stride_z;

4892

#endif // defined(MATRIX_B_DEPTH)

4893

4894

__global half *src_addr_a = (__global half *)(src0_ptr + src0_addr_in_bytes);

4895

__global half *src_addr_b = (__global half *)(src1_ptr + src1_addr_in_bytes);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4896

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4897

// Compute end row address for matrix B

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4898

__global half *src_end_addr_b = src_addr_b + COLS_B;

4899

4900

src_addr_a += offset_row_a;

4901

src_addr_b += offset_row_b;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4902

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4903

// Reset accumulators

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

half8 c0 = 0.0f;

half8 c1 = 0.0f;

half8 c2 = 0.0f;

half8 c3 = 0.0f;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4908

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4909

for(; src_addr_b <= (src_end_addr_b - (int)(16 * MULT_TRANSPOSE1XW_WIDTH)); src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 16 * MULT_TRANSPOSE1XW_WIDTH)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4910

{

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4911

// Load values from matrix A (interleaved) and matrix B (transposed)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4912

half4 a0 = vload4(0, src_addr_a);

4913

half8 b0 = vload8(0, src_addr_b);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4914

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4915

c0 += (half8)a0.s0 * b0;

4916

c1 += (half8)a0.s1 * b0;

4917

c2 += (half8)a0.s2 * b0;

4918

c3 += (half8)a0.s3 * b0;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4919

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4920

// Load values from matrix A (interleaved) and matrix B (transposed)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4921

a0 = vload4(0, src_addr_a + 4 * MULT_INTERLEAVE4X4_HEIGHT);

4922

b0 = vload8(0, src_addr_b + 8 * MULT_TRANSPOSE1XW_WIDTH);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4923

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4924

c0 += (half8)a0.s0 * b0;

4925

c1 += (half8)a0.s1 * b0;

4926

c2 += (half8)a0.s2 * b0;

4927

c3 += (half8)a0.s3 * b0;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4928

}

4929

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4930

for(; src_addr_b < src_end_addr_b; src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4931

{

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4932

// Load values from matrix A (interleaved) and matrix B (transposed)

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

4933

half4 a0 = vload4(0, src_addr_a);

4934

half8 b0 = vload8(0, src_addr_b);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4935

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4936

c0 += (half8)a0.s0 * b0;

4937

c1 += (half8)a0.s1 * b0;

4938

c2 += (half8)a0.s2 * b0;

4939

c3 += (half8)a0.s3 * b0;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4940

}

4941

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

4942

// Compute destination address

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

4943

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

4944

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

4945

// Compute dst address

4946

__global uchar *dst_addr = offset(&dst, 0, 0);

4947

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4948

uint4 zout = 0;

4949

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4950

#if defined(REINTERPRET_OUTPUT_AS_3D)

4951

// Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

4952

// in order to take into account the presence of possible cross plane paddings

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4953

//

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

// | |

// | plane0 |

// | |

// |__________________|

4958

// |******************|

4959

// | cross_plane_pad |

4960

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4965

4966

// The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4967

zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;

4968

zout = min(DEPTH_GEMM3D - 1, zout);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4969

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

4970

// Add offset due to the cross plane paddings

4971

zout *= (cross_plane_pad * dst_stride_y);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4972

4973

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

4974

// multiply dst_stride_z by DEPTH_GEMM3D

4975

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

4976

#else // defined(REINTERPRET_OUTPUT_AS_3D)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

4977

// Add offset for batched GEMM

4978

dst_addr += z * dst_stride_z;

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

4979

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

4980

4981

// Multiply by the weight of matrix-matrix product and store the result

4982

#if defined(ALPHA)

4983

SCALE_BLOCK(4, half, c, ALPHA);

4984

#endif // defined(ALPHA)

// Add beta*bias

#if defined(BETA)

REPEAT_VAR_INIT_TO_CONST(4, uint, zero, 0);

4989

4990

#if defined(BROADCAST_BIAS)

4991

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half));

4992

4993

LOAD_BLOCK(1, 8, half, bias, src2_addr, 0, src2_stride_y, zero);

4994

4995

#ifndef UNIT_BETA

4996

SCALE_BLOCK(1, half, bias, BETA);

4997

#endif // UNIT_BIAS

4998

4999

// c = c + bias[broadcasted]

5000

ADD_BLOCK_BROADCAST(4, c, bias0);

5001

5002

#else // defined(BROADCAST_BIAS)

5003

5004

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) + (get_global_id(1) * (uint)4 * src2_stride_y) + get_global_id(

5005

2) * src2_stride_z;

5006

5007

LOAD_BLOCK(4, 8, half, bias, src2_addr, 0, src2_stride_y, zero);

5008

5009

#ifndef UNIT_BETA

5010

SCALE_BLOCK(4, half, bias, BETA);

#endif // UNIT_BIAS

// c = c + bias

ADD_BLOCK(4, c, bias);

5015

5016

#endif // defined(BROADCAST_BIAS)

5017

#endif // defined(BETA)

5018

5019

#if defined(ACTIVATION_TYPE)

5020

ACTIVATION_BLOCK(4, ACTIVATION_TYPE, half, c, A_VAL, B_VAL);

5021

#endif // defined(ACTIVATION_TYPE)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

5022

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

5023

// Store 4x8 block

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5024

vstore8(c0, 0, (__global half *)(dst_addr + 0 * dst_stride_y + zout.s0));

5025

vstore8(c1, 0, (__global half *)(dst_addr + 1 * dst_stride_y + zout.s1));

5026

vstore8(c2, 0, (__global half *)(dst_addr + 2 * dst_stride_y + zout.s2));

5027

vstore8(c3, 0, (__global half *)(dst_addr + 3 * dst_stride_y + zout.s3));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5028

}

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

5029

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5030

/** This OpenCL kernel computes the matrix multiplication between matrix A reshaped (src0) and matrix B reshaped (src1) while accumulating the result in a 32 floating point variable.

Michele Di Giorgio

2018-11-16 16:04:25 +0000

[diff] [blame]

5031

*

Vidhya Sudhan Loganathan

2018-11-20 15:38:13 +0000

[diff] [blame]

5032

* @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5033

* @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (e.g. -DMULT_TRANSPOSE1XW_WIDTH=2)

5034

* @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (e.g. -DMULT_INTERLEAVE4X4_HEIGHT=2)

5035

* @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)

5036

* This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])

Vidhya Sudhan Loganathan

2018-11-20 15:38:13 +0000

[diff] [blame]

5037

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5038

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

5039

* The activation function is performed after the bias addition

5040

* @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:

Vidhya Sudhan Loganathan

2018-11-20 15:38:13 +0000

[diff] [blame]

5041

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

5042

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

5043

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

5044

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped

5045

*

5046

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16

5047

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

5048

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

5049

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

5050

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

5051

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

5052

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

5053

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

5054

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

5055

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

5056

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

5057

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5058

* @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

5059

* @param[in] src2_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

5060

* @param[in] src2_step_x (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)

5061

* @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

5062

* @param[in] src2_step_y (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)

5063

* @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

Vidhya Sudhan Loganathan

2018-11-20 15:38:13 +0000

[diff] [blame]

5064

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

5065

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

5066

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

5067

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

5068

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

5069

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

5070

* @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)

5071

* @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5072

* @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

Vidhya Sudhan Loganathan

2018-11-20 15:38:13 +0000

[diff] [blame]

5073

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

5074

* @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

5075

*/

5076

__kernel void gemm_mm_interleaved_transposed_f16_acc32(IMAGE_DECLARATION(src0),

5077

IMAGE_DECLARATION(src1),

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5078

#if defined(BETA)

5079

IMAGE_DECLARATION(src2),

5080

#endif // defined(BETA)

Vidhya Sudhan Loganathan

2018-11-20 15:38:13 +0000

[diff] [blame]

5081

IMAGE_DECLARATION(dst),

5082

uint src0_stride_z,

5083

uint src1_stride_z,

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5084

#if defined(BETA)

5085

uint src2_stride_z,

5086

#endif //defined(BETA)

Vidhya Sudhan Loganathan

2018-11-20 15:38:13 +0000

[diff] [blame]

5087

uint dst_stride_z

5088

#if defined(REINTERPRET_OUTPUT_AS_3D)

5089

,

5090

uint cross_plane_pad

5091

#endif // REINTERPRET_OUTPUT_AS_3D

5092

)

5093

{

5094

int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;

5095

int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;

5096

int z = get_global_id(2);

5097

5098

// Offset

5099

const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;

5100

const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 8;

5101

5102

// src_addr_a = address of matrix A

5103

// src_addr_b = address of matrix B

5104

int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;

5105

int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;

5106

5107

#if defined(MATRIX_B_DEPTH)

5108

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

5109

src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;

5110

#else // defined(MATRIX_B_DEPTH)

5111

src1_addr_in_bytes += z * src1_stride_z;

5112

#endif // defined(MATRIX_B_DEPTH)

5113

5114

__global half *src_addr_a = (__global half *)(src0_ptr + src0_addr_in_bytes);

5115

__global half *src_addr_b = (__global half *)(src1_ptr + src1_addr_in_bytes);

5116

5117

// Compute end row address for matrix B

5118

__global half *src_end_addr_b = src_addr_b + COLS_B;

5119

5120

src_addr_a += offset_row_a;

5121

src_addr_b += offset_row_b;

5122

5123

// Reset accumulators

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

float8 c0 = 0.0f;

float8 c1 = 0.0f;

float8 c2 = 0.0f;

float8 c3 = 0.0f;

Vidhya Sudhan Loganathan

2018-11-20 15:38:13 +0000

[diff] [blame]

5128

5129

for(; src_addr_b <= (src_end_addr_b - (int)(16 * MULT_TRANSPOSE1XW_WIDTH)); src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 16 * MULT_TRANSPOSE1XW_WIDTH)

5130

{

5131

// Load values from matrix A (interleaved) and matrix B (transposed)

5132

float4 a0 = convert_float4(vload4(0, src_addr_a));

5133

float8 b0 = convert_float8(vload8(0, src_addr_b));

5134

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5135

c0 += (float8)a0.s0 * b0;

5136

c1 += (float8)a0.s1 * b0;

5137

c2 += (float8)a0.s2 * b0;

5138

c3 += (float8)a0.s3 * b0;

Vidhya Sudhan Loganathan

2018-11-20 15:38:13 +0000

[diff] [blame]

5139

5140

// Load values from matrix A (interleaved) and matrix B (transposed)

5141

a0 = convert_float4(vload4(0, src_addr_a + 4 * MULT_INTERLEAVE4X4_HEIGHT));

5142

b0 = convert_float8(vload8(0, src_addr_b + 8 * MULT_TRANSPOSE1XW_WIDTH));

5143

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5144

c0 += (float8)a0.s0 * b0;

5145

c1 += (float8)a0.s1 * b0;

5146

c2 += (float8)a0.s2 * b0;

5147

c3 += (float8)a0.s3 * b0;

Vidhya Sudhan Loganathan

2018-11-20 15:38:13 +0000

[diff] [blame]

5148

}

5149

5150

for(; src_addr_b < src_end_addr_b; src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH)

5151

{

5152

// Load values from matrix A (interleaved) and matrix B (transposed)

5153

float4 a0 = convert_float4(vload4(0, src_addr_a));

5154

float8 b0 = convert_float8(vload8(0, src_addr_b));

5155

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5156

c0 += (float8)a0.s0 * b0;

5157

c1 += (float8)a0.s1 * b0;

5158

c2 += (float8)a0.s2 * b0;

5159

c3 += (float8)a0.s3 * b0;

Vidhya Sudhan Loganathan

2018-11-20 15:38:13 +0000

[diff] [blame]

5160

}

5161

5162

// Compute destination address

5163

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

5164

Vidhya Sudhan Loganathan

2018-11-20 15:38:13 +0000

[diff] [blame]

5165

// Compute dst address

5166

__global uchar *dst_addr = offset(&dst, 0, 0);

5167

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5168

uint4 zout = 0;

5169

Vidhya Sudhan Loganathan

2018-11-20 15:38:13 +0000

[diff] [blame]

5170

#if defined(REINTERPRET_OUTPUT_AS_3D)

5171

// Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension

5172

// in order to take into account the presence of possible cross plane paddings

//

// | |

// | plane0 |

// | |

// |__________________|

5178

// |******************|

5179

// | cross_plane_pad |

5180

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

5185

5186

// The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5187

zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;

5188

zout = min(DEPTH_GEMM3D - 1, zout);

Vidhya Sudhan Loganathan

2018-11-20 15:38:13 +0000

[diff] [blame]

5189

5190

// Add offset due to the cross plane paddings

5191

zout *= (cross_plane_pad * dst_stride_y);

5192

5193

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

5194

// multiply dst_stride_z by DEPTH_GEMM3D

5195

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

Vidhya Sudhan Loganathan

2018-11-20 15:38:13 +0000

[diff] [blame]

5196

#else // defined(REINTERPRET_OUTPUT_AS_3D)

5197

// Add offset for batched GEMM

5198

dst_addr += z * dst_stride_z;

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5199

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

5200

5201

// Multiply by the weight of matrix-matrix product and store the result

5202

#if defined(ALPHA)

5203

SCALE_BLOCK(4, float, c, ALPHA);

5204

#endif // defined(ALPHA)

5205

5206

#if defined(BETA)

5207

REPEAT_VAR_INIT_TO_CONST(4, uint, zero, 0);

5208

5209

#if defined(BROADCAST_BIAS)

5210

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half));

5211

5212

LOAD_BLOCK(1, 8, half, bias, src2_addr, 0, src2_stride_y, zero);

5213

5214

float8 bias_f0 = convert_float8(bias0);

5215

5216

#ifndef UNIT_BETA

5217

SCALE_BLOCK(1, float, bias_f, BETA);

5218

#endif // UNIT_BIAS

5219

5220

// c = c + bias[broadcasted]

5221

ADD_BLOCK_BROADCAST(4, c, bias_f0);

5222

5223

#else // defined(BROADCAST_BIAS)

5224

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) + (get_global_id(1) * (uint)4 * src2_stride_y) + get_global_id(

5225

2) * src2_stride_z;

5226

5227

LOAD_BLOCK(4, 8, half, bias, src2_addr, 0, src2_stride_y, zero);

5228

5229

float8 bias_f0 = convert_float8(bias0);

5230

float8 bias_f1 = convert_float8(bias1);

5231

float8 bias_f2 = convert_float8(bias2);

5232

float8 bias_f3 = convert_float8(bias3);

5233

5234

#ifndef UNIT_BETA

5235

SCALE_BLOCK(4, float, bias_f, BETA);

#endif // UNIT_BIAS

// c = c + bias

ADD_BLOCK(4, c, bias_f);

5240

5241

#endif // defined(BROADCAST_BIAS)

5242

#endif // defined(BETA)

5243

5244

half8 c_h0 = convert_half8(c0);

5245

half8 c_h1 = convert_half8(c1);

5246

half8 c_h2 = convert_half8(c2);

5247

half8 c_h3 = convert_half8(c3);

5248

5249

#if defined(ACTIVATION_TYPE)

5250

ACTIVATION_BLOCK(4, ACTIVATION_TYPE, half, c_h, A_VAL, B_VAL);

5251

#endif // defined(ACTIVATION_TYPE)

Vidhya Sudhan Loganathan

2018-11-20 15:38:13 +0000

[diff] [blame]

5252

5253

// Store 4x8 block

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5254

vstore8(c_h0, 0, (__global half *)(dst_addr + 0 * dst_stride_y + zout.s0));

5255

vstore8(c_h1, 0, (__global half *)(dst_addr + 1 * dst_stride_y + zout.s1));

5256

vstore8(c_h2, 0, (__global half *)(dst_addr + 2 * dst_stride_y + zout.s2));

5257

vstore8(c_h3, 0, (__global half *)(dst_addr + 3 * dst_stride_y + zout.s3));

Vidhya Sudhan Loganathan

2018-11-20 15:38:13 +0000

[diff] [blame]

5258

}

5259

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5260

/** This OpenCL kernel optimized for Bifrost architectures computes the matrix multiplication between matrix A reshaped (src0) and matrix B reshaped (src1)

Michele Di Giorgio

2018-11-16 16:04:25 +0000

[diff] [blame]

5261

*

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

5262

* @note The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5263

* @note The multiplication factor for the transposition width (mult_transpose1xW_width) must be passed at compile time using -DMULT_TRANSPOSE1XW_WIDTH (e.g. -DMULT_TRANSPOSE1XW_WIDTH=2)

5264

* @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (e.g. -DMULT_INTERLEAVE4X4_HEIGHT=2)

5265

* @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)

5266

* This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

5267

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5268

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

5269

* The activation function is performed after the bias addition

5270

* @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5271

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

5272

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

5273

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

5274

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped

5275

*

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

5276

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16

5277

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

5278

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

5279

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

5280

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

5281

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

5282

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

5283

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

5284

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

5285

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

5286

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

5287

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5288

* @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

5289

* @param[in] src2_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

5290

* @param[in] src2_step_x (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)

5291

* @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

5292

* @param[in] src2_step_y (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)

5293

* @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

5294

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

5295

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

5296

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

5297

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

5298

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

5299

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5300

* @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)

5301

* @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)

5302

* @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

5303

* @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

5304

*/

5305

__kernel void gemm_mm_interleaved_transposed_f16_bifrost(IMAGE_DECLARATION(src0),

5306

IMAGE_DECLARATION(src1),

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5307

#if defined(BETA)

5308

IMAGE_DECLARATION(src2),

5309

#endif // defined(BETA)

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

5310

IMAGE_DECLARATION(dst),

5311

uint src0_stride_z,

5312

uint src1_stride_z,

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5313

#if defined(BETA)

5314

uint src2_stride_z,

5315

#endif //defined(BETA)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5316

uint dst_stride_z

5317

#if defined(REINTERPRET_OUTPUT_AS_3D)

5318

,

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

5319

uint cross_plane_pad

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5320

#endif // REINTERPRET_OUTPUT_AS_3D

5321

)

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

5322

{

5323

int x = get_global_id(0) / MULT_TRANSPOSE1XW_WIDTH;

5324

int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;

5325

int z = get_global_id(2);

5326

5327

// Offset

5328

const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;

5329

const int offset_row_b = (get_global_id(0) % MULT_TRANSPOSE1XW_WIDTH) * 8;

5330

5331

// src_addr_a = address of matrix A

5332

// src_addr_b = address of matrix B

5333

int src0_addr_in_bytes = z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes;

5334

int src1_addr_in_bytes = x * src1_stride_y + src1_offset_first_element_in_bytes;

5335

5336

#if defined(MATRIX_B_DEPTH)

5337

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

5338

src1_addr_in_bytes += (z % MATRIX_B_DEPTH) * src1_stride_z;

5339

#else // defined(MATRIX_B_DEPTH)

5340

src1_addr_in_bytes += z * src1_stride_z;

5341

#endif // defined(MATRIX_B_DEPTH)

5342

5343

__global half *src_addr_a = (__global half *)(src0_ptr + src0_addr_in_bytes);

5344

__global half *src_addr_b = (__global half *)(src1_ptr + src1_addr_in_bytes);

5345

5346

// Compute end row address for matrix B

5347

__global half *src_end_addr_b = src_addr_b + COLS_B;

5348

5349

src_addr_a += offset_row_a;

5350

src_addr_b += offset_row_b;

5351

5352

// Reset accumulators

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

half8 c0 = 0.0f;

half8 c1 = 0.0f;

half8 c2 = 0.0f;

half8 c3 = 0.0f;

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

5357

5358

#define COLS_MTX_B (COLS_B / (8 * MULT_TRANSPOSE1XW_WIDTH))

5359

5360

int i = 0;

5361

for(; i <= (int)(COLS_MTX_B - 4); i += 4)

5362

{

5363

#if MULT_INTERLEAVE4X4_HEIGHT == 1

5364

// Load values from matrix A (interleaved) and matrix B (transposed)

5365

half8 a0 = vload8(0, src_addr_a);

5366

half8 b0 = vload8(0, src_addr_b);

5367

5368

src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT;

5369

src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;

5370

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5371

c0 = fma((half8)a0.s0, b0, c0);

5372

c1 = fma((half8)a0.s1, b0, c1);

5373

c2 = fma((half8)a0.s2, b0, c2);

5374

c3 = fma((half8)a0.s3, b0, c3);

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

5375

5376

// Load values from matrix B (transposed)

5377

b0 = vload8(0, src_addr_b);

5378

5379

src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;

5380

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5381

c0 = fma((half8)a0.s4, b0, c0);

5382

c1 = fma((half8)a0.s5, b0, c1);

5383

c2 = fma((half8)a0.s6, b0, c2);

5384

c3 = fma((half8)a0.s7, b0, c3);

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

5385

5386

// Load values from matrix A (interleaved) and matrix B (transposed)

5387

a0 = vload8(0, src_addr_a);

5388

b0 = vload8(0, src_addr_b);

5389

5390

src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT;

5391

src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;

5392

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5393

c0 = fma((half8)a0.s0, b0, c0);

5394

c1 = fma((half8)a0.s1, b0, c1);

5395

c2 = fma((half8)a0.s2, b0, c2);

5396

c3 = fma((half8)a0.s3, b0, c3);

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

5397

5398

// Load values from matrix B (transposed)

5399

b0 = vload8(0, src_addr_b);

5400

5401

src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;

5402

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5403

c0 = fma((half8)a0.s4, b0, c0);

5404

c1 = fma((half8)a0.s5, b0, c1);

5405

c2 = fma((half8)a0.s6, b0, c2);

5406

c3 = fma((half8)a0.s7, b0, c3);

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

5407

#else // MULT_INTERLEAVE4X4_HEIGHT == 1

5408

// Load values from matrix A (interleaved) and matrix B (transposed)

5409

half4 a0 = vload4(0, src_addr_a);

5410

half8 b0 = vload8(0, src_addr_b);

5411

5412

src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;

5413

src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;

5414

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5415

c0 = fma((half8)a0.s0, b0, c0);

5416

c1 = fma((half8)a0.s1, b0, c1);

5417

c2 = fma((half8)a0.s2, b0, c2);

5418

c3 = fma((half8)a0.s3, b0, c3);

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

5419

5420

// Load values from matrix A (interleaved) and matrix B (transposed)

5421

a0 = vload4(0, src_addr_a);

5422

b0 = vload8(0, src_addr_b);

5423

5424

src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;

5425

src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;

5426

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5427

c0 = fma((half8)a0.s0, b0, c0);

5428

c1 = fma((half8)a0.s1, b0, c1);

5429

c2 = fma((half8)a0.s2, b0, c2);

5430

c3 = fma((half8)a0.s3, b0, c3);

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

5431

5432

// Load values from matrix A (interleaved) and matrix B (transposed)

5433

a0 = vload4(0, src_addr_a);

5434

b0 = vload8(0, src_addr_b);

5435

5436

src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;

5437

src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;

5438

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5439

c0 = fma((half8)a0.s0, b0, c0);

5440

c1 = fma((half8)a0.s1, b0, c1);

5441

c2 = fma((half8)a0.s2, b0, c2);

5442

c3 = fma((half8)a0.s3, b0, c3);

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

5443

5444

// Load values from matrix A (interleaved) and matrix B (transposed)

5445

a0 = vload4(0, src_addr_a);

5446

b0 = vload8(0, src_addr_b);

5447

5448

src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;

5449

src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;

5450

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5451

c0 = fma((half8)a0.s0, b0, c0);

5452

c1 = fma((half8)a0.s1, b0, c1);

5453

c2 = fma((half8)a0.s2, b0, c2);

5454

c3 = fma((half8)a0.s3, b0, c3);

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

5455

#endif // MULT_INTERLEAVE4X4_HEIGHT == 1

5456

}

5457

5458

for(; i < (int)(COLS_MTX_B); ++i)

5459

{

5460

// Load values from matrix A (interleaved) and matrix B (transposed)

5461

half4 a0 = vload4(0, src_addr_a);

5462

half8 b0 = vload8(0, src_addr_b);

5463

5464

src_addr_a += 4 * MULT_INTERLEAVE4X4_HEIGHT;

5465

src_addr_b += 8 * MULT_TRANSPOSE1XW_WIDTH;

5466

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5467

c0 = fma((half8)a0.s0, b0, c0);

5468

c1 = fma((half8)a0.s1, b0, c1);

5469

c2 = fma((half8)a0.s2, b0, c2);

5470

c3 = fma((half8)a0.s3, b0, c3);

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

5471

}

5472

5473

// Compute destination address

5474

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

5475

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

5476

// Compute dst address

5477

__global uchar *dst_addr = offset(&dst, 0, 0);

5478

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5479

uint4 zout = 0;

5480

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5481

#if defined(REINTERPRET_OUTPUT_AS_3D)

5482

// Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

5483

// in order to take into account the presence of possible cross plane paddings

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5484

//

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

// | |

// | plane0 |

// | |

// |__________________|

5489

// |******************|

5490

// | cross_plane_pad |

5491

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5496

5497

// The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5498

zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;

5499

zout = min(DEPTH_GEMM3D - 1, zout);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5500

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

5501

// Add offset due to the cross plane paddings

5502

zout *= (cross_plane_pad * dst_stride_y);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5503

5504

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

5505

// multiply dst_stride_z by DEPTH_GEMM3D

5506

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5507

#else // defined(REINTERPRET_OUTPUT_AS_3D)

5508

// Add offset for batched GEMM

5509

dst_addr += z * dst_stride_z;

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5510

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

5511

5512

// Multiply by the weight of matrix-matrix product and store the result

5513

#if defined(ALPHA)

5514

SCALE_BLOCK(4, half, c, ALPHA);

5515

#endif // defined(ALPHA)

// Add beta*bias

#if defined(BETA)

REPEAT_VAR_INIT_TO_CONST(4, uint, zero, 0);

5520

5521

#if defined(BROADCAST_BIAS)

5522

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half));

5523

5524

LOAD_BLOCK(1, 8, half, bias, src2_addr, 0, src2_stride_y, zero);

5525

5526

#ifndef UNIT_BETA

5527

SCALE_BLOCK(1, half, bias, BETA);

5528

#endif // UNIT_BIAS

5529

5530

// c = c + bias[broadcasted]

5531

ADD_BLOCK_BROADCAST(4, c, bias0);

5532

5533

#else // defined(BROADCAST_BIAS)

5534

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) + (get_global_id(1) * (uint)4 * src2_stride_y) + get_global_id(

5535

2) * src2_stride_z;

5536

5537

LOAD_BLOCK(4, 8, half, bias, src2_addr, 0, src2_stride_y, zero);

5538

5539

#ifndef UNIT_BETA

5540

SCALE_BLOCK(4, half, bias, BETA);

#endif // UNIT_BIAS

// c = c + bias

ADD_BLOCK(4, c, bias);

5545

5546

#endif // defined(BROADCAST_BIAS)

5547

#endif // defined(BETA)

5548

5549

#if defined(ACTIVATION_TYPE)

5550

ACTIVATION_BLOCK(4, ACTIVATION_TYPE, half, c, A_VAL, B_VAL);

5551

#endif // defined(ACTIVATION_TYPE)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5552

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

5553

// Store 4x8 block

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5554

vstore8(c0, 0, (__global half *)(dst_addr + 0 * dst_stride_y + zout.s0));

5555

vstore8(c1, 0, (__global half *)(dst_addr + 1 * dst_stride_y + zout.s1));

5556

vstore8(c2, 0, (__global half *)(dst_addr + 2 * dst_stride_y + zout.s2));

5557

vstore8(c3, 0, (__global half *)(dst_addr + 3 * dst_stride_y + zout.s3));

Gian Marco Iodice

2018-04-19 12:05:08 +0100

[diff] [blame]

5558

}

Georgios Pinitas

8422558

2018-05-14 12:00:05 +0100

[diff] [blame]

5559

5560

// Undefine local defines

5561

#undef COLS_MTX_B

5562

Matthew Bentham

6f31f8c

2017-10-27 11:50:06 +0100

[diff] [blame]

5563

#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5564

Gian Marco

2018-01-12 10:21:40 +0000

[diff] [blame]

5565

#endif // defined(COLS_B) && defined(MULT_TRANSPOSE1XW_WIDTH) && defined(MULT_INTERLEAVE4X4_HEIGHT)

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

5566

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

5567

#if defined(COLS_A) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && (NUM_ELEMS_PROCESSED_PER_THREAD_Y)

5568

#if defined(DATA_TYPE)

5569

#define VECTOR_TYPE VEC_DATA_TYPE(DATA_TYPE, NUM_ELEMS_PROCESSED_PER_THREAD_X)

Michele Di Giorgio

2018-11-16 16:04:25 +0000

[diff] [blame]

5570

/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not been reshaped.

5571

*

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

5572

* @note This OpenCL kernel works with floating point data types (F16/F32)

5573

* @note The floating point data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)

5574

* @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

5575

* @note The number of matrix A columns and the optional alpha's value need to be passed at compile time using -DCOLS_A and -DALPHA

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5576

* @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)

5577

* This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5578

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5579

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

5580

* The activation function is performed after the bias addition

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

5581

* @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:

5582

* -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5583

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

5584

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

5585

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

5586

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped

5587

*

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

5588

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16/F32

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5589

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

5590

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

5591

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

5592

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

5593

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

5594

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5595

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

5596

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

5597

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

5598

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

5599

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5600

* @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

5601

* @param[in] src2_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

5602

* @param[in] src2_step_x (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)

5603

* @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

5604

* @param[in] src2_step_y (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)

5605

* @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

5606

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5607

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

5608

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

5609

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

5610

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

5611

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5612

* @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)

5613

* @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5614

* @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5615

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

5616

* @param[in] src_cross_plane_pad (Optional) Bottom paddings in unit of elements for the input tensor (only if defined REINTERPRET_INPUT_AS_3D)

5617

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements for the output tensor (only if defined REINTERPRET_OUTPUT_AS_3D)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5618

*/

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

5619

__kernel void gemm_mm_floating_point(IMAGE_DECLARATION(src0),

5620

IMAGE_DECLARATION(src1),

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5621

#if defined(BETA)

5622

IMAGE_DECLARATION(src2),

5623

#endif // defined(BETA)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

5624

IMAGE_DECLARATION(dst),

5625

uint src0_stride_z,

5626

uint src1_stride_z,

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5627

#if defined(BETA)

5628

uint src2_stride_z,

5629

#endif //defined(BETA)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5630

uint dst_stride_z

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

5631

#if defined(REINTERPRET_INPUT_AS_3D)

5632

,

5633

uint src_cross_plane_pad

5634

#endif // REINTERPRET_INPUT_AS_3D

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5635

#if defined(REINTERPRET_OUTPUT_AS_3D)

5636

,

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

5637

uint dst_cross_plane_pad

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5638

#endif // REINTERPRET_OUTPUT_AS_3D

5639

)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5640

{

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

5641

int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5642

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

5643

// Compute starting address for matrix A and Matrix B

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5644

int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5645

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

5646

// Update address for the matrix A

5647

src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5648

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

5649

// Update address for the matrix B

5650

src_addr.s1 += idx * sizeof(DATA_TYPE);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5651

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

5652

#if defined(REINTERPRET_INPUT_AS_3D)

5653

// Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension

5654

// in order to take into account the presence of possible cross plane paddings

//

// | |

// | plane0 |

// | |

// |__________________|

5660

// |******************|

5661

// | cross_plane_pad |

5662

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

5667

5668

// The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D

5669

uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;

5670

zin = min(DEPTH_GEMM3D - 1, zin);

5671

5672

// Add offset due to the cross plane paddings

5673

zin *= (src_cross_plane_pad * src0_stride_y);

5674

5675

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

5676

// multiply src0_stride_z by DEPTH_GEMM3D

5677

src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;

5678

5679

#else // defined(REINTERPRET_INPUT_AS_3D)

5680

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

5681

// Add offset for batched GEMM

5682

src_addr.s0 += get_global_id(2) * src0_stride_z;

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

5683

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

5684

#endif // defined(REINTERPRET_INPUT_AS_3D)

5685

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

5686

#if defined(MATRIX_B_DEPTH)

5687

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

5688

src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;

5689

#else // defined(MATRIX_B_DEPTH)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

5690

src_addr.s1 += get_global_id(2) * src1_stride_z;

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

5691

#endif // defined(MATRIX_B_DEPTH)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

5692

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

5693

int end_row_vec_a = src_addr.s0 + (COLS_A * sizeof(DATA_TYPE));

5694

5695

VECTOR_TYPE acc0 = 0.0f;

5696

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5697

VECTOR_TYPE acc1 = 0.0f;

5698

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5699

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5700

VECTOR_TYPE acc2 = 0.0f;

5701

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5702

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

5703

VECTOR_TYPE acc3 = 0.0f;

5704

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

5705

Georgios Pinitas

96880cf

2017-10-20 18:52:20 +0100

[diff] [blame]

5706

for(; src_addr.s0 <= (end_row_vec_a - 2 * (int)sizeof(DATA_TYPE)); src_addr += (int2)(2 * sizeof(DATA_TYPE), 2 * src1_stride_y))

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5707

{

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

5708

#if defined(REINTERPRET_INPUT_AS_3D)

5709

// Load values from matrix A

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

5710

LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 2, DATA_TYPE, a, src0_ptr, src_addr.s0, src0_stride_y, zin.s);

5711

#else // defined(REINTERPRET_INPUT_AS_3D)

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

5712

// Load values from matrix A

5713

VEC_DATA_TYPE(DATA_TYPE, 2)

5714

a0 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));

5715

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5716

VEC_DATA_TYPE(DATA_TYPE, 2)

5717

a1 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

5718

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5719

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5720

VEC_DATA_TYPE(DATA_TYPE, 2)

5721

a2 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

5722

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5723

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

5724

VEC_DATA_TYPE(DATA_TYPE, 2)

5725

a3 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

5726

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

5727

#endif // defined(REINTERPRET_INPUT_AS_3D)

5728

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

5729

// Load values from matrix B

5730

VECTOR_TYPE b0 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, (__global DATA_TYPE *)(src1_ptr + src_addr.s1));

5731

VECTOR_TYPE b1 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, (__global DATA_TYPE *)(src1_ptr + src_addr.s1 + src1_stride_y));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5732

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

5733

// Accumulate

5734

acc0 += b0 * (VECTOR_TYPE)a0.s0;

5735

acc0 += b1 * (VECTOR_TYPE)a0.s1;

5736

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5737

acc1 += b0 * (VECTOR_TYPE)a1.s0;

5738

acc1 += b1 * (VECTOR_TYPE)a1.s1;

5739

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5740

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5741

acc2 += b0 * (VECTOR_TYPE)a2.s0;

5742

acc2 += b1 * (VECTOR_TYPE)a2.s1;

5743

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5744

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

5745

acc3 += b0 * (VECTOR_TYPE)a3.s0;

5746

acc3 += b1 * (VECTOR_TYPE)a3.s1;

5747

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5748

}

5749

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

5750

for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(sizeof(DATA_TYPE), src1_stride_y))

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5751

{

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

5752

#if defined(REINTERPRET_INPUT_AS_3D)

5753

// Load values from matrix A

5754

DATA_TYPE a0 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));

5755

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5756

DATA_TYPE a1 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));

5757

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5758

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5759

DATA_TYPE a2 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));

5760

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5761

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

5762

DATA_TYPE a3 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));

5763

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

5764

#else // defined(REINTERPRET_INPUT_AS_3D)

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

5765

// Load values from matrix A

5766

DATA_TYPE a0 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));

5767

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5768

DATA_TYPE a1 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

5769

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5770

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5771

DATA_TYPE a2 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

5772

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5773

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

5774

DATA_TYPE a3 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

5775

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

5776

#endif // defined(REINTERPRET_INPUT_AS_3D)

5777

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

5778

// Load values from matrix B

5779

VECTOR_TYPE b0 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, (__global DATA_TYPE *)(src1_ptr + src_addr.s1));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5780

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

5781

// Accumulate

5782

acc0 += b0 * (VECTOR_TYPE)a0;

5783

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5784

acc1 += b0 * (VECTOR_TYPE)a1;

5785

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

5786

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5787

acc2 += b0 * (VECTOR_TYPE)a2;

5788

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

5789

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

5790

acc3 += b0 * (VECTOR_TYPE)a3;

5791

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5792

}

5793

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5794

int z = get_global_id(2);

5795

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

5796

// Compute destination address

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5797

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

5798

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

5799

// Compute dst address

5800

__global uchar *dst_addr = offset(&dst, 0, 0);

5801

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5802

uint4 zout = 0;

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5803

5804

#if defined(REINTERPRET_OUTPUT_AS_3D)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5805

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5806

// Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

5807

// in order to take into account the presence of possible cross plane paddings

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5808

//

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

// | |

// | plane0 |

// | |

// |__________________|

5813

// |******************|

5814

// | cross_plane_pad |

5815

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5820

5821

// The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5822

zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;

5823

zout = min(DEPTH_GEMM3D - 1, zout);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5824

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

5825

// Add offset due to the cross plane paddings

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

5826

zout *= (dst_cross_plane_pad * dst_stride_y);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5827

5828

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

5829

// multiply dst_stride_z by DEPTH_GEMM3D

5830

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5831

#else // defined(REINTERPRET_OUTPUT_AS_3D)

5832

// Add offset for batched GEMM

5833

dst_addr += z * dst_stride_z;

5834

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

5835

5836

// Multiply by the weight of matrix-matrix product and store the result

5837

#if defined(ALPHA)

5838

SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, DATA_TYPE, acc, ALPHA);

5839

#endif // defined(ALPHA)

// Add beta*bias

#if defined(BETA)

REPEAT_VAR_INIT_TO_CONST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, uint, zero, 0);

5844

5845

#if defined(BROADCAST_BIAS)

5846

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)NUM_ELEMS_PROCESSED_PER_THREAD_X * sizeof(DATA_TYPE));

5847

5848

LOAD_BLOCK(1, NUM_ELEMS_PROCESSED_PER_THREAD_X, DATA_TYPE, bias, src2_addr, 0, src2_stride_y, zero);

5849

5850

#ifndef UNIT_BETA

5851

SCALE_BLOCK(1, DATA_TYPE, bias, BETA);

5852

#endif // UNIT_BIAS

5853

5854

// c = c + bias[broadcasted]

5855

ADD_BLOCK_BROADCAST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias0);

5856

5857

#else // defined(BROADCAST_BIAS)

5858

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)NUM_ELEMS_PROCESSED_PER_THREAD_X * sizeof(DATA_TYPE)) + (get_global_id(1) *

5859

(uint)NUM_ELEMS_PROCESSED_PER_THREAD_Y * src2_stride_y) + get_global_id(2) * src2_stride_z;

5860

5861

LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, NUM_ELEMS_PROCESSED_PER_THREAD_X, DATA_TYPE, bias, src2_addr, 0, src2_stride_y, zero);

5862

5863

#ifndef UNIT_BETA

5864

SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, DATA_TYPE, bias, BETA);

#endif // UNIT_BIAS

// c = c + bias

ADD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias);

5869

5870

#endif // defined(BROADCAST_BIAS)

5871

#endif // defined(BETA)

5872

5873

#if defined(ACTIVATION_TYPE)

5874

ACTIVATION_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, ACTIVATION_TYPE, DATA_TYPE, acc, A_VAL, B_VAL);

5875

#endif // defined(ACTIVATION_TYPE)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5876

5877

// Store output block

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

5878

STORE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, NUM_ELEMS_PROCESSED_PER_THREAD_X, DATA_TYPE, acc, dst_addr, dst_stride_y, zout.s);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

5879

}

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

5880

#endif // defined(DATA_TYPE)

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

5881

Michele Di Giorgio

f6f08da

2018-04-26 10:24:30 +0100

[diff] [blame]

5882

/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not been reshaped

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

5883

*

5884

* @note This OpenCL kernel works with the 32-bit floating point data type (float) and uses the fma units.

5885

* @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y.

5886

* This kernel optimally uses -DNUM_ELEMS_PROCESSED_PER_THREAD_X=4.

5887

* @note The number of matrix A columns must be passed at compile time using -DCOLS_A.

5888

* @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5889

* @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)

5890

* This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

5891

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5892

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

5893

* The activation function is performed after the bias addition

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

5894

* @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:

5895

* -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5896

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

5897

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

5898

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

5899

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped

5900

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5901

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

5902

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

5903

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

5904

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

5905

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

5906

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

5907

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

5908

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

5909

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

5910

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

5911

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

5912

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5913

* @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

5914

* @param[in] src2_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

5915

* @param[in] src2_step_x (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)

5916

* @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

5917

* @param[in] src2_step_y (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)

5918

* @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

5919

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

5920

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

5921

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

5922

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

5923

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

5924

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5925

* @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)

5926

* @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5927

* @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5928

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

5929

* @param[in] src_cross_plane_pad (Optional) Bottom paddings in unit of elements for the input tensor (only if defined REINTERPRET_INPUT_AS_3D)

5930

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

5931

*/

5932

__kernel void gemm_mm_floating_point_f32_bifrost(IMAGE_DECLARATION(src0),

5933

IMAGE_DECLARATION(src1),

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5934

#if defined(BETA)

5935

IMAGE_DECLARATION(src2),

5936

#endif // defined(BETA)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

5937

IMAGE_DECLARATION(dst),

5938

uint src0_stride_z,

5939

uint src1_stride_z,

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

5940

#if defined(BETA)

5941

uint src2_stride_z,

5942

#endif //defined(BETA)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5943

uint dst_stride_z

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

5944

#if defined(REINTERPRET_INPUT_AS_3D)

5945

,

5946

uint src_cross_plane_pad

5947

#endif // REINTERPRET_INPUT_AS_3D

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5948

#if defined(REINTERPRET_OUTPUT_AS_3D)

5949

,

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

5950

uint dst_cross_plane_pad

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

5951

#endif // REINTERPRET_OUTPUT_AS_3D

5952

)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

5953

{

5954

int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;

5955

5956

// Compute starting address for matrix A and matrix B

5957

int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));

5958

5959

// Update address for matrix A

5960

src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;

5961

5962

// Update address for matrix B

5963

src_addr.s1 += idx * sizeof(float);

5964

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

5965

#if defined(REINTERPRET_INPUT_AS_3D)

5966

// Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension

5967

// in order to take into account the presence of possible cross plane paddings

//

// | |

// | plane0 |

// | |

// |__________________|

5973

// |******************|

5974

// | cross_plane_pad |

5975

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

5980

5981

// The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D

5982

uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;

5983

zin = min(DEPTH_GEMM3D - 1, zin);

5984

5985

// Add offset due to the cross plane paddings

5986

zin *= (src_cross_plane_pad * src0_stride_y);

5987

5988

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

5989

// multiply src0_stride_z by DEPTH_GEMM3D

5990

src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;

5991

5992

#else // defined(REINTERPRET_INPUT_AS_3D)

5993

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

5994

// Add offset for batched GEMM

5995

src_addr.s0 += get_global_id(2) * src0_stride_z;

5996

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

5997

#endif // defined(REINTERPRET_INPUT_AS_3D)

5998

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

5999

#if defined(MATRIX_B_DEPTH)

6000

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

6001

src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;

6002

#else // defined(MATRIX_B_DEPTH)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

6003

src_addr.s1 += get_global_id(2) * src1_stride_z;

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

6004

#endif // defined(MATRIX_B_DEPTH)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

6005

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6006

// Initialize accumulators

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6007

float4 acc0 = 0.0f;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6008

6009

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6010

float4 acc1 = 0.0f;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6011

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6012

6013

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6014

float4 acc2 = 0.0f;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6015

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6016

6017

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6018

float4 acc3 = 0.0f;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6019

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6020

6021

// A and B src indices get incremented at the same time.

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6022

int i = 0;

6023

for(; i <= ((int)COLS_A - 4); i += 4)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6024

{

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

6025

#if defined(REINTERPRET_INPUT_AS_3D)

6026

// Load values from matrix A and matrix B

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

6027

LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 4, float, a, src0_ptr, src_addr.s0, src0_stride_y, zin.s);

6028

#else // defined(REINTERPRET_INPUT_AS_3D)

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6029

// Load values from matrix A and matrix B

6030

float4 a0 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6031

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6032

float4 a1 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6033

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6034

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6035

float4 a2 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6036

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6037

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6038

float4 a3 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6039

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

6040

#endif // defined(REINTERPRET_INPUT_AS_3D)

6041

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6042

float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));

6043

src_addr.s1 += src1_stride_y;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6044

6045

// Multiply and accumulate

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6046

acc0.s0 = fma(a0.s0, b0.s0, acc0.s0);

6047

acc0.s1 = fma(a0.s0, b0.s1, acc0.s1);

6048

acc0.s2 = fma(a0.s0, b0.s2, acc0.s2);

6049

acc0.s3 = fma(a0.s0, b0.s3, acc0.s3);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6050

6051

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6052

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6053

acc1.s0 = fma(a1.s0, b0.s0, acc1.s0);

6054

acc1.s1 = fma(a1.s0, b0.s1, acc1.s1);

6055

acc1.s2 = fma(a1.s0, b0.s2, acc1.s2);

6056

acc1.s3 = fma(a1.s0, b0.s3, acc1.s3);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6057

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6058

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6059

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6060

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6061

acc2.s0 = fma(a2.s0, b0.s0, acc2.s0);

6062

acc2.s1 = fma(a2.s0, b0.s1, acc2.s1);

6063

acc2.s2 = fma(a2.s0, b0.s2, acc2.s2);

6064

acc2.s3 = fma(a2.s0, b0.s3, acc2.s3);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6065

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6066

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6067

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6068

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6069

acc3.s0 = fma(a3.s0, b0.s0, acc3.s0);

6070

acc3.s1 = fma(a3.s0, b0.s1, acc3.s1);

6071

acc3.s2 = fma(a3.s0, b0.s2, acc3.s2);

6072

acc3.s3 = fma(a3.s0, b0.s3, acc3.s3);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6073

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6074

6075

// Load values from matrix A and matrix B

6076

b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));

6077

src_addr.s1 += src1_stride_y;

6078

6079

// Multiply and accumulate

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6080

acc0.s0 = fma(a0.s1, b0.s0, acc0.s0);

6081

acc0.s1 = fma(a0.s1, b0.s1, acc0.s1);

6082

acc0.s2 = fma(a0.s1, b0.s2, acc0.s2);

6083

acc0.s3 = fma(a0.s1, b0.s3, acc0.s3);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6084

6085

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6086

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6087

acc1.s0 = fma(a1.s1, b0.s0, acc1.s0);

6088

acc1.s1 = fma(a1.s1, b0.s1, acc1.s1);

6089

acc1.s2 = fma(a1.s1, b0.s2, acc1.s2);

6090

acc1.s3 = fma(a1.s1, b0.s3, acc1.s3);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6091

6092

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6093

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6094

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6095

acc2.s0 = fma(a2.s1, b0.s0, acc2.s0);

6096

acc2.s1 = fma(a2.s1, b0.s1, acc2.s1);

6097

acc2.s2 = fma(a2.s1, b0.s2, acc2.s2);

6098

acc2.s3 = fma(a2.s1, b0.s3, acc2.s3);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6099

6100

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6101

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6102

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6103

acc3.s0 = fma(a3.s1, b0.s0, acc3.s0);

6104

acc3.s1 = fma(a3.s1, b0.s1, acc3.s1);

6105

acc3.s2 = fma(a3.s1, b0.s2, acc3.s2);

6106

acc3.s3 = fma(a3.s1, b0.s3, acc3.s3);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6107

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6108

6109

// Load values from matrix A and matrix B

6110

b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));

6111

src_addr.s1 += src1_stride_y;

6112

6113

// Multiply and accumulate

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6114

acc0.s0 = fma(a0.s2, b0.s0, acc0.s0);

6115

acc0.s1 = fma(a0.s2, b0.s1, acc0.s1);

6116

acc0.s2 = fma(a0.s2, b0.s2, acc0.s2);

6117

acc0.s3 = fma(a0.s2, b0.s3, acc0.s3);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6118

6119

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6120

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6121

acc1.s0 = fma(a1.s2, b0.s0, acc1.s0);

6122

acc1.s1 = fma(a1.s2, b0.s1, acc1.s1);

6123

acc1.s2 = fma(a1.s2, b0.s2, acc1.s2);

6124

acc1.s3 = fma(a1.s2, b0.s3, acc1.s3);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6125

6126

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6127

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6128

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6129

acc2.s0 = fma(a2.s2, b0.s0, acc2.s0);

6130

acc2.s1 = fma(a2.s2, b0.s1, acc2.s1);

6131

acc2.s2 = fma(a2.s2, b0.s2, acc2.s2);

6132

acc2.s3 = fma(a2.s2, b0.s3, acc2.s3);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6133

6134

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6135

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6136

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6137

acc3.s0 = fma(a3.s2, b0.s0, acc3.s0);

6138

acc3.s1 = fma(a3.s2, b0.s1, acc3.s1);

6139

acc3.s2 = fma(a3.s2, b0.s2, acc3.s2);

6140

acc3.s3 = fma(a3.s2, b0.s3, acc3.s3);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6141

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6142

6143

// Load values from matrix A and matrix B

6144

b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));

6145

src_addr.s1 += src1_stride_y;

6146

6147

// Multiply and accumulate

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6148

acc0.s0 = fma(a0.s3, b0.s0, acc0.s0);

6149

acc0.s1 = fma(a0.s3, b0.s1, acc0.s1);

6150

acc0.s2 = fma(a0.s3, b0.s2, acc0.s2);

6151

acc0.s3 = fma(a0.s3, b0.s3, acc0.s3);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6152

6153

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6154

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6155

acc1.s0 = fma(a1.s3, b0.s0, acc1.s0);

6156

acc1.s1 = fma(a1.s3, b0.s1, acc1.s1);

6157

acc1.s2 = fma(a1.s3, b0.s2, acc1.s2);

6158

acc1.s3 = fma(a1.s3, b0.s3, acc1.s3);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6159

6160

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6161

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6162

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6163

acc2.s0 = fma(a2.s3, b0.s0, acc2.s0);

6164

acc2.s1 = fma(a2.s3, b0.s1, acc2.s1);

6165

acc2.s2 = fma(a2.s3, b0.s2, acc2.s2);

6166

acc2.s3 = fma(a2.s3, b0.s3, acc2.s3);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6167

6168

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6169

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6170

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6171

acc3.s0 = fma(a3.s3, b0.s0, acc3.s0);

6172

acc3.s1 = fma(a3.s3, b0.s1, acc3.s1);

6173

acc3.s2 = fma(a3.s3, b0.s2, acc3.s2);

6174

acc3.s3 = fma(a3.s3, b0.s3, acc3.s3);

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6175

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6176

6177

src_addr.s0 += 4 * sizeof(float);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6178

}

6179

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6180

for(; i < (int)COLS_A; ++i)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6181

{

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

6182

#if defined(REINTERPRET_INPUT_AS_3D)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6183

// Load values from matrix A

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

6184

float a0 = *((__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));

6185

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6186

float a1 = *((__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));

6187

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6188

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6189

float a2 = *((__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));

6190

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6191

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6192

float a3 = *((__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));

6193

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6194

#else // defined(REINTERPRET_INPUT_AS_3D)

6195

// Load values from matrix A

6196

float a0 = *((__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6197

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6198

float a1 = *((__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

6199

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6200

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6201

float a2 = *((__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

6202

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6203

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6204

float a3 = *((__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

6205

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

6206

#endif // defined(REINTERPRET_INPUT_AS_3D)

6207

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6208

// Load values from matrix B

6209

float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6210

src_addr.s1 += src1_stride_y;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6211

6212

// Multiply and accumulate

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6213

acc0.s0 = fma(a0, b0.s0, acc0.s0);

6214

acc0.s1 = fma(a0, b0.s1, acc0.s1);

6215

acc0.s2 = fma(a0, b0.s2, acc0.s2);

6216

acc0.s3 = fma(a0, b0.s3, acc0.s3);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6217

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6218

acc1.s0 = fma(a1, b0.s0, acc1.s0);

6219

acc1.s1 = fma(a1, b0.s1, acc1.s1);

6220

acc1.s2 = fma(a1, b0.s2, acc1.s2);

6221

acc1.s3 = fma(a1, b0.s3, acc1.s3);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6222

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6223

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6224

acc2.s0 = fma(a2, b0.s0, acc2.s0);

6225

acc2.s1 = fma(a2, b0.s1, acc2.s1);

6226

acc2.s2 = fma(a2, b0.s2, acc2.s2);

6227

acc2.s3 = fma(a2, b0.s3, acc2.s3);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6228

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6229

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6230

acc3.s0 = fma(a3, b0.s0, acc3.s0);

6231

acc3.s1 = fma(a3, b0.s1, acc3.s1);

6232

acc3.s2 = fma(a3, b0.s2, acc3.s2);

6233

acc3.s3 = fma(a3, b0.s3, acc3.s3);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6234

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6235

6236

src_addr.s0 += sizeof(float);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6237

}

6238

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6239

int z = get_global_id(2);

6240

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6241

// Compute destination address

6242

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

6243

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6244

// Compute dst address

6245

__global uchar *dst_addr = offset(&dst, 0, 0);

6246

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6247

uint4 zout = 0;

Michele Di Giorgio

2018-11-16 16:04:25 +0000

[diff] [blame]

6248

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6249

#if defined(REINTERPRET_OUTPUT_AS_3D)

6250

// Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

6251

// in order to take into account the presence of possible cross plane paddings

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6252

//

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

// | |

// | plane0 |

// | |

// |__________________|

6257

// |******************|

6258

// | cross_plane_pad |

6259

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6264

6265

// The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6266

zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;

6267

zout = min(DEPTH_GEMM3D - 1, zout);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6268

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

6269

// Add offset due to the cross plane paddings

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

6270

zout *= (dst_cross_plane_pad * dst_stride_y);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6271

6272

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

6273

// multiply dst_stride_z by DEPTH_GEMM3D

6274

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6275

#else // defined(REINTERPRET_OUTPUT_AS_3D)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6276

// Add offset for batched GEMM

6277

dst_addr += z * dst_stride_z;

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6278

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

6279

6280

// Multiply by the weight of matrix-matrix product and store the result

6281

#if defined(ALPHA)

6282

SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, float, acc, ALPHA);

6283

#endif // defined(ALPHA)

// Add beta*bias

#if defined(BETA)

REPEAT_VAR_INIT_TO_CONST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, uint, zero, 0);

6288

6289

#if defined(BROADCAST_BIAS)

6290

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float));

6291

6292

LOAD_BLOCK(1, 4, float, bias, src2_addr, 0, src2_stride_y, zero);

6293

6294

#ifndef UNIT_BETA

6295

SCALE_BLOCK(1, float, bias, BETA);

6296

#endif // UNIT_BIAS

6297

6298

// acc = acc + bias[broadcasted]

6299

ADD_BLOCK_BROADCAST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias0);

6300

6301

#else // defined(BROADCAST_BIAS)

6302

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)4 * sizeof(float)) + (get_global_id(1) *

6303

(uint)NUM_ELEMS_PROCESSED_PER_THREAD_Y * src2_stride_y) + get_global_id(2) * src2_stride_z;

6304

6305

LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 4, float, bias, src2_addr, 0, src2_stride_y, zero);

6306

6307

#ifndef UNIT_BETA

6308

SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, float, bias, BETA);

#endif // UNIT_BIAS

// acc = acc + bias

ADD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias);

6313

6314

#endif // defined(BROADCAST_BIAS)

6315

#endif // defined(BETA)

6316

6317

#if defined(ACTIVATION_TYPE)

6318

ACTIVATION_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, ACTIVATION_TYPE, float, acc, A_VAL, B_VAL);

6319

#endif // defined(ACTIVATION_TYPE)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6320

6321

// Store the output block

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6322

vstore4(acc0, 0, (__global float *)(dst_addr + 0 * dst_stride_y + zout.s0));

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6323

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6324

vstore4(acc1, 0, (__global float *)(dst_addr + 1 * dst_stride_y + zout.s1));

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6325

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6326

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6327

vstore4(acc2, 0, (__global float *)(dst_addr + 2 * dst_stride_y + zout.s2));

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6328

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6329

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6330

vstore4(acc3, 0, (__global float *)(dst_addr + 3 * dst_stride_y + zout.s3));

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6331

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6332

}

6333

6334

/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not been reshaped

6335

*

6336

* @note This OpenCL kernel works with the 32-bit floating point data type (float) and uses the fma units.

6337

* This OpenCL kernel is optimized for Bifrost when the number of matrix B columns is less or equal to 1000.

6338

* @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y.

6339

* This kernel optimally uses -DNUM_ELEMS_PROCESSED_PER_THREAD_X=2.

6340

* @note The number of matrix A columns must be passed at compile time using -DCOLS_A.

6341

* @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha if alpha!=1.0f.

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6342

* @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)

6343

* This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6344

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6345

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

6346

* The activation function is performed after the bias addition

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

6347

* @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:

6348

* -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6349

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

6350

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

6351

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

6352

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped

6353

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6354

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6355

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

6356

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

6357

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

6358

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

6359

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

6360

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

6361

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

6362

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

6363

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

6364

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

6365

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6366

* @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

6367

* @param[in] src2_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

6368

* @param[in] src2_step_x (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)

6369

* @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

6370

* @param[in] src2_step_y (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)

6371

* @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6372

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

6373

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

6374

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

6375

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

6376

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

6377

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6378

* @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)

6379

* @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6380

* @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6381

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

6382

* @param[in] src_cross_plane_pad (Optional) Bottom paddings in unit of elements for the input tensor (only if defined REINTERPRET_INPUT_AS_3D)

6383

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6384

*/

6385

__kernel void gemm_mm_floating_point_f32_bifrost_1000(IMAGE_DECLARATION(src0),

6386

IMAGE_DECLARATION(src1),

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6387

#if defined(BETA)

6388

IMAGE_DECLARATION(src2),

6389

#endif // defined(BETA)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

6390

IMAGE_DECLARATION(dst),

6391

uint src0_stride_z,

6392

uint src1_stride_z,

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6393

#if defined(BETA)

6394

uint src2_stride_z,

6395

#endif //defined(BETA)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6396

uint dst_stride_z

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

6397

#if defined(REINTERPRET_INPUT_AS_3D)

6398

,

6399

uint src_cross_plane_pad

6400

#endif // REINTERPRET_INPUT_AS_3D

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6401

#if defined(REINTERPRET_OUTPUT_AS_3D)

6402

,

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

6403

uint dst_cross_plane_pad

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6404

#endif // REINTERPRET_OUTPUT_AS_3D

6405

)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6406

{

6407

// Requires 2 NUM_ELEMS_PROCESSED_PER_THREAD_X, C vect2, A vect4, B (2 vload2) // to fix for NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6408

int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;

6409

6410

// Compute starting address for matrix A and Matrix B

6411

int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));

6412

6413

// Update address for the matrix A

6414

src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;

6415

6416

// Update address for the matrix B

6417

src_addr.s1 += idx * sizeof(float);

6418

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

6419

#if defined(REINTERPRET_INPUT_AS_3D)

6420

// Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension

6421

// in order to take into account the presence of possible cross plane paddings

//

// | |

// | plane0 |

// | |

// |__________________|

6427

// |******************|

6428

// | cross_plane_pad |

6429

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

6434

6435

// The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D

6436

uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;

6437

zin = min(DEPTH_GEMM3D - 1, zin);

6438

6439

// Add offset due to the cross plane paddings

6440

zin *= (src_cross_plane_pad * src0_stride_y);

6441

6442

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

6443

// multiply src0_stride_z by DEPTH_GEMM3D

6444

src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;

6445

6446

#else // defined(REINTERPRET_INPUT_AS_3D)

6447

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

6448

// Add offset for batched GEMM

6449

src_addr.s0 += get_global_id(2) * src0_stride_z;

6450

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

6451

#endif // defined(REINTERPRET_INPUT_AS_3D)

6452

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

6453

#if defined(MATRIX_B_DEPTH)

6454

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

6455

src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;

6456

#else // defined(MATRIX_B_DEPTH)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

6457

src_addr.s1 += get_global_id(2) * src1_stride_z;

Gian Marco Iodice

2018-03-02 11:18:12 +0000

[diff] [blame]

6458

#endif // defined(MATRIX_B_DEPTH)

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

6459

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6460

// Initialize accumulators

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6461

float2 acc0 = 0.0f;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6462

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6463

float2 acc1 = 0.0f;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6464

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6465

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6466

float2 acc2 = 0.0f;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6467

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6468

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6469

float2 acc3 = 0.0f;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6470

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6471

6472

// A and B src indices get incremented at the same time.

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6473

int i = 0;

6474

for(; i <= ((int)COLS_A - 8); i += 8)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6475

{

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

6476

#if defined(REINTERPRET_INPUT_AS_3D)

6477

// Load values from matrix A

6478

float8 a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + zin.s0));

6479

#else // defined(REINTERPRET_INPUT_AS_3D)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6480

// Load values from matrix A

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6481

float8 a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0));

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

6482

#endif // defined(REINTERPRET_INPUT_AS_3D)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6483

6484

// Load values from matrix B

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6485

float2 b0 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));

6486

src_addr.s1 += src1_stride_y;

6487

float2 b1 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));

6488

src_addr.s1 += src1_stride_y;

6489

float2 b2 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));

6490

src_addr.s1 += src1_stride_y;

6491

float2 b3 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));

6492

src_addr.s1 += src1_stride_y;

6493

float2 b4 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));

6494

src_addr.s1 += src1_stride_y;

6495

float2 b5 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));

6496

src_addr.s1 += src1_stride_y;

6497

float2 b6 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));

6498

src_addr.s1 += src1_stride_y;

6499

float2 b7 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));

6500

src_addr.s1 += src1_stride_y;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6501

6502

// Multiply and accumulate

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6503

acc0.s0 = fma(a0.s0, b0.s0, acc0.s0);

6504

acc0.s0 = fma(a0.s1, b1.s0, acc0.s0);

6505

acc0.s0 = fma(a0.s2, b2.s0, acc0.s0);

6506

acc0.s0 = fma(a0.s3, b3.s0, acc0.s0);

6507

acc0.s0 = fma(a0.s4, b4.s0, acc0.s0);

6508

acc0.s0 = fma(a0.s5, b5.s0, acc0.s0);

6509

acc0.s0 = fma(a0.s6, b6.s0, acc0.s0);

6510

acc0.s0 = fma(a0.s7, b7.s0, acc0.s0);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6511

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6512

acc0.s1 = fma(a0.s0, b0.s1, acc0.s1);

6513

acc0.s1 = fma(a0.s1, b1.s1, acc0.s1);

6514

acc0.s1 = fma(a0.s2, b2.s1, acc0.s1);

6515

acc0.s1 = fma(a0.s3, b3.s1, acc0.s1);

6516

acc0.s1 = fma(a0.s4, b4.s1, acc0.s1);

6517

acc0.s1 = fma(a0.s5, b5.s1, acc0.s1);

6518

acc0.s1 = fma(a0.s6, b6.s1, acc0.s1);

6519

acc0.s1 = fma(a0.s7, b7.s1, acc0.s1);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6520

6521

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

6522

#if defined(REINTERPRET_INPUT_AS_3D)

6523

a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));

6524

#else // defined(REINTERPRET_INPUT_AS_3D)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6525

a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

6526

#endif // defined(REINTERPRET_INPUT_AS_3D)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6527

acc1.s0 = fma(a0.s0, b0.s0, acc1.s0);

6528

acc1.s0 = fma(a0.s1, b1.s0, acc1.s0);

6529

acc1.s0 = fma(a0.s2, b2.s0, acc1.s0);

6530

acc1.s0 = fma(a0.s3, b3.s0, acc1.s0);

6531

acc1.s0 = fma(a0.s4, b4.s0, acc1.s0);

6532

acc1.s0 = fma(a0.s5, b5.s0, acc1.s0);

6533

acc1.s0 = fma(a0.s6, b6.s0, acc1.s0);

6534

acc1.s0 = fma(a0.s7, b7.s0, acc1.s0);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6535

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6536

acc1.s1 = fma(a0.s0, b0.s1, acc1.s1);

6537

acc1.s1 = fma(a0.s1, b1.s1, acc1.s1);

6538

acc1.s1 = fma(a0.s2, b2.s1, acc1.s1);

6539

acc1.s1 = fma(a0.s3, b3.s1, acc1.s1);

6540

acc1.s1 = fma(a0.s4, b4.s1, acc1.s1);

6541

acc1.s1 = fma(a0.s5, b5.s1, acc1.s1);

6542

acc1.s1 = fma(a0.s6, b6.s1, acc1.s1);

6543

acc1.s1 = fma(a0.s7, b7.s1, acc1.s1);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6544

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6545

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

6546

#if defined(REINTERPRET_INPUT_AS_3D)

6547

a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));

6548

#else // defined(REINTERPRET_INPUT_AS_3D)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6549

a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

6550

#endif // defined(REINTERPRET_INPUT_AS_3D)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6551

acc2.s0 = fma(a0.s0, b0.s0, acc2.s0);

6552

acc2.s0 = fma(a0.s1, b1.s0, acc2.s0);

6553

acc2.s0 = fma(a0.s2, b2.s0, acc2.s0);

6554

acc2.s0 = fma(a0.s3, b3.s0, acc2.s0);

6555

acc2.s0 = fma(a0.s4, b4.s0, acc2.s0);

6556

acc2.s0 = fma(a0.s5, b5.s0, acc2.s0);

6557

acc2.s0 = fma(a0.s6, b6.s0, acc2.s0);

6558

acc2.s0 = fma(a0.s7, b7.s0, acc2.s0);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6559

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6560

acc2.s1 = fma(a0.s0, b0.s1, acc2.s1);

6561

acc2.s1 = fma(a0.s1, b1.s1, acc2.s1);

6562

acc2.s1 = fma(a0.s2, b2.s1, acc2.s1);

6563

acc2.s1 = fma(a0.s3, b3.s1, acc2.s1);

6564

acc2.s1 = fma(a0.s4, b4.s1, acc2.s1);

6565

acc2.s1 = fma(a0.s5, b5.s1, acc2.s1);

6566

acc2.s1 = fma(a0.s6, b6.s1, acc2.s1);

6567

acc2.s1 = fma(a0.s7, b7.s1, acc2.s1);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6568

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6569

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

6570

#if defined(REINTERPRET_INPUT_AS_3D)

6571

a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));

6572

#else // defined(REINTERPRET_INPUT_AS_3D)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6573

a0 = vload8(0, (__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

6574

#endif // defined(REINTERPRET_INPUT_AS_3D)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6575

acc3.s0 = fma(a0.s0, b0.s0, acc3.s0);

6576

acc3.s0 = fma(a0.s1, b1.s0, acc3.s0);

6577

acc3.s0 = fma(a0.s2, b2.s0, acc3.s0);

6578

acc3.s0 = fma(a0.s3, b3.s0, acc3.s0);

6579

acc3.s0 = fma(a0.s4, b4.s0, acc3.s0);

6580

acc3.s0 = fma(a0.s5, b5.s0, acc3.s0);

6581

acc3.s0 = fma(a0.s6, b6.s0, acc3.s0);

6582

acc3.s0 = fma(a0.s7, b7.s0, acc3.s0);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6583

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6584

acc3.s1 = fma(a0.s0, b0.s1, acc3.s1);

6585

acc3.s1 = fma(a0.s1, b1.s1, acc3.s1);

6586

acc3.s1 = fma(a0.s2, b2.s1, acc3.s1);

6587

acc3.s1 = fma(a0.s3, b3.s1, acc3.s1);

6588

acc3.s1 = fma(a0.s4, b4.s1, acc3.s1);

6589

acc3.s1 = fma(a0.s5, b5.s1, acc3.s1);

6590

acc3.s1 = fma(a0.s6, b6.s1, acc3.s1);

6591

acc3.s1 = fma(a0.s7, b7.s1, acc3.s1);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6592

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6593

6594

src_addr.s0 += sizeof(float) * 8;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6595

}

6596

// float size increment

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6597

for(; i < (int)COLS_A; ++i)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6598

{

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

6599

#if defined(REINTERPRET_INPUT_AS_3D)

6600

// Load values from matrix A

6601

float a0 = *((__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));

6602

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6603

float a1 = *((__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));

6604

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6605

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6606

float a2 = *((__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));

6607

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6608

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6609

float a3 = *((__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));

6610

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6611

#else // defined(REINTERPRET_INPUT_AS_3D)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6612

// Load values from matrix A

6613

float a0 = *((__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));

6614

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6615

float a1 = *((__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

6616

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6617

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6618

float a2 = *((__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

6619

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6620

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6621

float a3 = *((__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

6622

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

6623

#endif // defined(REINTERPRET_INPUT_AS_3D)

6624

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6625

// Load values from matrix B

6626

float2 b0 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6627

src_addr.s1 += src1_stride_y;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6628

6629

// Multiply and accumulate

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6630

acc0.s0 = fma(a0, b0.s0, acc0.s0);

6631

acc0.s1 = fma(a0, b0.s1, acc0.s1);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6632

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6633

acc1.s0 = fma(a1, b0.s0, acc1.s0);

6634

acc1.s1 = fma(a1, b0.s1, acc1.s1);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6635

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6636

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6637

acc2.s0 = fma(a2, b0.s0, acc2.s0);

6638

acc2.s1 = fma(a2, b0.s1, acc2.s1);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6639

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6640

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6641

acc3.s0 = fma(a3, b0.s0, acc3.s0);

6642

acc3.s1 = fma(a3, b0.s1, acc3.s1);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6643

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-04-06 10:00:10 +0100

[diff] [blame]

6644

6645

src_addr.s0 += sizeof(float);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6646

}

6647

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6648

int z = get_global_id(2);

6649

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6650

// Compute destination address

6651

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

6652

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

6653

// Compute dst address

6654

__global uchar *dst_addr = offset(&dst, 0, 0);

6655

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6656

uint4 zout = 0;

Michele Di Giorgio

2018-11-16 16:04:25 +0000

[diff] [blame]

6657

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6658

#if defined(REINTERPRET_OUTPUT_AS_3D)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6659

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6660

// Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

6661

// in order to take into account the presence of possible cross plane paddings

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6662

//

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

// | |

// | plane0 |

// | |

// |__________________|

6667

// |******************|

6668

// | cross_plane_pad |

6669

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

Gian Marco

2018-02-15 12:35:44 +0000

[diff] [blame]

6674

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6675

// The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6676

zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;

6677

zout = min(DEPTH_GEMM3D - 1, zout);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6678

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

6679

// Add offset due to the cross plane paddings

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

6680

zout *= (dst_cross_plane_pad * dst_stride_y);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6681

6682

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

6683

// multiply dst_stride_z by DEPTH_GEMM3D

6684

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6685

#else // defined(REINTERPRET_OUTPUT_AS_3D)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6686

// Add offset for batched GEMM

6687

dst_addr += z * dst_stride_z;

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6688

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

6689

6690

// Multiply by the weight of matrix-matrix product and store the result

6691

#if defined(ALPHA)

6692

SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, float, acc, ALPHA);

6693

#endif // defined(ALPHA)

// Add beta*bias

#if defined(BETA)

REPEAT_VAR_INIT_TO_CONST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, uint, zero, 0);

6698

6699

#if defined(BROADCAST_BIAS)

6700

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)2 * sizeof(float));

6701

6702

LOAD_BLOCK(1, 2, float, bias, src2_addr, 0, src2_stride_y, zero);

6703

6704

#ifndef UNIT_BETA

6705

SCALE_BLOCK(1, float, bias, BETA);

6706

#endif // UNIT_BIAS

6707

6708

// acc = acc + bias[broadcasted]

6709

ADD_BLOCK_BROADCAST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias0);

6710

6711

#else // defined(BROADCAST_BIAS)

6712

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)2 * sizeof(float)) + (get_global_id(1) *

6713

(uint)NUM_ELEMS_PROCESSED_PER_THREAD_Y * src2_stride_y) + get_global_id(2) * src2_stride_z;

6714

6715

LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 2, float, bias, src2_addr, 0, src2_stride_y, zero);

6716

6717

#ifndef UNIT_BETA

6718

SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, float, bias, BETA);

#endif // UNIT_BIAS

// acc = acc + bias

ADD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias);

6723

6724

#endif // defined(BROADCAST_BIAS)

6725

#endif // defined(BETA)

6726

6727

#if defined(ACTIVATION_TYPE)

6728

ACTIVATION_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, ACTIVATION_TYPE, float, acc, A_VAL, B_VAL);

6729

#endif // defined(ACTIVATION_TYPE)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6730

6731

// Store the output block

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6732

vstore2(acc0, 0, (__global float *)(dst_addr + 0 * dst_stride_y + zout.s0));

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6733

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6734

vstore2(acc1, 0, (__global float *)(dst_addr + 1 * dst_stride_y + zout.s1));

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6735

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6736

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6737

vstore2(acc2, 0, (__global float *)(dst_addr + 2 * dst_stride_y + zout.s2));

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6738

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6739

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6740

vstore2(acc3, 0, (__global float *)(dst_addr + 3 * dst_stride_y + zout.s3));

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

6741

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

6742

}

6743

Vidhya Sudhan Loganathan

bdff491

2018-05-22 15:03:09 +0100

[diff] [blame]

6744

#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

6745

/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped

6746

*

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

6747

* @note This OpenCL kernel works with the 16-bit floating point data type (half) and accumulating the result in a 32 floating point variable.

6748

* @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y.

6749

* This kernel optimally uses -DNUM_ELEMS_PROCESSED_PER_THREAD_X=4.

6750

* @note The number of matrix A columns must be passed at compile time using -DCOLS_A.

6751

* @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6752

* @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)

6753

* This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

6754

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6755

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

6756

* The activation function is performed after the bias addition

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

6757

* @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:

6758

* -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D

6759

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

6760

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

6761

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

6762

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped

6763

*

6764

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16

6765

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

6766

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

6767

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

6768

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

6769

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

6770

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

6771

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

6772

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

6773

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

6774

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

6775

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6776

* @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

6777

* @param[in] src2_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

6778

* @param[in] src2_step_x (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)

6779

* @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

6780

* @param[in] src2_step_y (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)

6781

* @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

6782

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

6783

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

6784

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

6785

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

6786

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

6787

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

6788

* @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)

6789

* @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6790

* @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

6791

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

6792

* @param[in] src_cross_plane_pad (Optional) Bottom paddings in unit of elements for the input tensor (only if defined REINTERPRET_INPUT_AS_3D)

6793

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

6794

*/

6795

__kernel void gemm_mm_floating_point_f16_bifrost_acc32(IMAGE_DECLARATION(src0),

6796

IMAGE_DECLARATION(src1),

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6797

#if defined(BETA)

6798

IMAGE_DECLARATION(src2),

6799

#endif // defined(BETA)

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

6800

IMAGE_DECLARATION(dst),

6801

uint src0_stride_z,

6802

uint src1_stride_z,

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

6803

#if defined(BETA)

6804

uint src2_stride_z,

6805

#endif //defined(BETA)

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

6806

uint dst_stride_z

6807

#if defined(REINTERPRET_INPUT_AS_3D)

6808

,

6809

uint src_cross_plane_pad

6810

#endif // REINTERPRET_INPUT_AS_3D

6811

#if defined(REINTERPRET_OUTPUT_AS_3D)

6812

,

6813

uint dst_cross_plane_pad

6814

#endif // REINTERPRET_OUTPUT_AS_3D

6815

)

6816

{

6817

int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;

6818

6819

// Compute starting address for matrix A and Matrix B

6820

int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));

6821

6822

// Update address for the matrix A

6823

src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;

6824

6825

// Update address for the matrix B

6826

src_addr.s1 += idx * sizeof(half);

6827

6828

#if defined(REINTERPRET_INPUT_AS_3D)

6829

// Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension

6830

// in order to take into account the presence of possible cross plane paddings

//

// | |

// | plane0 |

// | |

// |__________________|

6836

// |******************|

6837

// | cross_plane_pad |

6838

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

6843

6844

// The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D

6845

uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;

6846

zin = min(DEPTH_GEMM3D - 1, zin);

6847

6848

// Add offset due to the cross plane paddings

6849

zin *= (src_cross_plane_pad * src0_stride_y);

6850

6851

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

6852

// multiply src0_stride_z by DEPTH_GEMM3D

6853

src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;

6854

6855

#else // defined(REINTERPRET_INPUT_AS_3D)

6856

6857

// Add offset for batched GEMM

6858

src_addr.s0 += get_global_id(2) * src0_stride_z;

6859

6860

#endif // defined(REINTERPRET_INPUT_AS_3D)

6861

6862

#if defined(MATRIX_B_DEPTH)

6863

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

6864

src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;

6865

#else // defined(MATRIX_B_DEPTH)

6866

src_addr.s1 += get_global_id(2) * src1_stride_z;

6867

#endif // defined(MATRIX_B_DEPTH)

6868

6869

float8 acc0 = 0.0h;

6870

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6871

float8 acc1 = 0.0h;

6872

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6873

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6874

float8 acc2 = 0.0h;

6875

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6876

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6877

float8 acc3 = 0.0h;

6878

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6879

6880

int i = 0;

6881

for(; i <= ((int)COLS_A - 4); i += 4)

6882

{

6883

#if defined(REINTERPRET_INPUT_AS_3D)

6884

// Load values from matrix A

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

6885

LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 4, half, a, src0_ptr, src_addr.s0, src0_stride_y, zin.s);

6886

#else // defined(REINTERPRET_INPUT_AS_3D)

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

6887

// Load values from matrix A

6888

half4 a0 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));

6889

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6890

half4 a1 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

6891

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6892

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6893

half4 a2 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

6894

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6895

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6896

half4 a3 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

6897

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6898

#endif // defined(REINTERPRET_INPUT_AS_3D)

6899

6900

// Load values from matrix B

6901

float8 b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1)));

6902

src_addr.s1 += src1_stride_y;

6903

6904

// Accumulate

6905

acc0 = fma(b0, (float8)a0.s0, acc0);

6906

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6907

acc1 = fma(b0, (float8)a1.s0, acc1);

6908

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6909

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6910

acc2 = fma(b0, (float8)a2.s0, acc2);

6911

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6912

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6913

acc3 = fma(b0, (float8)a3.s0, acc3);

6914

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6915

6916

b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1)));

6917

src_addr.s1 += src1_stride_y;

6918

acc0 = fma(b0, (float8)a0.s1, acc0);

6919

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6920

acc1 = fma(b0, (float8)a1.s1, acc1);

6921

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6922

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6923

acc2 = fma(b0, (float8)a2.s1, acc2);

6924

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6925

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6926

acc3 = fma(b0, (float8)a3.s1, acc3);

6927

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6928

6929

b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1)));

6930

src_addr.s1 += src1_stride_y;

6931

acc0 = fma(b0, (float8)a0.s2, acc0);

6932

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6933

acc1 = fma(b0, (float8)a1.s2, acc1);

6934

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6935

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6936

acc2 = fma(b0, (float8)a2.s2, acc2);

6937

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6938

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6939

acc3 = fma(b0, (float8)a3.s2, acc3);

6940

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6941

6942

b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1)));

6943

src_addr.s1 += src1_stride_y;

6944

acc0 = fma(b0, (float8)a0.s3, acc0);

6945

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6946

acc1 = fma(b0, (float8)a1.s3, acc1);

6947

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6948

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6949

acc2 = fma(b0, (float8)a2.s3, acc2);

6950

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6951

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6952

acc3 = fma(b0, (float8)a3.s3, acc3);

6953

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6954

6955

src_addr.s0 += 4 * sizeof(half);

6956

}

6957

6958

for(; i < (int)COLS_A; ++i)

6959

{

6960

#if defined(REINTERPRET_INPUT_AS_3D)

6961

// Load values from matrix A

6962

half a0 = *((__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));

6963

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6964

half a1 = *((__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));

6965

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6966

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6967

half a2 = *((__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));

6968

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6969

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6970

half a3 = *((__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));

6971

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6972

#else // defined(REINTERPRET_INPUT_AS_3D)

6973

// Load values from matrix A

6974

half a0 = *((__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));

6975

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6976

half a1 = *((__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

6977

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6978

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6979

half a2 = *((__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

6980

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6981

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6982

half a3 = *((__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

6983

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

6984

#endif // defined(REINTERPRET_INPUT_AS_3D)

6985

6986

// Load values from matrix B

6987

float8 b0 = convert_float8(vload8(0, (__global half *)(src1_ptr + src_addr.s1)));

6988

6989

src_addr += (int2)(sizeof(half), src1_stride_y);

6990

6991

// Accumulate

6992

acc0 = fma(b0, (float8)a0, acc0); // b0 * (half8)a0;

6993

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6994

acc1 = fma(b0, (float8)a1, acc1); // b0 * (half8)a1;

6995

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

6996

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6997

acc2 = fma(b0, (float8)a2, acc2); // b0 * (half8)a2;

6998

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

6999

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

7000

acc3 = fma(b0, (float8)a3, acc3); // b0 * (half8)a3;

7001

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

7002

}

7003

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

7004

int z = get_global_id(2);

7005

7006

// Compute destination address

7007

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

7008

7009

// Compute dst address

7010

__global uchar *dst_addr = offset(&dst, 0, 0);

7011

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

7012

uint4 zout = 0;

7013

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

7014

#if defined(REINTERPRET_OUTPUT_AS_3D)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

7015

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

7016

// Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension

7017

// in order to take into account the presence of possible cross plane paddings

//

// | |

// | plane0 |

// | |

// |__________________|

7023

// |******************|

7024

// | cross_plane_pad |

7025

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

7030

7031

// The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

7032

zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;

7033

zout = min(DEPTH_GEMM3D - 1, zout);

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

7034

7035

// Add offset due to the cross plane paddings

7036

zout *= (dst_cross_plane_pad * dst_stride_y);

7037

7038

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

7039

// multiply dst_stride_z by DEPTH_GEMM3D

7040

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

7041

#else // defined(REINTERPRET_OUTPUT_AS_3D)

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

7042

// Add offset for batched GEMM

7043

dst_addr += z * dst_stride_z;

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

7044

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

7045

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

7046

// Multiply by the weight of matrix-matrix product and store the result

7047

#if defined(ALPHA)

7048

SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, float, acc, ALPHA);

7049

#endif // defined(ALPHA)

7050

7051

#if defined(BETA)

7052

REPEAT_VAR_INIT_TO_CONST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, uint, zero, 0);

7053

7054

#if defined(BROADCAST_BIAS)

7055

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half));

7056

7057

LOAD_BLOCK(1, 8, half, bias, src2_addr, 0, src2_stride_y, zero);

7058

7059

float8 bias_f0 = convert_float8(bias0);

7060

7061

#ifndef UNIT_BETA

7062

SCALE_BLOCK(1, float, bias_f, BETA);

7063

#endif // UNIT_BIAS

7064

7065

// acc = acc + bias[broadcasted]

7066

ADD_BLOCK_BROADCAST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias_f0);

7067

7068

#else // defined(BROADCAST_BIAS)

7069

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) + (get_global_id(1) *

7070

(uint)NUM_ELEMS_PROCESSED_PER_THREAD_Y * src2_stride_y) + get_global_id(2) * src2_stride_z;

7071

7072

LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 8, half, bias, src2_addr, 0, src2_stride_y, zero);

7073

7074

float8 bias_f0 = convert_float8(bias0);

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

7075

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

7076

float8 bias_f1 = convert_float8(bias1);

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

7077

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

7078

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

7079

float8 bias_f2 = convert_float8(bias2);

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

7080

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

7081

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

7082

float8 bias_f3 = convert_float8(bias3);

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

7083

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

7084

7085

#ifndef UNIT_BETA

7086

SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, float, bias_f, BETA);

#endif // UNIT_BIAS

// acc = acc + bias

ADD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias_f);

7091

7092

#endif // defined(BROADCAST_BIAS)

7093

#endif // defined(BETA)

7094

7095

half8 acc_h0 = convert_half8(acc0);

7096

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

7097

half8 acc_h1 = convert_half8(acc1);

7098

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

7099

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

7100

half8 acc_h2 = convert_half8(acc2);

7101

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

7102

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

7103

half8 acc_h3 = convert_half8(acc3);

7104

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

7105

7106

#if defined(ACTIVATION_TYPE)

7107

ACTIVATION_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, ACTIVATION_TYPE, half, acc_h, A_VAL, B_VAL);

7108

#endif // defined(ACTIVATION_TYPE)

7109

7110

// Store the output block

7111

STORE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 8, half, acc_h, dst_addr, dst_stride_y, zout.s);

Vidhya Sudhan Loganathan

2018-11-16 11:33:12 +0000

[diff] [blame]

7112

}

7113

7114

/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped

7115

*

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

7116

* @note This OpenCL kernel works with the 16-bit floating point data type (half) and uses the fma units.

7117

* @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y.

7118

* This kernel optimally uses -DNUM_ELEMS_PROCESSED_PER_THREAD_X=4.

7119

* @note The number of matrix A columns must be passed at compile time using -DCOLS_A.

7120

* @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

7121

* @note In case the matrix B has 3 dimensions and the matrix A more than 3, in order to avoid out-of-bounds reads, the number of channels of matrix B must be passed at compile time using MATRIX_B_DEPTH (e.g. -DMATRIX_B_DEPTH=16)

7122

* This case can happen when GEMM is used to perform the element-wise multiplication through a batched matrix multiplication (2D Winograd) and we have multiple inputs (e.g. a = [K, M, 16, Batches], b = [N, K, 16])

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

7123

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

7124

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

7125

* The activation function is performed after the bias addition

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

7126

* @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:

7127

* -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

7128

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

7129

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

7130

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

7131

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped

7132

*

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

7133

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16

7134

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

7135

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

7136

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

7137

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

7138

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

7139

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

7140

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

7141

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

7142

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

7143

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

7144

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

7145

* @param[in] src2_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

7146

* @param[in] src2_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

7147

* @param[in] src2_step_x (Optional) src2_stride_x * number of elements along X processed per workitem(in bytes)

7148

* @param[in] src2_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

7149

* @param[in] src2_step_y (Optional) src2_stride_y * number of elements along Y processed per workitem(in bytes)

7150

* @param[in] src2_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

7151

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

7152

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

7153

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

7154

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

7155

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

7156

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

7157

* @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)

7158

* @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

7159

* @param[in] src2_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

7160

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

7161

* @param[in] src_cross_plane_pad (Optional) Bottom paddings in unit of elements for the input tensor (only if defined REINTERPRET_INPUT_AS_3D)

7162

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

7163

*/

7164

__kernel void gemm_mm_floating_point_f16_bifrost(IMAGE_DECLARATION(src0),

7165

IMAGE_DECLARATION(src1),

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

7166

#if defined(BETA)

7167

IMAGE_DECLARATION(src2),

7168

#endif // defined(BETA)

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

7169

IMAGE_DECLARATION(dst),

7170

uint src0_stride_z,

7171

uint src1_stride_z,

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

7172

#if defined(BETA)

7173

uint src2_stride_z,

7174

#endif //defined(BETA)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

7175

uint dst_stride_z

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

7176

#if defined(REINTERPRET_INPUT_AS_3D)

7177

,

7178

uint src_cross_plane_pad

7179

#endif // REINTERPRET_INPUT_AS_3D

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

7180

#if defined(REINTERPRET_OUTPUT_AS_3D)

7181

,

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

7182

uint dst_cross_plane_pad

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

7183

#endif // REINTERPRET_OUTPUT_AS_3D

7184

)

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

7185

{

7186

int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;

7187

7188

// Compute starting address for matrix A and Matrix B

7189

int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));

7190

7191

// Update address for the matrix A

7192

src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;

7193

7194

// Update address for the matrix B

7195

src_addr.s1 += idx * sizeof(half);

7196

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

7197

#if defined(REINTERPRET_INPUT_AS_3D)

7198

// Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension

7199

// in order to take into account the presence of possible cross plane paddings

//

// | |

// | plane0 |

// | |

// |__________________|

7205

// |******************|

7206

// | cross_plane_pad |

7207

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

7212

7213

// The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D

7214

uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;

7215

zin = min(DEPTH_GEMM3D - 1, zin);

7216

7217

// Add offset due to the cross plane paddings

7218

zin *= (src_cross_plane_pad * src0_stride_y);

7219

7220

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

7221

// multiply src0_stride_z by DEPTH_GEMM3D

7222

src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;

7223

7224

#else // defined(REINTERPRET_INPUT_AS_3D)

7225

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

7226

// Add offset for batched GEMM

7227

src_addr.s0 += get_global_id(2) * src0_stride_z;

7228

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

7229

#endif // defined(REINTERPRET_INPUT_AS_3D)

7230

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

7231

#if defined(MATRIX_B_DEPTH)

7232

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

7233

src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;

7234

#else // defined(MATRIX_B_DEPTH)

7235

src_addr.s1 += get_global_id(2) * src1_stride_z;

7236

#endif // defined(MATRIX_B_DEPTH)

7237

7238

half8 acc0 = 0.0h;

7239

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

7240

half8 acc1 = 0.0h;

7241

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

7242

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

7243

half8 acc2 = 0.0h;

7244

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

7245

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

7246

half8 acc3 = 0.0h;

7247

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

7248

7249

int i = 0;

7250

for(; i <= ((int)COLS_A - 4); i += 4)

7251

{

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

7252

#if defined(REINTERPRET_INPUT_AS_3D)

7253

// Load values from matrix A

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

7254

LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 4, half, a, src0_ptr, src_addr.s0, src0_stride_y, zin.s);

7255

#else // defined(REINTERPRET_INPUT_AS_3D)

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

7256

// Load values from matrix A

7257

half4 a0 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));

7258

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

7259

half4 a1 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

7260

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

7261

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

7262

half4 a2 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

7263

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

7264

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

7265

half4 a3 = vload4(0, (__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

7266

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

7267

#endif // defined(REINTERPRET_INPUT_AS_3D)

7268

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

7269

// Load values from matrix B

7270

half8 b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));

7271

src_addr.s1 += src1_stride_y;

7272

7273

// Accumulate

7274

acc0 = fma(b0, (half8)a0.s0, acc0);

7275

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

7276

acc1 = fma(b0, (half8)a1.s0, acc1);

7277

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

7278

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

7279

acc2 = fma(b0, (half8)a2.s0, acc2);

7280

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

7281

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

7282

acc3 = fma(b0, (half8)a3.s0, acc3);

7283

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

7284

7285

b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));

7286

src_addr.s1 += src1_stride_y;

7287

acc0 = fma(b0, (half8)a0.s1, acc0);

7288

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

7289

acc1 = fma(b0, (half8)a1.s1, acc1);

7290

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

7291

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

7292

acc2 = fma(b0, (half8)a2.s1, acc2);

7293

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

7294

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

7295

acc3 = fma(b0, (half8)a3.s1, acc3);

7296

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

7297

7298

b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));

7299

src_addr.s1 += src1_stride_y;

7300

acc0 = fma(b0, (half8)a0.s2, acc0);

7301

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

7302

acc1 = fma(b0, (half8)a1.s2, acc1);

7303

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

7304

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

7305

acc2 = fma(b0, (half8)a2.s2, acc2);

7306

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

7307

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

7308

acc3 = fma(b0, (half8)a3.s2, acc3);

7309

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

7310

7311

b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));

7312

src_addr.s1 += src1_stride_y;

7313

acc0 = fma(b0, (half8)a0.s3, acc0);

7314

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

7315

acc1 = fma(b0, (half8)a1.s3, acc1);

7316

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

7317

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

7318

acc2 = fma(b0, (half8)a2.s3, acc2);

7319

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

7320

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

7321

acc3 = fma(b0, (half8)a3.s3, acc3);

7322

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

7323

7324

src_addr.s0 += 4 * sizeof(half);

7325

}

7326

7327

for(; i < (int)COLS_A; ++i)

7328

{

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

7329

#if defined(REINTERPRET_INPUT_AS_3D)

7330

// Load values from matrix A

7331

half a0 = *((__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y + zin.s0));

7332

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

7333

half a1 = *((__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y + zin.s1));

7334

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

7335

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

7336

half a2 = *((__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y + zin.s2));

7337

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

7338

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

7339

half a3 = *((__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y + zin.s3));

7340

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

7341

#else // defined(REINTERPRET_INPUT_AS_3D)

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

7342

// Load values from matrix A

7343

half a0 = *((__global half *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));

7344

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

7345

half a1 = *((__global half *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

7346

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

7347

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

7348

half a2 = *((__global half *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

7349

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

7350

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

7351

half a3 = *((__global half *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

7352

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

7353

#endif // defined(REINTERPRET_INPUT_AS_3D)

7354

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

7355

// Load values from matrix B

7356

half8 b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));

7357

7358

src_addr += (int2)(sizeof(half), src1_stride_y);

7359

7360

// Accumulate

7361

acc0 = fma(b0, (half8)a0, acc0); // b0 * (half8)a0;

7362

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

7363

acc1 = fma(b0, (half8)a1, acc1); // b0 * (half8)a1;

7364

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

7365

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

7366

acc2 = fma(b0, (half8)a2, acc2); // b0 * (half8)a2;

7367

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

7368

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

7369

acc3 = fma(b0, (half8)a3, acc3); // b0 * (half8)a3;

7370

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

7371

}

7372

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

7373

int z = get_global_id(2);

7374

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

7375

// Compute destination address

7376

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

7377

7378

// Compute dst address

7379

__global uchar *dst_addr = offset(&dst, 0, 0);

7380

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

7381

uint4 zout = 0;

7382

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

7383

#if defined(REINTERPRET_OUTPUT_AS_3D)

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

7384

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

7385

// Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

7386

// in order to take into account the presence of possible cross plane paddings

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

7387

//

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

// | |

// | plane0 |

// | |

// |__________________|

7392

// |******************|

7393

// | cross_plane_pad |

7394

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

7399

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

7400

// The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

7401

zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;

7402

zout = min(DEPTH_GEMM3D - 1, zout);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

7403

Georgios Pinitas

2018-07-11 15:54:56 +0100

[diff] [blame]

7404

// Add offset due to the cross plane paddings

Gian Marco Iodice

2018-07-26 11:44:03 +0100

[diff] [blame]

7405

zout *= (dst_cross_plane_pad * dst_stride_y);

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

7406

7407

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

7408

// multiply dst_stride_z by DEPTH_GEMM3D

7409

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

7410

#else // defined(REINTERPRET_OUTPUT_AS_3D)

7411

// Add offset for batched GEMM

7412

dst_addr += z * dst_stride_z;

7413

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

7414

7415

// Multiply by the weight of matrix-matrix product and store the result

7416

#if defined(ALPHA)

7417

SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, half, acc, ALPHA);

7418

#endif // defined(ALPHA)

// Add beta*bias

#if defined(BETA)

REPEAT_VAR_INIT_TO_CONST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, uint, zero, 0);

7423

7424

#if defined(BROADCAST_BIAS)

7425

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half));

7426

7427

LOAD_BLOCK(1, 8, half, bias, src2_addr, 0, src2_stride_y, zero);

7428

7429

#ifndef UNIT_BETA

7430

SCALE_BLOCK(1, half, bias, BETA);

7431

#endif // UNIT_BIAS

7432

7433

// acc = acc + bias[broadcasted]

7434

ADD_BLOCK_BROADCAST(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias0);

7435

7436

#else // defined(BROADCAST_BIAS)

7437

__global uchar *src2_addr = src2_ptr + src2_offset_first_element_in_bytes + (get_global_id(0) * (uint)8 * sizeof(half)) + (get_global_id(1) *

7438

(uint)NUM_ELEMS_PROCESSED_PER_THREAD_Y * src2_stride_y) + get_global_id(2) * src2_stride_z;

7439

7440

LOAD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 8, half, bias, src2_addr, 0, src2_stride_y, zero);

7441

7442

#ifndef UNIT_BETA

7443

SCALE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, half, bias, BETA);

#endif // UNIT_BIAS

// acc = acc + bias

ADD_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, acc, bias);

7448

7449

#endif // defined(BROADCAST_BIAS)

7450

#endif // defined(BETA)

7451

7452

#if defined(ACTIVATION_TYPE)

7453

ACTIVATION_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, ACTIVATION_TYPE, half, acc, A_VAL, B_VAL);

7454

#endif // defined(ACTIVATION_TYPE)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

7455

7456

// Store the output block

Usama Arif

2019-04-25 14:28:07 +0100

[diff] [blame]

7457

STORE_BLOCK(NUM_ELEMS_PROCESSED_PER_THREAD_Y, 8, half, acc, dst_addr, dst_stride_y, zout.s);

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

7458

}

Vidhya Sudhan Loganathan

bdff491

2018-05-22 15:03:09 +0100

[diff] [blame]

7459

#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)

Gian Marco Iodice

2018-04-17 09:52:44 +0100

[diff] [blame]

7460

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

7461

#endif // defined(COLS_A) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && (NUM_ELEMS_PROCESSED_PER_THREAD_Y)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7462

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

7463

#if defined(BETA)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7464

/** This OpenCL kernel performs the in-place matrix addition between 2 matrices taking into account that the second matrix might be weighted by a scalar value beta:

7465

*

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

7466

* @note The beta's value need to be passed at compile time using -DBETA

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7467

*

7468

* @param[in] src_ptr Pointer to the source matrix. Supported data types: F32

7469

* @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)

7470

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

7471

* @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)

7472

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

7473

* @param[in] src_stride_z Stride of the destination tensor in Z dimension (in bytes)

7474

* @param[in] src_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7475

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

7476

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7477

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

7478

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

7479

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

7480

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

7481

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

7482

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7483

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

7484

*/

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

7485

__kernel void gemm_ma_f32(TENSOR3D_DECLARATION(src),

7486

TENSOR3D_DECLARATION(dst))

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7487

{

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

7488

// Compute source and destination addresses

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

7489

Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);

7490

Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7491

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

7492

// Load values from A x B

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7493

float4 alpha_ab = vload4(0, (__global float *)dst.ptr);

7494

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

7495

// Load values from Matrix C

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7496

float4 c = vload4(0, (__global float *)src.ptr);

7497

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

7498

// Computes alpha * axb + beta * c

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7499

float4 out = alpha_ab + (float4)BETA * c;

7500

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

7501

// Store final result in axb matrix

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7502

vstore4(out, 0, (__global float *)dst.ptr);

7503

}

7504

Vidhya Sudhan Loganathan

76c8564

2018-05-25 13:53:02 +0100

[diff] [blame]

7505

#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7506

/** This OpenCL kernel performs the in-place matrix addition between 2 matrices taking into account that the second matrix might be weighted by a scalar value beta:

7507

*

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

7508

* @note The beta's value need to be passed at compile time using -DBETA

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

7509

*

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7510

* @param[in] src_ptr Pointer to the source matrix. Supported data types: F16

7511

* @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)

7512

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

7513

* @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)

7514

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

7515

* @param[in] src_stride_z Stride of the destination tensor in Z dimension (in bytes)

7516

* @param[in] src_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7517

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

7518

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7519

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

7520

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

7521

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

7522

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

7523

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

7524

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7525

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

7526

*/

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

7527

__kernel void gemm_ma_f16(TENSOR3D_DECLARATION(src),

7528

TENSOR3D_DECLARATION(dst))

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7529

{

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

7530

// Compute source and destination addresses

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

7531

Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);

7532

Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7533

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

7534

// Load values from A x B

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7535

half8 alpha_ab = vload8(0, (__global half *)dst.ptr);

7536

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

7537

// Load values from Matrix C

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7538

half8 c = vload8(0, (__global half *)src.ptr);

7539

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

7540

// Computes alpha * axb + beta * c

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7541

half8 out = alpha_ab + (half8)BETA * c;

7542

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

7543

// Store final result in axb matrix

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7544

vstore8(out, 0, (__global half *)dst.ptr);

7545

}

Vidhya Sudhan Loganathan

76c8564

2018-05-25 13:53:02 +0100

[diff] [blame]

7546

#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

7547

#endif // defined(BETA)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7548

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

7549

#if defined(WIDTH_VECTOR_A)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7550

/** This OpenCL kernel computes the vector by matrix multiplication between each row of A (src0) and matrix B (src1) used for locally connected layer

7551

*

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

7552

* @note The width of A need to be passed at compile time using -DWIDTH_VECTOR_A

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7553

*

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

7554

* @note The input A and matrix B must not be reshaped

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7555

*

7556

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32

7557

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

7558

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

7559

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

7560

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

7561

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

7562

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7563

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

7564

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

7565

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

7566

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

7567

* @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)

7568

* @param[in] src1_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

7569

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

7570

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7571

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

7572

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

7573

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

7574

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

7575

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

7576

*/

7577

__kernel void gemm_lc_vm_f32(IMAGE_DECLARATION(src0),

7578

TENSOR3D_DECLARATION(src1),

7579

IMAGE_DECLARATION(dst))

7580

{

7581

int idx = get_global_id(0) * 4;

7582

int idy = get_global_id(1);

7583

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

7584

// Compute the address for the vector A and matrix B

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7585

int2 src_addr = ((int2)(src0_offset_first_element_in_bytes + src0_stride_y * idy, src1_offset_first_element_in_bytes + src1_stride_z * idy));

7586

src_addr.s1 += idx * sizeof(float);

7587

7588

int end_row_vec_a = src_addr.s0 + (WIDTH_VECTOR_A * sizeof(float));

float4 acc = 0.0f;

Georgios Pinitas

2017-10-20 18:52:20 +0100

[diff] [blame]

7592

for(; src_addr.s0 <= (end_row_vec_a - 2 * (int)sizeof(float)); src_addr += (int2)(2 * sizeof(float), 2 * src1_stride_y))

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7593

{

7594

float2 a0 = vload2(0, (__global float *)(src0_ptr + src_addr.s0));

7595

float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));

7596

float4 b1 = vload4(0, (__global float *)(src1_ptr + src_addr.s1 + src1_stride_y));

7597

7598

acc += b0 * (float4)a0.s0;

7599

acc += b1 * (float4)a0.s1;

7600

}

7601

7602

for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(sizeof(float), src1_stride_y))

7603

{

7604

float a0 = *((__global float *)(src0_ptr + src_addr.s0));

7605

float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));

7606

7607

acc += b0 * (float4)a0;

7608

}

7609

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

7610

// Compute destination address

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

7611

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

7612

7613

vstore4(acc, 0, (__global float *)(offset(&dst, 0, 0)));

7614

}

Anton Lokhmotov