Blame - src/core/CL/cl_kernels/gemmlowp.cl - ml/ComputeLibrary

2017-11-21 10:57:50 +0000

[diff] [blame]

38

*

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

39

* @note The number of matrix B columns needs to be passed at compile time using -DCOLS_B: e.g. -DCOLS_B=1024

40

* @note The transposition width step (mult_transpose1xW_width * 4) must be passed at compile time using -DTRANSPOSE1XW_WIDTH_STEP (i.e. -DTRANSPOSE1XW_WIDTH_STEP=2)

41

* @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (i.e. -DMULT_INTERLEAVE4X4_HEIGHT=2)

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

42

*

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

43

* @note In case the output has to be reinterpreted as a 3D tensor (i.e. output of convolution layer), the following information must be passed at compile time:

44

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

45

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

46

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

47

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped

48

*

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

49

* @param[in] src0_ptr Pointer to the source matrix. Supported data type: QASYMM8

50

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

51

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

52

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

53

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

54

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

55

* @param[in] src1_ptr Pointer to the source matrix. Supported data type: same as @p src0_ptr

56

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

57

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

58

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

59

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

60

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

61

* @param[out] dst_ptr Pointer to the destination matrix Supported data type: S32

62

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

63

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

64

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

65

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

66

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

67

* @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)

68

* @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)

69

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

70

* @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

71

*/

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

72

__kernel void gemmlowp_mm_interleaved_transposed_midgard(IMAGE_DECLARATION(src0),

73

IMAGE_DECLARATION(src1),

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

74

IMAGE_DECLARATION(dst),

uint src0_stride_z,

uint src1_stride_z,

uint dst_stride_z

#if defined(REINTERPRET_OUTPUT_AS_3D)

79

,

80

uint cross_plane_pad

81

#endif // REINTERPRET_OUTPUT_AS_3D

82

)

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

83

{

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

84

const int x = get_global_id(0) / TRANSPOSE1XW_WIDTH_STEP;

85

const int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;

86

const int z = get_global_id(2);

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

87

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

88

// Offset

89

const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;

90

const int offset_row_b = (get_global_id(0) % TRANSPOSE1XW_WIDTH_STEP) * 4;

91

92

// src_addr_a = address of matrix A

93

// src_addr_b = address of matrix B

Isabella Gottardi

b92805b

2018-09-28 18:24:27 +0100

[diff] [blame]

94

__global uchar *src_addr_a = (__global uchar *)(src0_ptr + z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes);

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

95

__global uchar *src_addr_b = (__global uchar *)(src1_ptr + x * src1_stride_y + src1_offset_first_element_in_bytes);

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

96

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

97

#if defined(MATRIX_B_DEPTH)

98

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

99

src_addr_b += (z % MATRIX_B_DEPTH) * src1_stride_z;

100

#else // defined(MATRIX_B_DEPTH)

101

src_addr_b += z * src1_stride_z;

102

#endif // defined(MATRIX_B_DEPTH)

103

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

104

// Compute end row address for matrix B

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

105

__global uchar *src_end_addr_b = src_addr_b + COLS_B;

106

107

src_addr_a += offset_row_a;

108

src_addr_b += offset_row_b;

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

109

110

// Reset accumulators

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

int4 c00 = 0;

int4 c10 = 0;

int4 c20 = 0;

int4 c30 = 0;

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

115

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

116

for(; src_addr_b <= (src_end_addr_b - (int)(8 * TRANSPOSE1XW_WIDTH_STEP)); src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 8 * TRANSPOSE1XW_WIDTH_STEP)

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

117

{

118

// Load values from matrix A (interleaved) and matrix B (transposed)

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

119

int4 a0 = convert_int4(vload4(0, src_addr_a));

120

int4 b0 = convert_int4(vload4(0, src_addr_b));

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

121

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

122

c00 += (int4)a0.s0 * b0;

123

c10 += (int4)a0.s1 * b0;

124

c20 += (int4)a0.s2 * b0;

125

c30 += (int4)a0.s3 * b0;

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

126

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

127

a0 = convert_int4(vload4(0, src_addr_a + 4 * MULT_INTERLEAVE4X4_HEIGHT));

128

b0 = convert_int4(vload4(0, src_addr_b + 4 * TRANSPOSE1XW_WIDTH_STEP));

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

129

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

130

c00 += (int4)a0.s0 * b0;

131

c10 += (int4)a0.s1 * b0;

132

c20 += (int4)a0.s2 * b0;

133

c30 += (int4)a0.s3 * b0;

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

134

}

135

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

136

for(; src_addr_b < src_end_addr_b; src_addr_a += (4 * MULT_INTERLEAVE4X4_HEIGHT), src_addr_b += (4 * TRANSPOSE1XW_WIDTH_STEP))

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

137

{

138

// Load values from matrix A (interleaved) and matrix B (transposed)

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

139

int4 a0 = convert_int4(vload4(0, src_addr_a));

140

int4 b0 = convert_int4(vload4(0, src_addr_b));

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

141

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

142

c00 += (int4)a0.s0 * b0;

143

c10 += (int4)a0.s1 * b0;

144

c20 += (int4)a0.s2 * b0;

145

c30 += (int4)a0.s3 * b0;

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

146

}

147

148

// Compute destination address

149

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

150

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

151

#if defined(REINTERPRET_OUTPUT_AS_3D)

152

// Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension

153

// in order to take into account the presence of possible cross plane paddings

//

// | |

// | plane0 |

// | |

// |__________________|

159

// |******************|

160

// | cross_plane_pad |

161

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

166

167

// The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D

168

uint4 zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;

169

zout = min(DEPTH_GEMM3D - 1, zout);

170

171

// Add offset due to the cross plane paddings

172

zout *= (cross_plane_pad * dst_stride_y);

173

174

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

175

// multiply dst_stride_z by DEPTH_GEMM3D

176

dst.ptr += z * dst_stride_z * DEPTH_GEMM3D;

177

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

178

// Store 4x4 block

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

179

vstore4(c00, 0, (__global int *)(dst.ptr + 0 * dst_stride_y + zout.s0));

180

vstore4(c10, 0, (__global int *)(dst.ptr + 1 * dst_stride_y + zout.s1));

181

vstore4(c20, 0, (__global int *)(dst.ptr + 2 * dst_stride_y + zout.s2));

182

vstore4(c30, 0, (__global int *)(dst.ptr + 3 * dst_stride_y + zout.s3));

183

184

#else // defined(REINTERPRET_OUTPUT_AS_3D)

185

// Add offset for batched GEMM

186

dst.ptr += z * dst_stride_z;

187

188

// Store 4x4 block

189

vstore4(c00, 0, (__global int *)(dst.ptr + 0 * dst_stride_y));

190

vstore4(c10, 0, (__global int *)(dst.ptr + 1 * dst_stride_y));

191

vstore4(c20, 0, (__global int *)(dst.ptr + 2 * dst_stride_y));

192

vstore4(c30, 0, (__global int *)(dst.ptr + 3 * dst_stride_y));

193

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

194

}

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

195

196

/** This OpenCL kernel is optimized for Bifrost and computes the matrix multiplication between matrix A (src0) and matrix B (src1)

197

* Matrix A and matrix B must be reshaped respectively with @ref CLGEMMInterleave4x4Kernel and @ref CLGEMMTranspose1xWKernel before running the matrix multiplication

198

*

199

* @attention The number of matrix B columns needs to be passed at compile time using -DCOLS_B

200

* @note The transposition width step (mult_transpose1xW_width * 4) must be passed at compile time using -DTRANSPOSE1XW_WIDTH_STEP (i.e. -DTRANSPOSE1XW_WIDTH_STEP=2)

201

* @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (i.e. -DMULT_INTERLEAVE4X4_HEIGHT=2)

202

*

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

203

* @note In case the output has to be reinterpreted as a 3D tensor (i.e. output of convolution layer), the following information must be passed at compile time:

204

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

205

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

206

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

207

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped

208

*

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

209

* @param[in] src0_ptr Pointer to the source matrix. Supported data type: QASYMM8

210

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

211

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

212

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

213

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

214

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

215

* @param[in] src1_ptr Pointer to the source matrix. Supported data type: same as @p src0_ptr

216

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

217

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

218

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

219

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

220

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

221

* @param[out] dst_ptr Pointer to the destination matrix Supported data type: S32

222

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

223

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

224

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

225

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

226

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

227

* @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)

228

* @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)

229

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

230

* @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

231

*/

232

__kernel void gemmlowp_mm_interleaved_transposed_bifrost(IMAGE_DECLARATION(src0),

233

IMAGE_DECLARATION(src1),

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

234

IMAGE_DECLARATION(dst),

uint src0_stride_z,

uint src1_stride_z,

uint dst_stride_z

#if defined(REINTERPRET_OUTPUT_AS_3D)

239

,

240

uint cross_plane_pad

241

#endif // REINTERPRET_OUTPUT_AS_3D

242

)

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

243

{

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

244

const int x = get_global_id(0) / TRANSPOSE1XW_WIDTH_STEP;

245

const int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;

246

const int z = get_global_id(2);

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

247

248

// Offset

249

const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;

250

const int offset_row_b = (get_global_id(0) % TRANSPOSE1XW_WIDTH_STEP) * 4;

251

252

// src_addr_a = address of matrix A

253

// src_addr_b = address of matrix B

Isabella Gottardi

b92805b

2018-09-28 18:24:27 +0100

[diff] [blame]

254

__global uchar *src_addr_a = (__global uchar *)(src0_ptr + z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes);

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

255

__global uchar *src_addr_b = (__global uchar *)(src1_ptr + x * src1_stride_y + src1_offset_first_element_in_bytes);

256

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

257

#if defined(MATRIX_B_DEPTH)

258

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

259

src_addr_b += (z % MATRIX_B_DEPTH) * src1_stride_z;

260

#else // defined(MATRIX_B_DEPTH)

261

src_addr_b += z * src1_stride_z;

262

#endif // defined(MATRIX_B_DEPTH)

263

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

264

// Compute end row address for matrix B

265

__global uchar *src_end_addr_b = src_addr_b + COLS_B;

266

267

src_addr_a += offset_row_a;

268

src_addr_b += offset_row_b;

269

270

// Reset accumulators

uint c00 = 0;

uint c01 = 0;

uint c02 = 0;

uint c03 = 0;

uint c10 = 0;

uint c11 = 0;

uint c12 = 0;

uint c13 = 0;

uint c20 = 0;

uint c21 = 0;

uint c22 = 0;

uint c23 = 0;

uint c30 = 0;

uint c31 = 0;

uint c32 = 0;

uint c33 = 0;

#if MULT_INTERLEAVE4X4_HEIGHT == 1

289

for(; src_addr_b <= (src_end_addr_b - (int)(32 * TRANSPOSE1XW_WIDTH_STEP)); src_addr_a += (32 * MULT_INTERLEAVE4X4_HEIGHT), src_addr_b += (32 * TRANSPOSE1XW_WIDTH_STEP))

290

{

291

// Load values from matrix A (interleaved) and matrix B (transposed)

292

uchar16 a0 = vload16(0, src_addr_a);

293

uchar4 b0 = vload4(0, src_addr_b);

294

295

c00 += (ushort)a0.s0 * b0.s0;

296

c01 += (ushort)a0.s0 * b0.s1;

297

c02 += (ushort)a0.s0 * b0.s2;

298

c03 += (ushort)a0.s0 * b0.s3;

299

300

c10 += (ushort)a0.s1 * b0.s0;

301

c11 += (ushort)a0.s1 * b0.s1;

302

c12 += (ushort)a0.s1 * b0.s2;

303

c13 += (ushort)a0.s1 * b0.s3;

304

305

c20 += (ushort)a0.s2 * b0.s0;

306

c21 += (ushort)a0.s2 * b0.s1;

307

c22 += (ushort)a0.s2 * b0.s2;

308

c23 += (ushort)a0.s2 * b0.s3;

309

310

c30 += (ushort)a0.s3 * b0.s0;

311

c31 += (ushort)a0.s3 * b0.s1;

312

c32 += (ushort)a0.s3 * b0.s2;

313

c33 += (ushort)a0.s3 * b0.s3;

314

315

// Load values from matrix B (transposed)

316

b0 = vload4(0, src_addr_b + 4 * TRANSPOSE1XW_WIDTH_STEP);

317

318

c00 += (ushort)a0.s4 * b0.s0;

319

c01 += (ushort)a0.s4 * b0.s1;

320

c02 += (ushort)a0.s4 * b0.s2;

321

c03 += (ushort)a0.s4 * b0.s3;

322

323

c10 += (ushort)a0.s5 * b0.s0;

324

c11 += (ushort)a0.s5 * b0.s1;

325

c12 += (ushort)a0.s5 * b0.s2;

326

c13 += (ushort)a0.s5 * b0.s3;

327

328

c20 += (ushort)a0.s6 * b0.s0;

329

c21 += (ushort)a0.s6 * b0.s1;

330

c22 += (ushort)a0.s6 * b0.s2;

331

c23 += (ushort)a0.s6 * b0.s3;

332

333

c30 += (ushort)a0.s7 * b0.s0;

334

c31 += (ushort)a0.s7 * b0.s1;

335

c32 += (ushort)a0.s7 * b0.s2;

336

c33 += (ushort)a0.s7 * b0.s3;

337

338

// Load values from matrix B (transposed)

339

b0 = vload4(0, src_addr_b + 8 * TRANSPOSE1XW_WIDTH_STEP);

340

341

c00 += (ushort)a0.s8 * b0.s0;

342

c01 += (ushort)a0.s8 * b0.s1;

343

c02 += (ushort)a0.s8 * b0.s2;

344

c03 += (ushort)a0.s8 * b0.s3;

345

346

c10 += (ushort)a0.s9 * b0.s0;

347

c11 += (ushort)a0.s9 * b0.s1;

348

c12 += (ushort)a0.s9 * b0.s2;

349

c13 += (ushort)a0.s9 * b0.s3;

350

351

c20 += (ushort)a0.sA * b0.s0;

352

c21 += (ushort)a0.sA * b0.s1;

353

c22 += (ushort)a0.sA * b0.s2;

354

c23 += (ushort)a0.sA * b0.s3;

355

356

c30 += (ushort)a0.sB * b0.s0;

357

c31 += (ushort)a0.sB * b0.s1;

358

c32 += (ushort)a0.sB * b0.s2;

359

c33 += (ushort)a0.sB * b0.s3;

360

361

// Load values from matrix B (transposed)

362

b0 = vload4(0, src_addr_b + 12 * TRANSPOSE1XW_WIDTH_STEP);

363

364

c00 += (ushort)a0.sC * b0.s0;

365

c01 += (ushort)a0.sC * b0.s1;

366

c02 += (ushort)a0.sC * b0.s2;

367

c03 += (ushort)a0.sC * b0.s3;

368

369

c10 += (ushort)a0.sD * b0.s0;

370

c11 += (ushort)a0.sD * b0.s1;

371

c12 += (ushort)a0.sD * b0.s2;

372

c13 += (ushort)a0.sD * b0.s3;

373

374

c20 += (ushort)a0.sE * b0.s0;

375

c21 += (ushort)a0.sE * b0.s1;

376

c22 += (ushort)a0.sE * b0.s2;

377

c23 += (ushort)a0.sE * b0.s3;

378

379

c30 += (ushort)a0.sF * b0.s0;

380

c31 += (ushort)a0.sF * b0.s1;

381

c32 += (ushort)a0.sF * b0.s2;

382

c33 += (ushort)a0.sF * b0.s3;

383

384

// Load values from matrix A (interleaved) and matrix B (transposed)

385

a0 = vload16(0, src_addr_a + 16);

386

b0 = vload4(0, src_addr_b + 16 * TRANSPOSE1XW_WIDTH_STEP);

387

388

c00 += (ushort)a0.s0 * b0.s0;

389

c01 += (ushort)a0.s0 * b0.s1;

390

c02 += (ushort)a0.s0 * b0.s2;

391

c03 += (ushort)a0.s0 * b0.s3;

392

393

c10 += (ushort)a0.s1 * b0.s0;

394

c11 += (ushort)a0.s1 * b0.s1;

395

c12 += (ushort)a0.s1 * b0.s2;

396

c13 += (ushort)a0.s1 * b0.s3;

397

398

c20 += (ushort)a0.s2 * b0.s0;

399

c21 += (ushort)a0.s2 * b0.s1;

400

c22 += (ushort)a0.s2 * b0.s2;

401

c23 += (ushort)a0.s2 * b0.s3;

402

403

c30 += (ushort)a0.s3 * b0.s0;

404

c31 += (ushort)a0.s3 * b0.s1;

405

c32 += (ushort)a0.s3 * b0.s2;

406

c33 += (ushort)a0.s3 * b0.s3;

407

408

// Load values from matrix B (transposed)

409

b0 = vload4(0, src_addr_b + 20 * TRANSPOSE1XW_WIDTH_STEP);

410

411

c00 += (ushort)a0.s4 * b0.s0;

412

c01 += (ushort)a0.s4 * b0.s1;

413

c02 += (ushort)a0.s4 * b0.s2;

414

c03 += (ushort)a0.s4 * b0.s3;

415

416

c10 += (ushort)a0.s5 * b0.s0;

417

c11 += (ushort)a0.s5 * b0.s1;

418

c12 += (ushort)a0.s5 * b0.s2;

419

c13 += (ushort)a0.s5 * b0.s3;

420

421

c20 += (ushort)a0.s6 * b0.s0;

422

c21 += (ushort)a0.s6 * b0.s1;

423

c22 += (ushort)a0.s6 * b0.s2;

424

c23 += (ushort)a0.s6 * b0.s3;

425

426

c30 += (ushort)a0.s7 * b0.s0;

427

c31 += (ushort)a0.s7 * b0.s1;

428

c32 += (ushort)a0.s7 * b0.s2;

429

c33 += (ushort)a0.s7 * b0.s3;

430

431

// Load values from matrix B (transposed)

432

b0 = vload4(0, src_addr_b + 24 * TRANSPOSE1XW_WIDTH_STEP);

433

434

c00 += (ushort)a0.s8 * b0.s0;

435

c01 += (ushort)a0.s8 * b0.s1;

436

c02 += (ushort)a0.s8 * b0.s2;

437

c03 += (ushort)a0.s8 * b0.s3;

438

439

c10 += (ushort)a0.s9 * b0.s0;

440

c11 += (ushort)a0.s9 * b0.s1;

441

c12 += (ushort)a0.s9 * b0.s2;

442

c13 += (ushort)a0.s9 * b0.s3;

443

444

c20 += (ushort)a0.sA * b0.s0;

445

c21 += (ushort)a0.sA * b0.s1;

446

c22 += (ushort)a0.sA * b0.s2;

447

c23 += (ushort)a0.sA * b0.s3;

448

449

c30 += (ushort)a0.sB * b0.s0;

450

c31 += (ushort)a0.sB * b0.s1;

451

c32 += (ushort)a0.sB * b0.s2;

452

c33 += (ushort)a0.sB * b0.s3;

453

454

// Load values from matrix B (transposed)

455

b0 = vload4(0, src_addr_b + 28 * TRANSPOSE1XW_WIDTH_STEP);

456

457

c00 += (ushort)a0.sC * b0.s0;

458

c01 += (ushort)a0.sC * b0.s1;

459

c02 += (ushort)a0.sC * b0.s2;

460

c03 += (ushort)a0.sC * b0.s3;

461

462

c10 += (ushort)a0.sD * b0.s0;

463

c11 += (ushort)a0.sD * b0.s1;

464

c12 += (ushort)a0.sD * b0.s2;

465

c13 += (ushort)a0.sD * b0.s3;

466

467

c20 += (ushort)a0.sE * b0.s0;

468

c21 += (ushort)a0.sE * b0.s1;

469

c22 += (ushort)a0.sE * b0.s2;

470

c23 += (ushort)a0.sE * b0.s3;

471

472

c30 += (ushort)a0.sF * b0.s0;

473

c31 += (ushort)a0.sF * b0.s1;

474

c32 += (ushort)a0.sF * b0.s2;

475

c33 += (ushort)a0.sF * b0.s3;

476

}

477

#endif // MULT_INTERLEAVE4X4_HEIGHT == 1

478

479

for(; src_addr_b < src_end_addr_b; src_addr_a += (4 * MULT_INTERLEAVE4X4_HEIGHT), src_addr_b += (4 * TRANSPOSE1XW_WIDTH_STEP))

480

{

481

// Load values from matrix A (interleaved) and matrix B (transposed)

482

uchar4 a0 = vload4(0, src_addr_a);

483

uchar4 b0 = vload4(0, src_addr_b);

484

485

c00 += (ushort)a0.s0 * b0.s0;

486

c01 += (ushort)a0.s0 * b0.s1;

487

c02 += (ushort)a0.s0 * b0.s2;

488

c03 += (ushort)a0.s0 * b0.s3;

489

490

c10 += (ushort)a0.s1 * b0.s0;

491

c11 += (ushort)a0.s1 * b0.s1;

492

c12 += (ushort)a0.s1 * b0.s2;

493

c13 += (ushort)a0.s1 * b0.s3;

494

495

c20 += (ushort)a0.s2 * b0.s0;

496

c21 += (ushort)a0.s2 * b0.s1;

497

c22 += (ushort)a0.s2 * b0.s2;

498

c23 += (ushort)a0.s2 * b0.s3;

499

500

c30 += (ushort)a0.s3 * b0.s0;

501

c31 += (ushort)a0.s3 * b0.s1;

502

c32 += (ushort)a0.s3 * b0.s2;

503

c33 += (ushort)a0.s3 * b0.s3;

504

}

505

506

// Compute destination address

507

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

508

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

509

#if defined(REINTERPRET_OUTPUT_AS_3D)

510

// Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension

511

// in order to take into account the presence of possible cross plane paddings

//

// | |

// | plane0 |

// | |

// |__________________|

517

// |******************|

518

// | cross_plane_pad |

519

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

524

525

// The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D

526

uint4 zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;

527

zout = min(DEPTH_GEMM3D - 1, zout);

528

529

// Add offset due to the cross plane paddings

530

zout *= (cross_plane_pad * dst_stride_y);

531

532

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

533

// multiply dst_stride_z by DEPTH_GEMM3D

534

dst.ptr += z * dst_stride_z * DEPTH_GEMM3D;

535

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

536

// Store 4x4 block

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

537

vstore4((int4)(c00, c01, c02, c03), 0, (__global int *)(dst.ptr + 0 * dst_stride_y + zout.s0));

538

vstore4((int4)(c10, c11, c12, c13), 0, (__global int *)(dst.ptr + 1 * dst_stride_y + zout.s1));

539

vstore4((int4)(c20, c21, c22, c23), 0, (__global int *)(dst.ptr + 2 * dst_stride_y + zout.s2));

540

vstore4((int4)(c30, c31, c32, c33), 0, (__global int *)(dst.ptr + 3 * dst_stride_y + zout.s3));

541

542

#else // defined(REINTERPRET_OUTPUT_AS_3D)

543

// Add offset for batched GEMM

544

dst.ptr += z * dst_stride_z;

545

546

// Store 4x4 block

547

vstore4((int4)(c00, c01, c02, c03), 0, (__global int *)(dst.ptr + 0 * dst_stride_y));

548

vstore4((int4)(c10, c11, c12, c13), 0, (__global int *)(dst.ptr + 1 * dst_stride_y));

549

vstore4((int4)(c20, c21, c22, c23), 0, (__global int *)(dst.ptr + 2 * dst_stride_y));

550

vstore4((int4)(c30, c31, c32, c33), 0, (__global int *)(dst.ptr + 3 * dst_stride_y));

551

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

552

}

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

553

Georgios Pinitas

2018-08-28 17:43:18 +0100

[diff] [blame]

554

#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

555

/** This OpenCL kernel is optimized for Bifrost and computes the matrix multiplication between matrix A (src0) and matrix B (src1)

556

* Matrix A and matrix B must be reshaped respectively with @ref CLGEMMInterleave4x4Kernel and @ref CLGEMMTranspose1xWKernel before running the matrix multiplication

557

*

558

* @attention The number of matrix B columns needs to be passed at compile time using -DCOLS_B

559

* @note The transposition width step (mult_transpose1xW_width * 4) must be passed at compile time using -DTRANSPOSE1XW_WIDTH_STEP (i.e. -DTRANSPOSE1XW_WIDTH_STEP=2)

560

* @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (i.e. -DMULT_INTERLEAVE4X4_HEIGHT=2)

561

*

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

562

* @note In case the output has to be reinterpreted as a 3D tensor (i.e. output of convolution layer), the following information must be passed at compile time:

563

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

564

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

565

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

566

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped

567

*

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

568

* @param[in] src0_ptr Pointer to the source matrix. Supported data type: QASYMM8

569

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

570

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

571

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

572

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

573

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

574

* @param[in] src1_ptr Pointer to the source matrix. Supported data type: same as @p src0_ptr

575

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

576

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

577

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

578

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

579

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

580

* @param[out] dst_ptr Pointer to the destination matrix Supported data type: S32

581

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

582

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

583

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

584

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

585

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

586

* @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)

587

* @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)

588

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

589

* @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

590

*/

591

__kernel void gemmlowp_mm_interleaved_transposed_bifrost_dot8(IMAGE_DECLARATION(src0),

592

IMAGE_DECLARATION(src1),

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

593

IMAGE_DECLARATION(dst),

uint src0_stride_z,

uint src1_stride_z,

uint dst_stride_z

#if defined(REINTERPRET_OUTPUT_AS_3D)

598

,

599

uint cross_plane_pad

600

#endif // REINTERPRET_OUTPUT_AS_3D

601

)

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

602

{

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

603

// Offset

604

const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;

605

const int offset_row_b = (get_global_id(0) % TRANSPOSE1XW_WIDTH_STEP) * 4;

606

607

// src_addr_a = address of matrix A

608

// src_addr_b = address of matrix B

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

609

__global uchar *src_addr_a = (__global uchar *)(src0_ptr + (get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT) * src0_stride_y + get_global_id(2) * src0_stride_z + src0_offset_first_element_in_bytes);

610

__global uchar *src_addr_b = (__global uchar *)(src1_ptr + (get_global_id(0) / TRANSPOSE1XW_WIDTH_STEP) * src1_stride_y + src1_offset_first_element_in_bytes);

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

611

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

612

#if defined(MATRIX_B_DEPTH)

613

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

614

src_addr_b += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

615

#else // defined(MATRIX_B_DEPTH)

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

616

src_addr_b += get_global_id(2) * src1_stride_z;

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

617

#endif // defined(MATRIX_B_DEPTH)

618

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

619

src_addr_a += offset_row_a;

620

src_addr_b += offset_row_b;

621

622

// Reset accumulators

uint c00 = 0;

uint c01 = 0;

uint c02 = 0;

uint c03 = 0;

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

627

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

uint c10 = 0;

uint c11 = 0;

uint c12 = 0;

uint c13 = 0;

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

632

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

uint c20 = 0;

uint c21 = 0;

uint c22 = 0;

uint c23 = 0;

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

637

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

uint c30 = 0;

uint c31 = 0;

uint c32 = 0;

uint c33 = 0;

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

643

#define COLS_MTX_B (COLS_B / (16 * MULT_TRANSPOSE1XW_WIDTH))

644

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

645

#if MULT_INTERLEAVE4X4_HEIGHT == 1

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

646

int i = 0;

647

for(; i <= (int)(COLS_MTX_B - 8); i += 8)

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

648

{

649

// Load values from matrix A (interleaved) and matrix B (transposed)

650

uchar16 a0 = vload16(0, src_addr_a);

651

uchar4 b0 = vload4(0, src_addr_b);

652

uchar4 b1 = vload4(0, src_addr_b + 4 * TRANSPOSE1XW_WIDTH_STEP);

653

uchar4 b2 = vload4(0, src_addr_b + 8 * TRANSPOSE1XW_WIDTH_STEP);

654

uchar4 b3 = vload4(0, src_addr_b + 12 * TRANSPOSE1XW_WIDTH_STEP);

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

655

uchar4 b4 = vload4(0, src_addr_b + 16 * TRANSPOSE1XW_WIDTH_STEP);

656

uchar4 b5 = vload4(0, src_addr_b + 20 * TRANSPOSE1XW_WIDTH_STEP);

657

uchar4 b6 = vload4(0, src_addr_b + 24 * TRANSPOSE1XW_WIDTH_STEP);

658

uchar4 b7 = vload4(0, src_addr_b + 28 * TRANSPOSE1XW_WIDTH_STEP);

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

659

660

// Accumulate

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

661

ARM_DOT((uchar4)(a0.s0123), (uchar4)(b0.s0, b1.s0, b2.s0, b3.s0), c00);

662

ARM_DOT((uchar4)(a0.s0123), (uchar4)(b0.s1, b1.s1, b2.s1, b3.s1), c01);

663

ARM_DOT((uchar4)(a0.s0123), (uchar4)(b0.s2, b1.s2, b2.s2, b3.s2), c02);

664

ARM_DOT((uchar4)(a0.s0123), (uchar4)(b0.s3, b1.s3, b2.s3, b3.s3), c03);

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

665

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

666

ARM_DOT((uchar4)(a0.s4567), (uchar4)(b0.s0, b1.s0, b2.s0, b3.s0), c10);

667

ARM_DOT((uchar4)(a0.s4567), (uchar4)(b0.s1, b1.s1, b2.s1, b3.s1), c11);

668

ARM_DOT((uchar4)(a0.s4567), (uchar4)(b0.s2, b1.s2, b2.s2, b3.s2), c12);

669

ARM_DOT((uchar4)(a0.s4567), (uchar4)(b0.s3, b1.s3, b2.s3, b3.s3), c13);

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

670

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

671

ARM_DOT((uchar4)(a0.s89AB), (uchar4)(b0.s0, b1.s0, b2.s0, b3.s0), c20);

672

ARM_DOT((uchar4)(a0.s89AB), (uchar4)(b0.s1, b1.s1, b2.s1, b3.s1), c21);

673

ARM_DOT((uchar4)(a0.s89AB), (uchar4)(b0.s2, b1.s2, b2.s2, b3.s2), c22);

674

ARM_DOT((uchar4)(a0.s89AB), (uchar4)(b0.s3, b1.s3, b2.s3, b3.s3), c23);

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

675

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

676

ARM_DOT((uchar4)(a0.sCDEF), (uchar4)(b0.s0, b1.s0, b2.s0, b3.s0), c30);

677

ARM_DOT((uchar4)(a0.sCDEF), (uchar4)(b0.s1, b1.s1, b2.s1, b3.s1), c31);

678

ARM_DOT((uchar4)(a0.sCDEF), (uchar4)(b0.s2, b1.s2, b2.s2, b3.s2), c32);

679

ARM_DOT((uchar4)(a0.sCDEF), (uchar4)(b0.s3, b1.s3, b2.s3, b3.s3), c33);

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

680

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

681

// Accumulate

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

682

a0 = vload16(0, src_addr_a + 16);

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

683

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

684

ARM_DOT((uchar4)(a0.s0123), (uchar4)(b4.s0, b5.s0, b6.s0, b7.s0), c00);

685

ARM_DOT((uchar4)(a0.s0123), (uchar4)(b4.s1, b5.s1, b6.s1, b7.s1), c01);

686

ARM_DOT((uchar4)(a0.s0123), (uchar4)(b4.s2, b5.s2, b6.s2, b7.s2), c02);

687

ARM_DOT((uchar4)(a0.s0123), (uchar4)(b4.s3, b5.s3, b6.s3, b7.s3), c03);

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

688

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

689

ARM_DOT((uchar4)(a0.s4567), (uchar4)(b4.s0, b5.s0, b6.s0, b7.s0), c10);

690

ARM_DOT((uchar4)(a0.s4567), (uchar4)(b4.s1, b5.s1, b6.s1, b7.s1), c11);

691

ARM_DOT((uchar4)(a0.s4567), (uchar4)(b4.s2, b5.s2, b6.s2, b7.s2), c12);

692

ARM_DOT((uchar4)(a0.s4567), (uchar4)(b4.s3, b5.s3, b6.s3, b7.s3), c13);

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

693

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

694

ARM_DOT((uchar4)(a0.s89AB), (uchar4)(b4.s0, b5.s0, b6.s0, b7.s0), c20);

695

ARM_DOT((uchar4)(a0.s89AB), (uchar4)(b4.s1, b5.s1, b6.s1, b7.s1), c21);

696

ARM_DOT((uchar4)(a0.s89AB), (uchar4)(b4.s2, b5.s2, b6.s2, b7.s2), c22);

697

ARM_DOT((uchar4)(a0.s89AB), (uchar4)(b4.s3, b5.s3, b6.s3, b7.s3), c23);

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

698

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

699

ARM_DOT((uchar4)(a0.sCDEF), (uchar4)(b4.s0, b5.s0, b6.s0, b7.s0), c30);

700

ARM_DOT((uchar4)(a0.sCDEF), (uchar4)(b4.s1, b5.s1, b6.s1, b7.s1), c31);

701

ARM_DOT((uchar4)(a0.sCDEF), (uchar4)(b4.s2, b5.s2, b6.s2, b7.s2), c32);

702

ARM_DOT((uchar4)(a0.sCDEF), (uchar4)(b4.s3, b5.s3, b6.s3, b7.s3), c33);

703

704

src_addr_a += 32;

705

src_addr_b += 32 * TRANSPOSE1XW_WIDTH_STEP;

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

706

}

707

#endif // MULT_INTERLEAVE4X4_HEIGHT == 1

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

708

int i_left_over = 0;

709

for(; i < (int)(COLS_MTX_B); ++i)

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

710

{

711

// Load values from matrix A (interleaved) and matrix B (transposed)

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

712

uchar16 a0 = vload16(0, src_addr_a + (i_left_over % 4) + ((i_left_over / 4) * 16));

Gian Marco Iodice

2018-10-30 12:20:03 +0000

[diff] [blame]

713

uchar4 b0 = vload4(0, src_addr_b);

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

714

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

715

c00 += a0.s0 * b0.s0;

716

c01 += a0.s0 * b0.s1;

717

c02 += a0.s0 * b0.s2;

718

c03 += a0.s0 * b0.s3;

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

719

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

720

c10 += a0.s4 * b0.s0;

721

c11 += a0.s4 * b0.s1;

722

c12 += a0.s4 * b0.s2;

723

c13 += a0.s4 * b0.s3;

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

724

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

725

c20 += a0.s8 * b0.s0;

726

c21 += a0.s8 * b0.s1;

727

c22 += a0.s8 * b0.s2;

728

c23 += a0.s8 * b0.s3;

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

729

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

730

c30 += a0.sC * b0.s0;

731

c31 += a0.sC * b0.s1;

732

c32 += a0.sC * b0.s2;

733

c33 += a0.sC * b0.s3;

734

735

i_left_over++;

736

src_addr_b += 4 * TRANSPOSE1XW_WIDTH_STEP;

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

737

}

738

739

// Compute destination address

740

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

741

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

742

#if defined(REINTERPRET_OUTPUT_AS_3D)

743

// Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension

744

// in order to take into account the presence of possible cross plane paddings

//

// | |

// | plane0 |

// | |

// |__________________|

750

// |******************|

751

// | cross_plane_pad |

752

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

757

758

// The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D

759

uint4 zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;

760

zout = min(DEPTH_GEMM3D - 1, zout);

761

762

// Add offset due to the cross plane paddings

763

zout *= (cross_plane_pad * dst_stride_y);

764

765

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

766

// multiply dst_stride_z by DEPTH_GEMM3D

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

767

dst.ptr += get_global_id(2) * dst_stride_z * DEPTH_GEMM3D;

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

768

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

769

// Store 4x4 block

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

770

vstore4((int4)(c00, c01, c02, c03), 0, (__global int *)(dst.ptr + 0 * dst_stride_y + zout.s0));

771

vstore4((int4)(c10, c11, c12, c13), 0, (__global int *)(dst.ptr + 1 * dst_stride_y + zout.s1));

772

vstore4((int4)(c20, c21, c22, c23), 0, (__global int *)(dst.ptr + 2 * dst_stride_y + zout.s2));

773

vstore4((int4)(c30, c31, c32, c33), 0, (__global int *)(dst.ptr + 3 * dst_stride_y + zout.s3));

774

775

#else // defined(REINTERPRET_OUTPUT_AS_3D)

776

// Add offset for batched GEMM

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

777

dst.ptr += get_global_id(2) * dst_stride_z;

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

778

779

// Store 4x4 block

780

vstore4((int4)(c00, c01, c02, c03), 0, (__global int *)(dst.ptr + 0 * dst_stride_y));

781

vstore4((int4)(c10, c11, c12, c13), 0, (__global int *)(dst.ptr + 1 * dst_stride_y));

782

vstore4((int4)(c20, c21, c22, c23), 0, (__global int *)(dst.ptr + 2 * dst_stride_y));

783

vstore4((int4)(c30, c31, c32, c33), 0, (__global int *)(dst.ptr + 3 * dst_stride_y));

784

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

785

}

Georgios Pinitas

2018-08-28 17:43:18 +0100

[diff] [blame]

786

#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

787

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

788

#endif // defined(COLS_B) && defined(MULT_INTERLEAVE4X4_HEIGHT) && defined(TRANSPOSE1XW_WIDTH_STEP)

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

789

790

#if defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_Y) && defined(COLS_A)

791

#define VECTOR_UCHAR VEC_DATA_TYPE(uchar, NUM_ELEMS_PROCESSED_PER_THREAD_X)

792

#define VECTOR_UINT VEC_DATA_TYPE(uint, NUM_ELEMS_PROCESSED_PER_THREAD_X)

793

#define VECTOR_INT VEC_DATA_TYPE(int, NUM_ELEMS_PROCESSED_PER_THREAD_X)

794

/** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped

795

*

796

* @attention The number of matrix A columns needs to be passed at compile time using -DCOLS_A

797

*

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

798

* @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:

799

* -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D

800

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

801

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

802

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

803

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped

804

*

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

805

* @param[in] src0_ptr Pointer to the source matrix. Supported data type: QASYMM8

806

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

807

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

808

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

809

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

810

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

811

* @param[in] src1_ptr Pointer to the source matrix. Supported data type: same as @p src0_ptr

812

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

813

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

814

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

815

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

816

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

817

* @param[out] dst_ptr Pointer to the destination matrix Supported data type: S32

818

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

819

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

820

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

821

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

822

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

823

* @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)

824

* @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)

825

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

826

* @param[in] src_cross_plane_pad (Optional) Bottom paddings in unit of elements for the input tensor (only if defined REINTERPRET_INPUT_AS_3D)

827

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements for the output tensor (only if defined REINTERPRET_OUTPUT_AS_3D)

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

828

*/

Gian Marco

2018-01-10 15:56:30 +0000

[diff] [blame]

829

__kernel void gemmlowp_mm_midgard(IMAGE_DECLARATION(src0),

830

IMAGE_DECLARATION(src1),

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

831

IMAGE_DECLARATION(dst),

uint src0_stride_z,

uint src1_stride_z,

uint dst_stride_z

#if defined(REINTERPRET_INPUT_AS_3D)

836

,

837

uint src_cross_plane_pad

838

#endif // REINTERPRET_INPUT_AS_3D

839

#if defined(REINTERPRET_OUTPUT_AS_3D)

840

,

841

uint dst_cross_plane_pad

842

#endif // REINTERPRET_OUTPUT_AS_3D

843

)

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

844

{

845

int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;

846

847

// Compute starting address for matrix A and Matrix B

848

int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));

849

850

// Update address for the matrix A

851

src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;

852

853

// Update address for the matrix B

854

src_addr.s1 += idx;

855

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

856

#if defined(REINTERPRET_INPUT_AS_3D)

857

// Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension

858

// in order to take into account the presence of possible cross plane paddings

//

// | |

// | plane0 |

// | |

// |__________________|

864

// |******************|

865

// | cross_plane_pad |

866

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

871

872

// The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D

873

uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;

874

zin = min(DEPTH_GEMM3D - 1, zin);

875

876

// Add offset due to the cross plane paddings

877

zin *= (src_cross_plane_pad * src0_stride_y);

878

879

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

880

// multiply src0_stride_z by DEPTH_GEMM3D

881

src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;

882

883

#else // defined(REINTERPRET_INPUT_AS_3D)

884

885

// Add offset for batched GEMM

886

src_addr.s0 += get_global_id(2) * src0_stride_z;

887

888

#endif // defined(REINTERPRET_INPUT_AS_3D)

889

890

#if defined(MATRIX_B_DEPTH)

891

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

892

src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;

893

#else // defined(MATRIX_B_DEPTH)

894

src_addr.s1 += get_global_id(2) * src1_stride_z;

895

#endif // defined(MATRIX_B_DEPTH)

896

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

897

int end_row_vec_a = src_addr.s0 + COLS_A;

898

899

VECTOR_UINT acc0 = 0;

900

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

901

VECTOR_UINT acc1 = 0;

902

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

903

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

904

VECTOR_UINT acc2 = 0;

905

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

906

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

907

VECTOR_UINT acc3 = 0;

908

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco

2018-01-10 15:56:30 +0000

[diff] [blame]

909

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

910

VECTOR_UINT acc4 = 0;

911

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

912

913

for(; src_addr.s0 <= (end_row_vec_a - 2); src_addr += (int2)(2, 2 * src1_stride_y))

914

{

915

// Load values from matrix A

916

uchar2 a0 = vload2(0, src0_ptr + src_addr.s0 + 0 * src0_stride_y);

917

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

918

uchar2 a1 = vload2(0, src0_ptr + src_addr.s0 + 1 * src0_stride_y);

919

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

920

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

921

uchar2 a2 = vload2(0, src0_ptr + src_addr.s0 + 2 * src0_stride_y);

922

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

923

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

924

uchar2 a3 = vload2(0, src0_ptr + src_addr.s0 + 3 * src0_stride_y);

925

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco

2018-01-10 15:56:30 +0000

[diff] [blame]

926

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

927

uchar2 a4 = vload2(0, src0_ptr + src_addr.s0 + 4 * src0_stride_y);

928

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

929

// Load values from matrix B

930

VECTOR_UCHAR b0 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, src1_ptr + src_addr.s1);

931

VECTOR_UCHAR b1 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, src1_ptr + src_addr.s1 + src1_stride_y);

932

933

// Accumulate

934

acc0 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a0.s0;

935

acc0 += CONVERT(b1, VECTOR_UINT) * (VECTOR_UINT)a0.s1;

936

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

937

acc1 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a1.s0;

938

acc1 += CONVERT(b1, VECTOR_UINT) * (VECTOR_UINT)a1.s1;

939

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

940

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

941

acc2 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a2.s0;

942

acc2 += CONVERT(b1, VECTOR_UINT) * (VECTOR_UINT)a2.s1;

943

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

944

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

945

acc3 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a3.s0;

946

acc3 += CONVERT(b1, VECTOR_UINT) * (VECTOR_UINT)a3.s1;

947

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco

2018-01-10 15:56:30 +0000

[diff] [blame]

948

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

949

acc4 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a4.s0;

950

acc4 += CONVERT(b1, VECTOR_UINT) * (VECTOR_UINT)a4.s1;

951

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

952

}

953

954

for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(1, src1_stride_y))

955

{

956

// Load values from matrix A

957

uchar a0 = *(src0_ptr + src_addr.s0 + 0 * src0_stride_y);

958

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

959

uchar a1 = *(src0_ptr + src_addr.s0 + 1 * src0_stride_y);

960

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

961

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

962

uchar a2 = *(src0_ptr + src_addr.s0 + 2 * src0_stride_y);

963

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

964

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

965

uchar a3 = *(src0_ptr + src_addr.s0 + 3 * src0_stride_y);

966

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco

2018-01-10 15:56:30 +0000

[diff] [blame]

967

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

968

uchar a4 = *(src0_ptr + src_addr.s0 + 4 * src0_stride_y);

969

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

970

// Load values from matrix B

971

VECTOR_UCHAR b0 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, src1_ptr + src_addr.s1);

972

973

// Accumulate

974

acc0 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a0;

975

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

976

acc1 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a1;

977

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

978

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

979

acc2 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a2;

980

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

981

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

982

acc3 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a3;

983

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco

2018-01-10 15:56:30 +0000

[diff] [blame]

984

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

985

acc4 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a4;

986

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

987

}

988

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

989

const int z = get_global_id(2);

990

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

991

// Compute destination address

992

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

993

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

994

#if defined(REINTERPRET_OUTPUT_AS_3D)

995

// Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension

996

// in order to take into account the presence of possible cross plane paddings

//

// | |

// | plane0 |

// | |

// |__________________|

1002

// |******************|

1003

// | cross_plane_pad |

1004

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

1009

1010

// The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D

1011

uint8 zout = ((uint8)(0, 1, 2, 3, 4, 5, 6, 7) + (uint8)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint8)HEIGHT_GEMM3D;

1012

zout = min(DEPTH_GEMM3D - 1, zout);

1013

1014

// Add offset due to the cross plane paddings

1015

zout *= (dst_cross_plane_pad * dst_stride_y);

1016

1017

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

1018

// multiply dst_stride_z by DEPTH_GEMM3D

1019

dst.ptr += z * dst_stride_z * DEPTH_GEMM3D;

1020

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

1021

// Store the result

1022

VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

1023

(CONVERT(acc0, VECTOR_INT), 0, (__global int *)(dst.ptr + 0 * dst_stride_y + zout.s0));

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

1024

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1025

VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

1026

(CONVERT(acc1, VECTOR_INT), 0, (__global int *)(dst.ptr + 1 * dst_stride_y + zout.s1));

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

1027

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1028

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1029

VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

1030

(CONVERT(acc2, VECTOR_INT), 0, (__global int *)(dst.ptr + 2 * dst_stride_y + zout.s2));

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

1031

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1032

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1033

VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

1034

(CONVERT(acc3, VECTOR_INT), 0, (__global int *)(dst.ptr + 3 * dst_stride_y + zout.s3));

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

1035

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco

2018-01-10 15:56:30 +0000

[diff] [blame]

1036

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

1037

VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

1038

(CONVERT(acc4, VECTOR_INT), 0, (__global int *)(dst.ptr + 4 * dst_stride_y + zout.s4));

Gian Marco

2018-01-10 15:56:30 +0000

[diff] [blame]

1039

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

1040

1041

#else // defined(REINTERPRET_OUTPUT_AS_3D)

1042

// Add offset for batched GEMM

1043

dst.ptr += z * dst_stride_z;

1044

1045

// Store the result

1046

VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)

1047

(CONVERT(acc0, VECTOR_INT), 0, (__global int *)(dst.ptr + 0 * dst_stride_y));

1048

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1049

VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)

1050

(CONVERT(acc1, VECTOR_INT), 0, (__global int *)(dst.ptr + 1 * dst_stride_y));

1051

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1052

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1053

VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)

1054

(CONVERT(acc2, VECTOR_INT), 0, (__global int *)(dst.ptr + 2 * dst_stride_y));

1055

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1056

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1057

VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)

1058

(CONVERT(acc3, VECTOR_INT), 0, (__global int *)(dst.ptr + 3 * dst_stride_y));

1059

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1060

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

1061

VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)

1062

(CONVERT(acc4, VECTOR_INT), 0, (__global int *)(dst.ptr + 4 * dst_stride_y));

1063

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

1064

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

Gian Marco

2018-01-10 15:56:30 +0000

[diff] [blame]

1065

}

1066

1067

/** OpenCL kernel optimized for Bifrost architectures that computes the matrix multiplication between matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped

1068

*

1069

* @attention The number of matrix A columns needs to be passed at compile time using -DCOLS_A

1070

*

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

1071

* @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:

1072

* -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D

1073

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

1074

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

1075

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

1076

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped

1077

*

Gian Marco

2018-01-10 15:56:30 +0000

[diff] [blame]

1078

* @param[in] src0_ptr Pointer to the source matrix. Supported data type: QASYMM8

1079

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

1080

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1081

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

1082

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1083

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

1084

* @param[in] src1_ptr Pointer to the source matrix. Supported data type: same as @p src0_ptr

1085

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

1086

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1087

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

1088

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1089

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

1090

* @param[out] dst_ptr Pointer to the destination matrix Supported data type: S32

1091

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

1092

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

1093

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

1094

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

1095

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

1096

* @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)

1097

* @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)

1098

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

1099

* @param[in] src_cross_plane_pad (Optional) Bottom paddings in unit of elements for the input tensor (only if defined REINTERPRET_INPUT_AS_3D)

1100

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements for the output tensor (only if defined REINTERPRET_OUTPUT_AS_3D)

Gian Marco

2018-01-10 15:56:30 +0000

[diff] [blame]

1101

*/

1102

__kernel void gemmlowp_mm_bifrost(IMAGE_DECLARATION(src0),

1103

IMAGE_DECLARATION(src1),

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

1104

IMAGE_DECLARATION(dst),

uint src0_stride_z,

uint src1_stride_z,

uint dst_stride_z

#if defined(REINTERPRET_INPUT_AS_3D)

1109

,

1110

uint src_cross_plane_pad

1111

#endif // REINTERPRET_INPUT_AS_3D

1112

#if defined(REINTERPRET_OUTPUT_AS_3D)

1113

,

1114

uint dst_cross_plane_pad

1115

#endif // REINTERPRET_OUTPUT_AS_3D

1116

)

Gian Marco

2018-01-10 15:56:30 +0000

[diff] [blame]

1117

{

1118

int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;

1119

1120

// Compute starting address for matrix A and Matrix B

1121

int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));

1122

1123

// Update address for the matrix A

1124

src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;

1125

1126

// Update address for the matrix B

1127

src_addr.s1 += idx;

1128

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

1129

#if defined(REINTERPRET_INPUT_AS_3D)

1130

// Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension

1131

// in order to take into account the presence of possible cross plane paddings

//

// | |

// | plane0 |

// | |

// |__________________|

1137

// |******************|

1138

// | cross_plane_pad |

1139

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

1144

1145

// The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D

1146

uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;

1147

zin = min(DEPTH_GEMM3D - 1, zin);

1148

1149

// Add offset due to the cross plane paddings

1150

zin *= (src_cross_plane_pad * src0_stride_y);

1151

1152

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

1153

// multiply src0_stride_z by DEPTH_GEMM3D

1154

src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;

1155

1156

#else // defined(REINTERPRET_INPUT_AS_3D)

1157

1158

// Add offset for batched GEMM

1159

src_addr.s0 += get_global_id(2) * src0_stride_z;

1160

1161

#endif // defined(REINTERPRET_INPUT_AS_3D)

1162

1163

#if defined(MATRIX_B_DEPTH)

1164

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

1165

src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;

1166

#else // defined(MATRIX_B_DEPTH)

1167

src_addr.s1 += get_global_id(2) * src1_stride_z;

1168

#endif // defined(MATRIX_B_DEPTH)

1169

Gian Marco

2018-01-10 15:56:30 +0000

[diff] [blame]

1170

int end_row_vec_a = src_addr.s0 + COLS_A;

uint acc00 = 0;

uint acc01 = 0;

uint acc02 = 0;

uint acc03 = 0;

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

uint acc10 = 0;

uint acc11 = 0;

uint acc12 = 0;

uint acc13 = 0;

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1182

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

uint acc20 = 0;

uint acc21 = 0;

uint acc22 = 0;

uint acc23 = 0;

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1188

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

uint acc30 = 0;

uint acc31 = 0;

uint acc32 = 0;

uint acc33 = 0;

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1194

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

uint acc40 = 0;

uint acc41 = 0;

uint acc42 = 0;

uint acc43 = 0;

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

1200

1201

for(; src_addr.s0 <= (end_row_vec_a - 4); src_addr += (int2)(4, 4 * src1_stride_y))

1202

{

1203

// Load values from matrix A

1204

uchar4 a0 = vload4(0, src0_ptr + src_addr.s0 + 0 * src0_stride_y);

1205

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1206

uchar4 a1 = vload4(0, src0_ptr + src_addr.s0 + 1 * src0_stride_y);

1207

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1208

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1209

uchar4 a2 = vload4(0, src0_ptr + src_addr.s0 + 2 * src0_stride_y);

1210

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1211

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1212

uchar4 a3 = vload4(0, src0_ptr + src_addr.s0 + 3 * src0_stride_y);

1213

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1214

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

1215

uchar4 a4 = vload4(0, src0_ptr + src_addr.s0 + 4 * src0_stride_y);

1216

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

1217

// Load values from matrix B

1218

uchar4 b0 = vload4(0, src1_ptr + src_addr.s1 + 0 * src1_stride_y);

1219

uchar4 b1 = vload4(0, src1_ptr + src_addr.s1 + 1 * src1_stride_y);

1220

uchar4 b2 = vload4(0, src1_ptr + src_addr.s1 + 2 * src1_stride_y);

1221

uchar4 b3 = vload4(0, src1_ptr + src_addr.s1 + 3 * src1_stride_y);

{

// Accumulate

ushort tmp0 = (ushort)b0.s0 * (ushort)a0.s0;

1226

ushort tmp1 = (ushort)b0.s1 * (ushort)a0.s0;

1227

ushort tmp2 = (ushort)b0.s2 * (ushort)a0.s0;

1228

ushort tmp3 = (ushort)b0.s3 * (ushort)a0.s0;

1229

1230

ushort tmp4 = (ushort)b1.s0 * (ushort)a0.s1;

1231

ushort tmp5 = (ushort)b1.s1 * (ushort)a0.s1;

1232

ushort tmp6 = (ushort)b1.s2 * (ushort)a0.s1;

1233

ushort tmp7 = (ushort)b1.s3 * (ushort)a0.s1;

1234

1235

ushort tmp8 = (ushort)b2.s0 * (ushort)a0.s2;

1236

ushort tmp9 = (ushort)b2.s1 * (ushort)a0.s2;

1237

ushort tmpA = (ushort)b2.s2 * (ushort)a0.s2;

1238

ushort tmpB = (ushort)b2.s3 * (ushort)a0.s2;

1239

1240

ushort tmpC = (ushort)b3.s0 * (ushort)a0.s3;

1241

ushort tmpD = (ushort)b3.s1 * (ushort)a0.s3;

1242

ushort tmpE = (ushort)b3.s2 * (ushort)a0.s3;

1243

ushort tmpF = (ushort)b3.s3 * (ushort)a0.s3;

1244

1245

acc00 += ((uint)tmp0 + (uint)tmp4 + (uint)tmp8 + (uint)tmpC);

1246

acc01 += ((uint)tmp1 + (uint)tmp5 + (uint)tmp9 + (uint)tmpD);

1247

acc02 += ((uint)tmp2 + (uint)tmp6 + (uint)tmpA + (uint)tmpE);

1248

acc03 += ((uint)tmp3 + (uint)tmp7 + (uint)tmpB + (uint)tmpF);

1249

}

1250

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1251

{

1252

// Accumulate

1253

ushort tmp0 = (ushort)b0.s0 * (ushort)a1.s0;

1254

ushort tmp1 = (ushort)b0.s1 * (ushort)a1.s0;

1255

ushort tmp2 = (ushort)b0.s2 * (ushort)a1.s0;

1256

ushort tmp3 = (ushort)b0.s3 * (ushort)a1.s0;

1257

1258

ushort tmp4 = (ushort)b1.s0 * (ushort)a1.s1;

1259

ushort tmp5 = (ushort)b1.s1 * (ushort)a1.s1;

1260

ushort tmp6 = (ushort)b1.s2 * (ushort)a1.s1;

1261

ushort tmp7 = (ushort)b1.s3 * (ushort)a1.s1;

1262

1263

ushort tmp8 = (ushort)b2.s0 * (ushort)a1.s2;

1264

ushort tmp9 = (ushort)b2.s1 * (ushort)a1.s2;

1265

ushort tmpA = (ushort)b2.s2 * (ushort)a1.s2;

1266

ushort tmpB = (ushort)b2.s3 * (ushort)a1.s2;

1267

1268

ushort tmpC = (ushort)b3.s0 * (ushort)a1.s3;

1269

ushort tmpD = (ushort)b3.s1 * (ushort)a1.s3;

1270

ushort tmpE = (ushort)b3.s2 * (ushort)a1.s3;

1271

ushort tmpF = (ushort)b3.s3 * (ushort)a1.s3;

1272

1273

acc10 += ((uint)tmp0 + (uint)tmp4 + (uint)tmp8 + (uint)tmpC);

1274

acc11 += ((uint)tmp1 + (uint)tmp5 + (uint)tmp9 + (uint)tmpD);

1275

acc12 += ((uint)tmp2 + (uint)tmp6 + (uint)tmpA + (uint)tmpE);

1276

acc13 += ((uint)tmp3 + (uint)tmp7 + (uint)tmpB + (uint)tmpF);

1277

}

1278

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1279

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1280

{

1281

// Accumulate

1282

ushort tmp0 = (ushort)b0.s0 * (ushort)a2.s0;

1283

ushort tmp1 = (ushort)b0.s1 * (ushort)a2.s0;

1284

ushort tmp2 = (ushort)b0.s2 * (ushort)a2.s0;

1285

ushort tmp3 = (ushort)b0.s3 * (ushort)a2.s0;

1286

1287

ushort tmp4 = (ushort)b1.s0 * (ushort)a2.s1;

1288

ushort tmp5 = (ushort)b1.s1 * (ushort)a2.s1;

1289

ushort tmp6 = (ushort)b1.s2 * (ushort)a2.s1;

1290

ushort tmp7 = (ushort)b1.s3 * (ushort)a2.s1;

1291

1292

ushort tmp8 = (ushort)b2.s0 * (ushort)a2.s2;

1293

ushort tmp9 = (ushort)b2.s1 * (ushort)a2.s2;

1294

ushort tmpA = (ushort)b2.s2 * (ushort)a2.s2;

1295

ushort tmpB = (ushort)b2.s3 * (ushort)a2.s2;

1296

1297

ushort tmpC = (ushort)b3.s0 * (ushort)a2.s3;

1298

ushort tmpD = (ushort)b3.s1 * (ushort)a2.s3;

1299

ushort tmpE = (ushort)b3.s2 * (ushort)a2.s3;

1300

ushort tmpF = (ushort)b3.s3 * (ushort)a2.s3;

1301

1302

acc20 += ((uint)tmp0 + (uint)tmp4 + (uint)tmp8 + (uint)tmpC);

1303

acc21 += ((uint)tmp1 + (uint)tmp5 + (uint)tmp9 + (uint)tmpD);

1304

acc22 += ((uint)tmp2 + (uint)tmp6 + (uint)tmpA + (uint)tmpE);

1305

acc23 += ((uint)tmp3 + (uint)tmp7 + (uint)tmpB + (uint)tmpF);

1306

}

1307

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1308

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1309

{

1310

// Accumulate

1311

ushort tmp0 = (ushort)b0.s0 * (ushort)a3.s0;

1312

ushort tmp1 = (ushort)b0.s1 * (ushort)a3.s0;

1313

ushort tmp2 = (ushort)b0.s2 * (ushort)a3.s0;

1314

ushort tmp3 = (ushort)b0.s3 * (ushort)a3.s0;

1315

1316

ushort tmp4 = (ushort)b1.s0 * (ushort)a3.s1;

1317

ushort tmp5 = (ushort)b1.s1 * (ushort)a3.s1;

1318

ushort tmp6 = (ushort)b1.s2 * (ushort)a3.s1;

1319

ushort tmp7 = (ushort)b1.s3 * (ushort)a3.s1;

1320

1321

ushort tmp8 = (ushort)b2.s0 * (ushort)a3.s2;

1322

ushort tmp9 = (ushort)b2.s1 * (ushort)a3.s2;

1323

ushort tmpA = (ushort)b2.s2 * (ushort)a3.s2;

1324

ushort tmpB = (ushort)b2.s3 * (ushort)a3.s2;

1325

1326

ushort tmpC = (ushort)b3.s0 * (ushort)a3.s3;

1327

ushort tmpD = (ushort)b3.s1 * (ushort)a3.s3;

1328

ushort tmpE = (ushort)b3.s2 * (ushort)a3.s3;

1329

ushort tmpF = (ushort)b3.s3 * (ushort)a3.s3;

1330

1331

acc30 += ((uint)tmp0 + (uint)tmp4 + (uint)tmp8 + (uint)tmpC);

1332

acc31 += ((uint)tmp1 + (uint)tmp5 + (uint)tmp9 + (uint)tmpD);

1333

acc32 += ((uint)tmp2 + (uint)tmp6 + (uint)tmpA + (uint)tmpE);

1334

acc33 += ((uint)tmp3 + (uint)tmp7 + (uint)tmpB + (uint)tmpF);

1335

}

1336

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1337

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

1338

{

1339

// Accumulate

1340

ushort tmp0 = (ushort)b0.s0 * (ushort)a4.s0;

1341

ushort tmp1 = (ushort)b0.s1 * (ushort)a4.s0;

1342

ushort tmp2 = (ushort)b0.s2 * (ushort)a4.s0;

1343

ushort tmp3 = (ushort)b0.s3 * (ushort)a4.s0;

1344

1345

ushort tmp4 = (ushort)b1.s0 * (ushort)a4.s1;

1346

ushort tmp5 = (ushort)b1.s1 * (ushort)a4.s1;

1347

ushort tmp6 = (ushort)b1.s2 * (ushort)a4.s1;

1348

ushort tmp7 = (ushort)b1.s3 * (ushort)a4.s1;

1349

1350

ushort tmp8 = (ushort)b2.s0 * (ushort)a4.s2;

1351

ushort tmp9 = (ushort)b2.s1 * (ushort)a4.s2;

1352

ushort tmpA = (ushort)b2.s2 * (ushort)a4.s2;

1353

ushort tmpB = (ushort)b2.s3 * (ushort)a4.s2;

1354

1355

ushort tmpC = (ushort)b3.s0 * (ushort)a4.s3;

1356

ushort tmpD = (ushort)b3.s1 * (ushort)a4.s3;

1357

ushort tmpE = (ushort)b3.s2 * (ushort)a4.s3;

1358

ushort tmpF = (ushort)b3.s3 * (ushort)a4.s3;

1359

1360

acc40 += ((uint)tmp0 + (uint)tmp4 + (uint)tmp8 + (uint)tmpC);

1361

acc41 += ((uint)tmp1 + (uint)tmp5 + (uint)tmp9 + (uint)tmpD);

1362

acc42 += ((uint)tmp2 + (uint)tmp6 + (uint)tmpA + (uint)tmpE);

1363

acc43 += ((uint)tmp3 + (uint)tmp7 + (uint)tmpB + (uint)tmpF);

1364

}

1365

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

1366

}

1367

1368

for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(1, src1_stride_y))

1369

{

1370

// Load values from matrix A

1371

uchar a0 = *(src0_ptr + src_addr.s0 + 0 * src0_stride_y);

1372

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1373

uchar a1 = *(src0_ptr + src_addr.s0 + 1 * src0_stride_y);

1374

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1375

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1376

uchar a2 = *(src0_ptr + src_addr.s0 + 2 * src0_stride_y);

1377

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1378

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1379

uchar a3 = *(src0_ptr + src_addr.s0 + 3 * src0_stride_y);

1380

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1381

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

1382

uchar a4 = *(src0_ptr + src_addr.s0 + 4 * src0_stride_y);

1383

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

1384

// Load values from matrix B

1385

uchar4 b0 = vload4(0, src1_ptr + src_addr.s1);

// Accumulate

{

// Accumulate

ushort tmp0 = (ushort)b0.s0 * (ushort)a0;

1391

ushort tmp1 = (ushort)b0.s1 * (ushort)a0;

1392

ushort tmp2 = (ushort)b0.s2 * (ushort)a0;

1393

ushort tmp3 = (ushort)b0.s3 * (ushort)a0;

1394

1395

acc00 += ((uint)tmp0);

1396

acc01 += ((uint)tmp1);

1397

acc02 += ((uint)tmp2);

1398

acc03 += ((uint)tmp3);

1399

}

1400

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1401

{

1402

// Accumulate

1403

ushort tmp0 = (ushort)b0.s0 * (ushort)a1;

1404

ushort tmp1 = (ushort)b0.s1 * (ushort)a1;

1405

ushort tmp2 = (ushort)b0.s2 * (ushort)a1;

1406

ushort tmp3 = (ushort)b0.s3 * (ushort)a1;

1407

1408

acc10 += ((uint)tmp0);

1409

acc11 += ((uint)tmp1);

1410

acc12 += ((uint)tmp2);

1411

acc13 += ((uint)tmp3);

1412

}

1413

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1414

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1415

{

1416

// Accumulate

1417

ushort tmp0 = (ushort)b0.s0 * (ushort)a2;

1418

ushort tmp1 = (ushort)b0.s1 * (ushort)a2;

1419

ushort tmp2 = (ushort)b0.s2 * (ushort)a2;

1420

ushort tmp3 = (ushort)b0.s3 * (ushort)a2;

1421

1422

acc20 += ((uint)tmp0);

1423

acc21 += ((uint)tmp1);

1424

acc22 += ((uint)tmp2);

1425

acc23 += ((uint)tmp3);

1426

}

1427

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1428

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1429

{

1430

// Accumulate

1431

ushort tmp0 = (ushort)b0.s0 * (ushort)a3;

1432

ushort tmp1 = (ushort)b0.s1 * (ushort)a3;

1433

ushort tmp2 = (ushort)b0.s2 * (ushort)a3;

1434

ushort tmp3 = (ushort)b0.s3 * (ushort)a3;

1435

1436

acc30 += ((uint)tmp0);

1437

acc31 += ((uint)tmp1);

1438

acc32 += ((uint)tmp2);

1439

acc33 += ((uint)tmp3);

1440

}

1441

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1442

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

1443

{

1444

// Accumulate

1445

ushort tmp0 = (ushort)b0.s0 * (ushort)a4;

1446

ushort tmp1 = (ushort)b0.s1 * (ushort)a4;

1447

ushort tmp2 = (ushort)b0.s2 * (ushort)a4;

1448

ushort tmp3 = (ushort)b0.s3 * (ushort)a4;

1449

1450

acc40 += ((uint)tmp0);

1451

acc41 += ((uint)tmp1);

1452

acc42 += ((uint)tmp2);

1453

acc43 += ((uint)tmp3);

1454

}

1455

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

1456

}

1457

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

1458

const int z = get_global_id(2);

1459

Gian Marco

2018-01-10 15:56:30 +0000

[diff] [blame]

1460

// Compute destination address

1461

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

1462

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

1463

#if defined(REINTERPRET_OUTPUT_AS_3D)

1464

// Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension

1465

// in order to take into account the presence of possible cross plane paddings

//

// | |

// | plane0 |

// | |

// |__________________|

1471

// |******************|

1472

// | cross_plane_pad |

1473

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

1478

1479

// The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D

1480

uint8 zout = ((uint8)(0, 1, 2, 3, 4, 5, 6, 7) + (uint8)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint8)HEIGHT_GEMM3D;

1481

zout = min(DEPTH_GEMM3D - 1, zout);

1482

1483

// Add offset due to the cross plane paddings

1484

zout *= (dst_cross_plane_pad * dst_stride_y);

1485

1486

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

1487

// multiply dst_stride_z by DEPTH_GEMM3D

1488

dst.ptr += z * dst_stride_z * DEPTH_GEMM3D;

1489

Gian Marco

2018-01-10 15:56:30 +0000

[diff] [blame]

1490

// Store the result

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

1491

vstore4((int4)(acc00, acc01, acc02, acc03), 0, (__global int *)(dst.ptr + 0 * dst_stride_y + zout.s0));

Gian Marco

2018-01-10 15:56:30 +0000

[diff] [blame]

1492

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

1493

vstore4((int4)(acc10, acc11, acc12, acc13), 0, (__global int *)(dst.ptr + 1 * dst_stride_y + zout.s1));

Gian Marco

2018-01-10 15:56:30 +0000

[diff] [blame]

1494

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1495

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

1496

vstore4((int4)(acc20, acc21, acc22, acc23), 0, (__global int *)(dst.ptr + 2 * dst_stride_y + zout.s2));

Gian Marco

2018-01-10 15:56:30 +0000

[diff] [blame]

1497

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1498

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

1499

vstore4((int4)(acc30, acc31, acc32, acc33), 0, (__global int *)(dst.ptr + 3 * dst_stride_y + zout.s3));

Gian Marco

2018-01-10 15:56:30 +0000

[diff] [blame]

1500

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1501

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

1502

vstore4((int4)(acc40, acc41, acc42, acc43), 0, (__global int *)(dst.ptr + 4 * dst_stride_y + zout.s4));

Gian Marco

2018-01-10 15:56:30 +0000

[diff] [blame]

1503

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

1504

1505

#else // defined(REINTERPRET_OUTPUT_AS_3D)

1506

// Add offset for batched GEMM

1507

dst.ptr += z * dst_stride_z;

1508

1509

// Store the result

1510

vstore4((int4)(acc00, acc01, acc02, acc03), 0, (__global int *)(dst.ptr + 0 * dst_stride_y));

1511

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1512

vstore4((int4)(acc10, acc11, acc12, acc13), 0, (__global int *)(dst.ptr + 1 * dst_stride_y));

1513

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1514

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1515

vstore4((int4)(acc20, acc21, acc22, acc23), 0, (__global int *)(dst.ptr + 2 * dst_stride_y));

1516

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1517

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1518

vstore4((int4)(acc30, acc31, acc32, acc33), 0, (__global int *)(dst.ptr + 3 * dst_stride_y));

1519

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1520

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

1521

vstore4((int4)(acc40, acc41, acc42, acc43), 0, (__global int *)(dst.ptr + 4 * dst_stride_y));

1522

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

1523

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

1524

}

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1525

Georgios Pinitas

2018-08-28 17:43:18 +0100

[diff] [blame]

1526

#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1527

/** OpenCL kernel optimized to use dot product that computes the matrix multiplication between matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped

1528

*

1529

* @attention The number of matrix A columns needs to be passed at compile time using -DCOLS_A

1530

*

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

1531

* @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:

1532

* -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D

1533

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

1534

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

1535

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

1536

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped

1537

*

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1538

* @param[in] src0_ptr Pointer to the source matrix. Supported data type: QASYMM8

1539

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

1540

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1541

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

1542

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1543

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

1544

* @param[in] src1_ptr Pointer to the source matrix. Supported data type: same as @p src0_ptr

1545

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

1546

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1547

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

1548

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1549

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

1550

* @param[out] dst_ptr Pointer to the destination matrix Supported data type: S32

1551

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

1552

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

1553

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

1554

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

1555

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

1556

* @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)

1557

* @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)

1558

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

1559

* @param[in] src_cross_plane_pad (Optional) Bottom paddings in unit of elements for the input tensor (only if defined REINTERPRET_INPUT_AS_3D)

1560

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements for the output tensor (only if defined REINTERPRET_OUTPUT_AS_3D)

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1561

*/

1562

__kernel void gemmlowp_mm_bifrost_dot8(IMAGE_DECLARATION(src0),

1563

IMAGE_DECLARATION(src1),

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

1564

IMAGE_DECLARATION(dst),

uint src0_stride_z,

uint src1_stride_z,

uint dst_stride_z

#if defined(REINTERPRET_INPUT_AS_3D)

1569

,

1570

uint src_cross_plane_pad

1571

#endif // REINTERPRET_INPUT_AS_3D

1572

#if defined(REINTERPRET_OUTPUT_AS_3D)

1573

,

1574

uint dst_cross_plane_pad

1575

#endif // REINTERPRET_OUTPUT_AS_3D)

1576

)

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1577

{

1578

int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;

1579

1580

// Compute starting address for matrix A and Matrix B

1581

int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));

1582

1583

// Update address for the matrix A

1584

src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;

1585

1586

// Update address for the matrix B

1587

src_addr.s1 += idx;

1588

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

1589

#if defined(REINTERPRET_INPUT_AS_3D)

1590

// Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension

1591

// in order to take into account the presence of possible cross plane paddings

//

// | |

// | plane0 |

// | |

// |__________________|

1597

// |******************|

1598

// | cross_plane_pad |

1599

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

1604

1605

// The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D

1606

uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;

1607

zin = min(DEPTH_GEMM3D - 1, zin);

1608

1609

// Add offset due to the cross plane paddings

1610

zin *= (src_cross_plane_pad * src0_stride_y);

1611

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1612

zin += ((uint4)(0, 1, 2, 3)) * src0_stride_y;

1613

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

1614

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

1615

// multiply src0_stride_z by DEPTH_GEMM3D

1616

src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;

1617

1618

#else // defined(REINTERPRET_INPUT_AS_3D)

1619

1620

// Add offset for batched GEMM

1621

src_addr.s0 += get_global_id(2) * src0_stride_z;

1622

1623

#endif // defined(REINTERPRET_INPUT_AS_3D)

1624

1625

#if defined(MATRIX_B_DEPTH)

1626

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

1627

src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;

1628

#else // defined(MATRIX_B_DEPTH)

1629

src_addr.s1 += get_global_id(2) * src1_stride_z;

1630

#endif // defined(MATRIX_B_DEPTH)

1631

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

uint acc00 = 0;

uint acc01 = 0;

uint acc02 = 0;

uint acc03 = 0;

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

uint acc04 = 0;

uint acc05 = 0;

uint acc06 = 0;

uint acc07 = 0;

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1640

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

uint acc10 = 0;

uint acc11 = 0;

uint acc12 = 0;

uint acc13 = 0;

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

uint acc14 = 0;

uint acc15 = 0;

uint acc16 = 0;

uint acc17 = 0;

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1649

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1650

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

uint acc20 = 0;

uint acc21 = 0;

uint acc22 = 0;

uint acc23 = 0;

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

uint acc24 = 0;

uint acc25 = 0;

uint acc26 = 0;

uint acc27 = 0;

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1659

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1660

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

uint acc30 = 0;

uint acc31 = 0;

uint acc32 = 0;

uint acc33 = 0;

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

uint acc34 = 0;

uint acc35 = 0;

uint acc36 = 0;

uint acc37 = 0;

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1669

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1670

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1671

// A and B src indices get incremented at the same time.

1672

int i = 0;

1673

for(; i <= ((int)COLS_A - 8); i += 8)

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1674

{

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1675

#if defined(REINTERPRET_INPUT_AS_3D)

1676

// Load values from matrix A and matrix B

1677

uchar8 a0 = vload8(0, (__global uchar *)(src0_ptr + src_addr.s0 + zin.s0));

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1678

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1679

uchar8 a1 = vload8(0, (__global uchar *)(src0_ptr + src_addr.s0 + zin.s1));

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1680

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1681

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1682

uchar8 a2 = vload8(0, (__global uchar *)(src0_ptr + src_addr.s0 + zin.s2));

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1683

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1684

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1685

uchar8 a3 = vload8(0, (__global uchar *)(src0_ptr + src_addr.s0 + zin.s3));

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1686

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1687

#else // defined(REINTERPRET_INPUT_AS_3D)

1688

// Load values from matrix A and matrix B

1689

uchar8 a0 = vload8(0, (__global uchar *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));

1690

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1691

uchar8 a1 = vload8(0, (__global uchar *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

1692

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1693

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1694

uchar8 a2 = vload8(0, (__global uchar *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

1695

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1696

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1697

uchar8 a3 = vload8(0, (__global uchar *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

1698

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1699

#endif // defined(REINTERPRET_INPUT_AS_3D)

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1700

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1701

uchar8 b0 = vload8(0, src1_ptr + src_addr.s1 + 0 * src1_stride_y);

1702

uchar8 b1 = vload8(0, src1_ptr + src_addr.s1 + 1 * src1_stride_y);

1703

uchar8 b2 = vload8(0, src1_ptr + src_addr.s1 + 2 * src1_stride_y);

1704

uchar8 b3 = vload8(0, src1_ptr + src_addr.s1 + 3 * src1_stride_y);

1705

src_addr.s1 += 4 * src1_stride_y;

1706

1707

ARM_DOT(a0.s0123, (uchar4)(b0.s0, b1.s0, b2.s0, b3.s0), acc00);

1708

ARM_DOT(a0.s0123, (uchar4)(b0.s1, b1.s1, b2.s1, b3.s1), acc01);

1709

ARM_DOT(a0.s0123, (uchar4)(b0.s2, b1.s2, b2.s2, b3.s2), acc02);

1710

ARM_DOT(a0.s0123, (uchar4)(b0.s3, b1.s3, b2.s3, b3.s3), acc03);

1711

ARM_DOT(a0.s0123, (uchar4)(b0.s4, b1.s4, b2.s4, b3.s4), acc04);

1712

ARM_DOT(a0.s0123, (uchar4)(b0.s5, b1.s5, b2.s5, b3.s5), acc05);

1713

ARM_DOT(a0.s0123, (uchar4)(b0.s6, b1.s6, b2.s6, b3.s6), acc06);

1714

ARM_DOT(a0.s0123, (uchar4)(b0.s7, b1.s7, b2.s7, b3.s7), acc07);

1715

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1716

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1717

ARM_DOT(a1.s0123, (uchar4)(b0.s0, b1.s0, b2.s0, b3.s0), acc10);

1718

ARM_DOT(a1.s0123, (uchar4)(b0.s1, b1.s1, b2.s1, b3.s1), acc11);

1719

ARM_DOT(a1.s0123, (uchar4)(b0.s2, b1.s2, b2.s2, b3.s2), acc12);

1720

ARM_DOT(a1.s0123, (uchar4)(b0.s3, b1.s3, b2.s3, b3.s3), acc13);

1721

ARM_DOT(a1.s0123, (uchar4)(b0.s4, b1.s4, b2.s4, b3.s4), acc14);

1722

ARM_DOT(a1.s0123, (uchar4)(b0.s5, b1.s5, b2.s5, b3.s5), acc15);

1723

ARM_DOT(a1.s0123, (uchar4)(b0.s6, b1.s6, b2.s6, b3.s6), acc16);

1724

ARM_DOT(a1.s0123, (uchar4)(b0.s7, b1.s7, b2.s7, b3.s7), acc17);

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1725

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1726

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1727

ARM_DOT(a2.s0123, (uchar4)(b0.s0, b1.s0, b2.s0, b3.s0), acc20);

1728

ARM_DOT(a2.s0123, (uchar4)(b0.s1, b1.s1, b2.s1, b3.s1), acc21);

1729

ARM_DOT(a2.s0123, (uchar4)(b0.s2, b1.s2, b2.s2, b3.s2), acc22);

1730

ARM_DOT(a2.s0123, (uchar4)(b0.s3, b1.s3, b2.s3, b3.s3), acc23);

1731

ARM_DOT(a2.s0123, (uchar4)(b0.s4, b1.s4, b2.s4, b3.s4), acc24);

1732

ARM_DOT(a2.s0123, (uchar4)(b0.s5, b1.s5, b2.s5, b3.s5), acc25);

1733

ARM_DOT(a2.s0123, (uchar4)(b0.s6, b1.s6, b2.s6, b3.s6), acc26);

1734

ARM_DOT(a2.s0123, (uchar4)(b0.s7, b1.s7, b2.s7, b3.s7), acc27);

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1735

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1736

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1737

ARM_DOT(a3.s0123, (uchar4)(b0.s0, b1.s0, b2.s0, b3.s0), acc30);

1738

ARM_DOT(a3.s0123, (uchar4)(b0.s1, b1.s1, b2.s1, b3.s1), acc31);

1739

ARM_DOT(a3.s0123, (uchar4)(b0.s2, b1.s2, b2.s2, b3.s2), acc32);

1740

ARM_DOT(a3.s0123, (uchar4)(b0.s3, b1.s3, b2.s3, b3.s3), acc33);

1741

ARM_DOT(a3.s0123, (uchar4)(b0.s4, b1.s4, b2.s4, b3.s4), acc34);

1742

ARM_DOT(a3.s0123, (uchar4)(b0.s5, b1.s5, b2.s5, b3.s5), acc35);

1743

ARM_DOT(a3.s0123, (uchar4)(b0.s6, b1.s6, b2.s6, b3.s6), acc36);

1744

ARM_DOT(a3.s0123, (uchar4)(b0.s7, b1.s7, b2.s7, b3.s7), acc37);

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1745

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1746

1747

b0 = vload8(0, src1_ptr + src_addr.s1 + 0 * src1_stride_y);

1748

b1 = vload8(0, src1_ptr + src_addr.s1 + 1 * src1_stride_y);

1749

b2 = vload8(0, src1_ptr + src_addr.s1 + 2 * src1_stride_y);

1750

b3 = vload8(0, src1_ptr + src_addr.s1 + 3 * src1_stride_y);

1751

src_addr.s1 += 4 * src1_stride_y;

1752

1753

ARM_DOT(a0.s4567, (uchar4)(b0.s0, b1.s0, b2.s0, b3.s0), acc00);

1754

ARM_DOT(a0.s4567, (uchar4)(b0.s1, b1.s1, b2.s1, b3.s1), acc01);

1755

ARM_DOT(a0.s4567, (uchar4)(b0.s2, b1.s2, b2.s2, b3.s2), acc02);

1756

ARM_DOT(a0.s4567, (uchar4)(b0.s3, b1.s3, b2.s3, b3.s3), acc03);

1757

ARM_DOT(a0.s4567, (uchar4)(b0.s4, b1.s4, b2.s4, b3.s4), acc04);

1758

ARM_DOT(a0.s4567, (uchar4)(b0.s5, b1.s5, b2.s5, b3.s5), acc05);

1759

ARM_DOT(a0.s4567, (uchar4)(b0.s6, b1.s6, b2.s6, b3.s6), acc06);

1760

ARM_DOT(a0.s4567, (uchar4)(b0.s7, b1.s7, b2.s7, b3.s7), acc07);

1761

1762

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1763

ARM_DOT(a1.s4567, (uchar4)(b0.s0, b1.s0, b2.s0, b3.s0), acc10);

1764

ARM_DOT(a1.s4567, (uchar4)(b0.s1, b1.s1, b2.s1, b3.s1), acc11);

1765

ARM_DOT(a1.s4567, (uchar4)(b0.s2, b1.s2, b2.s2, b3.s2), acc12);

1766

ARM_DOT(a1.s4567, (uchar4)(b0.s3, b1.s3, b2.s3, b3.s3), acc13);

1767

ARM_DOT(a1.s4567, (uchar4)(b0.s4, b1.s4, b2.s4, b3.s4), acc14);

1768

ARM_DOT(a1.s4567, (uchar4)(b0.s5, b1.s5, b2.s5, b3.s5), acc15);

1769

ARM_DOT(a1.s4567, (uchar4)(b0.s6, b1.s6, b2.s6, b3.s6), acc16);

1770

ARM_DOT(a1.s4567, (uchar4)(b0.s7, b1.s7, b2.s7, b3.s7), acc17);

1771

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1772

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1773

ARM_DOT(a2.s4567, (uchar4)(b0.s0, b1.s0, b2.s0, b3.s0), acc20);

1774

ARM_DOT(a2.s4567, (uchar4)(b0.s1, b1.s1, b2.s1, b3.s1), acc21);

1775

ARM_DOT(a2.s4567, (uchar4)(b0.s2, b1.s2, b2.s2, b3.s2), acc22);

1776

ARM_DOT(a2.s4567, (uchar4)(b0.s3, b1.s3, b2.s3, b3.s3), acc23);

1777

ARM_DOT(a2.s4567, (uchar4)(b0.s4, b1.s4, b2.s4, b3.s4), acc24);

1778

ARM_DOT(a2.s4567, (uchar4)(b0.s5, b1.s5, b2.s5, b3.s5), acc25);

1779

ARM_DOT(a2.s4567, (uchar4)(b0.s6, b1.s6, b2.s6, b3.s6), acc26);

1780

ARM_DOT(a2.s4567, (uchar4)(b0.s7, b1.s7, b2.s7, b3.s7), acc27);

1781

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1782

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1783

ARM_DOT(a3.s4567, (uchar4)(b0.s0, b1.s0, b2.s0, b3.s0), acc30);

1784

ARM_DOT(a3.s4567, (uchar4)(b0.s1, b1.s1, b2.s1, b3.s1), acc31);

1785

ARM_DOT(a3.s4567, (uchar4)(b0.s2, b1.s2, b2.s2, b3.s2), acc32);

1786

ARM_DOT(a3.s4567, (uchar4)(b0.s3, b1.s3, b2.s3, b3.s3), acc33);

1787

ARM_DOT(a3.s4567, (uchar4)(b0.s4, b1.s4, b2.s4, b3.s4), acc34);

1788

ARM_DOT(a3.s4567, (uchar4)(b0.s5, b1.s5, b2.s5, b3.s5), acc35);

1789

ARM_DOT(a3.s4567, (uchar4)(b0.s6, b1.s6, b2.s6, b3.s6), acc36);

1790

ARM_DOT(a3.s4567, (uchar4)(b0.s7, b1.s7, b2.s7, b3.s7), acc37);

1791

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1792

1793

src_addr.s0 += 8;

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1794

}

1795

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1796

for(; i < (int)COLS_A; ++i)

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1797

{

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1798

#if defined(REINTERPRET_INPUT_AS_3D)

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1799

// Load values from matrix A

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1800

uchar a0 = *((__global uchar *)(src0_ptr + src_addr.s0 + zin.s0));

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1801

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1802

uchar a1 = *((__global uchar *)(src0_ptr + src_addr.s0 + zin.s1));

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1803

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1804

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1805

uchar a2 = *((__global uchar *)(src0_ptr + src_addr.s0 + zin.s2));

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1806

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1807

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1808

uchar a3 = *((__global uchar *)(src0_ptr + src_addr.s0 + zin.s3));

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1809

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1810

#else // defined(REINTERPRET_INPUT_AS_3D)

1811

// Load values from matrix A

1812

uchar a0 = *((__global uchar *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));

1813

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1814

uchar a1 = *((__global uchar *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

1815

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1816

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1817

uchar a2 = *((__global uchar *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

1818

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1819

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1820

uchar a3 = *((__global uchar *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

1821

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1822

#endif // defined(REINTERPRET_INPUT_AS_3D)

1823

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1824

// Load values from matrix B

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1825

uchar8 b0 = vload8(0, src1_ptr + src_addr.s1);

1826

src_addr.s1 += src1_stride_y;

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1827

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1828

acc00 += (uint)a0 * b0.s0;

1829

acc01 += (uint)a0 * b0.s1;

1830

acc02 += (uint)a0 * b0.s2;

1831

acc03 += (uint)a0 * b0.s3;

1832

acc04 += (uint)a0 * b0.s4;

1833

acc05 += (uint)a0 * b0.s5;

1834

acc06 += (uint)a0 * b0.s6;

1835

acc07 += (uint)a0 * b0.s7;

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1836

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1837

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1838

acc10 += (uint)a1 * b0.s0;

1839

acc11 += (uint)a1 * b0.s1;

1840

acc12 += (uint)a1 * b0.s2;

1841

acc13 += (uint)a1 * b0.s3;

1842

acc14 += (uint)a1 * b0.s4;

1843

acc15 += (uint)a1 * b0.s5;

1844

acc16 += (uint)a1 * b0.s6;

1845

acc17 += (uint)a1 * b0.s7;

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1846

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1847

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1848

acc20 += (uint)a2 * b0.s0;

1849

acc21 += (uint)a2 * b0.s1;

1850

acc22 += (uint)a2 * b0.s2;

1851

acc23 += (uint)a2 * b0.s3;

1852

acc24 += (uint)a2 * b0.s4;

1853

acc25 += (uint)a2 * b0.s5;

1854

acc26 += (uint)a2 * b0.s6;

1855

acc27 += (uint)a2 * b0.s7;

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1856

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1857

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1858

acc30 += (uint)a3 * b0.s0;

1859

acc31 += (uint)a3 * b0.s1;

1860

acc32 += (uint)a3 * b0.s2;

1861

acc33 += (uint)a3 * b0.s3;

1862

acc34 += (uint)a3 * b0.s4;

1863

acc35 += (uint)a3 * b0.s5;

1864

acc36 += (uint)a3 * b0.s6;

1865

acc37 += (uint)a3 * b0.s7;

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1866

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1867

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1868

src_addr.s0 += 1;

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1869

}

1870

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1871

int z = get_global_id(2);

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

1872

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1873

// Compute destination address

1874

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

1875

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1876

// Compute dst address

1877

__global uchar *dst_addr = dst.ptr;

1878

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

1879

#if defined(REINTERPRET_OUTPUT_AS_3D)

1880

// Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension

1881

// in order to take into account the presence of possible cross plane paddings

//

// | |

// | plane0 |

// | |

// |__________________|

1887

// |******************|

1888

// | cross_plane_pad |

1889

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

1894

1895

// The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1896

uint4 zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

1897

zout = min(DEPTH_GEMM3D - 1, zout);

1898

1899

// Add offset due to the cross plane paddings

1900

zout *= (dst_cross_plane_pad * dst_stride_y);

1901

1902

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

1903

// multiply dst_stride_z by DEPTH_GEMM3D

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1904

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

1905

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1906

// Store the result

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1907

vstore4((int4)(acc00, acc01, acc02, acc03), 0, (__global int *)(dst_addr + 0 * dst_stride_y + zout.s0));

1908

vstore4((int4)(acc04, acc05, acc06, acc07), 1, (__global int *)(dst_addr + 0 * dst_stride_y + zout.s0));

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1909

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1910

vstore4((int4)(acc10, acc11, acc12, acc13), 0, (__global int *)(dst_addr + 1 * dst_stride_y + zout.s1));

1911

vstore4((int4)(acc14, acc15, acc16, acc17), 1, (__global int *)(dst_addr + 1 * dst_stride_y + zout.s1));

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1912

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1913

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1914

vstore4((int4)(acc20, acc21, acc22, acc23), 0, (__global int *)(dst_addr + 2 * dst_stride_y + zout.s2));

1915

vstore4((int4)(acc24, acc25, acc26, acc27), 1, (__global int *)(dst_addr + 2 * dst_stride_y + zout.s2));

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1916

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1917

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1918

vstore4((int4)(acc30, acc31, acc32, acc33), 0, (__global int *)(dst_addr + 3 * dst_stride_y + zout.s3));

1919

vstore4((int4)(acc34, acc35, acc36, acc37), 0, (__global int *)(dst_addr + 3 * dst_stride_y + zout.s3));

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1920

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

1921

1922

#else // defined(REINTERPRET_OUTPUT_AS_3D)

1923

// Add offset for batched GEMM

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1924

dst_addr += z * dst_stride_z;

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

1925

1926

// Store the result

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1927

vstore4((int4)(acc00, acc01, acc02, acc03), 0, (__global int *)(dst_addr + 0 * dst_stride_y));

1928

vstore4((int4)(acc04, acc05, acc06, acc07), 1, (__global int *)(dst_addr + 0 * dst_stride_y));

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

1929

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1930

vstore4((int4)(acc10, acc11, acc12, acc13), 0, (__global int *)(dst_addr + 1 * dst_stride_y));

1931

vstore4((int4)(acc14, acc15, acc16, acc17), 1, (__global int *)(dst_addr + 1 * dst_stride_y));

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

1932

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1933

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1934

vstore4((int4)(acc20, acc21, acc22, acc23), 0, (__global int *)(dst_addr + 2 * dst_stride_y));

1935

vstore4((int4)(acc24, acc25, acc26, acc27), 1, (__global int *)(dst_addr + 2 * dst_stride_y));

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

1936

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1937

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1938

vstore4((int4)(acc30, acc31, acc32, acc33), 0, (__global int *)(dst_addr + 3 * dst_stride_y));

1939

vstore4((int4)(acc34, acc35, acc36, acc37), 0, (__global int *)(dst_addr + 3 * dst_stride_y));

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

1940

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1941

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

1942

}

Georgios Pinitas

2018-08-28 17:43:18 +0100

[diff] [blame]

1943

#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

1944

#endif // defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_Y) && defined(COLS_A)

1945

1946

#if defined(COLS_A)

1947

/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each row of Matrix A.

1948

*

1949

* @note This stage is needed to handle the offset of matrix product

1950

* https://github.com/google/gemmlowp/blob/master/doc/low-precision.md

1951

*

1952

* @attention The number of matrix A columns needs to be passed at compile time using -DCOLS_A

1953

*

1954

* @param[in] src_ptr Pointer to the source tensor. Supported data type: QASYMM8

1955

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1956

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1957

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1958

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1959

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1960

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

1961

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1962

* @param[out] dst_ptr Pointer to the destination tensor Supported data type: S32

1963

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1964

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

1965

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1966

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

1967

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1968

*/

1969

__kernel void gemmlowp_matrix_a_reduction(TENSOR3D_DECLARATION(src),

1970

IMAGE_DECLARATION(dst))

1971

{

1972

// Compute source and destination addresses

1973

Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);

1974

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

1975

1976

uint4 sum_row_u32 = (uint4)0;

1977

uint sum_row = 0;

1978

1979

__global const uchar *matrix_a = (__global const uchar *)(src.ptr + get_global_id(0) * src_stride_y + get_global_id(1) * src_stride_z);

int i = 0;

// This for loop performs 16 accumulations

1984

for(; i <= ((int)COLS_A - 16); i += 16)

1985

{

1986

const uchar16 a0_u8 = vload16(0, matrix_a + i);

1987

1988

sum_row_u32 += convert_uint4(a0_u8.s0123) + convert_uint4(a0_u8.s4567) + convert_uint4(a0_u8.s89AB) + convert_uint4(a0_u8.sCDEF);

1989

}

1990

1991

// This for loop performs the leftover accumulations

1992

for(; i < COLS_A; ++i)

1993

{

1994

sum_row += matrix_a[i];

1995

}

1996

1997

sum_row += sum_row_u32.s0 + sum_row_u32.s1 + sum_row_u32.s2 + sum_row_u32.s3;

1998

1999

*((__global int *)dst.ptr) = (int)sum_row;

2000

}

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

2001

2002

#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)

2003

/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each row of Matrix A using the arm dot product instruction

2004

*

2005

* @note This stage is needed to handle the offset of matrix product

2006

* https://github.com/google/gemmlowp/blob/master/doc/low-precision.md

2007

*

2008

* @attention The number of matrix A columns needs to be passed at compile time using -DCOLS_A

2009

*

2010

* @param[in] src_ptr Pointer to the source tensor. Supported data type: QASYMM8

2011

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

2012

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

2013

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

2014

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

2015

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

2016

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

2017

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

2018

* @param[out] dst_ptr Pointer to the destination tensor Supported data type: S32

2019

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

2020

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

2021

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

2022

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

2023

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

2024

*/

2025

__kernel void gemmlowp_matrix_a_reduction_dot8(TENSOR3D_DECLARATION(src),

2026

IMAGE_DECLARATION(dst))

2027

{

2028

// Compute source and destination addresses

2029

Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);

2030

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

uint sum_row = 0;

__global const uchar *matrix_a = (__global const uchar *)(src.ptr + get_global_id(0) * src_stride_y + get_global_id(1) * src_stride_z);

int i = 0;

// This for loop performs 16 accumulations

2039

for(; i <= ((int)COLS_A - 32); i += 32)

2040

{

2041

uchar16 a0_u8 = vload16(0, matrix_a + i);

2042

2043

sum_row += arm_dot(a0_u8.s0123, (uchar4)(1));

2044

sum_row += arm_dot(a0_u8.s4567, (uchar4)(1));

2045

sum_row += arm_dot(a0_u8.s89AB, (uchar4)(1));

2046

sum_row += arm_dot(a0_u8.sCDEF, (uchar4)(1));

2047

2048

a0_u8 = vload16(1, matrix_a + i);

2049

2050

sum_row += arm_dot(a0_u8.s0123, (uchar4)(1));

2051

sum_row += arm_dot(a0_u8.s4567, (uchar4)(1));

2052

sum_row += arm_dot(a0_u8.s89AB, (uchar4)(1));

2053

sum_row += arm_dot(a0_u8.sCDEF, (uchar4)(1));

2054

}

2055

2056

// This for loop performs the leftover accumulations

2057

for(; i < COLS_A; ++i)

2058

{

2059

sum_row += matrix_a[i];

2060

}

2061

2062

*((__global int *)dst.ptr) = (int)sum_row;

2063

}

2064

#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

2065

#endif // defined(COLS_A)

2066

2067

#if defined(COLS_B) && defined(ROWS_B)

2068

/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each column of Matrix B.

2069

*

2070

* @note This stage is needed to handle the offset of matrix product

2071

* https://github.com/google/gemmlowp/blob/master/doc/low-precision.md

2072

*

2073

* @attention The number of matrix B columns and rows needs to be passed at compile time using -DCOLS_B and -DROWS_B

2074

*

2075

* @param[in] src_ptr Pointer to the source tensor. Supported data type: QASYMM8

2076

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

2077

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

2078

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

2079

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

2080

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

2081

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

2082

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

2083

* @param[out] dst_ptr Pointer to the destination tensor Supported data type: S32

2084

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

2085

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

2086

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

2087

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

2088

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

2089

*/

2090

__kernel void gemmlowp_matrix_b_reduction(TENSOR3D_DECLARATION(src),

2091

IMAGE_DECLARATION(dst))

2092

{

2093

// Compute source and destination addresses

2094

Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);

2095

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

2096

2097

uint16 sum_col_u32 = (uint16)0;

2098

2099

__global const uchar *matrix_b = (__global const uchar *)(src.ptr + get_global_id(1) * src_stride_z);

2100

2101

int i = 0;

2102

// This for loop performs 4 accumulations

2103

for(; i <= ((int)ROWS_B - 4); i += 4)

2104

{

2105

const uchar16 b0_u8 = vload16(0, matrix_b + 0 * src_stride_y);

2106

const uchar16 b1_u8 = vload16(0, matrix_b + 1 * src_stride_y);

2107

const uchar16 b2_u8 = vload16(0, matrix_b + 2 * src_stride_y);

2108

const uchar16 b3_u8 = vload16(0, matrix_b + 3 * src_stride_y);

2109

2110

sum_col_u32 += convert_uint16(b0_u8) + convert_uint16(b1_u8) + convert_uint16(b2_u8) + convert_uint16(b3_u8);

2111

2112

matrix_b += 4 * src_stride_y;

2113

}

2114

2115

// This for loop perfoms the leftover accumulations

2116

for(; i < (int)ROWS_B; ++i)

2117

{

2118

const uchar16 b0_u8 = vload16(0, matrix_b);

2119

2120

sum_col_u32 += convert_uint16(b0_u8);

2121

2122

matrix_b += src_stride_y;

2123

}

2124

2125

vstore16(convert_int16(sum_col_u32), 0, (__global int *)dst.ptr);

2126

}

2127

#endif // defined(COLS_B) && defined(ROWS_B)

2128

2129

#if defined(K_OFFSET)

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

2130

2131

/* Helper function used to calculate the offset contribution after @ref CLGEMMLowpMatrixMultiplyKernel.

2132

*

2133

* This kernel takes a final int32 accumulator value (the output of @CLGEMMLowpMatrixMultiplyKernel),

2134

* and calculates the offset contribution of matrix A and matrix B.

2135

*

2136

* @attention The k_offset = a_offset * b_offset * k (where k is the number of matrix A columns) needs to be passed at compile time using -DK_OFFSET (i.e. -DK_OFFSET=1200)

2137

* @note In case the offset contribution due to a_offset is required, a_offset needs to be passed at compile time using -DA_OFFSET (i.e. -DA_OFFSET=1)

2138

* @note In case the offset contribution due to b_offset is required, b_offset needs to be passed at compile time using -DB_OFFSET (i.e. -DB_OFFSET=6)

2139

* @note In case sum_col has batches, -DSUM_COL_HAS_BATCHES must be passed at compile time. Usually if gemmlowp is used to accelerate convolution layer, sum_col will not have batches

2140

*

2141

* @param[in] x get_global_id(0) * 4

2142

* @param[in] y get_global_id(1)

2143

* @param[in] z get_global_id(2)

2144

* @param[in] sum_col_ptr (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr

2145

* @param[in] sum_col_stride_x (Optional) Stride of the source tensor in X dimension (in bytes)

2146

* @param[in] sum_col_step_x (Optional) sum_col_stride_x * number of elements along X processed per workitem(in bytes)

2147

* @param[in] sum_col_stride_y (Optional) Stride of the source tensor in Y dimension (in bytes)

2148

* @param[in] sum_col_step_y (Optional) sum_col_stride_y * number of elements along Y processed per workitem(in bytes)

2149

* @param[in] sum_col_offset_first_element_in_bytes (Optional) The offset of the first element in the source tensor

2150

* @param[in] sum_row_ptr (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr

2151

* @param[in] sum_row_stride_x (Optional) Stride of the source tensor in X dimension (in bytes)

2152

* @param[in] sum_row_step_x (Optional) sum_row_stride_x * number of elements along X processed per workitem(in bytes)

2153

* @param[in] sum_row_stride_y (Optional) Stride of the source tensor in Y dimension (in bytes)

2154

* @param[in] sum_row_step_y (Optional) sum_row_stride_y * number of elements along Y processed per workitem(in bytes)

2155

* @param[in] sum_row_offset_first_element_in_bytes (Optional) The offset of the first element in the source tensor

2156

* @param[in] biases_ptr (Optional) Pointer to the biases tensor. Supported data type: same as @p src_ptr

2157

* @param[in] biases_stride_x (Optional) Stride of the biases tensor in X dimension (in bytes)

2158

* @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)

2159

* @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases tensor

2160

*/

2161

inline int4 offset_contribution(

int x,

int y,

int z

#if defined(A_OFFSET)

2166

,

2167

IMAGE_DECLARATION(sum_col)

2168

#endif // defined(A_OFFSET)

2169

#if defined(B_OFFSET)

2170

,

2171

IMAGE_DECLARATION(sum_row)

2172

#endif // defined(B_OFFSET)

2173

#if defined(ADD_BIAS)

2174

,

2175

VECTOR_DECLARATION(biases)

2176

#endif // defined(ADD_BIAS)

2177

)

2178

{

2179

int4 a_offset_s32 = (int4)0;

2180

int4 b_offset_s32 = (int4)0;

2181

2182

int batch_id = z;

2183

#if defined(DEPTH_INPUT3D)

2184

batch_id /= (int)DEPTH_INPUT3D;

2185

#endif // defined(DEPTH_INPUT3D)

2186

2187

#if defined(A_OFFSET)

2188

// Compute the offset contribution due to A_OFFSET

2189

__global uchar *sum_col_addr = sum_col_ptr + sum_col_offset_first_element_in_bytes + x * sizeof(int);

2190

2191

// Compute the offset contribution due to A_OFFSET

2192

#if defined(SUM_COL_HAS_BATCHES)

2193

a_offset_s32 = vload4(0, (__global int *)(sum_col_addr + batch_id * sum_col_stride_y));

2194

#else // defined(SUM_COL_HAS_BATCHES)

2195

a_offset_s32 = vload4(0, (__global int *)sum_col_addr);

2196

#endif // defined(SUM_COL_HAS_BATCHES)

2197

2198

a_offset_s32 *= (int4)A_OFFSET;

2199

#endif // defined(A_OFFSET)

2200

2201

#if defined(B_OFFSET)

2202

// Compute the offset contribution due to A_OFFSET

2203

__global uchar *sum_row_addr = sum_row_ptr + sum_row_offset_first_element_in_bytes + y * sizeof(int);

2204

2205

// Compute the offset contribution due to B_OFFSET

2206

#if defined(HEIGHT_INPUT3D) && defined(DEPTH_INPUT3D)

2207

b_offset_s32 = (int4) * (((__global int *)(sum_row_addr + batch_id * sum_row_stride_y)) + (z % (int)DEPTH_INPUT3D) * (int)HEIGHT_INPUT3D);

2208

#else // defined(HEIGHT_INPUT3D) && defined(DEPTH_INPUT3D)

2209

b_offset_s32 = (int4) * (((__global int *)(sum_row_addr + batch_id * sum_row_stride_y)));

2210

#endif // defined(HEIGHT_INPUT3D) && defined(DEPTH_INPUT3D)

2211

b_offset_s32 *= (int4)B_OFFSET;

2212

#endif // defined(B_OFFSET)

2213

2214

#if defined(ADD_BIAS)

2215

// Add bias

2216

__global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int);

2217

2218

int4 biases_values = vload4(0, (__global int *)bias_addr);

2219

b_offset_s32 += (int4)biases_values;

2220

#endif // defined(ADD_BIAS)

2221

2222

return (int4)K_OFFSET + a_offset_s32 + b_offset_s32;

2223

}

2224

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

2225

/* OpenCL kernel used to add the offset contribution after @ref CLGEMMLowpMatrixMultiplyKernel. The computation is performed in-place

2226

*

2227

* This kernel takes a final int32 accumulator value (the output of @CLGEMMLowpMatrixMultiplyKernel),

2228

* and adds to it the offset contribution of matrix A and matrix B in-place.

2229

*

2230

* @attention The k_offset = a_offset * b_offset * k (where k is the number of matrix A columns) needs to be passed at compile time using -DK_OFFSET (i.e. -DK_OFFSET=1200)

2231

* @note In case the offset contribution due to a_offset is required, a_offset needs to be passed at compile time using -DA_OFFSET (i.e. -DA_OFFSET=1)

2232

* @note In case the offset contribution due to b_offset is required, b_offset needs to be passed at compile time using -DB_OFFSET (i.e. -DB_OFFSET=6)

Chunosov

5124be5

2017-11-22 20:42:13 +0700

[diff] [blame]

2233

* @note In case sum_col has batches, -DSUM_COL_HAS_BATCHES must be passed at compile time. Usually if gemmlowp is used to accelerate convolution layer, sum_col will not have batches

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

2234

*

2235

* The final result is:

2236

*

2237

* mm_result[i][k] = mm_result[i][k] +

2238

* (sum_col[k] * A_OFFSET) +

2239

* (sum_row[i] * B_OFFSET) +

2240

* (K_OFFSET)

2241

*

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

2242

* @param[in] mm_result_ptr Pointer to the source tensor. Supported data type: S32

2243

* @param[in] mm_result_stride_x Stride of the source tensor in X dimension (in bytes)

2244

* @param[in] mm_result_step_x mm_result_stride_x * number of elements along X processed per workitem(in bytes)

2245

* @param[in] mm_result_stride_y Stride of the source tensor in Y dimension (in bytes)

2246

* @param[in] mm_result_step_y mm_result_stride_y * number of elements along Y processed per workitem(in bytes)

2247

* @param[in] mm_result_stride_z Stride of the source tensor in Z dimension (in bytes)

2248

* @param[in] mm_result_step_z mm_result_stride_z * number of elements along Z processed per workitem(in bytes)

2249

* @param[in] mm_result_offset_first_element_in_bytes The offset of the first element in the source tensor

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

2250

* @param[in] sum_col_ptr (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr

2251

* @param[in] sum_col_stride_x (Optional) Stride of the source tensor in X dimension (in bytes)

2252

* @param[in] sum_col_step_x (Optional) sum_col_stride_x * number of elements along X processed per workitem(in bytes)

2253

* @param[in] sum_col_stride_y (Optional) Stride of the source tensor in Y dimension (in bytes)

2254

* @param[in] sum_col_step_y (Optional) sum_col_stride_y * number of elements along Y processed per workitem(in bytes)

2255

* @param[in] sum_col_offset_first_element_in_bytes (Optional) The offset of the first element in the source tensor

2256

* @param[in] sum_row_ptr (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr

2257

* @param[in] sum_row_stride_x (Optional) Stride of the source tensor in X dimension (in bytes)

2258

* @param[in] sum_row_step_x (Optional) sum_row_stride_x * number of elements along X processed per workitem(in bytes)

2259

* @param[in] sum_row_stride_y (Optional) Stride of the source tensor in Y dimension (in bytes)

2260

* @param[in] sum_row_step_y (Optional) sum_row_stride_y * number of elements along Y processed per workitem(in bytes)

2261

* @param[in] sum_row_offset_first_element_in_bytes (Optional) The offset of the first element in the source tensor

2262

* @param[in] biases_ptr (Optional) Pointer to the biases tensor. Supported data type: same as @p src_ptr

2263

* @param[in] biases_stride_x (Optional) Stride of the biases tensor in X dimension (in bytes)

2264

* @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)

2265

* @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases tensor

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

2266

*/

2267

__kernel void gemmlowp_offset_contribution(TENSOR3D_DECLARATION(mm_result)

2268

#if defined(A_OFFSET)

2269

,

2270

IMAGE_DECLARATION(sum_col)

2271

#endif // defined(A_OFFSET)

2272

#if defined(B_OFFSET)

2273

,

2274

IMAGE_DECLARATION(sum_row)

2275

#endif // defined(B_OFFSET)

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

2276

#if defined(ADD_BIAS)

2277

,

2278

VECTOR_DECLARATION(biases)

2279

#endif // defined(ADD_BIAS))

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

2280

)

2281

{

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

2282

const int x = get_global_id(0) * 4;

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

2283

const int y = get_global_id(1);

2284

const int z = get_global_id(2);

2285

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

2286

// Compute offset contribution

2287

int4 offset_term_s32 = offset_contribution(

2288

x, y, z

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

2289

#if defined(A_OFFSET)

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

,

sum_col_ptr,

sum_col_stride_x,

sum_col_step_x,

sum_col_stride_y,

sum_col_step_y,

sum_col_offset_first_element_in_bytes

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

2297

#endif // defined(A_OFFSET)

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

2298

#if defined(B_OFFSET)

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

,

sum_row_ptr,

sum_row_stride_x,

sum_row_step_x,

sum_row_stride_y,

sum_row_step_y,

sum_row_offset_first_element_in_bytes

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

2306

#endif // defined(B_OFFSET)

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

2307

#if defined(ADD_BIAS)

,

biases_ptr,

biases_stride_x,

biases_step_x,

biases_offset_first_element_in_bytes

2313

#endif // defined(ADD_BIAS)

2314

);

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

2315

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

2316

__global uchar *mm_result_addr = mm_result_ptr + mm_result_offset_first_element_in_bytes + x * sizeof(int) + y * mm_result_stride_y + z * mm_result_stride_z;

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

2317

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

2318

int4 in_s32 = vload4(0, (__global int *)mm_result_addr);

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

2319

2320

// Add the offset terms to GEMM's result

2321

in_s32 += offset_term_s32;

2322

2323

// Store the result with the offset contribution

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

2324

vstore4(in_s32, 0, (__global int *)mm_result_addr);

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

2325

}

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

2326

2327

#if defined(RESULT_OFFSET) && defined(RESULT_MULTIPLIER) && defined(RESULT_SHIFT)

2328

/* OpenCL kernel used to add the offset contribution after @ref CLGEMMLowpMatrixMultiplyKernel and it quantizes down to uint8.

2329

*

2330

* This kernel takes a final int32 accumulator value (the output of @CLGEMMLowpMatrixMultiplyKernel), adds to it the offset contribution of matrix A and matrix B and quantizes to uint8 through the output stage.

2331

*

2332

*

2333

* @attention The k_offset = a_offset * b_offset * k (where k is the number of matrix A columns) needs to be passed at compile time using -DK_OFFSET (i.e. -DK_OFFSET=1200)

2334

* @note In case the offset contribution due to a_offset is required, a_offset needs to be passed at compile time using -DA_OFFSET (i.e. -DA_OFFSET=1)

2335

* @note In case the offset contribution due to b_offset is required, b_offset needs to be passed at compile time using -DB_OFFSET (i.e. -DB_OFFSET=6)

2336

* @note In case sum_col has batches, -DSUM_COL_HAS_BATCHES must be passed at compile time. Usually if gemmlowp is used to accelerate convolution layer, sum_col will not have batches

2337

*

2338

* The result before the output stage is:

2339

*

2340

* mm_result[i][k] = mm_result[i][k] +

2341

* (sum_col[k] * A_OFFSET) +

2342

* (sum_row[i] * B_OFFSET) +

2343

* (K_OFFSET)

2344

*

2345

* This result is quantized down to uint8 using the output stage. The output stage computes the following operations:

2346

*

2347

* -# Add offset terms to final result

2348

* -# Multiply each entry of result by result_mult_int

2349

* -# Add bias to final result (if -DADD_BIAS is passed at compile time)

2350

* -# Shift the int32 accumulator by result_shift

2351

* -# Clamp the value between the specified min and max bounds (if -DMIN_BOUND and/or -DMAX_BOUND are passed at compile time)

2352

* -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8.

2353

*

2354

* @attention The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and -DRESULT_SHIFT

2355

*

2356

* @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile time

2357

* @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND.

2358

* These values can be used to implement "rectified linear unit" activation functions

2359

*

2360

* @param[in] mm_result_ptr Pointer to the source tensor. Supported data type: S32

2361

* @param[in] mm_result_stride_x Stride of the source tensor in X dimension (in bytes)

2362

* @param[in] mm_result_step_x mm_result_stride_x * number of elements along X processed per workitem(in bytes)

2363

* @param[in] mm_result_stride_y Stride of the source tensor in Y dimension (in bytes)

2364

* @param[in] mm_result_step_y mm_result_stride_y * number of elements along Y processed per workitem(in bytes)

2365

* @param[in] mm_result_stride_z Stride of the source tensor in Z dimension (in bytes)

2366

* @param[in] mm_result_step_z mm_result_stride_z * number of elements along Z processed per workitem(in bytes)

2367

* @param[in] mm_result_offset_first_element_in_bytes The offset of the first element in the source tensor

2368

* @param[in] sum_col_ptr (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr

2369

* @param[in] sum_col_stride_x (Optional) Stride of the source tensor in X dimension (in bytes)

2370

* @param[in] sum_col_step_x (Optional) sum_col_stride_x * number of elements along X processed per workitem(in bytes)

2371

* @param[in] sum_col_stride_y (Optional) Stride of the source tensor in Y dimension (in bytes)

2372

* @param[in] sum_col_step_y (Optional) sum_col_stride_y * number of elements along Y processed per workitem(in bytes)

2373

* @param[in] sum_col_offset_first_element_in_bytes (Optional) The offset of the first element in the source tensor

2374

* @param[in] sum_row_ptr (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr

2375

* @param[in] sum_row_stride_x (Optional) Stride of the source tensor in X dimension (in bytes)

2376

* @param[in] sum_row_step_x (Optional) sum_row_stride_x * number of elements along X processed per workitem(in bytes)

2377

* @param[in] sum_row_stride_y (Optional) Stride of the source tensor in Y dimension (in bytes)

2378

* @param[in] sum_row_step_y (Optional) sum_row_stride_y * number of elements along Y processed per workitem(in bytes)

2379

* @param[in] sum_row_offset_first_element_in_bytes (Optional) The offset of the first element in the source tensor

2380

* @param[in] biases_ptr (Optional) Pointer to the biases tensor. Supported data type: same as @p src_ptr

2381

* @param[in] biases_stride_x (Optional) Stride of the biases tensor in X dimension (in bytes)

2382

* @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)

2383

* @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases tensor

2384

* @param[out] dst_ptr Pointer to the destination tensor Supported data type: QASYMM8

2385

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

2386

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

2387

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

2388

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

2389

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

2390

* @param[in] dst_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

2391

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

2392

*/

2393

__kernel void gemmlowp_offset_contribution_quantize_down(TENSOR3D_DECLARATION(mm_result)

2394

#if defined(A_OFFSET)

2395

,

2396

IMAGE_DECLARATION(sum_col)

2397

#endif // defined(A_OFFSET)

2398

#if defined(B_OFFSET)

2399

,

2400

IMAGE_DECLARATION(sum_row)

2401

#endif // defined(B_OFFSET)

2402

,

2403

#if defined(ADD_BIAS)

2404

VECTOR_DECLARATION(biases),

2405

#endif // defined(ADD_BIAS)

2406

TENSOR3D_DECLARATION(dst))

2407

{

2408

const int x = get_global_id(0) * 4;

2409

const int y = get_global_id(1);

2410

const int z = get_global_id(2);

2411

2412

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;

2413

2414

// Compute offset contribution

2415

int4 offset_term_s32 = offset_contribution(

2416

x, y, z

2417

#if defined(A_OFFSET)

,

sum_col_ptr,

sum_col_stride_x,

sum_col_step_x,

sum_col_stride_y,

sum_col_step_y,

sum_col_offset_first_element_in_bytes

2425

#endif // defined(A_OFFSET)

2426

#if defined(B_OFFSET)

,

sum_row_ptr,

sum_row_stride_x,

sum_row_step_x,

sum_row_stride_y,

sum_row_step_y,

sum_row_offset_first_element_in_bytes

2434

#endif // defined(B_OFFSET)

2435

#if defined(ADD_BIAS)

,

biases_ptr,

biases_stride_x,

biases_step_x,

biases_offset_first_element_in_bytes

2441

#endif // defined(ADD_BIAS)

2442

);

2443

2444

__global uchar *mm_result_addr = mm_result_ptr + mm_result_offset_first_element_in_bytes + x * sizeof(int) + y * mm_result_stride_y + z * mm_result_stride_z;

2445

2446

int4 in_s32 = vload4(0, (__global int *)mm_result_addr);

2447

2448

// Add the offset terms to GEMM's result

2449

in_s32 += offset_term_s32;

2450

2451

// -------------- OUTPUT STAGE

2452

2453

// Add the offset terms to GEMM's result

2454

in_s32 += (int4)RESULT_OFFSET;

2455

2456

// Multiply by result_mult_int and shift

2457

in_s32 *= RESULT_MULTIPLIER;

2458

2459

in_s32 >>= RESULT_SHIFT;

2460

2461

uchar4 res = convert_uchar4_sat(in_s32);

2462

2463

#if defined(MIN_BOUND)

2464

res = max(res, (uchar4)MIN_BOUND);

2465

#endif // defined(MIN_BOUND)

2466

#if defined(MAX_BOUND)

2467

res = min(res, (uchar4)MAX_BOUND);

2468

#endif // defined(MAX_BOUND)

2469

2470

// Store the result

2471

vstore4(res, 0, dst_addr);

2472

}

2473

2474

/* OpenCL kernel used to add the offset contribution after @ref CLGEMMLowpMatrixMultiplyKernel and it quantizes down to uint8.

2475

*

2476

* This kernel takes a final int32 accumulator value (the output of @CLGEMMLowpMatrixMultiplyKernel), adds to it the offset contribution of matrix A and matrix B and quantizes to uint8 through the output stage.

2477

*

2478

*

2479

* @attention The k_offset = a_offset * b_offset * k (where k is the number of matrix A columns) needs to be passed at compile time using -DK_OFFSET (i.e. -DK_OFFSET=1200)

2480

* @note In case the offset contribution due to a_offset is required, a_offset needs to be passed at compile time using -DA_OFFSET (i.e. -DA_OFFSET=1)

2481

* @note In case the offset contribution due to b_offset is required, b_offset needs to be passed at compile time using -DB_OFFSET (i.e. -DB_OFFSET=6)

2482

* @note In case sum_col has batches, -DSUM_COL_HAS_BATCHES must be passed at compile time. Usually if gemmlowp is used to accelerate convolution layer, sum_col will not have batches

2483

*

2484

* The result before the output stage is:

2485

*

2486

* mm_result[i][k] = mm_result[i][k] +

2487

* (sum_col[k] * A_OFFSET) +

2488

* (sum_row[i] * B_OFFSET) +

2489

* (K_OFFSET)

2490

*

2491

* This result is quantized down to uint8 using the output stage. The output stage computes the following operations:

2492

*

2493

* -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier

2494

* -# Add bias to final result if bias tensor is not a nullptr

2495

* -# Round to nearest division by a power-of-two using result_shift

2496

* -# Add offset to each result

2497

* -# Clamp the value between the specified min and max bounds

2498

* -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8.

2499

*

2500

* @attention The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and -DRESULT_SHIFT

2501

*

2502

* @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile time

2503

* @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND.

2504

* These values can be used to implement "rectified linear unit" activation functions

2505

*

2506

* @param[in] mm_result_ptr Pointer to the source tensor. Supported data type: S32

2507

* @param[in] mm_result_stride_x Stride of the source tensor in X dimension (in bytes)

2508

* @param[in] mm_result_step_x mm_result_stride_x * number of elements along X processed per workitem(in bytes)

2509

* @param[in] mm_result_stride_y Stride of the source tensor in Y dimension (in bytes)

2510

* @param[in] mm_result_step_y mm_result_stride_y * number of elements along Y processed per workitem(in bytes)

2511

* @param[in] mm_result_stride_z Stride of the source tensor in Z dimension (in bytes)

2512

* @param[in] mm_result_step_z mm_result_stride_z * number of elements along Z processed per workitem(in bytes)

2513

* @param[in] mm_result_offset_first_element_in_bytes The offset of the first element in the source tensor

2514

* @param[in] sum_col_ptr (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr

2515

* @param[in] sum_col_stride_x (Optional) Stride of the source tensor in X dimension (in bytes)

2516

* @param[in] sum_col_step_x (Optional) sum_col_stride_x * number of elements along X processed per workitem(in bytes)

2517

* @param[in] sum_col_stride_y (Optional) Stride of the source tensor in Y dimension (in bytes)

2518

* @param[in] sum_col_step_y (Optional) sum_col_stride_y * number of elements along Y processed per workitem(in bytes)

2519

* @param[in] sum_col_offset_first_element_in_bytes (Optional) The offset of the first element in the source tensor

2520

* @param[in] sum_row_ptr (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr

2521

* @param[in] sum_row_stride_x (Optional) Stride of the source tensor in X dimension (in bytes)

2522

* @param[in] sum_row_step_x (Optional) sum_row_stride_x * number of elements along X processed per workitem(in bytes)

2523

* @param[in] sum_row_stride_y (Optional) Stride of the source tensor in Y dimension (in bytes)

2524

* @param[in] sum_row_step_y (Optional) sum_row_stride_y * number of elements along Y processed per workitem(in bytes)

2525

* @param[in] sum_row_offset_first_element_in_bytes (Optional) The offset of the first element in the source tensor

2526

* @param[in] biases_ptr (Optional) Pointer to the biases tensor. Supported data type: same as @p src_ptr

2527

* @param[in] biases_stride_x (Optional) Stride of the biases tensor in X dimension (in bytes)

2528

* @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)

2529

* @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases tensor

2530

* @param[out] dst_ptr Pointer to the destination tensor Supported data type: QASYMM8

2531

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

2532

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

2533

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

2534

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

2535

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

2536

* @param[in] dst_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

2537

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

2538

*/

2539

__kernel void gemmlowp_offset_contribution_quantize_down_fixedpoint(TENSOR3D_DECLARATION(mm_result)

2540

#if defined(A_OFFSET)

2541

,

2542

IMAGE_DECLARATION(sum_col)

2543

#endif // defined(A_OFFSET)

2544

#if defined(B_OFFSET)

2545

,

2546

IMAGE_DECLARATION(sum_row)

2547

#endif // defined(B_OFFSET)

2548

,

2549

#if defined(ADD_BIAS)

2550

VECTOR_DECLARATION(biases),

2551

#endif // defined(ADD_BIAS)

2552

TENSOR3D_DECLARATION(dst))

2553

{

2554

const int x = get_global_id(0) * 4;

2555

const int y = get_global_id(1);

2556

const int z = get_global_id(2);

2557

2558

// Compute offset contribution

2559

int4 offset_term_s32 = offset_contribution(

2560

x, y, z

2561

#if defined(A_OFFSET)

,

sum_col_ptr,

sum_col_stride_x,

sum_col_step_x,

sum_col_stride_y,

sum_col_step_y,

sum_col_offset_first_element_in_bytes

2569

#endif // defined(A_OFFSET)

2570

#if defined(B_OFFSET)

,

sum_row_ptr,

sum_row_stride_x,

sum_row_step_x,

sum_row_stride_y,

sum_row_step_y,

sum_row_offset_first_element_in_bytes

2578

#endif // defined(B_OFFSET)

2579

#if defined(ADD_BIAS)

,

biases_ptr,

biases_stride_x,

biases_step_x,

biases_offset_first_element_in_bytes

2585

#endif // defined(ADD_BIAS)

2586

);

2587

2588

__global uchar *mm_result_addr = mm_result_ptr + mm_result_offset_first_element_in_bytes + x * sizeof(int) + y * mm_result_stride_y + z * mm_result_stride_z;

2589

2590

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;

2591

2592

int4 in_s32 = vload4(0, (__global int *)mm_result_addr);

2593

2594

// Add the offset terms to GEMM's result

2595

in_s32 += offset_term_s32;

2596

2597

// -------------- OUTPUT STAGE

2598

2599

// Multiply by result_mult_int and shift

2600

in_s32 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(in_s32, RESULT_MULTIPLIER, RESULT_SHIFT, 4);

2601

2602

// Add the offset terms to GEMM's result

2603

in_s32 += (int4)RESULT_OFFSET;

2604

2605

uchar4 res = convert_uchar4_sat(in_s32);

2606

2607

#if defined(MIN_BOUND)

2608

res = max(res, (uchar4)MIN_BOUND);

2609

#endif // defined(MIN_BOUND)

2610

#if defined(MAX_BOUND)

2611

res = min(res, (uchar4)MAX_BOUND);

2612

#endif // defined(MAX_BOUND)

2613

2614

// Store the result

2615

vstore4(res, 0, dst_addr);

2616

}

2617

#endif // defined(K_OFFSET) && defined(RESULT_OFFSET) && defined(RESULT_MULTIPLIER) && defined(RESULT_SHIFT)

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

2618

#endif // defined(K_OFFSET)

2619

2620

#if defined(RESULT_OFFSET) && defined(RESULT_MULT_INT) && defined(RESULT_SHIFT)

2621

/** This OpenCL kernel is used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8

2622

*

2623

* This kernel takes a final int32 accumulator value and processes it to obtain the final QASYMM8 value.

2624

* The following computations will be performed by the kernel:

2625

*

2626

* -# Add offset terms to final result

2627

* -# Multiply each entry of result by result_mult_int

2628

* -# Add bias to final result (if -DADD_BIAS is passed at compile time)

2629

* -# Shift the int32 accumulator by result_shift

2630

* -# Clamp the value between the specified min and max bounds (if -DMIN_BOUND and/or -DMAX_BOUND are passed at compile time)

2631

* -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8.

2632

*

2633

* @attention The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and -DRESULT_SHIFT

2634

*

2635

* @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile time

2636

* @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND.

2637

* These values can be used to implement "rectified linear unit" activation functions

2638

*

2639

* @param[in] src_ptr Pointer to the source tensor. Supported data type: S32

2640

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

2641

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

2642

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

2643

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

2644

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

2645

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

2646

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

2647

* @param[in] biases_ptr (Optional) Pointer to the biases tensor. Supported data type: same as @p src_ptr

2648

* @param[in] biases_stride_x (Optional) Stride of the biases tensor in X dimension (in bytes)

2649

* @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)

2650

* @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases tensor

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

2651

* @param[out] dst_ptr Pointer to the destination tensor Supported data type: QASYMM8

2652

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

2653

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

2654

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

2655

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

2656

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

2657

* @param[in] dst_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

2658

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

2659

*/

2660

__kernel void gemmlowp_output_stage_quantize_down(TENSOR3D_DECLARATION(src),

2661

#if defined(ADD_BIAS)

2662

VECTOR_DECLARATION(biases),

2663

#endif // defined(ADD_BIAS)

2664

TENSOR3D_DECLARATION(dst))

2665

{

2666

// Compute source and destination addresses

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

2667

int x = get_global_id(0) * 4;

2668

int y = get_global_id(1);

2669

int z = get_global_id(2);

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

2670

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

2671

__global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(int) + y * src_stride_y + z * src_stride_z;

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

2672

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

2673

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;

2674

2675

int4 input_values = vload4(0, (__global int *)src_addr);

Gian Marco

2017-11-28 09:10:03 +0000

[diff] [blame]

2676

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

2677

#if defined(ADD_BIAS)

2678

// Add bias

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

2679

__global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int);

2680

2681

int4 biases_values = vload4(0, (__global int *)bias_addr);

2682

input_values += (int4)biases_values;

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

2683

#endif // defined(ADD_BIAS)

2684

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

2685

// Add the offset terms to GEMM's result

2686

input_values += (int4)RESULT_OFFSET;

2687

Georgios Pinitas

45bcc3a

2017-11-29 11:06:49 +0000

[diff] [blame]

2688

// Multiply by result_mult_int and shift

Gian Marco

2017-11-28 09:10:03 +0000

[diff] [blame]

2689

input_values *= RESULT_MULT_INT;

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

2690

Gian Marco

2017-11-28 09:10:03 +0000

[diff] [blame]

2691

input_values >>= RESULT_SHIFT;

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

2692

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

2693

uchar4 res = convert_uchar4_sat(input_values);

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

2694

2695

#if defined(MIN_BOUND)

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

2696

res = max(res, (uchar4)MIN_BOUND);

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

2697

#endif // defined(MIN_BOUND)

2698

#if defined(MAX_BOUND)

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

2699

res = min(res, (uchar4)MAX_BOUND);

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

2700

#endif // defined(MAX_BOUND)

2701

2702

// Store the result

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

2703

vstore4(res, 0, dst_addr);

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

2704

}

Gian Marco

2017-11-28 09:10:03 +0000

[diff] [blame]

2705

#endif // defined(RESULT_OFFSET) && defined(RESULT_MULT_INT) && defined(RESULT_SHIFT)

2706

2707

#if defined(RESULT_OFFSET_AFTER_SHIFT) && defined(RESULT_FIXEDPOINT_MULTIPLIER) && defined(RESULT_SHIFT)

2708

/** This OpenCL kernel is used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8

2709

*

2710

* This kernel takes a final int32 accumulator value (the output of @ref CLGEMMLowpMatrixMultiplyKernel), and processes it to obtain the final QASYMM8 value.

2711

* The following computations will be performed by the kernel:

2712

*

2713

* -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier

2714

* -# Add bias to final result if bias tensor is not a nullptr

2715

* -# Round to nearest division by a power-of-two using result_shift

2716

* -# Add offset to each result

2717

* -# Clamp the value between the specified min and max bounds

2718

* -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8.

2719

*

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

2720

* @attention The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_OFFSET_AFTER_SHIFT, -DRESULT_FIXEDPOINT_MULTIPLIER and -DRESULT_SHIFT

Gian Marco

2017-11-28 09:10:03 +0000

[diff] [blame]

2721

*

2722

* @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile time

2723

* @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND.

2724

* These values can be used to implement "rectified linear unit" activation functions

2725

*

2726

* @param[in] src_ptr Pointer to the source tensor. Supported data type: S32

2727

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

2728

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

2729

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

2730

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

2731

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

2732

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

2733

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

2734

* @param[in] biases_ptr (Optional) Pointer to the biases tensor. Supported data type: same as @p src_ptr

2735

* @param[in] biases_stride_x (Optional) Stride of the biases tensor in X dimension (in bytes)

2736

* @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)

2737

* @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases tensor

Gian Marco

2017-11-28 09:10:03 +0000

[diff] [blame]

2738

* @param[out] dst_ptr Pointer to the destination tensor Supported data type: QASYMM8

2739

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

2740

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

2741

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

2742

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

2743

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

2744

* @param[in] dst_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

2745

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

2746

*/

2747

__kernel void gemmlowp_output_stage_quantize_down_fixedpoint(TENSOR3D_DECLARATION(src),

2748

#if defined(ADD_BIAS)

2749

VECTOR_DECLARATION(biases),

2750

#endif // defined(ADD_BIAS)

2751

TENSOR3D_DECLARATION(dst))

2752

{

2753

// Compute source and destination addresses

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

2754

int x = get_global_id(0) * 4;

2755

int y = get_global_id(1);

2756

int z = get_global_id(2);

Georgios Pinitas

932491f

2018-09-21 16:33:15 +0100

[diff] [blame]

2757

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

2758

__global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(int) + y * src_stride_y + z * src_stride_z;

Gian Marco

2017-11-28 09:10:03 +0000

[diff] [blame]

2759

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

2760

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;

2761

2762

int4 input_values = vload4(0, (__global int *)src_addr);

Gian Marco

2017-11-28 09:10:03 +0000

[diff] [blame]

2763

2764

#if defined(ADD_BIAS)

2765

// Add bias

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

2766

__global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int);

2767

2768

int4 biases_values = vload4(0, (__global int *)bias_addr);

2769

input_values += (int4)biases_values;

Gian Marco

2017-11-28 09:10:03 +0000

[diff] [blame]

2770

#endif // defined(ADD_BIAS)

2771

2772

// Multiply by result_mult_int and shift

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

2773

input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, 4);

Gian Marco

2017-11-28 09:10:03 +0000

[diff] [blame]

2774

2775

// Add the offset terms to GEMM's result

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

2776

input_values += (int4)RESULT_OFFSET_AFTER_SHIFT;

Gian Marco

2017-11-28 09:10:03 +0000

[diff] [blame]

2777

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

2778

uchar4 res = convert_uchar4_sat(input_values);

Gian Marco

2017-11-28 09:10:03 +0000

[diff] [blame]

2779

2780

#if defined(MIN_BOUND)

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

2781

res = max(res, (uchar4)MIN_BOUND);

Gian Marco

2017-11-28 09:10:03 +0000

[diff] [blame]

2782

#endif // defined(MIN_BOUND)

2783

#if defined(MAX_BOUND)

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

2784

res = min(res, (uchar4)MAX_BOUND);

Gian Marco

2017-11-28 09:10:03 +0000

[diff] [blame]

2785

#endif // defined(MAX_BOUND)

2786

2787

// Store the result

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

2788

vstore4(res, 0, dst_addr);

Gian Marco

2017-11-28 09:10:03 +0000

[diff] [blame]

2789

}

Chunosov

5124be5

2017-11-22 20:42:13 +0700

[diff] [blame]

2790

#endif // defined(RESULT_OFFSET_AFTER_SHIFT) && defined(RESULT_FIXEDPOINT_MULTIPLIER) && defined(RESULT_SHIFT)

Georgios Pinitas

2018-10-22 13:49:08 +0100

[diff] [blame]

2791

2792

#if defined(REAL_MULTIPLIER) && defined(OUTPUT_OFFSET)

2793

/** This OpenCL kernel is used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8

2794

*

2795

* This kernel takes a final int32 accumulator value (the output of @ref CLGEMMLowpMatrixMultiplyKernel), and processes it to obtain the final QASYMM8 value.

2796

* The following computations will be performed by the kernel:

2797

*

2798

* -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier

2799

* -# Add bias to final result if bias tensor is not a nullptr

2800

* -# Requantize

2801

* -# Add offset to each result

2802

* -# Clamp the value between the specified min and max bounds

2803

* -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8.

2804

*

2805

* @attention The offset and scalar scale factor must be passed at compile time using -DRESULT_OFFSET, -DREAL_MULTIPLIER

2806

*

2807

* @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile time

2808

* @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND.

2809

* These values can be used to implement "rectified linear unit" activation functions

2810

*

2811

* @param[in] src_ptr Pointer to the source tensor. Supported data type: S32

2812

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

2813

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

2814

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

2815

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

2816

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

2817

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

2818

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

2819

* @param[in] biases_ptr Pointer to the biases tensor. Supported data type: same as @p src_ptr

2820

* @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes)

2821

* @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes)

2822

* @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor

2823

* @param[out] dst_ptr Pointer to the destination tensor Supported data type: QASYMM8

2824

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

2825

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

2826

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

2827

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

2828

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

2829

* @param[in] dst_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

2830

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

2831

* @param[in] dst_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

2832

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

2833

*/

2834

__kernel void gemmlowp_output_stage_quantize_down_float(TENSOR3D_DECLARATION(src),

2835

#if defined(ADD_BIAS)

2836

VECTOR_DECLARATION(biases),

2837

#endif // defined(ADD_BIAS)

2838

#if defined(DST_HEIGHT)

2839

TENSOR4D_DECLARATION(dst))

2840

#else // defined(DST_HEIGHT)

2841

TENSOR3D_DECLARATION(dst))

2842

#endif // defined(DST_HEIGHT)

2843

{

2844

// Compute source and destination addresses

Gian Marco Iodice

2018-10-30 12:20:03 +0000

[diff] [blame]

2845

int x = get_global_id(0) * 4;

2846

int y = get_global_id(1);

2847

int z = get_global_id(2);

Georgios Pinitas

2018-10-22 13:49:08 +0100

[diff] [blame]

2848

Gian Marco Iodice

2018-10-30 12:20:03 +0000

[diff] [blame]

2849

__global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(int) + y * src_stride_y + z * src_stride_z;

Georgios Pinitas

2018-10-22 13:49:08 +0100

[diff] [blame]

2850

Gian Marco Iodice

2018-10-30 12:20:03 +0000

[diff] [blame]

2851

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;

2852

2853

int4 input_values = vload4(0, (__global int *)src_addr);

Georgios Pinitas

2018-10-22 13:49:08 +0100

[diff] [blame]

2854

2855

#if defined(ADD_BIAS)

2856

// Add bias

Gian Marco Iodice

2018-10-30 12:20:03 +0000

[diff] [blame]

2857

__global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int);

2858

2859

int4 biases_values = vload4(0, (__global int *)bias_addr);

2860

input_values += (int4)biases_values;

Georgios Pinitas

2018-10-22 13:49:08 +0100

[diff] [blame]

2861

#endif // defined(ADD_BIAS)

2862

2863

// Convert to float

Gian Marco Iodice

2018-10-30 12:20:03 +0000

[diff] [blame]

2864

float16 input_values_f = convert_float4(input_values);

Georgios Pinitas

2018-10-22 13:49:08 +0100

[diff] [blame]

2865

input_values_f = round(input_values_f * (float)REAL_MULTIPLIER + (float)OUTPUT_OFFSET);

2866

Gian Marco Iodice

2018-10-30 12:20:03 +0000

[diff] [blame]

2867

uchar4 res = convert_uchar4_sat(input_values_f);

Georgios Pinitas

2018-10-22 13:49:08 +0100

[diff] [blame]

2868

2869

#if defined(MIN_BOUND)

Gian Marco Iodice

2018-10-30 12:20:03 +0000

[diff] [blame]

2870

res = max(res, (uchar4)MIN_BOUND);

Georgios Pinitas

2018-10-22 13:49:08 +0100

[diff] [blame]

2871

#endif // defined(MIN_BOUND)

2872

#if defined(MAX_BOUND)

Gian Marco Iodice

2018-10-30 12:20:03 +0000

[diff] [blame]

2873

res = min(res, (uchar4)MAX_BOUND);

Georgios Pinitas

2018-10-22 13:49:08 +0100

[diff] [blame]

2874

#endif // defined(MAX_BOUND)

2875

2876

// Store the result

Gian Marco Iodice

2018-10-30 12:20:03 +0000

[diff] [blame]

2877

vstore4(res, 0, dst_addr);

Georgios Pinitas