Blame - src/core/CL/cl_kernels/gemmlowp.cl - ml/ComputeLibrary

2017-11-21 10:57:50 +0000

[diff] [blame]

38

*

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

39

* @note The number of matrix B columns needs to be passed at compile time using -DCOLS_B: e.g. -DCOLS_B=1024

40

* @note The transposition width step (mult_transpose1xW_width * 4) must be passed at compile time using -DTRANSPOSE1XW_WIDTH_STEP (i.e. -DTRANSPOSE1XW_WIDTH_STEP=2)

41

* @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (i.e. -DMULT_INTERLEAVE4X4_HEIGHT=2)

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

42

*

43

* @param[in] src0_ptr Pointer to the source matrix. Supported data type: QASYMM8

44

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

45

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

46

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

47

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

48

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

49

* @param[in] src1_ptr Pointer to the source matrix. Supported data type: same as @p src0_ptr

50

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

51

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

52

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

53

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

54

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

55

* @param[out] dst_ptr Pointer to the destination matrix Supported data type: S32

56

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

57

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

58

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

59

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

60

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

61

*/

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

62

__kernel void gemmlowp_mm_interleaved_transposed_midgard(IMAGE_DECLARATION(src0),

63

IMAGE_DECLARATION(src1),

64

IMAGE_DECLARATION(dst))

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

65

{

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

66

int x = get_global_id(0) / TRANSPOSE1XW_WIDTH_STEP;

67

int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

68

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

69

// Offset

70

const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;

71

const int offset_row_b = (get_global_id(0) % TRANSPOSE1XW_WIDTH_STEP) * 4;

72

73

// src_addr_a = address of matrix A

74

// src_addr_b = address of matrix B

75

__global uchar *src_addr_a = (__global uchar *)(src0_ptr + y * src0_stride_y + src0_offset_first_element_in_bytes);

76

__global uchar *src_addr_b = (__global uchar *)(src1_ptr + x * src1_stride_y + src1_offset_first_element_in_bytes);

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

77

78

// Compute end row address for matrix B

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

79

__global uchar *src_end_addr_b = src_addr_b + COLS_B;

80

81

src_addr_a += offset_row_a;

82

src_addr_b += offset_row_b;

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

83

84

// Reset accumulators

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

int4 c00 = 0;

int4 c10 = 0;

int4 c20 = 0;

int4 c30 = 0;

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

89

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

90

for(; src_addr_b <= (src_end_addr_b - (int)(8 * TRANSPOSE1XW_WIDTH_STEP)); src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 8 * TRANSPOSE1XW_WIDTH_STEP)

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

91

{

92

// Load values from matrix A (interleaved) and matrix B (transposed)

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

93

int4 a0 = convert_int4(vload4(0, src_addr_a));

94

int4 b0 = convert_int4(vload4(0, src_addr_b));

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

95

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

96

c00 += (int4)a0.s0 * b0;

97

c10 += (int4)a0.s1 * b0;

98

c20 += (int4)a0.s2 * b0;

99

c30 += (int4)a0.s3 * b0;

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

100

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

101

a0 = convert_int4(vload4(0, src_addr_a + 4 * MULT_INTERLEAVE4X4_HEIGHT));

102

b0 = convert_int4(vload4(0, src_addr_b + 4 * TRANSPOSE1XW_WIDTH_STEP));

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

103

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

104

c00 += (int4)a0.s0 * b0;

105

c10 += (int4)a0.s1 * b0;

106

c20 += (int4)a0.s2 * b0;

107

c30 += (int4)a0.s3 * b0;

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

108

}

109

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

110

for(; src_addr_b < src_end_addr_b; src_addr_a += (4 * MULT_INTERLEAVE4X4_HEIGHT), src_addr_b += (4 * TRANSPOSE1XW_WIDTH_STEP))

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

111

{

112

// Load values from matrix A (interleaved) and matrix B (transposed)

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

113

int4 a0 = convert_int4(vload4(0, src_addr_a));

114

int4 b0 = convert_int4(vload4(0, src_addr_b));

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

115

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

116

c00 += (int4)a0.s0 * b0;

117

c10 += (int4)a0.s1 * b0;

118

c20 += (int4)a0.s2 * b0;

119

c30 += (int4)a0.s3 * b0;

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

120

}

121

122

// Compute destination address

123

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

124

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

125

// Store 4x4 block

126

vstore4(c00, 0, (__global int *)(offset(&dst, 0, 0)));

127

vstore4(c10, 0, (__global int *)(offset(&dst, 0, 1)));

128

vstore4(c20, 0, (__global int *)(offset(&dst, 0, 2)));

129

vstore4(c30, 0, (__global int *)(offset(&dst, 0, 3)));

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

130

}

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

131

132

/** This OpenCL kernel is optimized for Bifrost and computes the matrix multiplication between matrix A (src0) and matrix B (src1)

133

* Matrix A and matrix B must be reshaped respectively with @ref CLGEMMInterleave4x4Kernel and @ref CLGEMMTranspose1xWKernel before running the matrix multiplication

134

*

135

* @attention The number of matrix B columns needs to be passed at compile time using -DCOLS_B

136

* @note The transposition width step (mult_transpose1xW_width * 4) must be passed at compile time using -DTRANSPOSE1XW_WIDTH_STEP (i.e. -DTRANSPOSE1XW_WIDTH_STEP=2)

137

* @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (i.e. -DMULT_INTERLEAVE4X4_HEIGHT=2)

138

*

139

* @param[in] src0_ptr Pointer to the source matrix. Supported data type: QASYMM8

140

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

141

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

142

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

143

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

144

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

145

* @param[in] src1_ptr Pointer to the source matrix. Supported data type: same as @p src0_ptr

146

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

147

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

148

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

149

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

150

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

151

* @param[out] dst_ptr Pointer to the destination matrix Supported data type: S32

152

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

153

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

154

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

155

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

156

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

157

*/

158

__kernel void gemmlowp_mm_interleaved_transposed_bifrost(IMAGE_DECLARATION(src0),

159

IMAGE_DECLARATION(src1),

160

IMAGE_DECLARATION(dst))

161

{

162

int x = get_global_id(0) / TRANSPOSE1XW_WIDTH_STEP;

163

int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;

164

165

// Offset

166

const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;

167

const int offset_row_b = (get_global_id(0) % TRANSPOSE1XW_WIDTH_STEP) * 4;

168

169

// src_addr_a = address of matrix A

170

// src_addr_b = address of matrix B

171

__global uchar *src_addr_a = (__global uchar *)(src0_ptr + y * src0_stride_y + src0_offset_first_element_in_bytes);

172

__global uchar *src_addr_b = (__global uchar *)(src1_ptr + x * src1_stride_y + src1_offset_first_element_in_bytes);

173

174

// Compute end row address for matrix B

175

__global uchar *src_end_addr_b = src_addr_b + COLS_B;

176

177

src_addr_a += offset_row_a;

178

src_addr_b += offset_row_b;

179

180

// Reset accumulators

uint c00 = 0;

uint c01 = 0;

uint c02 = 0;

uint c03 = 0;

uint c10 = 0;

uint c11 = 0;

uint c12 = 0;

uint c13 = 0;

uint c20 = 0;

uint c21 = 0;

uint c22 = 0;

uint c23 = 0;

uint c30 = 0;

uint c31 = 0;

uint c32 = 0;

uint c33 = 0;

#if MULT_INTERLEAVE4X4_HEIGHT == 1

199

for(; src_addr_b <= (src_end_addr_b - (int)(32 * TRANSPOSE1XW_WIDTH_STEP)); src_addr_a += (32 * MULT_INTERLEAVE4X4_HEIGHT), src_addr_b += (32 * TRANSPOSE1XW_WIDTH_STEP))

200

{

201

// Load values from matrix A (interleaved) and matrix B (transposed)

202

uchar16 a0 = vload16(0, src_addr_a);

203

uchar4 b0 = vload4(0, src_addr_b);

204

205

c00 += (ushort)a0.s0 * b0.s0;

206

c01 += (ushort)a0.s0 * b0.s1;

207

c02 += (ushort)a0.s0 * b0.s2;

208

c03 += (ushort)a0.s0 * b0.s3;

209

210

c10 += (ushort)a0.s1 * b0.s0;

211

c11 += (ushort)a0.s1 * b0.s1;

212

c12 += (ushort)a0.s1 * b0.s2;

213

c13 += (ushort)a0.s1 * b0.s3;

214

215

c20 += (ushort)a0.s2 * b0.s0;

216

c21 += (ushort)a0.s2 * b0.s1;

217

c22 += (ushort)a0.s2 * b0.s2;

218

c23 += (ushort)a0.s2 * b0.s3;

219

220

c30 += (ushort)a0.s3 * b0.s0;

221

c31 += (ushort)a0.s3 * b0.s1;

222

c32 += (ushort)a0.s3 * b0.s2;

223

c33 += (ushort)a0.s3 * b0.s3;

224

225

// Load values from matrix B (transposed)

226

b0 = vload4(0, src_addr_b + 4 * TRANSPOSE1XW_WIDTH_STEP);

227

228

c00 += (ushort)a0.s4 * b0.s0;

229

c01 += (ushort)a0.s4 * b0.s1;

230

c02 += (ushort)a0.s4 * b0.s2;

231

c03 += (ushort)a0.s4 * b0.s3;

232

233

c10 += (ushort)a0.s5 * b0.s0;

234

c11 += (ushort)a0.s5 * b0.s1;

235

c12 += (ushort)a0.s5 * b0.s2;

236

c13 += (ushort)a0.s5 * b0.s3;

237

238

c20 += (ushort)a0.s6 * b0.s0;

239

c21 += (ushort)a0.s6 * b0.s1;

240

c22 += (ushort)a0.s6 * b0.s2;

241

c23 += (ushort)a0.s6 * b0.s3;

242

243

c30 += (ushort)a0.s7 * b0.s0;

244

c31 += (ushort)a0.s7 * b0.s1;

245

c32 += (ushort)a0.s7 * b0.s2;

246

c33 += (ushort)a0.s7 * b0.s3;

247

248

// Load values from matrix B (transposed)

249

b0 = vload4(0, src_addr_b + 8 * TRANSPOSE1XW_WIDTH_STEP);

250

251

c00 += (ushort)a0.s8 * b0.s0;

252

c01 += (ushort)a0.s8 * b0.s1;

253

c02 += (ushort)a0.s8 * b0.s2;

254

c03 += (ushort)a0.s8 * b0.s3;

255

256

c10 += (ushort)a0.s9 * b0.s0;

257

c11 += (ushort)a0.s9 * b0.s1;

258

c12 += (ushort)a0.s9 * b0.s2;

259

c13 += (ushort)a0.s9 * b0.s3;

260

261

c20 += (ushort)a0.sA * b0.s0;

262

c21 += (ushort)a0.sA * b0.s1;

263

c22 += (ushort)a0.sA * b0.s2;

264

c23 += (ushort)a0.sA * b0.s3;

265

266

c30 += (ushort)a0.sB * b0.s0;

267

c31 += (ushort)a0.sB * b0.s1;

268

c32 += (ushort)a0.sB * b0.s2;

269

c33 += (ushort)a0.sB * b0.s3;

270

271

// Load values from matrix B (transposed)

272

b0 = vload4(0, src_addr_b + 12 * TRANSPOSE1XW_WIDTH_STEP);

273

274

c00 += (ushort)a0.sC * b0.s0;

275

c01 += (ushort)a0.sC * b0.s1;

276

c02 += (ushort)a0.sC * b0.s2;

277

c03 += (ushort)a0.sC * b0.s3;

278

279

c10 += (ushort)a0.sD * b0.s0;

280

c11 += (ushort)a0.sD * b0.s1;

281

c12 += (ushort)a0.sD * b0.s2;

282

c13 += (ushort)a0.sD * b0.s3;

283

284

c20 += (ushort)a0.sE * b0.s0;

285

c21 += (ushort)a0.sE * b0.s1;

286

c22 += (ushort)a0.sE * b0.s2;

287

c23 += (ushort)a0.sE * b0.s3;

288

289

c30 += (ushort)a0.sF * b0.s0;

290

c31 += (ushort)a0.sF * b0.s1;

291

c32 += (ushort)a0.sF * b0.s2;

292

c33 += (ushort)a0.sF * b0.s3;

293

294

// Load values from matrix A (interleaved) and matrix B (transposed)

295

a0 = vload16(0, src_addr_a + 16);

296

b0 = vload4(0, src_addr_b + 16 * TRANSPOSE1XW_WIDTH_STEP);

297

298

c00 += (ushort)a0.s0 * b0.s0;

299

c01 += (ushort)a0.s0 * b0.s1;

300

c02 += (ushort)a0.s0 * b0.s2;

301

c03 += (ushort)a0.s0 * b0.s3;

302

303

c10 += (ushort)a0.s1 * b0.s0;

304

c11 += (ushort)a0.s1 * b0.s1;

305

c12 += (ushort)a0.s1 * b0.s2;

306

c13 += (ushort)a0.s1 * b0.s3;

307

308

c20 += (ushort)a0.s2 * b0.s0;

309

c21 += (ushort)a0.s2 * b0.s1;

310

c22 += (ushort)a0.s2 * b0.s2;

311

c23 += (ushort)a0.s2 * b0.s3;

312

313

c30 += (ushort)a0.s3 * b0.s0;

314

c31 += (ushort)a0.s3 * b0.s1;

315

c32 += (ushort)a0.s3 * b0.s2;

316

c33 += (ushort)a0.s3 * b0.s3;

317

318

// Load values from matrix B (transposed)

319

b0 = vload4(0, src_addr_b + 20 * TRANSPOSE1XW_WIDTH_STEP);

320

321

c00 += (ushort)a0.s4 * b0.s0;

322

c01 += (ushort)a0.s4 * b0.s1;

323

c02 += (ushort)a0.s4 * b0.s2;

324

c03 += (ushort)a0.s4 * b0.s3;

325

326

c10 += (ushort)a0.s5 * b0.s0;

327

c11 += (ushort)a0.s5 * b0.s1;

328

c12 += (ushort)a0.s5 * b0.s2;

329

c13 += (ushort)a0.s5 * b0.s3;

330

331

c20 += (ushort)a0.s6 * b0.s0;

332

c21 += (ushort)a0.s6 * b0.s1;

333

c22 += (ushort)a0.s6 * b0.s2;

334

c23 += (ushort)a0.s6 * b0.s3;

335

336

c30 += (ushort)a0.s7 * b0.s0;

337

c31 += (ushort)a0.s7 * b0.s1;

338

c32 += (ushort)a0.s7 * b0.s2;

339

c33 += (ushort)a0.s7 * b0.s3;

340

341

// Load values from matrix B (transposed)

342

b0 = vload4(0, src_addr_b + 24 * TRANSPOSE1XW_WIDTH_STEP);

343

344

c00 += (ushort)a0.s8 * b0.s0;

345

c01 += (ushort)a0.s8 * b0.s1;

346

c02 += (ushort)a0.s8 * b0.s2;

347

c03 += (ushort)a0.s8 * b0.s3;

348

349

c10 += (ushort)a0.s9 * b0.s0;

350

c11 += (ushort)a0.s9 * b0.s1;

351

c12 += (ushort)a0.s9 * b0.s2;

352

c13 += (ushort)a0.s9 * b0.s3;

353

354

c20 += (ushort)a0.sA * b0.s0;

355

c21 += (ushort)a0.sA * b0.s1;

356

c22 += (ushort)a0.sA * b0.s2;

357

c23 += (ushort)a0.sA * b0.s3;

358

359

c30 += (ushort)a0.sB * b0.s0;

360

c31 += (ushort)a0.sB * b0.s1;

361

c32 += (ushort)a0.sB * b0.s2;

362

c33 += (ushort)a0.sB * b0.s3;

363

364

// Load values from matrix B (transposed)

365

b0 = vload4(0, src_addr_b + 28 * TRANSPOSE1XW_WIDTH_STEP);

366

367

c00 += (ushort)a0.sC * b0.s0;

368

c01 += (ushort)a0.sC * b0.s1;

369

c02 += (ushort)a0.sC * b0.s2;

370

c03 += (ushort)a0.sC * b0.s3;

371

372

c10 += (ushort)a0.sD * b0.s0;

373

c11 += (ushort)a0.sD * b0.s1;

374

c12 += (ushort)a0.sD * b0.s2;

375

c13 += (ushort)a0.sD * b0.s3;

376

377

c20 += (ushort)a0.sE * b0.s0;

378

c21 += (ushort)a0.sE * b0.s1;

379

c22 += (ushort)a0.sE * b0.s2;

380

c23 += (ushort)a0.sE * b0.s3;

381

382

c30 += (ushort)a0.sF * b0.s0;

383

c31 += (ushort)a0.sF * b0.s1;

384

c32 += (ushort)a0.sF * b0.s2;

385

c33 += (ushort)a0.sF * b0.s3;

386

}

387

#endif // MULT_INTERLEAVE4X4_HEIGHT == 1

388

389

for(; src_addr_b < src_end_addr_b; src_addr_a += (4 * MULT_INTERLEAVE4X4_HEIGHT), src_addr_b += (4 * TRANSPOSE1XW_WIDTH_STEP))

390

{

391

// Load values from matrix A (interleaved) and matrix B (transposed)

392

uchar4 a0 = vload4(0, src_addr_a);

393

uchar4 b0 = vload4(0, src_addr_b);

394

395

c00 += (ushort)a0.s0 * b0.s0;

396

c01 += (ushort)a0.s0 * b0.s1;

397

c02 += (ushort)a0.s0 * b0.s2;

398

c03 += (ushort)a0.s0 * b0.s3;

399

400

c10 += (ushort)a0.s1 * b0.s0;

401

c11 += (ushort)a0.s1 * b0.s1;

402

c12 += (ushort)a0.s1 * b0.s2;

403

c13 += (ushort)a0.s1 * b0.s3;

404

405

c20 += (ushort)a0.s2 * b0.s0;

406

c21 += (ushort)a0.s2 * b0.s1;

407

c22 += (ushort)a0.s2 * b0.s2;

408

c23 += (ushort)a0.s2 * b0.s3;

409

410

c30 += (ushort)a0.s3 * b0.s0;

411

c31 += (ushort)a0.s3 * b0.s1;

412

c32 += (ushort)a0.s3 * b0.s2;

413

c33 += (ushort)a0.s3 * b0.s3;

414

}

415

416

// Compute destination address

417

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

418

419

// Store 4x4 block

420

vstore4((int4)(c00, c01, c02, c03), 0, (__global int *)(offset(&dst, 0, 0)));

421

vstore4((int4)(c10, c11, c12, c13), 0, (__global int *)(offset(&dst, 0, 1)));

422

vstore4((int4)(c20, c21, c22, c23), 0, (__global int *)(offset(&dst, 0, 2)));

423

vstore4((int4)(c30, c31, c32, c33), 0, (__global int *)(offset(&dst, 0, 3)));

424

}

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

425

426

#if ARM_COMPUTE_OPENCL_DOT8_ENABLED

427

/** This OpenCL kernel is optimized for Bifrost and computes the matrix multiplication between matrix A (src0) and matrix B (src1)

428

* Matrix A and matrix B must be reshaped respectively with @ref CLGEMMInterleave4x4Kernel and @ref CLGEMMTranspose1xWKernel before running the matrix multiplication

429

*

430

* @attention The number of matrix B columns needs to be passed at compile time using -DCOLS_B

431

* @note The transposition width step (mult_transpose1xW_width * 4) must be passed at compile time using -DTRANSPOSE1XW_WIDTH_STEP (i.e. -DTRANSPOSE1XW_WIDTH_STEP=2)

432

* @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (i.e. -DMULT_INTERLEAVE4X4_HEIGHT=2)

433

*

434

* @param[in] src0_ptr Pointer to the source matrix. Supported data type: QASYMM8

435

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

436

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

437

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

438

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

439

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

440

* @param[in] src1_ptr Pointer to the source matrix. Supported data type: same as @p src0_ptr

441

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

442

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

443

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

444

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

445

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

446

* @param[out] dst_ptr Pointer to the destination matrix Supported data type: S32

447

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

448

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

449

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

450

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

451

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

452

*/

453

__kernel void gemmlowp_mm_interleaved_transposed_bifrost_dot8(IMAGE_DECLARATION(src0),

454

IMAGE_DECLARATION(src1),

455

IMAGE_DECLARATION(dst))

456

{

457

int x = get_global_id(0) / TRANSPOSE1XW_WIDTH_STEP;

458

int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;

459

460

// Offset

461

const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;

462

const int offset_row_b = (get_global_id(0) % TRANSPOSE1XW_WIDTH_STEP) * 4;

463

464

// src_addr_a = address of matrix A

465

// src_addr_b = address of matrix B

466

__global uchar *src_addr_a = (__global uchar *)(src0_ptr + y * src0_stride_y + src0_offset_first_element_in_bytes);

467

__global uchar *src_addr_b = (__global uchar *)(src1_ptr + x * src1_stride_y + src1_offset_first_element_in_bytes);

468

469

// Compute end row address for matrix B

470

__global uchar *src_end_addr_b = src_addr_b + COLS_B;

471

472

src_addr_a += offset_row_a;

473

src_addr_b += offset_row_b;

474

475

// Reset accumulators

uint c00 = 0;

uint c01 = 0;

uint c02 = 0;

uint c03 = 0;

uint c10 = 0;

uint c11 = 0;

uint c12 = 0;

uint c13 = 0;

uint c20 = 0;

uint c21 = 0;

uint c22 = 0;

uint c23 = 0;

uint c30 = 0;

uint c31 = 0;

uint c32 = 0;

uint c33 = 0;

#if MULT_INTERLEAVE4X4_HEIGHT == 1

494

for(; src_addr_b <= (src_end_addr_b - (int)(32 * TRANSPOSE1XW_WIDTH_STEP)); src_addr_a += (32 * MULT_INTERLEAVE4X4_HEIGHT), src_addr_b += (32 * TRANSPOSE1XW_WIDTH_STEP))

495

{

496

// Load values from matrix A (interleaved) and matrix B (transposed)

497

uchar16 a0 = vload16(0, src_addr_a);

498

uchar4 b0 = vload4(0, src_addr_b);

499

uchar4 b1 = vload4(0, src_addr_b + 4 * TRANSPOSE1XW_WIDTH_STEP);

500

uchar4 b2 = vload4(0, src_addr_b + 8 * TRANSPOSE1XW_WIDTH_STEP);

501

uchar4 b3 = vload4(0, src_addr_b + 12 * TRANSPOSE1XW_WIDTH_STEP);

502

503

// Accumulate

Giorgio Arena

2018-07-26 15:50:09 +0100

[diff] [blame]

504

ARM_DOT(a0.s0, a0.s4, a0.s8, a0.sC, b0.s0, b1.s0, b2.s0, b3.s0, c00);

505

ARM_DOT(a0.s0, a0.s4, a0.s8, a0.sC, b0.s1, b1.s1, b2.s1, b3.s1, c01);

506

ARM_DOT(a0.s0, a0.s4, a0.s8, a0.sC, b0.s2, b1.s2, b2.s2, b3.s2, c02);

507

ARM_DOT(a0.s0, a0.s4, a0.s8, a0.sC, b0.s3, b1.s3, b2.s3, b3.s3, c03);

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

508

Giorgio Arena

2018-07-26 15:50:09 +0100

[diff] [blame]

509

ARM_DOT(a0.s1, a0.s5, a0.s9, a0.sD, b0.s0, b1.s0, b2.s0, b3.s0, c10);

510

ARM_DOT(a0.s1, a0.s5, a0.s9, a0.sD, b0.s1, b1.s1, b2.s1, b3.s1, c11);

511

ARM_DOT(a0.s1, a0.s5, a0.s9, a0.sD, b0.s2, b1.s2, b2.s2, b3.s2, c12);

512

ARM_DOT(a0.s1, a0.s5, a0.s9, a0.sD, b0.s3, b1.s3, b2.s3, b3.s3, c13);

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

513

Giorgio Arena

2018-07-26 15:50:09 +0100

[diff] [blame]

514

ARM_DOT(a0.s2, a0.s6, a0.sA, a0.sE, b0.s0, b1.s0, b2.s0, b3.s0, c20);

515

ARM_DOT(a0.s2, a0.s6, a0.sA, a0.sE, b0.s1, b1.s1, b2.s1, b3.s1, c21);

516

ARM_DOT(a0.s2, a0.s6, a0.sA, a0.sE, b0.s2, b1.s2, b2.s2, b3.s2, c22);

517

ARM_DOT(a0.s2, a0.s6, a0.sA, a0.sE, b0.s3, b1.s3, b2.s3, b3.s3, c23);

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

518

Giorgio Arena

2018-07-26 15:50:09 +0100

[diff] [blame]

519

ARM_DOT(a0.s3, a0.s7, a0.sB, a0.sF, b0.s0, b1.s0, b2.s0, b3.s0, c30);

520

ARM_DOT(a0.s3, a0.s7, a0.sB, a0.sF, b0.s1, b1.s1, b2.s1, b3.s1, c31);

521

ARM_DOT(a0.s3, a0.s7, a0.sB, a0.sF, b0.s2, b1.s2, b2.s2, b3.s2, c32);

522

ARM_DOT(a0.s3, a0.s7, a0.sB, a0.sF, b0.s3, b1.s3, b2.s3, b3.s3, c33);

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

523

524

// Load values from matrix A (interleaved) and matrix B (transposed)

525

a0 = vload16(0, src_addr_a + 16);

526

b0 = vload4(0, src_addr_b + 16 * TRANSPOSE1XW_WIDTH_STEP);

527

b1 = vload4(0, src_addr_b + 20 * TRANSPOSE1XW_WIDTH_STEP);

528

b2 = vload4(0, src_addr_b + 24 * TRANSPOSE1XW_WIDTH_STEP);

529

b3 = vload4(0, src_addr_b + 28 * TRANSPOSE1XW_WIDTH_STEP);

530

531

// Accumulate

Giorgio Arena

2018-07-26 15:50:09 +0100

[diff] [blame]

532

ARM_DOT(a0.s0, a0.s4, a0.s8, a0.sC, b0.s0, b1.s0, b2.s0, b3.s0, c00);

533

ARM_DOT(a0.s0, a0.s4, a0.s8, a0.sC, b0.s1, b1.s1, b2.s1, b3.s1, c01);

534

ARM_DOT(a0.s0, a0.s4, a0.s8, a0.sC, b0.s2, b1.s2, b2.s2, b3.s2, c02);

535

ARM_DOT(a0.s0, a0.s4, a0.s8, a0.sC, b0.s3, b1.s3, b2.s3, b3.s3, c03);

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

536

Giorgio Arena

2018-07-26 15:50:09 +0100

[diff] [blame]

537

ARM_DOT(a0.s1, a0.s5, a0.s9, a0.sD, b0.s0, b1.s0, b2.s0, b3.s0, c10);

538

ARM_DOT(a0.s1, a0.s5, a0.s9, a0.sD, b0.s1, b1.s1, b2.s1, b3.s1, c11);

539

ARM_DOT(a0.s1, a0.s5, a0.s9, a0.sD, b0.s2, b1.s2, b2.s2, b3.s2, c12);

540

ARM_DOT(a0.s1, a0.s5, a0.s9, a0.sD, b0.s3, b1.s3, b2.s3, b3.s3, c13);

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

541

Giorgio Arena

2018-07-26 15:50:09 +0100

[diff] [blame]

542

ARM_DOT(a0.s2, a0.s6, a0.sA, a0.sE, b0.s0, b1.s0, b2.s0, b3.s0, c20);

543

ARM_DOT(a0.s2, a0.s6, a0.sA, a0.sE, b0.s1, b1.s1, b2.s1, b3.s1, c21);

544

ARM_DOT(a0.s2, a0.s6, a0.sA, a0.sE, b0.s2, b1.s2, b2.s2, b3.s2, c22);

545

ARM_DOT(a0.s2, a0.s6, a0.sA, a0.sE, b0.s3, b1.s3, b2.s3, b3.s3, c23);

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

546

Giorgio Arena

2018-07-26 15:50:09 +0100

[diff] [blame]

547

ARM_DOT(a0.s3, a0.s7, a0.sB, a0.sF, b0.s0, b1.s0, b2.s0, b3.s0, c30);

548

ARM_DOT(a0.s3, a0.s7, a0.sB, a0.sF, b0.s1, b1.s1, b2.s1, b3.s1, c31);

549

ARM_DOT(a0.s3, a0.s7, a0.sB, a0.sF, b0.s2, b1.s2, b2.s2, b3.s2, c32);

550

ARM_DOT(a0.s3, a0.s7, a0.sB, a0.sF, b0.s3, b1.s3, b2.s3, b3.s3, c33);

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

551

}

552

#endif // MULT_INTERLEAVE4X4_HEIGHT == 1

553

554

for(; src_addr_b < src_end_addr_b; src_addr_a += (4 * MULT_INTERLEAVE4X4_HEIGHT), src_addr_b += (4 * TRANSPOSE1XW_WIDTH_STEP))

555

{

556

// Load values from matrix A (interleaved) and matrix B (transposed)

557

uchar4 a0 = vload4(0, src_addr_a);

558

uchar4 b0 = vload4(0, src_addr_b);

559

560

c00 += (ushort)a0.s0 * b0.s0;

561

c01 += (ushort)a0.s0 * b0.s1;

562

c02 += (ushort)a0.s0 * b0.s2;

563

c03 += (ushort)a0.s0 * b0.s3;

564

565

c10 += (ushort)a0.s1 * b0.s0;

566

c11 += (ushort)a0.s1 * b0.s1;

567

c12 += (ushort)a0.s1 * b0.s2;

568

c13 += (ushort)a0.s1 * b0.s3;

569

570

c20 += (ushort)a0.s2 * b0.s0;

571

c21 += (ushort)a0.s2 * b0.s1;

572

c22 += (ushort)a0.s2 * b0.s2;

573

c23 += (ushort)a0.s2 * b0.s3;

574

575

c30 += (ushort)a0.s3 * b0.s0;

576

c31 += (ushort)a0.s3 * b0.s1;

577

c32 += (ushort)a0.s3 * b0.s2;

578

c33 += (ushort)a0.s3 * b0.s3;

579

}

580

581

// Compute destination address

582

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

583

584

// Store 4x4 block

585

vstore4((int4)(c00, c01, c02, c03), 0, (__global int *)(offset(&dst, 0, 0)));

586

vstore4((int4)(c10, c11, c12, c13), 0, (__global int *)(offset(&dst, 0, 1)));

587

vstore4((int4)(c20, c21, c22, c23), 0, (__global int *)(offset(&dst, 0, 2)));

588

vstore4((int4)(c30, c31, c32, c33), 0, (__global int *)(offset(&dst, 0, 3)));

589

}

590

#endif // ARM_COMPUTE_OPENCL_DOT8_ENABLED

591

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

592

#endif // defined(COLS_B) && defined(MULT_INTERLEAVE4X4_HEIGHT) && defined(TRANSPOSE1XW_WIDTH_STEP)

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

593

594

#if defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_Y) && defined(COLS_A)

595

#define VECTOR_UCHAR VEC_DATA_TYPE(uchar, NUM_ELEMS_PROCESSED_PER_THREAD_X)

596

#define VECTOR_UINT VEC_DATA_TYPE(uint, NUM_ELEMS_PROCESSED_PER_THREAD_X)

597

#define VECTOR_INT VEC_DATA_TYPE(int, NUM_ELEMS_PROCESSED_PER_THREAD_X)

598

/** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped

599

*

600

* @attention The number of matrix A columns needs to be passed at compile time using -DCOLS_A

601

*

602

* @param[in] src0_ptr Pointer to the source matrix. Supported data type: QASYMM8

603

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

604

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

605

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

606

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

607

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

608

* @param[in] src1_ptr Pointer to the source matrix. Supported data type: same as @p src0_ptr

609

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

610

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

611

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

612

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

613

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

614

* @param[out] dst_ptr Pointer to the destination matrix Supported data type: S32

615

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

616

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

617

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

618

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

619

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

620

*/

Gian Marco

2018-01-10 15:56:30 +0000

[diff] [blame]

621

__kernel void gemmlowp_mm_midgard(IMAGE_DECLARATION(src0),

622

IMAGE_DECLARATION(src1),

623

IMAGE_DECLARATION(dst))

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

624

{

625

int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;

626

627

// Compute starting address for matrix A and Matrix B

628

int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));

629

630

// Update address for the matrix A

631

src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;

632

633

// Update address for the matrix B

634

src_addr.s1 += idx;

635

636

int end_row_vec_a = src_addr.s0 + COLS_A;

637

638

VECTOR_UINT acc0 = 0;

639

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

640

VECTOR_UINT acc1 = 0;

641

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

642

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

643

VECTOR_UINT acc2 = 0;

644

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

645

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

646

VECTOR_UINT acc3 = 0;

647

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco

2018-01-10 15:56:30 +0000

[diff] [blame]

648

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

649

VECTOR_UINT acc4 = 0;

650

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

651

652

for(; src_addr.s0 <= (end_row_vec_a - 2); src_addr += (int2)(2, 2 * src1_stride_y))

653

{

654

// Load values from matrix A

655

uchar2 a0 = vload2(0, src0_ptr + src_addr.s0 + 0 * src0_stride_y);

656

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

657

uchar2 a1 = vload2(0, src0_ptr + src_addr.s0 + 1 * src0_stride_y);

658

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

659

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

660

uchar2 a2 = vload2(0, src0_ptr + src_addr.s0 + 2 * src0_stride_y);

661

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

662

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

663

uchar2 a3 = vload2(0, src0_ptr + src_addr.s0 + 3 * src0_stride_y);

664

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco

2018-01-10 15:56:30 +0000

[diff] [blame]

665

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

666

uchar2 a4 = vload2(0, src0_ptr + src_addr.s0 + 4 * src0_stride_y);

667

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

668

// Load values from matrix B

669

VECTOR_UCHAR b0 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, src1_ptr + src_addr.s1);

670

VECTOR_UCHAR b1 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, src1_ptr + src_addr.s1 + src1_stride_y);

671

672

// Accumulate

673

acc0 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a0.s0;

674

acc0 += CONVERT(b1, VECTOR_UINT) * (VECTOR_UINT)a0.s1;

675

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

676

acc1 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a1.s0;

677

acc1 += CONVERT(b1, VECTOR_UINT) * (VECTOR_UINT)a1.s1;

678

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

679

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

680

acc2 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a2.s0;

681

acc2 += CONVERT(b1, VECTOR_UINT) * (VECTOR_UINT)a2.s1;

682

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

683

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

684

acc3 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a3.s0;

685

acc3 += CONVERT(b1, VECTOR_UINT) * (VECTOR_UINT)a3.s1;

686

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco

2018-01-10 15:56:30 +0000

[diff] [blame]

687

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

688

acc4 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a4.s0;

689

acc4 += CONVERT(b1, VECTOR_UINT) * (VECTOR_UINT)a4.s1;

690

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

691

}

692

693

for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(1, src1_stride_y))

694

{

695

// Load values from matrix A

696

uchar a0 = *(src0_ptr + src_addr.s0 + 0 * src0_stride_y);

697

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

698

uchar a1 = *(src0_ptr + src_addr.s0 + 1 * src0_stride_y);

699

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

700

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

701

uchar a2 = *(src0_ptr + src_addr.s0 + 2 * src0_stride_y);

702

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

703

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

704

uchar a3 = *(src0_ptr + src_addr.s0 + 3 * src0_stride_y);

705

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco

2018-01-10 15:56:30 +0000

[diff] [blame]

706

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

707

uchar a4 = *(src0_ptr + src_addr.s0 + 4 * src0_stride_y);

708

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

709

// Load values from matrix B

710

VECTOR_UCHAR b0 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, src1_ptr + src_addr.s1);

711

712

// Accumulate

713

acc0 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a0;

714

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

715

acc1 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a1;

716

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

717

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

718

acc2 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a2;

719

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

720

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

721

acc3 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a3;

722

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco

2018-01-10 15:56:30 +0000

[diff] [blame]

723

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

724

acc4 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a4;

725

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

726

}

727

728

// Compute destination address

729

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

730

731

// Store the result

732

VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)

733

(CONVERT(acc0, VECTOR_INT), 0, (__global int *)(offset(&dst, 0, 0)));

734

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

735

VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)

736

(CONVERT(acc1, VECTOR_INT), 0, (__global int *)(offset(&dst, 0, 1)));

737

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

738

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

739

VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)

740

(CONVERT(acc2, VECTOR_INT), 0, (__global int *)(offset(&dst, 0, 2)));

741

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

742

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

743

VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)

744

(CONVERT(acc3, VECTOR_INT), 0, (__global int *)(offset(&dst, 0, 3)));

745

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco

2018-01-10 15:56:30 +0000

[diff] [blame]

746

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

747

VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)

748

(CONVERT(acc4, VECTOR_INT), 0, (__global int *)(offset(&dst, 0, 4)));

749

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

750

}

751

752

/** OpenCL kernel optimized for Bifrost architectures that computes the matrix multiplication between matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped

753

*

754

* @attention The number of matrix A columns needs to be passed at compile time using -DCOLS_A

755

*

756

* @param[in] src0_ptr Pointer to the source matrix. Supported data type: QASYMM8

757

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

758

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

759

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

760

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

761

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

762

* @param[in] src1_ptr Pointer to the source matrix. Supported data type: same as @p src0_ptr

763

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

764

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

765

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

766

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

767

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

768

* @param[out] dst_ptr Pointer to the destination matrix Supported data type: S32

769

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

770

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

771

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

772

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

773

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

774

*/

775

__kernel void gemmlowp_mm_bifrost(IMAGE_DECLARATION(src0),

776

IMAGE_DECLARATION(src1),

777

IMAGE_DECLARATION(dst))

778

{

779

int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;

780

781

// Compute starting address for matrix A and Matrix B

782

int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));

783

784

// Update address for the matrix A

785

src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;

786

787

// Update address for the matrix B

788

src_addr.s1 += idx;

789

790

int end_row_vec_a = src_addr.s0 + COLS_A;

uint acc00 = 0;

uint acc01 = 0;

uint acc02 = 0;

uint acc03 = 0;

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

uint acc10 = 0;

uint acc11 = 0;

uint acc12 = 0;

uint acc13 = 0;

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

802

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

uint acc20 = 0;

uint acc21 = 0;

uint acc22 = 0;

uint acc23 = 0;

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

808

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

uint acc30 = 0;

uint acc31 = 0;

uint acc32 = 0;

uint acc33 = 0;

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

814

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

uint acc40 = 0;

uint acc41 = 0;

uint acc42 = 0;

uint acc43 = 0;

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

820

821

for(; src_addr.s0 <= (end_row_vec_a - 4); src_addr += (int2)(4, 4 * src1_stride_y))

822

{

823

// Load values from matrix A

824

uchar4 a0 = vload4(0, src0_ptr + src_addr.s0 + 0 * src0_stride_y);

825

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

826

uchar4 a1 = vload4(0, src0_ptr + src_addr.s0 + 1 * src0_stride_y);

827

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

828

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

829

uchar4 a2 = vload4(0, src0_ptr + src_addr.s0 + 2 * src0_stride_y);

830

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

831

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

832

uchar4 a3 = vload4(0, src0_ptr + src_addr.s0 + 3 * src0_stride_y);

833

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

834

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

835

uchar4 a4 = vload4(0, src0_ptr + src_addr.s0 + 4 * src0_stride_y);

836

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

837

// Load values from matrix B

838

uchar4 b0 = vload4(0, src1_ptr + src_addr.s1 + 0 * src1_stride_y);

839

uchar4 b1 = vload4(0, src1_ptr + src_addr.s1 + 1 * src1_stride_y);

840

uchar4 b2 = vload4(0, src1_ptr + src_addr.s1 + 2 * src1_stride_y);

841

uchar4 b3 = vload4(0, src1_ptr + src_addr.s1 + 3 * src1_stride_y);

{

// Accumulate

ushort tmp0 = (ushort)b0.s0 * (ushort)a0.s0;

846

ushort tmp1 = (ushort)b0.s1 * (ushort)a0.s0;

847

ushort tmp2 = (ushort)b0.s2 * (ushort)a0.s0;

848

ushort tmp3 = (ushort)b0.s3 * (ushort)a0.s0;

849

850

ushort tmp4 = (ushort)b1.s0 * (ushort)a0.s1;

851

ushort tmp5 = (ushort)b1.s1 * (ushort)a0.s1;

852

ushort tmp6 = (ushort)b1.s2 * (ushort)a0.s1;

853

ushort tmp7 = (ushort)b1.s3 * (ushort)a0.s1;

854

855

ushort tmp8 = (ushort)b2.s0 * (ushort)a0.s2;

856

ushort tmp9 = (ushort)b2.s1 * (ushort)a0.s2;

857

ushort tmpA = (ushort)b2.s2 * (ushort)a0.s2;

858

ushort tmpB = (ushort)b2.s3 * (ushort)a0.s2;

859

860

ushort tmpC = (ushort)b3.s0 * (ushort)a0.s3;

861

ushort tmpD = (ushort)b3.s1 * (ushort)a0.s3;

862

ushort tmpE = (ushort)b3.s2 * (ushort)a0.s3;

863

ushort tmpF = (ushort)b3.s3 * (ushort)a0.s3;

864

865

acc00 += ((uint)tmp0 + (uint)tmp4 + (uint)tmp8 + (uint)tmpC);

866

acc01 += ((uint)tmp1 + (uint)tmp5 + (uint)tmp9 + (uint)tmpD);

867

acc02 += ((uint)tmp2 + (uint)tmp6 + (uint)tmpA + (uint)tmpE);

868

acc03 += ((uint)tmp3 + (uint)tmp7 + (uint)tmpB + (uint)tmpF);

869

}

870

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

871

{

872

// Accumulate

873

ushort tmp0 = (ushort)b0.s0 * (ushort)a1.s0;

874

ushort tmp1 = (ushort)b0.s1 * (ushort)a1.s0;

875

ushort tmp2 = (ushort)b0.s2 * (ushort)a1.s0;

876

ushort tmp3 = (ushort)b0.s3 * (ushort)a1.s0;

877

878

ushort tmp4 = (ushort)b1.s0 * (ushort)a1.s1;

879

ushort tmp5 = (ushort)b1.s1 * (ushort)a1.s1;

880

ushort tmp6 = (ushort)b1.s2 * (ushort)a1.s1;

881

ushort tmp7 = (ushort)b1.s3 * (ushort)a1.s1;

882

883

ushort tmp8 = (ushort)b2.s0 * (ushort)a1.s2;

884

ushort tmp9 = (ushort)b2.s1 * (ushort)a1.s2;

885

ushort tmpA = (ushort)b2.s2 * (ushort)a1.s2;

886

ushort tmpB = (ushort)b2.s3 * (ushort)a1.s2;

887

888

ushort tmpC = (ushort)b3.s0 * (ushort)a1.s3;

889

ushort tmpD = (ushort)b3.s1 * (ushort)a1.s3;

890

ushort tmpE = (ushort)b3.s2 * (ushort)a1.s3;

891

ushort tmpF = (ushort)b3.s3 * (ushort)a1.s3;

892

893

acc10 += ((uint)tmp0 + (uint)tmp4 + (uint)tmp8 + (uint)tmpC);

894

acc11 += ((uint)tmp1 + (uint)tmp5 + (uint)tmp9 + (uint)tmpD);

895

acc12 += ((uint)tmp2 + (uint)tmp6 + (uint)tmpA + (uint)tmpE);

896

acc13 += ((uint)tmp3 + (uint)tmp7 + (uint)tmpB + (uint)tmpF);

897

}

898

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

899

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

900

{

901

// Accumulate

902

ushort tmp0 = (ushort)b0.s0 * (ushort)a2.s0;

903

ushort tmp1 = (ushort)b0.s1 * (ushort)a2.s0;

904

ushort tmp2 = (ushort)b0.s2 * (ushort)a2.s0;

905

ushort tmp3 = (ushort)b0.s3 * (ushort)a2.s0;

906

907

ushort tmp4 = (ushort)b1.s0 * (ushort)a2.s1;

908

ushort tmp5 = (ushort)b1.s1 * (ushort)a2.s1;

909

ushort tmp6 = (ushort)b1.s2 * (ushort)a2.s1;

910

ushort tmp7 = (ushort)b1.s3 * (ushort)a2.s1;

911

912

ushort tmp8 = (ushort)b2.s0 * (ushort)a2.s2;

913

ushort tmp9 = (ushort)b2.s1 * (ushort)a2.s2;

914

ushort tmpA = (ushort)b2.s2 * (ushort)a2.s2;

915

ushort tmpB = (ushort)b2.s3 * (ushort)a2.s2;

916

917

ushort tmpC = (ushort)b3.s0 * (ushort)a2.s3;

918

ushort tmpD = (ushort)b3.s1 * (ushort)a2.s3;

919

ushort tmpE = (ushort)b3.s2 * (ushort)a2.s3;

920

ushort tmpF = (ushort)b3.s3 * (ushort)a2.s3;

921

922

acc20 += ((uint)tmp0 + (uint)tmp4 + (uint)tmp8 + (uint)tmpC);

923

acc21 += ((uint)tmp1 + (uint)tmp5 + (uint)tmp9 + (uint)tmpD);

924

acc22 += ((uint)tmp2 + (uint)tmp6 + (uint)tmpA + (uint)tmpE);

925

acc23 += ((uint)tmp3 + (uint)tmp7 + (uint)tmpB + (uint)tmpF);

926

}

927

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

928

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

929

{

930

// Accumulate

931

ushort tmp0 = (ushort)b0.s0 * (ushort)a3.s0;

932

ushort tmp1 = (ushort)b0.s1 * (ushort)a3.s0;

933

ushort tmp2 = (ushort)b0.s2 * (ushort)a3.s0;

934

ushort tmp3 = (ushort)b0.s3 * (ushort)a3.s0;

935

936

ushort tmp4 = (ushort)b1.s0 * (ushort)a3.s1;

937

ushort tmp5 = (ushort)b1.s1 * (ushort)a3.s1;

938

ushort tmp6 = (ushort)b1.s2 * (ushort)a3.s1;

939

ushort tmp7 = (ushort)b1.s3 * (ushort)a3.s1;

940

941

ushort tmp8 = (ushort)b2.s0 * (ushort)a3.s2;

942

ushort tmp9 = (ushort)b2.s1 * (ushort)a3.s2;

943

ushort tmpA = (ushort)b2.s2 * (ushort)a3.s2;

944

ushort tmpB = (ushort)b2.s3 * (ushort)a3.s2;

945

946

ushort tmpC = (ushort)b3.s0 * (ushort)a3.s3;

947

ushort tmpD = (ushort)b3.s1 * (ushort)a3.s3;

948

ushort tmpE = (ushort)b3.s2 * (ushort)a3.s3;

949

ushort tmpF = (ushort)b3.s3 * (ushort)a3.s3;

950

951

acc30 += ((uint)tmp0 + (uint)tmp4 + (uint)tmp8 + (uint)tmpC);

952

acc31 += ((uint)tmp1 + (uint)tmp5 + (uint)tmp9 + (uint)tmpD);

953

acc32 += ((uint)tmp2 + (uint)tmp6 + (uint)tmpA + (uint)tmpE);

954

acc33 += ((uint)tmp3 + (uint)tmp7 + (uint)tmpB + (uint)tmpF);

955

}

956

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

957

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

958

{

959

// Accumulate

960

ushort tmp0 = (ushort)b0.s0 * (ushort)a4.s0;

961

ushort tmp1 = (ushort)b0.s1 * (ushort)a4.s0;

962

ushort tmp2 = (ushort)b0.s2 * (ushort)a4.s0;

963

ushort tmp3 = (ushort)b0.s3 * (ushort)a4.s0;

964

965

ushort tmp4 = (ushort)b1.s0 * (ushort)a4.s1;

966

ushort tmp5 = (ushort)b1.s1 * (ushort)a4.s1;

967

ushort tmp6 = (ushort)b1.s2 * (ushort)a4.s1;

968

ushort tmp7 = (ushort)b1.s3 * (ushort)a4.s1;

969

970

ushort tmp8 = (ushort)b2.s0 * (ushort)a4.s2;

971

ushort tmp9 = (ushort)b2.s1 * (ushort)a4.s2;

972

ushort tmpA = (ushort)b2.s2 * (ushort)a4.s2;

973

ushort tmpB = (ushort)b2.s3 * (ushort)a4.s2;

974

975

ushort tmpC = (ushort)b3.s0 * (ushort)a4.s3;

976

ushort tmpD = (ushort)b3.s1 * (ushort)a4.s3;

977

ushort tmpE = (ushort)b3.s2 * (ushort)a4.s3;

978

ushort tmpF = (ushort)b3.s3 * (ushort)a4.s3;

979

980

acc40 += ((uint)tmp0 + (uint)tmp4 + (uint)tmp8 + (uint)tmpC);

981

acc41 += ((uint)tmp1 + (uint)tmp5 + (uint)tmp9 + (uint)tmpD);

982

acc42 += ((uint)tmp2 + (uint)tmp6 + (uint)tmpA + (uint)tmpE);

983

acc43 += ((uint)tmp3 + (uint)tmp7 + (uint)tmpB + (uint)tmpF);

984

}

985

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

986

}

987

988

for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(1, src1_stride_y))

989

{

990

// Load values from matrix A

991

uchar a0 = *(src0_ptr + src_addr.s0 + 0 * src0_stride_y);

992

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

993

uchar a1 = *(src0_ptr + src_addr.s0 + 1 * src0_stride_y);

994

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

995

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

996

uchar a2 = *(src0_ptr + src_addr.s0 + 2 * src0_stride_y);

997

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

998

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

999

uchar a3 = *(src0_ptr + src_addr.s0 + 3 * src0_stride_y);

1000

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1001

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

1002

uchar a4 = *(src0_ptr + src_addr.s0 + 4 * src0_stride_y);

1003

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

1004

// Load values from matrix B

1005

uchar4 b0 = vload4(0, src1_ptr + src_addr.s1);

// Accumulate

{

// Accumulate

ushort tmp0 = (ushort)b0.s0 * (ushort)a0;

1011

ushort tmp1 = (ushort)b0.s1 * (ushort)a0;

1012

ushort tmp2 = (ushort)b0.s2 * (ushort)a0;

1013

ushort tmp3 = (ushort)b0.s3 * (ushort)a0;

1014

1015

acc00 += ((uint)tmp0);

1016

acc01 += ((uint)tmp1);

1017

acc02 += ((uint)tmp2);

1018

acc03 += ((uint)tmp3);

1019

}

1020

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1021

{

1022

// Accumulate

1023

ushort tmp0 = (ushort)b0.s0 * (ushort)a1;

1024

ushort tmp1 = (ushort)b0.s1 * (ushort)a1;

1025

ushort tmp2 = (ushort)b0.s2 * (ushort)a1;

1026

ushort tmp3 = (ushort)b0.s3 * (ushort)a1;

1027

1028

acc10 += ((uint)tmp0);

1029

acc11 += ((uint)tmp1);

1030

acc12 += ((uint)tmp2);

1031

acc13 += ((uint)tmp3);

1032

}

1033

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1034

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1035

{

1036

// Accumulate

1037

ushort tmp0 = (ushort)b0.s0 * (ushort)a2;

1038

ushort tmp1 = (ushort)b0.s1 * (ushort)a2;

1039

ushort tmp2 = (ushort)b0.s2 * (ushort)a2;

1040

ushort tmp3 = (ushort)b0.s3 * (ushort)a2;

1041

1042

acc20 += ((uint)tmp0);

1043

acc21 += ((uint)tmp1);

1044

acc22 += ((uint)tmp2);

1045

acc23 += ((uint)tmp3);

1046

}

1047

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1048

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1049

{

1050

// Accumulate

1051

ushort tmp0 = (ushort)b0.s0 * (ushort)a3;

1052

ushort tmp1 = (ushort)b0.s1 * (ushort)a3;

1053

ushort tmp2 = (ushort)b0.s2 * (ushort)a3;

1054

ushort tmp3 = (ushort)b0.s3 * (ushort)a3;

1055

1056

acc30 += ((uint)tmp0);

1057

acc31 += ((uint)tmp1);

1058

acc32 += ((uint)tmp2);

1059

acc33 += ((uint)tmp3);

1060

}

1061

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1062

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

1063

{

1064

// Accumulate

1065

ushort tmp0 = (ushort)b0.s0 * (ushort)a4;

1066

ushort tmp1 = (ushort)b0.s1 * (ushort)a4;

1067

ushort tmp2 = (ushort)b0.s2 * (ushort)a4;

1068

ushort tmp3 = (ushort)b0.s3 * (ushort)a4;

1069

1070

acc40 += ((uint)tmp0);

1071

acc41 += ((uint)tmp1);

1072

acc42 += ((uint)tmp2);

1073

acc43 += ((uint)tmp3);

1074

}

1075

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

1076

}

1077

1078

// Compute destination address

1079

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

1080

1081

// Store the result

1082

vstore4((int4)(acc00, acc01, acc02, acc03), 0, (__global int *)(offset(&dst, 0, 0)));

1083

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1084

vstore4((int4)(acc10, acc11, acc12, acc13), 0, (__global int *)(offset(&dst, 0, 1)));

1085

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1086

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1087

vstore4((int4)(acc20, acc21, acc22, acc23), 0, (__global int *)(offset(&dst, 0, 2)));

1088

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1089

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1090

vstore4((int4)(acc30, acc31, acc32, acc33), 0, (__global int *)(offset(&dst, 0, 3)));

1091

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1092

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

1093

vstore4((int4)(acc40, acc41, acc42, acc43), 0, (__global int *)(offset(&dst, 0, 4)));

1094

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

1095

}

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1096

1097

#if ARM_COMPUTE_OPENCL_DOT8_ENABLED

1098

/** OpenCL kernel optimized to use dot product that computes the matrix multiplication between matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped

1099

*

1100

* @attention The number of matrix A columns needs to be passed at compile time using -DCOLS_A

1101

*

1102

* @param[in] src0_ptr Pointer to the source matrix. Supported data type: QASYMM8

1103

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

1104

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1105

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

1106

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1107

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

1108

* @param[in] src1_ptr Pointer to the source matrix. Supported data type: same as @p src0_ptr

1109

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

1110

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1111

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

1112

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1113

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

1114

* @param[out] dst_ptr Pointer to the destination matrix Supported data type: S32

1115

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

1116

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

1117

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

1118

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

1119

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

1120

*/

1121

__kernel void gemmlowp_mm_bifrost_dot8(IMAGE_DECLARATION(src0),

1122

IMAGE_DECLARATION(src1),

1123

IMAGE_DECLARATION(dst))

1124

{

1125

int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;

1126

1127

// Compute starting address for matrix A and Matrix B

1128

int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));

1129

1130

// Update address for the matrix A

1131

src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;

1132

1133

// Update address for the matrix B

1134

src_addr.s1 += idx;

1135

1136

int end_row_vec_a = src_addr.s0 + COLS_A;

uint acc00 = 0;

uint acc01 = 0;

uint acc02 = 0;

uint acc03 = 0;

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

uint acc10 = 0;

uint acc11 = 0;

uint acc12 = 0;

uint acc13 = 0;

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1148

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

uint acc20 = 0;

uint acc21 = 0;

uint acc22 = 0;

uint acc23 = 0;

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1154

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

uint acc30 = 0;

uint acc31 = 0;

uint acc32 = 0;

uint acc33 = 0;

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1160

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

uint acc40 = 0;

uint acc41 = 0;

uint acc42 = 0;

uint acc43 = 0;

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

1166

1167

for(; src_addr.s0 <= (end_row_vec_a - 4); src_addr += (int2)(4, 4 * src1_stride_y))

1168

{

1169

// Load values from matrix A

1170

uchar4 a0 = vload4(0, src0_ptr + src_addr.s0 + 0 * src0_stride_y);

1171

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1172

uchar4 a1 = vload4(0, src0_ptr + src_addr.s0 + 1 * src0_stride_y);

1173

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1174

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1175

uchar4 a2 = vload4(0, src0_ptr + src_addr.s0 + 2 * src0_stride_y);

1176

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1177

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1178

uchar4 a3 = vload4(0, src0_ptr + src_addr.s0 + 3 * src0_stride_y);

1179

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1180

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

1181

uchar4 a4 = vload4(0, src0_ptr + src_addr.s0 + 4 * src0_stride_y);

1182

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

1183

// Load values from matrix B

1184

uchar4 b0 = vload4(0, src1_ptr + src_addr.s1 + 0 * src1_stride_y);

1185

uchar4 b1 = vload4(0, src1_ptr + src_addr.s1 + 1 * src1_stride_y);

1186

uchar4 b2 = vload4(0, src1_ptr + src_addr.s1 + 2 * src1_stride_y);

1187

uchar4 b3 = vload4(0, src1_ptr + src_addr.s1 + 3 * src1_stride_y);

1188

1189

{

1190

// Accumulate

Giorgio Arena

2018-07-26 15:50:09 +0100

[diff] [blame]

1191

ARM_DOT(b0.s0, b1.s0, b2.s0, b3.s0, a0.s0, a0.s1, a0.s2, a0.s3, acc00);

1192

ARM_DOT(b0.s1, b1.s1, b2.s1, b3.s1, a0.s0, a0.s1, a0.s2, a0.s3, acc01);

1193

ARM_DOT(b0.s2, b1.s2, b2.s2, b3.s2, a0.s0, a0.s1, a0.s2, a0.s3, acc02);

1194

ARM_DOT(b0.s3, b1.s3, b2.s3, b3.s3, a0.s0, a0.s1, a0.s2, a0.s3, acc03);

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1195

}

1196

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1197

{

1198

// Accumulate

Giorgio Arena

2018-07-26 15:50:09 +0100

[diff] [blame]

1199

ARM_DOT(b0.s0, b1.s0, b2.s0, b3.s0, a1.s0, a1.s1, a1.s2, a1.s3, acc10);

1200

ARM_DOT(b0.s1, b1.s1, b2.s1, b3.s1, a1.s0, a1.s1, a1.s2, a1.s3, acc11);

1201

ARM_DOT(b0.s2, b1.s2, b2.s2, b3.s2, a1.s0, a1.s1, a1.s2, a1.s3, acc12);

1202

ARM_DOT(b0.s3, b1.s3, b2.s3, b3.s3, a1.s0, a1.s1, a1.s2, a1.s3, acc13);

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1203

}

1204

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1205

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1206

{

1207

// Accumulate

Giorgio Arena

2018-07-26 15:50:09 +0100

[diff] [blame]

1208

ARM_DOT(b0.s0, b1.s0, b2.s0, b3.s0, a2.s0, a2.s1, a2.s2, a2.s3, acc20);

1209

ARM_DOT(b0.s1, b1.s1, b2.s1, b3.s1, a2.s0, a2.s1, a2.s2, a2.s3, acc21);

1210

ARM_DOT(b0.s2, b1.s2, b2.s2, b3.s2, a2.s0, a2.s1, a2.s2, a2.s3, acc22);

1211

ARM_DOT(b0.s3, b1.s3, b2.s3, b3.s3, a2.s0, a2.s1, a2.s2, a2.s3, acc23);

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1212

}

1213

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1214

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1215

{

1216

// Accumulate

Giorgio Arena

2018-07-26 15:50:09 +0100

[diff] [blame]

1217

ARM_DOT(b0.s0, b1.s0, b2.s0, b3.s0, a3.s0, a3.s1, a3.s2, a3.s3, acc30);

1218

ARM_DOT(b0.s1, b1.s1, b2.s1, b3.s1, a3.s0, a3.s1, a3.s2, a3.s3, acc31);

1219

ARM_DOT(b0.s2, b1.s2, b2.s2, b3.s2, a3.s0, a3.s1, a3.s2, a3.s3, acc32);

1220

ARM_DOT(b0.s3, b1.s3, b2.s3, b3.s3, a3.s0, a3.s1, a3.s2, a3.s3, acc33);

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1221

}

1222

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1223

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

1224

{

1225

// Accumulate

Giorgio Arena

2018-07-26 15:50:09 +0100

[diff] [blame]

1226

ARM_DOT(b0.s0, b1.s0, b2.s0, b3.s0, a4.s0, a4.s1, a4.s2, a4.s3, acc40);

1227

ARM_DOT(b0.s1, b1.s1, b2.s1, b3.s1, a4.s0, a4.s1, a4.s2, a4.s3, acc41);

1228

ARM_DOT(b0.s2, b1.s2, b2.s2, b3.s2, a4.s0, a4.s1, a4.s2, a4.s3, acc42);

1229

ARM_DOT(b0.s3, b1.s3, b2.s3, b3.s3, a4.s0, a4.s1, a4.s2, a4.s3, acc43);

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1230

}

1231

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

1232

}

1233

1234

for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(1, src1_stride_y))

1235

{

1236

// Load values from matrix A

1237

uchar a0 = *(src0_ptr + src_addr.s0 + 0 * src0_stride_y);

1238

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1239

uchar a1 = *(src0_ptr + src_addr.s0 + 1 * src0_stride_y);

1240

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1241

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1242

uchar a2 = *(src0_ptr + src_addr.s0 + 2 * src0_stride_y);

1243

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1244

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1245

uchar a3 = *(src0_ptr + src_addr.s0 + 3 * src0_stride_y);

1246

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1247

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

1248

uchar a4 = *(src0_ptr + src_addr.s0 + 4 * src0_stride_y);

1249

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

1250

// Load values from matrix B

1251

uchar4 b0 = vload4(0, src1_ptr + src_addr.s1);

// Accumulate

{

// Accumulate

ushort tmp0 = (ushort)b0.s0 * (ushort)a0;

1257

ushort tmp1 = (ushort)b0.s1 * (ushort)a0;

1258

ushort tmp2 = (ushort)b0.s2 * (ushort)a0;

1259

ushort tmp3 = (ushort)b0.s3 * (ushort)a0;

1260

1261

acc00 += ((uint)tmp0);

1262

acc01 += ((uint)tmp1);

1263

acc02 += ((uint)tmp2);

1264

acc03 += ((uint)tmp3);

1265

}

1266

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1267

{

1268

// Accumulate

1269

ushort tmp0 = (ushort)b0.s0 * (ushort)a1;

1270

ushort tmp1 = (ushort)b0.s1 * (ushort)a1;

1271

ushort tmp2 = (ushort)b0.s2 * (ushort)a1;

1272

ushort tmp3 = (ushort)b0.s3 * (ushort)a1;

1273

1274

acc10 += ((uint)tmp0);

1275

acc11 += ((uint)tmp1);

1276

acc12 += ((uint)tmp2);

1277

acc13 += ((uint)tmp3);

1278

}

1279

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1280

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1281

{

1282

// Accumulate

1283

ushort tmp0 = (ushort)b0.s0 * (ushort)a2;

1284

ushort tmp1 = (ushort)b0.s1 * (ushort)a2;

1285

ushort tmp2 = (ushort)b0.s2 * (ushort)a2;

1286

ushort tmp3 = (ushort)b0.s3 * (ushort)a2;

1287

1288

acc20 += ((uint)tmp0);

1289

acc21 += ((uint)tmp1);

1290

acc22 += ((uint)tmp2);

1291

acc23 += ((uint)tmp3);

1292

}

1293

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1294

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1295

{

1296

// Accumulate

1297

ushort tmp0 = (ushort)b0.s0 * (ushort)a3;

1298

ushort tmp1 = (ushort)b0.s1 * (ushort)a3;

1299

ushort tmp2 = (ushort)b0.s2 * (ushort)a3;

1300

ushort tmp3 = (ushort)b0.s3 * (ushort)a3;

1301

1302

acc30 += ((uint)tmp0);

1303

acc31 += ((uint)tmp1);

1304

acc32 += ((uint)tmp2);

1305

acc33 += ((uint)tmp3);

1306

}

1307

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1308

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

1309

{

1310

// Accumulate

1311

ushort tmp0 = (ushort)b0.s0 * (ushort)a4;

1312

ushort tmp1 = (ushort)b0.s1 * (ushort)a4;

1313

ushort tmp2 = (ushort)b0.s2 * (ushort)a4;

1314

ushort tmp3 = (ushort)b0.s3 * (ushort)a4;

1315

1316

acc40 += ((uint)tmp0);

1317

acc41 += ((uint)tmp1);

1318

acc42 += ((uint)tmp2);

1319

acc43 += ((uint)tmp3);

1320

}

1321

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

1322

}

1323

1324

// Compute destination address

1325

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

1326

1327

// Store the result

1328

vstore4((int4)(acc00, acc01, acc02, acc03), 0, (__global int *)(offset(&dst, 0, 0)));

1329

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1330

vstore4((int4)(acc10, acc11, acc12, acc13), 0, (__global int *)(offset(&dst, 0, 1)));

1331

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1332

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1333

vstore4((int4)(acc20, acc21, acc22, acc23), 0, (__global int *)(offset(&dst, 0, 2)));

1334

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1335

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1336

vstore4((int4)(acc30, acc31, acc32, acc33), 0, (__global int *)(offset(&dst, 0, 3)));

1337

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1338

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

1339

vstore4((int4)(acc40, acc41, acc42, acc43), 0, (__global int *)(offset(&dst, 0, 4)));

1340

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

1341

}

1342

#endif // ARM_COMPUTE_OPENCL_DOT8_ENABLED

1343

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

1344

#endif // defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_Y) && defined(COLS_A)

1345

1346

#if defined(COLS_A)

1347

/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each row of Matrix A.

1348

*

1349

* @note This stage is needed to handle the offset of matrix product

1350

* https://github.com/google/gemmlowp/blob/master/doc/low-precision.md

1351

*

1352

* @attention The number of matrix A columns needs to be passed at compile time using -DCOLS_A

1353

*

1354

* @param[in] src_ptr Pointer to the source tensor. Supported data type: QASYMM8

1355

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1356

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1357

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1358

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1359

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1360

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

1361

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1362

* @param[out] dst_ptr Pointer to the destination tensor Supported data type: S32

1363

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1364

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

1365

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1366

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

1367

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1368

*/

1369

__kernel void gemmlowp_matrix_a_reduction(TENSOR3D_DECLARATION(src),

1370

IMAGE_DECLARATION(dst))

1371

{

1372

// Compute source and destination addresses

1373

Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);

1374

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

1375

1376

uint4 sum_row_u32 = (uint4)0;

1377

uint sum_row = 0;

1378

1379

__global const uchar *matrix_a = (__global const uchar *)(src.ptr + get_global_id(0) * src_stride_y + get_global_id(1) * src_stride_z);

int i = 0;

// This for loop performs 16 accumulations

1384

for(; i <= ((int)COLS_A - 16); i += 16)

1385

{

1386

const uchar16 a0_u8 = vload16(0, matrix_a + i);

1387

1388

sum_row_u32 += convert_uint4(a0_u8.s0123) + convert_uint4(a0_u8.s4567) + convert_uint4(a0_u8.s89AB) + convert_uint4(a0_u8.sCDEF);

1389

}

1390

1391

// This for loop performs the leftover accumulations

1392

for(; i < COLS_A; ++i)

1393

{

1394

sum_row += matrix_a[i];

1395

}

1396

1397

sum_row += sum_row_u32.s0 + sum_row_u32.s1 + sum_row_u32.s2 + sum_row_u32.s3;

1398

1399

*((__global int *)dst.ptr) = (int)sum_row;

1400

}

1401

#endif // defined(COLS_A)

1402

1403

#if defined(COLS_B) && defined(ROWS_B)

1404

/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each column of Matrix B.

1405

*

1406

* @note This stage is needed to handle the offset of matrix product

1407

* https://github.com/google/gemmlowp/blob/master/doc/low-precision.md

1408

*

1409

* @attention The number of matrix B columns and rows needs to be passed at compile time using -DCOLS_B and -DROWS_B

1410

*

1411

* @param[in] src_ptr Pointer to the source tensor. Supported data type: QASYMM8

1412

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1413

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1414

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1415

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1416

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1417

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

1418

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1419

* @param[out] dst_ptr Pointer to the destination tensor Supported data type: S32

1420

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1421

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

1422

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1423

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

1424

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1425

*/

1426

__kernel void gemmlowp_matrix_b_reduction(TENSOR3D_DECLARATION(src),

1427

IMAGE_DECLARATION(dst))

1428

{

1429

// Compute source and destination addresses

1430

Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);

1431

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

1432

1433

uint16 sum_col_u32 = (uint16)0;

1434

1435

__global const uchar *matrix_b = (__global const uchar *)(src.ptr + get_global_id(1) * src_stride_z);

1436

1437

int i = 0;

1438

// This for loop performs 4 accumulations

1439

for(; i <= ((int)ROWS_B - 4); i += 4)

1440

{

1441

const uchar16 b0_u8 = vload16(0, matrix_b + 0 * src_stride_y);

1442

const uchar16 b1_u8 = vload16(0, matrix_b + 1 * src_stride_y);

1443

const uchar16 b2_u8 = vload16(0, matrix_b + 2 * src_stride_y);

1444

const uchar16 b3_u8 = vload16(0, matrix_b + 3 * src_stride_y);

1445

1446

sum_col_u32 += convert_uint16(b0_u8) + convert_uint16(b1_u8) + convert_uint16(b2_u8) + convert_uint16(b3_u8);

1447

1448

matrix_b += 4 * src_stride_y;

1449

}

1450

1451

// This for loop perfoms the leftover accumulations

1452

for(; i < (int)ROWS_B; ++i)

1453

{

1454

const uchar16 b0_u8 = vload16(0, matrix_b);

1455

1456

sum_col_u32 += convert_uint16(b0_u8);

1457

1458

matrix_b += src_stride_y;

1459

}

1460

1461

vstore16(convert_int16(sum_col_u32), 0, (__global int *)dst.ptr);

1462

}

1463

#endif // defined(COLS_B) && defined(ROWS_B)

1464

1465

#if defined(K_OFFSET)

1466

/* OpenCL kernel used to add the offset contribution after @ref CLGEMMLowpMatrixMultiplyKernel. The computation is performed in-place

1467

*

1468

* This kernel takes a final int32 accumulator value (the output of @CLGEMMLowpMatrixMultiplyKernel),

1469

* and adds to it the offset contribution of matrix A and matrix B in-place.

1470

*

1471

* @attention The k_offset = a_offset * b_offset * k (where k is the number of matrix A columns) needs to be passed at compile time using -DK_OFFSET (i.e. -DK_OFFSET=1200)

1472

* @note In case the offset contribution due to a_offset is required, a_offset needs to be passed at compile time using -DA_OFFSET (i.e. -DA_OFFSET=1)

1473

* @note In case the offset contribution due to b_offset is required, b_offset needs to be passed at compile time using -DB_OFFSET (i.e. -DB_OFFSET=6)

Chunosov

2017-11-22 20:42:13 +0700

[diff] [blame]

1474

* @note In case sum_col has batches, -DSUM_COL_HAS_BATCHES must be passed at compile time. Usually if gemmlowp is used to accelerate convolution layer, sum_col will not have batches

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

1475

*

1476

* The final result is:

1477

*

1478

* mm_result[i][k] = mm_result[i][k] +

1479

* (sum_col[k] * A_OFFSET) +

1480

* (sum_row[i] * B_OFFSET) +

1481

* (K_OFFSET)

1482

*

1483

* @param[in] mm_result_ptr Pointer to the source tensor. Supported data type: S32

1484

* @param[in] mm_result_stride_x Stride of the source tensor in X dimension (in bytes)

1485

* @param[in] mm_result_step_x mm_result_stride_x * number of elements along X processed per workitem(in bytes)

1486

* @param[in] mm_result_stride_y Stride of the source tensor in Y dimension (in bytes)

1487

* @param[in] mm_result_step_y mm_result_stride_y * number of elements along Y processed per workitem(in bytes)

1488

* @param[in] mm_result_stride_z Stride of the source tensor in Z dimension (in bytes)

1489

* @param[in] mm_result_step_z mm_result_stride_z * number of elements along Z processed per workitem(in bytes)

1490

* @param[in] mm_result_offset_first_element_in_bytes The offset of the first element in the source tensor

1491

* @param[in] sum_col_result_ptr Pointer to the source tensor. Supported data type: same as @p mm_result_ptr

1492

* @param[in] sum_col_result_stride_x Stride of the source tensor in X dimension (in bytes)

1493

* @param[in] sum_col_result_step_x sum_col_stride_x * number of elements along X processed per workitem(in bytes)

1494

* @param[in] sum_col_result_stride_y Stride of the source tensor in Y dimension (in bytes)

1495

* @param[in] sum_col_result_step_y sum_col_stride_y * number of elements along Y processed per workitem(in bytes)

1496

* @param[in] sum_col_result_offset_first_element_in_bytes The offset of the first element in the source tensor

1497

* @param[in] sum_row_result_ptr Pointer to the source tensor. Supported data type: same as @p mm_result_ptr

1498

* @param[in] sum_row_result_stride_x Stride of the source tensor in X dimension (in bytes)

1499

* @param[in] sum_row_result_step_x sum_row_stride_x * number of elements along X processed per workitem(in bytes)

1500

* @param[in] sum_row_result_stride_y Stride of the source tensor in Y dimension (in bytes)

1501

* @param[in] sum_row_result_step_y sum_row_stride_y * number of elements along Y processed per workitem(in bytes)

1502

* @param[in] sum_row_result_offset_first_element_in_bytes The offset of the first element in the source tensor

1503

*/

1504

__kernel void gemmlowp_offset_contribution(TENSOR3D_DECLARATION(mm_result)

1505

#if defined(A_OFFSET)

1506

,

1507

IMAGE_DECLARATION(sum_col)

1508

#endif // defined(A_OFFSET)

1509

#if defined(B_OFFSET)

1510

,

1511

IMAGE_DECLARATION(sum_row)

1512

#endif // defined(B_OFFSET)

1513

)

1514

{

1515

Tensor3D mm_result = CONVERT_TO_TENSOR3D_STRUCT(mm_result);

1516

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

1517

int4 a_offset_s32 = (int4)0;

1518

int4 b_offset_s32 = (int4)0;

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

1519

1520

#if defined(A_OFFSET)

1521

Image sum_col = CONVERT_TO_IMAGE_STRUCT(sum_col);

1522

1523

// Compute the offset contribution due to A_OFFSET

Chunosov

2017-11-22 20:42:13 +0700

[diff] [blame]

1524

#if defined(SUM_COL_HAS_BATCHES)

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

1525

a_offset_s32 = vload4(0, (__global int *)(sum_col.ptr + get_global_id(2) * sum_col_stride_y));

Chunosov

2017-11-22 20:42:13 +0700

[diff] [blame]

1526

#else // defined(MATRIX_B_HAS_BATCHES)

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

1527

a_offset_s32 = vload4(0, (__global int *)(sum_col.ptr));

Chunosov

2017-11-22 20:42:13 +0700

[diff] [blame]

1528

#endif // defined(MATRIX_B_HAS_BATCHES)

1529

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

1530

a_offset_s32 *= (int4)A_OFFSET;

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

1531

#endif // defined(A_OFFSET)

1532

1533

#if defined(B_OFFSET)

1534

Image sum_row = CONVERT_TO_IMAGE_STRUCT(sum_row);

1535

1536

// Compute the offset contribution due to B_OFFSET

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

1537

b_offset_s32 = (int4) * (((__global int *)(sum_row.ptr + get_global_id(2) * sum_row_stride_y)) + get_global_id(1));

1538

b_offset_s32 *= (int4)B_OFFSET;

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

1539

#endif // defined(B_OFFSET)

1540

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

1541

const int4 offset_term_s32 = (int4)K_OFFSET + a_offset_s32 + b_offset_s32;

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

1542

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

1543

int4 in_s32 = vload4(0, (__global int *)mm_result.ptr);

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

1544

1545

// Add the offset terms to GEMM's result

1546

in_s32 += offset_term_s32;

1547

1548

// Store the result with the offset contribution

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

1549

vstore4(in_s32, 0, (__global int *)mm_result.ptr);

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

1550

}

1551

#endif // defined(K_OFFSET)

1552

1553

#if defined(RESULT_OFFSET) && defined(RESULT_MULT_INT) && defined(RESULT_SHIFT)

1554

/** This OpenCL kernel is used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8

1555

*

1556

* This kernel takes a final int32 accumulator value and processes it to obtain the final QASYMM8 value.

1557

* The following computations will be performed by the kernel:

1558

*

1559

* -# Add offset terms to final result

1560

* -# Multiply each entry of result by result_mult_int

1561

* -# Add bias to final result (if -DADD_BIAS is passed at compile time)

1562

* -# Shift the int32 accumulator by result_shift

1563

* -# Clamp the value between the specified min and max bounds (if -DMIN_BOUND and/or -DMAX_BOUND are passed at compile time)

1564

* -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8.

1565

*

1566

* @attention The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and -DRESULT_SHIFT

1567

*

1568

* @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile time

1569

* @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND.

1570

* These values can be used to implement "rectified linear unit" activation functions

1571

*

1572

* @param[in] src_ptr Pointer to the source tensor. Supported data type: S32

1573

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1574

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1575

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1576

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1577

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1578

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

1579

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1580

* @param[in] biases_ptr Pointer to the biases tensor. Supported data type: same as @p src_ptr

1581

* @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes)

1582

* @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes)

1583

* @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor

1584

* @param[out] dst_ptr Pointer to the destination tensor Supported data type: QASYMM8

1585

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1586

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

1587

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1588

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

1589

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

1590

* @param[in] dst_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

1591

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1592

*/

1593

__kernel void gemmlowp_output_stage_quantize_down(TENSOR3D_DECLARATION(src),

1594

#if defined(ADD_BIAS)

1595

VECTOR_DECLARATION(biases),

1596

#endif // defined(ADD_BIAS)

1597

TENSOR3D_DECLARATION(dst))

1598

{

1599

// Compute source and destination addresses

1600

Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);

1601

Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);

1602

#if defined(ADD_BIAS)

1603

Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);

1604

#endif // defined(ADD_BIAS)

1605

1606

int16 input_values = vload16(0, (__global int *)src.ptr);

1607

Gian Marco

2017-11-28 09:10:03 +0000

[diff] [blame]

1608

// Add the offset terms to GEMM's result

1609

input_values += (int16)RESULT_OFFSET;

1610

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

1611

#if defined(ADD_BIAS)

1612

// Add bias

1613

const int16 biases_values = vload16(0, (__global int *)biases.ptr);

1614

input_values += (int16)biases_values;

1615

#endif // defined(ADD_BIAS)

1616

Georgios Pinitas

45bcc3a

2017-11-29 11:06:49 +0000

[diff] [blame]

1617

// Multiply by result_mult_int and shift

Gian Marco

2017-11-28 09:10:03 +0000

[diff] [blame]

1618

input_values *= RESULT_MULT_INT;

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

1619

Gian Marco

2017-11-28 09:10:03 +0000

[diff] [blame]

1620

input_values >>= RESULT_SHIFT;

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

1621

1622

uchar16 res = convert_uchar16_sat(input_values);

1623

1624

#if defined(MIN_BOUND)

1625

res = max(res, (uchar16)MIN_BOUND);

1626

#endif // defined(MIN_BOUND)

1627

#if defined(MAX_BOUND)

1628

res = min(res, (uchar16)MAX_BOUND);

1629

#endif // defined(MAX_BOUND)

1630

1631

// Store the result

1632

vstore16(res, 0, dst.ptr);

1633

}

Gian Marco

2017-11-28 09:10:03 +0000

[diff] [blame]

1634

#endif // defined(RESULT_OFFSET) && defined(RESULT_MULT_INT) && defined(RESULT_SHIFT)

1635

1636

#if defined(RESULT_OFFSET_AFTER_SHIFT) && defined(RESULT_FIXEDPOINT_MULTIPLIER) && defined(RESULT_SHIFT)

1637

/** This OpenCL kernel is used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8

1638

*

1639

* This kernel takes a final int32 accumulator value (the output of @ref CLGEMMLowpMatrixMultiplyKernel), and processes it to obtain the final QASYMM8 value.

1640

* The following computations will be performed by the kernel:

1641

*

1642

* -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier

1643

* -# Add bias to final result if bias tensor is not a nullptr

1644

* -# Round to nearest division by a power-of-two using result_shift

1645

* -# Add offset to each result

1646

* -# Clamp the value between the specified min and max bounds

1647

* -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8.

1648

*

1649

* @attention The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and -DRESULT_SHIFT

1650

*

1651

* @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile time

1652

* @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND.

1653

* These values can be used to implement "rectified linear unit" activation functions

1654

*

1655

* @param[in] src_ptr Pointer to the source tensor. Supported data type: S32

1656

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1657

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1658

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1659

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1660

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1661

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

1662

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1663

* @param[in] biases_ptr Pointer to the biases tensor. Supported data type: same as @p src_ptr

1664

* @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes)

1665

* @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes)

1666

* @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor

1667

* @param[out] dst_ptr Pointer to the destination tensor Supported data type: QASYMM8

1668

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1669

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

1670

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1671

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

1672

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

1673

* @param[in] dst_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

1674

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1675

*/

1676

__kernel void gemmlowp_output_stage_quantize_down_fixedpoint(TENSOR3D_DECLARATION(src),

1677

#if defined(ADD_BIAS)

1678

VECTOR_DECLARATION(biases),

1679

#endif // defined(ADD_BIAS)

1680

TENSOR3D_DECLARATION(dst))

1681

{

1682

// Compute source and destination addresses

1683

Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);

1684

Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);

1685

#if defined(ADD_BIAS)

1686

Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);

1687

#endif // defined(ADD_BIAS)

1688

1689

int16 input_values = vload16(0, (__global int *)src.ptr);

1690

1691

#if defined(ADD_BIAS)

1692

// Add bias

1693

const int16 biases_values = vload16(0, (__global int *)biases.ptr);

1694

input_values += (int16)biases_values;

1695

#endif // defined(ADD_BIAS)

1696

1697

// Multiply by result_mult_int and shift

1698

input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, 16);

1699

1700

// Add the offset terms to GEMM's result

1701

input_values += (int16)RESULT_OFFSET_AFTER_SHIFT;

1702

1703

uchar16 res = convert_uchar16_sat(input_values);

1704

1705

#if defined(MIN_BOUND)

1706

res = max(res, (uchar16)MIN_BOUND);

1707

#endif // defined(MIN_BOUND)

1708

#if defined(MAX_BOUND)

1709

res = min(res, (uchar16)MAX_BOUND);

1710

#endif // defined(MAX_BOUND)

1711

1712

// Store the result

1713

vstore16(res, 0, dst.ptr);

1714

}

Chunosov