Blame - src/core/CL/cl_kernels/gemmlowp.cl - ml/ComputeLibrary

2017-11-21 10:57:50 +0000

[diff] [blame]

30

*

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

31

* @note The number of matrix B columns needs to be passed at compile time using -DCOLS_B: e.g. -DCOLS_B=1024

32

* @note The transposition width step (mult_transpose1xW_width * 4) must be passed at compile time using -DTRANSPOSE1XW_WIDTH_STEP (i.e. -DTRANSPOSE1XW_WIDTH_STEP=2)

33

* @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (i.e. -DMULT_INTERLEAVE4X4_HEIGHT=2)

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

34

*

35

* @param[in] src0_ptr Pointer to the source matrix. Supported data type: QASYMM8

36

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

37

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

38

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

39

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

40

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

41

* @param[in] src1_ptr Pointer to the source matrix. Supported data type: same as @p src0_ptr

42

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

43

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

44

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

45

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

46

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

47

* @param[out] dst_ptr Pointer to the destination matrix Supported data type: S32

48

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

49

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

50

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

51

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

52

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

53

*/

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

54

__kernel void gemmlowp_mm_interleaved_transposed_midgard(IMAGE_DECLARATION(src0),

55

IMAGE_DECLARATION(src1),

56

IMAGE_DECLARATION(dst))

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

57

{

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

58

int x = get_global_id(0) / TRANSPOSE1XW_WIDTH_STEP;

59

int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

60

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

61

// Offset

62

const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;

63

const int offset_row_b = (get_global_id(0) % TRANSPOSE1XW_WIDTH_STEP) * 4;

64

65

// src_addr_a = address of matrix A

66

// src_addr_b = address of matrix B

67

__global uchar *src_addr_a = (__global uchar *)(src0_ptr + y * src0_stride_y + src0_offset_first_element_in_bytes);

68

__global uchar *src_addr_b = (__global uchar *)(src1_ptr + x * src1_stride_y + src1_offset_first_element_in_bytes);

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

69

70

// Compute end row address for matrix B

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

71

__global uchar *src_end_addr_b = src_addr_b + COLS_B;

72

73

src_addr_a += offset_row_a;

74

src_addr_b += offset_row_b;

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

75

76

// Reset accumulators

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

int4 c00 = 0;

int4 c10 = 0;

int4 c20 = 0;

int4 c30 = 0;

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

81

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

82

for(; src_addr_b <= (src_end_addr_b - (int)(8 * TRANSPOSE1XW_WIDTH_STEP)); src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 8 * TRANSPOSE1XW_WIDTH_STEP)

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

83

{

84

// Load values from matrix A (interleaved) and matrix B (transposed)

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

85

int4 a0 = convert_int4(vload4(0, src_addr_a));

86

int4 b0 = convert_int4(vload4(0, src_addr_b));

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

87

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

88

c00 += (int4)a0.s0 * b0;

89

c10 += (int4)a0.s1 * b0;

90

c20 += (int4)a0.s2 * b0;

91

c30 += (int4)a0.s3 * b0;

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

92

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

93

a0 = convert_int4(vload4(0, src_addr_a + 4 * MULT_INTERLEAVE4X4_HEIGHT));

94

b0 = convert_int4(vload4(0, src_addr_b + 4 * TRANSPOSE1XW_WIDTH_STEP));

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

95

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

96

c00 += (int4)a0.s0 * b0;

97

c10 += (int4)a0.s1 * b0;

98

c20 += (int4)a0.s2 * b0;

99

c30 += (int4)a0.s3 * b0;

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

100

}

101

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

102

for(; src_addr_b < src_end_addr_b; src_addr_a += (4 * MULT_INTERLEAVE4X4_HEIGHT), src_addr_b += (4 * TRANSPOSE1XW_WIDTH_STEP))

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

103

{

104

// Load values from matrix A (interleaved) and matrix B (transposed)

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

105

int4 a0 = convert_int4(vload4(0, src_addr_a));

106

int4 b0 = convert_int4(vload4(0, src_addr_b));

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

107

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

108

c00 += (int4)a0.s0 * b0;

109

c10 += (int4)a0.s1 * b0;

110

c20 += (int4)a0.s2 * b0;

111

c30 += (int4)a0.s3 * b0;

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

112

}

113

114

// Compute destination address

115

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

116

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

117

// Store 4x4 block

118

vstore4(c00, 0, (__global int *)(offset(&dst, 0, 0)));

119

vstore4(c10, 0, (__global int *)(offset(&dst, 0, 1)));

120

vstore4(c20, 0, (__global int *)(offset(&dst, 0, 2)));

121

vstore4(c30, 0, (__global int *)(offset(&dst, 0, 3)));

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

122

}

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

123

124

/** This OpenCL kernel is optimized for Bifrost and computes the matrix multiplication between matrix A (src0) and matrix B (src1)

125

* Matrix A and matrix B must be reshaped respectively with @ref CLGEMMInterleave4x4Kernel and @ref CLGEMMTranspose1xWKernel before running the matrix multiplication

126

*

127

* @attention The number of matrix B columns needs to be passed at compile time using -DCOLS_B

128

* @note The transposition width step (mult_transpose1xW_width * 4) must be passed at compile time using -DTRANSPOSE1XW_WIDTH_STEP (i.e. -DTRANSPOSE1XW_WIDTH_STEP=2)

129

* @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (i.e. -DMULT_INTERLEAVE4X4_HEIGHT=2)

130

*

131

* @param[in] src0_ptr Pointer to the source matrix. Supported data type: QASYMM8

132

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

133

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

134

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

135

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

136

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

137

* @param[in] src1_ptr Pointer to the source matrix. Supported data type: same as @p src0_ptr

138

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

139

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

140

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

141

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

142

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

143

* @param[out] dst_ptr Pointer to the destination matrix Supported data type: S32

144

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

145

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

146

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

147

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

148

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

149

*/

150

__kernel void gemmlowp_mm_interleaved_transposed_bifrost(IMAGE_DECLARATION(src0),

151

IMAGE_DECLARATION(src1),

152

IMAGE_DECLARATION(dst))

153

{

154

int x = get_global_id(0) / TRANSPOSE1XW_WIDTH_STEP;

155

int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;

156

157

// Offset

158

const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;

159

const int offset_row_b = (get_global_id(0) % TRANSPOSE1XW_WIDTH_STEP) * 4;

160

161

// src_addr_a = address of matrix A

162

// src_addr_b = address of matrix B

163

__global uchar *src_addr_a = (__global uchar *)(src0_ptr + y * src0_stride_y + src0_offset_first_element_in_bytes);

164

__global uchar *src_addr_b = (__global uchar *)(src1_ptr + x * src1_stride_y + src1_offset_first_element_in_bytes);

165

166

// Compute end row address for matrix B

167

__global uchar *src_end_addr_b = src_addr_b + COLS_B;

168

169

src_addr_a += offset_row_a;

170

src_addr_b += offset_row_b;

171

172

// Reset accumulators

uint c00 = 0;

uint c01 = 0;

uint c02 = 0;

uint c03 = 0;

uint c10 = 0;

uint c11 = 0;

uint c12 = 0;

uint c13 = 0;

uint c20 = 0;

uint c21 = 0;

uint c22 = 0;

uint c23 = 0;

uint c30 = 0;

uint c31 = 0;

uint c32 = 0;

uint c33 = 0;

#if MULT_INTERLEAVE4X4_HEIGHT == 1

191

for(; src_addr_b <= (src_end_addr_b - (int)(32 * TRANSPOSE1XW_WIDTH_STEP)); src_addr_a += (32 * MULT_INTERLEAVE4X4_HEIGHT), src_addr_b += (32 * TRANSPOSE1XW_WIDTH_STEP))

192

{

193

// Load values from matrix A (interleaved) and matrix B (transposed)

194

uchar16 a0 = vload16(0, src_addr_a);

195

uchar4 b0 = vload4(0, src_addr_b);

196

197

c00 += (ushort)a0.s0 * b0.s0;

198

c01 += (ushort)a0.s0 * b0.s1;

199

c02 += (ushort)a0.s0 * b0.s2;

200

c03 += (ushort)a0.s0 * b0.s3;

201

202

c10 += (ushort)a0.s1 * b0.s0;

203

c11 += (ushort)a0.s1 * b0.s1;

204

c12 += (ushort)a0.s1 * b0.s2;

205

c13 += (ushort)a0.s1 * b0.s3;

206

207

c20 += (ushort)a0.s2 * b0.s0;

208

c21 += (ushort)a0.s2 * b0.s1;

209

c22 += (ushort)a0.s2 * b0.s2;

210

c23 += (ushort)a0.s2 * b0.s3;

211

212

c30 += (ushort)a0.s3 * b0.s0;

213

c31 += (ushort)a0.s3 * b0.s1;

214

c32 += (ushort)a0.s3 * b0.s2;

215

c33 += (ushort)a0.s3 * b0.s3;

216

217

// Load values from matrix B (transposed)

218

b0 = vload4(0, src_addr_b + 4 * TRANSPOSE1XW_WIDTH_STEP);

219

220

c00 += (ushort)a0.s4 * b0.s0;

221

c01 += (ushort)a0.s4 * b0.s1;

222

c02 += (ushort)a0.s4 * b0.s2;

223

c03 += (ushort)a0.s4 * b0.s3;

224

225

c10 += (ushort)a0.s5 * b0.s0;

226

c11 += (ushort)a0.s5 * b0.s1;

227

c12 += (ushort)a0.s5 * b0.s2;

228

c13 += (ushort)a0.s5 * b0.s3;

229

230

c20 += (ushort)a0.s6 * b0.s0;

231

c21 += (ushort)a0.s6 * b0.s1;

232

c22 += (ushort)a0.s6 * b0.s2;

233

c23 += (ushort)a0.s6 * b0.s3;

234

235

c30 += (ushort)a0.s7 * b0.s0;

236

c31 += (ushort)a0.s7 * b0.s1;

237

c32 += (ushort)a0.s7 * b0.s2;

238

c33 += (ushort)a0.s7 * b0.s3;

239

240

// Load values from matrix B (transposed)

241

b0 = vload4(0, src_addr_b + 8 * TRANSPOSE1XW_WIDTH_STEP);

242

243

c00 += (ushort)a0.s8 * b0.s0;

244

c01 += (ushort)a0.s8 * b0.s1;

245

c02 += (ushort)a0.s8 * b0.s2;

246

c03 += (ushort)a0.s8 * b0.s3;

247

248

c10 += (ushort)a0.s9 * b0.s0;

249

c11 += (ushort)a0.s9 * b0.s1;

250

c12 += (ushort)a0.s9 * b0.s2;

251

c13 += (ushort)a0.s9 * b0.s3;

252

253

c20 += (ushort)a0.sA * b0.s0;

254

c21 += (ushort)a0.sA * b0.s1;

255

c22 += (ushort)a0.sA * b0.s2;

256

c23 += (ushort)a0.sA * b0.s3;

257

258

c30 += (ushort)a0.sB * b0.s0;

259

c31 += (ushort)a0.sB * b0.s1;

260

c32 += (ushort)a0.sB * b0.s2;

261

c33 += (ushort)a0.sB * b0.s3;

262

263

// Load values from matrix B (transposed)

264

b0 = vload4(0, src_addr_b + 12 * TRANSPOSE1XW_WIDTH_STEP);

265

266

c00 += (ushort)a0.sC * b0.s0;

267

c01 += (ushort)a0.sC * b0.s1;

268

c02 += (ushort)a0.sC * b0.s2;

269

c03 += (ushort)a0.sC * b0.s3;

270

271

c10 += (ushort)a0.sD * b0.s0;

272

c11 += (ushort)a0.sD * b0.s1;

273

c12 += (ushort)a0.sD * b0.s2;

274

c13 += (ushort)a0.sD * b0.s3;

275

276

c20 += (ushort)a0.sE * b0.s0;

277

c21 += (ushort)a0.sE * b0.s1;

278

c22 += (ushort)a0.sE * b0.s2;

279

c23 += (ushort)a0.sE * b0.s3;

280

281

c30 += (ushort)a0.sF * b0.s0;

282

c31 += (ushort)a0.sF * b0.s1;

283

c32 += (ushort)a0.sF * b0.s2;

284

c33 += (ushort)a0.sF * b0.s3;

285

286

// Load values from matrix A (interleaved) and matrix B (transposed)

287

a0 = vload16(0, src_addr_a + 16);

288

b0 = vload4(0, src_addr_b + 16 * TRANSPOSE1XW_WIDTH_STEP);

289

290

c00 += (ushort)a0.s0 * b0.s0;

291

c01 += (ushort)a0.s0 * b0.s1;

292

c02 += (ushort)a0.s0 * b0.s2;

293

c03 += (ushort)a0.s0 * b0.s3;

294

295

c10 += (ushort)a0.s1 * b0.s0;

296

c11 += (ushort)a0.s1 * b0.s1;

297

c12 += (ushort)a0.s1 * b0.s2;

298

c13 += (ushort)a0.s1 * b0.s3;

299

300

c20 += (ushort)a0.s2 * b0.s0;

301

c21 += (ushort)a0.s2 * b0.s1;

302

c22 += (ushort)a0.s2 * b0.s2;

303

c23 += (ushort)a0.s2 * b0.s3;

304

305

c30 += (ushort)a0.s3 * b0.s0;

306

c31 += (ushort)a0.s3 * b0.s1;

307

c32 += (ushort)a0.s3 * b0.s2;

308

c33 += (ushort)a0.s3 * b0.s3;

309

310

// Load values from matrix B (transposed)

311

b0 = vload4(0, src_addr_b + 20 * TRANSPOSE1XW_WIDTH_STEP);

312

313

c00 += (ushort)a0.s4 * b0.s0;

314

c01 += (ushort)a0.s4 * b0.s1;

315

c02 += (ushort)a0.s4 * b0.s2;

316

c03 += (ushort)a0.s4 * b0.s3;

317

318

c10 += (ushort)a0.s5 * b0.s0;

319

c11 += (ushort)a0.s5 * b0.s1;

320

c12 += (ushort)a0.s5 * b0.s2;

321

c13 += (ushort)a0.s5 * b0.s3;

322

323

c20 += (ushort)a0.s6 * b0.s0;

324

c21 += (ushort)a0.s6 * b0.s1;

325

c22 += (ushort)a0.s6 * b0.s2;

326

c23 += (ushort)a0.s6 * b0.s3;

327

328

c30 += (ushort)a0.s7 * b0.s0;

329

c31 += (ushort)a0.s7 * b0.s1;

330

c32 += (ushort)a0.s7 * b0.s2;

331

c33 += (ushort)a0.s7 * b0.s3;

332

333

// Load values from matrix B (transposed)

334

b0 = vload4(0, src_addr_b + 24 * TRANSPOSE1XW_WIDTH_STEP);

335

336

c00 += (ushort)a0.s8 * b0.s0;

337

c01 += (ushort)a0.s8 * b0.s1;

338

c02 += (ushort)a0.s8 * b0.s2;

339

c03 += (ushort)a0.s8 * b0.s3;

340

341

c10 += (ushort)a0.s9 * b0.s0;

342

c11 += (ushort)a0.s9 * b0.s1;

343

c12 += (ushort)a0.s9 * b0.s2;

344

c13 += (ushort)a0.s9 * b0.s3;

345

346

c20 += (ushort)a0.sA * b0.s0;

347

c21 += (ushort)a0.sA * b0.s1;

348

c22 += (ushort)a0.sA * b0.s2;

349

c23 += (ushort)a0.sA * b0.s3;

350

351

c30 += (ushort)a0.sB * b0.s0;

352

c31 += (ushort)a0.sB * b0.s1;

353

c32 += (ushort)a0.sB * b0.s2;

354

c33 += (ushort)a0.sB * b0.s3;

355

356

// Load values from matrix B (transposed)

357

b0 = vload4(0, src_addr_b + 28 * TRANSPOSE1XW_WIDTH_STEP);

358

359

c00 += (ushort)a0.sC * b0.s0;

360

c01 += (ushort)a0.sC * b0.s1;

361

c02 += (ushort)a0.sC * b0.s2;

362

c03 += (ushort)a0.sC * b0.s3;

363

364

c10 += (ushort)a0.sD * b0.s0;

365

c11 += (ushort)a0.sD * b0.s1;

366

c12 += (ushort)a0.sD * b0.s2;

367

c13 += (ushort)a0.sD * b0.s3;

368

369

c20 += (ushort)a0.sE * b0.s0;

370

c21 += (ushort)a0.sE * b0.s1;

371

c22 += (ushort)a0.sE * b0.s2;

372

c23 += (ushort)a0.sE * b0.s3;

373

374

c30 += (ushort)a0.sF * b0.s0;

375

c31 += (ushort)a0.sF * b0.s1;

376

c32 += (ushort)a0.sF * b0.s2;

377

c33 += (ushort)a0.sF * b0.s3;

378

}

379

#endif // MULT_INTERLEAVE4X4_HEIGHT == 1

380

381

for(; src_addr_b < src_end_addr_b; src_addr_a += (4 * MULT_INTERLEAVE4X4_HEIGHT), src_addr_b += (4 * TRANSPOSE1XW_WIDTH_STEP))

382

{

383

// Load values from matrix A (interleaved) and matrix B (transposed)

384

uchar4 a0 = vload4(0, src_addr_a);

385

uchar4 b0 = vload4(0, src_addr_b);

386

387

c00 += (ushort)a0.s0 * b0.s0;

388

c01 += (ushort)a0.s0 * b0.s1;

389

c02 += (ushort)a0.s0 * b0.s2;

390

c03 += (ushort)a0.s0 * b0.s3;

391

392

c10 += (ushort)a0.s1 * b0.s0;

393

c11 += (ushort)a0.s1 * b0.s1;

394

c12 += (ushort)a0.s1 * b0.s2;

395

c13 += (ushort)a0.s1 * b0.s3;

396

397

c20 += (ushort)a0.s2 * b0.s0;

398

c21 += (ushort)a0.s2 * b0.s1;

399

c22 += (ushort)a0.s2 * b0.s2;

400

c23 += (ushort)a0.s2 * b0.s3;

401

402

c30 += (ushort)a0.s3 * b0.s0;

403

c31 += (ushort)a0.s3 * b0.s1;

404

c32 += (ushort)a0.s3 * b0.s2;

405

c33 += (ushort)a0.s3 * b0.s3;

406

}

407

408

// Compute destination address

409

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

410

411

// Store 4x4 block

412

vstore4((int4)(c00, c01, c02, c03), 0, (__global int *)(offset(&dst, 0, 0)));

413

vstore4((int4)(c10, c11, c12, c13), 0, (__global int *)(offset(&dst, 0, 1)));

414

vstore4((int4)(c20, c21, c22, c23), 0, (__global int *)(offset(&dst, 0, 2)));

415

vstore4((int4)(c30, c31, c32, c33), 0, (__global int *)(offset(&dst, 0, 3)));

416

}

417

#endif // defined(COLS_B) && defined(MULT_INTERLEAVE4X4_HEIGHT) && defined(TRANSPOSE1XW_WIDTH_STEP)

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

418

419

#if defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_Y) && defined(COLS_A)

420

#define VECTOR_UCHAR VEC_DATA_TYPE(uchar, NUM_ELEMS_PROCESSED_PER_THREAD_X)

421

#define VECTOR_UINT VEC_DATA_TYPE(uint, NUM_ELEMS_PROCESSED_PER_THREAD_X)

422

#define VECTOR_INT VEC_DATA_TYPE(int, NUM_ELEMS_PROCESSED_PER_THREAD_X)

423

/** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped

424

*

425

* @attention The number of matrix A columns needs to be passed at compile time using -DCOLS_A

426

*

427

* @param[in] src0_ptr Pointer to the source matrix. Supported data type: QASYMM8

428

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

429

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

430

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

431

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

432

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

433

* @param[in] src1_ptr Pointer to the source matrix. Supported data type: same as @p src0_ptr

434

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

435

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

436

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

437

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

438

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

439

* @param[out] dst_ptr Pointer to the destination matrix Supported data type: S32

440

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

441

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

442

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

443

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

444

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

445

*/

Gian Marco

2018-01-10 15:56:30 +0000

[diff] [blame]

446

__kernel void gemmlowp_mm_midgard(IMAGE_DECLARATION(src0),

447

IMAGE_DECLARATION(src1),

448

IMAGE_DECLARATION(dst))

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

449

{

450

int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;

451

452

// Compute starting address for matrix A and Matrix B

453

int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));

454

455

// Update address for the matrix A

456

src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;

457

458

// Update address for the matrix B

459

src_addr.s1 += idx;

460

461

int end_row_vec_a = src_addr.s0 + COLS_A;

462

463

VECTOR_UINT acc0 = 0;

464

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

465

VECTOR_UINT acc1 = 0;

466

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

467

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

468

VECTOR_UINT acc2 = 0;

469

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

470

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

471

VECTOR_UINT acc3 = 0;

472

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco

2018-01-10 15:56:30 +0000

[diff] [blame]

473

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

474

VECTOR_UINT acc4 = 0;

475

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

476

477

for(; src_addr.s0 <= (end_row_vec_a - 2); src_addr += (int2)(2, 2 * src1_stride_y))

478

{

479

// Load values from matrix A

480

uchar2 a0 = vload2(0, src0_ptr + src_addr.s0 + 0 * src0_stride_y);

481

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

482

uchar2 a1 = vload2(0, src0_ptr + src_addr.s0 + 1 * src0_stride_y);

483

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

484

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

485

uchar2 a2 = vload2(0, src0_ptr + src_addr.s0 + 2 * src0_stride_y);

486

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

487

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

488

uchar2 a3 = vload2(0, src0_ptr + src_addr.s0 + 3 * src0_stride_y);

489

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco

2018-01-10 15:56:30 +0000

[diff] [blame]

490

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

491

uchar2 a4 = vload2(0, src0_ptr + src_addr.s0 + 4 * src0_stride_y);

492

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

493

// Load values from matrix B

494

VECTOR_UCHAR b0 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, src1_ptr + src_addr.s1);

495

VECTOR_UCHAR b1 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, src1_ptr + src_addr.s1 + src1_stride_y);

496

497

// Accumulate

498

acc0 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a0.s0;

499

acc0 += CONVERT(b1, VECTOR_UINT) * (VECTOR_UINT)a0.s1;

500

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

501

acc1 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a1.s0;

502

acc1 += CONVERT(b1, VECTOR_UINT) * (VECTOR_UINT)a1.s1;

503

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

504

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

505

acc2 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a2.s0;

506

acc2 += CONVERT(b1, VECTOR_UINT) * (VECTOR_UINT)a2.s1;

507

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

508

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

509

acc3 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a3.s0;

510

acc3 += CONVERT(b1, VECTOR_UINT) * (VECTOR_UINT)a3.s1;

511

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco

2018-01-10 15:56:30 +0000

[diff] [blame]

512

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

513

acc4 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a4.s0;

514

acc4 += CONVERT(b1, VECTOR_UINT) * (VECTOR_UINT)a4.s1;

515

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

516

}

517

518

for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(1, src1_stride_y))

519

{

520

// Load values from matrix A

521

uchar a0 = *(src0_ptr + src_addr.s0 + 0 * src0_stride_y);

522

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

523

uchar a1 = *(src0_ptr + src_addr.s0 + 1 * src0_stride_y);

524

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

525

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

526

uchar a2 = *(src0_ptr + src_addr.s0 + 2 * src0_stride_y);

527

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

528

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

529

uchar a3 = *(src0_ptr + src_addr.s0 + 3 * src0_stride_y);

530

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco

2018-01-10 15:56:30 +0000

[diff] [blame]

531

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

532

uchar a4 = *(src0_ptr + src_addr.s0 + 4 * src0_stride_y);

533

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

534

// Load values from matrix B

535

VECTOR_UCHAR b0 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, src1_ptr + src_addr.s1);

536

537

// Accumulate

538

acc0 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a0;

539

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

540

acc1 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a1;

541

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

542

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

543

acc2 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a2;

544

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

545

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

546

acc3 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a3;

547

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco

2018-01-10 15:56:30 +0000

[diff] [blame]

548

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

549

acc4 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a4;

550

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

551

}

552

553

// Compute destination address

554

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

555

556

// Store the result

557

VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)

558

(CONVERT(acc0, VECTOR_INT), 0, (__global int *)(offset(&dst, 0, 0)));

559

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

560

VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)

561

(CONVERT(acc1, VECTOR_INT), 0, (__global int *)(offset(&dst, 0, 1)));

562

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

563

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

564

VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)

565

(CONVERT(acc2, VECTOR_INT), 0, (__global int *)(offset(&dst, 0, 2)));

566

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

567

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

568

VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)

569

(CONVERT(acc3, VECTOR_INT), 0, (__global int *)(offset(&dst, 0, 3)));

570

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco

2018-01-10 15:56:30 +0000

[diff] [blame]

571

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

572

VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)

573

(CONVERT(acc4, VECTOR_INT), 0, (__global int *)(offset(&dst, 0, 4)));

574

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

575

}

576

577

/** OpenCL kernel optimized for Bifrost architectures that computes the matrix multiplication between matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped

578

*

579

* @attention The number of matrix A columns needs to be passed at compile time using -DCOLS_A

580

*

581

* @param[in] src0_ptr Pointer to the source matrix. Supported data type: QASYMM8

582

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

583

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

584

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

585

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

586

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

587

* @param[in] src1_ptr Pointer to the source matrix. Supported data type: same as @p src0_ptr

588

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

589

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

590

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

591

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

592

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

593

* @param[out] dst_ptr Pointer to the destination matrix Supported data type: S32

594

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

595

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

596

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

597

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

598

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

599

*/

600

__kernel void gemmlowp_mm_bifrost(IMAGE_DECLARATION(src0),

601

IMAGE_DECLARATION(src1),

602

IMAGE_DECLARATION(dst))

603

{

604

int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;

605

606

// Compute starting address for matrix A and Matrix B

607

int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));

608

609

// Update address for the matrix A

610

src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;

611

612

// Update address for the matrix B

613

src_addr.s1 += idx;

614

615

int end_row_vec_a = src_addr.s0 + COLS_A;

uint acc00 = 0;

uint acc01 = 0;

uint acc02 = 0;

uint acc03 = 0;

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

uint acc10 = 0;

uint acc11 = 0;

uint acc12 = 0;

uint acc13 = 0;

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

627

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

uint acc20 = 0;

uint acc21 = 0;

uint acc22 = 0;

uint acc23 = 0;

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

633

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

uint acc30 = 0;

uint acc31 = 0;

uint acc32 = 0;

uint acc33 = 0;

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

639

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

uint acc40 = 0;

uint acc41 = 0;

uint acc42 = 0;

uint acc43 = 0;

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

645

646

for(; src_addr.s0 <= (end_row_vec_a - 4); src_addr += (int2)(4, 4 * src1_stride_y))

647

{

648

// Load values from matrix A

649

uchar4 a0 = vload4(0, src0_ptr + src_addr.s0 + 0 * src0_stride_y);

650

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

651

uchar4 a1 = vload4(0, src0_ptr + src_addr.s0 + 1 * src0_stride_y);

652

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

653

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

654

uchar4 a2 = vload4(0, src0_ptr + src_addr.s0 + 2 * src0_stride_y);

655

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

656

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

657

uchar4 a3 = vload4(0, src0_ptr + src_addr.s0 + 3 * src0_stride_y);

658

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

659

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

660

uchar4 a4 = vload4(0, src0_ptr + src_addr.s0 + 4 * src0_stride_y);

661

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

662

// Load values from matrix B

663

uchar4 b0 = vload4(0, src1_ptr + src_addr.s1 + 0 * src1_stride_y);

664

uchar4 b1 = vload4(0, src1_ptr + src_addr.s1 + 1 * src1_stride_y);

665

uchar4 b2 = vload4(0, src1_ptr + src_addr.s1 + 2 * src1_stride_y);

666

uchar4 b3 = vload4(0, src1_ptr + src_addr.s1 + 3 * src1_stride_y);

{

// Accumulate

ushort tmp0 = (ushort)b0.s0 * (ushort)a0.s0;

671

ushort tmp1 = (ushort)b0.s1 * (ushort)a0.s0;

672

ushort tmp2 = (ushort)b0.s2 * (ushort)a0.s0;

673

ushort tmp3 = (ushort)b0.s3 * (ushort)a0.s0;

674

675

ushort tmp4 = (ushort)b1.s0 * (ushort)a0.s1;

676

ushort tmp5 = (ushort)b1.s1 * (ushort)a0.s1;

677

ushort tmp6 = (ushort)b1.s2 * (ushort)a0.s1;

678

ushort tmp7 = (ushort)b1.s3 * (ushort)a0.s1;

679

680

ushort tmp8 = (ushort)b2.s0 * (ushort)a0.s2;

681

ushort tmp9 = (ushort)b2.s1 * (ushort)a0.s2;

682

ushort tmpA = (ushort)b2.s2 * (ushort)a0.s2;

683

ushort tmpB = (ushort)b2.s3 * (ushort)a0.s2;

684

685

ushort tmpC = (ushort)b3.s0 * (ushort)a0.s3;

686

ushort tmpD = (ushort)b3.s1 * (ushort)a0.s3;

687

ushort tmpE = (ushort)b3.s2 * (ushort)a0.s3;

688

ushort tmpF = (ushort)b3.s3 * (ushort)a0.s3;

689

690

acc00 += ((uint)tmp0 + (uint)tmp4 + (uint)tmp8 + (uint)tmpC);

691

acc01 += ((uint)tmp1 + (uint)tmp5 + (uint)tmp9 + (uint)tmpD);

692

acc02 += ((uint)tmp2 + (uint)tmp6 + (uint)tmpA + (uint)tmpE);

693

acc03 += ((uint)tmp3 + (uint)tmp7 + (uint)tmpB + (uint)tmpF);

694

}

695

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

696

{

697

// Accumulate

698

ushort tmp0 = (ushort)b0.s0 * (ushort)a1.s0;

699

ushort tmp1 = (ushort)b0.s1 * (ushort)a1.s0;

700

ushort tmp2 = (ushort)b0.s2 * (ushort)a1.s0;

701

ushort tmp3 = (ushort)b0.s3 * (ushort)a1.s0;

702

703

ushort tmp4 = (ushort)b1.s0 * (ushort)a1.s1;

704

ushort tmp5 = (ushort)b1.s1 * (ushort)a1.s1;

705

ushort tmp6 = (ushort)b1.s2 * (ushort)a1.s1;

706

ushort tmp7 = (ushort)b1.s3 * (ushort)a1.s1;

707

708

ushort tmp8 = (ushort)b2.s0 * (ushort)a1.s2;

709

ushort tmp9 = (ushort)b2.s1 * (ushort)a1.s2;

710

ushort tmpA = (ushort)b2.s2 * (ushort)a1.s2;

711

ushort tmpB = (ushort)b2.s3 * (ushort)a1.s2;

712

713

ushort tmpC = (ushort)b3.s0 * (ushort)a1.s3;

714

ushort tmpD = (ushort)b3.s1 * (ushort)a1.s3;

715

ushort tmpE = (ushort)b3.s2 * (ushort)a1.s3;

716

ushort tmpF = (ushort)b3.s3 * (ushort)a1.s3;

717

718

acc10 += ((uint)tmp0 + (uint)tmp4 + (uint)tmp8 + (uint)tmpC);

719

acc11 += ((uint)tmp1 + (uint)tmp5 + (uint)tmp9 + (uint)tmpD);

720

acc12 += ((uint)tmp2 + (uint)tmp6 + (uint)tmpA + (uint)tmpE);

721

acc13 += ((uint)tmp3 + (uint)tmp7 + (uint)tmpB + (uint)tmpF);

722

}

723

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

724

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

725

{

726

// Accumulate

727

ushort tmp0 = (ushort)b0.s0 * (ushort)a2.s0;

728

ushort tmp1 = (ushort)b0.s1 * (ushort)a2.s0;

729

ushort tmp2 = (ushort)b0.s2 * (ushort)a2.s0;

730

ushort tmp3 = (ushort)b0.s3 * (ushort)a2.s0;

731

732

ushort tmp4 = (ushort)b1.s0 * (ushort)a2.s1;

733

ushort tmp5 = (ushort)b1.s1 * (ushort)a2.s1;

734

ushort tmp6 = (ushort)b1.s2 * (ushort)a2.s1;

735

ushort tmp7 = (ushort)b1.s3 * (ushort)a2.s1;

736

737

ushort tmp8 = (ushort)b2.s0 * (ushort)a2.s2;

738

ushort tmp9 = (ushort)b2.s1 * (ushort)a2.s2;

739

ushort tmpA = (ushort)b2.s2 * (ushort)a2.s2;

740

ushort tmpB = (ushort)b2.s3 * (ushort)a2.s2;

741

742

ushort tmpC = (ushort)b3.s0 * (ushort)a2.s3;

743

ushort tmpD = (ushort)b3.s1 * (ushort)a2.s3;

744

ushort tmpE = (ushort)b3.s2 * (ushort)a2.s3;

745

ushort tmpF = (ushort)b3.s3 * (ushort)a2.s3;

746

747

acc20 += ((uint)tmp0 + (uint)tmp4 + (uint)tmp8 + (uint)tmpC);

748

acc21 += ((uint)tmp1 + (uint)tmp5 + (uint)tmp9 + (uint)tmpD);

749

acc22 += ((uint)tmp2 + (uint)tmp6 + (uint)tmpA + (uint)tmpE);

750

acc23 += ((uint)tmp3 + (uint)tmp7 + (uint)tmpB + (uint)tmpF);

751

}

752

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

753

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

754

{

755

// Accumulate

756

ushort tmp0 = (ushort)b0.s0 * (ushort)a3.s0;

757

ushort tmp1 = (ushort)b0.s1 * (ushort)a3.s0;

758

ushort tmp2 = (ushort)b0.s2 * (ushort)a3.s0;

759

ushort tmp3 = (ushort)b0.s3 * (ushort)a3.s0;

760

761

ushort tmp4 = (ushort)b1.s0 * (ushort)a3.s1;

762

ushort tmp5 = (ushort)b1.s1 * (ushort)a3.s1;

763

ushort tmp6 = (ushort)b1.s2 * (ushort)a3.s1;

764

ushort tmp7 = (ushort)b1.s3 * (ushort)a3.s1;

765

766

ushort tmp8 = (ushort)b2.s0 * (ushort)a3.s2;

767

ushort tmp9 = (ushort)b2.s1 * (ushort)a3.s2;

768

ushort tmpA = (ushort)b2.s2 * (ushort)a3.s2;

769

ushort tmpB = (ushort)b2.s3 * (ushort)a3.s2;

770

771

ushort tmpC = (ushort)b3.s0 * (ushort)a3.s3;

772

ushort tmpD = (ushort)b3.s1 * (ushort)a3.s3;

773

ushort tmpE = (ushort)b3.s2 * (ushort)a3.s3;

774

ushort tmpF = (ushort)b3.s3 * (ushort)a3.s3;

775

776

acc30 += ((uint)tmp0 + (uint)tmp4 + (uint)tmp8 + (uint)tmpC);

777

acc31 += ((uint)tmp1 + (uint)tmp5 + (uint)tmp9 + (uint)tmpD);

778

acc32 += ((uint)tmp2 + (uint)tmp6 + (uint)tmpA + (uint)tmpE);

779

acc33 += ((uint)tmp3 + (uint)tmp7 + (uint)tmpB + (uint)tmpF);

780

}

781

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

782

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

783

{

784

// Accumulate

785

ushort tmp0 = (ushort)b0.s0 * (ushort)a4.s0;

786

ushort tmp1 = (ushort)b0.s1 * (ushort)a4.s0;

787

ushort tmp2 = (ushort)b0.s2 * (ushort)a4.s0;

788

ushort tmp3 = (ushort)b0.s3 * (ushort)a4.s0;

789

790

ushort tmp4 = (ushort)b1.s0 * (ushort)a4.s1;

791

ushort tmp5 = (ushort)b1.s1 * (ushort)a4.s1;

792

ushort tmp6 = (ushort)b1.s2 * (ushort)a4.s1;

793

ushort tmp7 = (ushort)b1.s3 * (ushort)a4.s1;

794

795

ushort tmp8 = (ushort)b2.s0 * (ushort)a4.s2;

796

ushort tmp9 = (ushort)b2.s1 * (ushort)a4.s2;

797

ushort tmpA = (ushort)b2.s2 * (ushort)a4.s2;

798

ushort tmpB = (ushort)b2.s3 * (ushort)a4.s2;

799

800

ushort tmpC = (ushort)b3.s0 * (ushort)a4.s3;

801

ushort tmpD = (ushort)b3.s1 * (ushort)a4.s3;

802

ushort tmpE = (ushort)b3.s2 * (ushort)a4.s3;

803

ushort tmpF = (ushort)b3.s3 * (ushort)a4.s3;

804

805

acc40 += ((uint)tmp0 + (uint)tmp4 + (uint)tmp8 + (uint)tmpC);

806

acc41 += ((uint)tmp1 + (uint)tmp5 + (uint)tmp9 + (uint)tmpD);

807

acc42 += ((uint)tmp2 + (uint)tmp6 + (uint)tmpA + (uint)tmpE);

808

acc43 += ((uint)tmp3 + (uint)tmp7 + (uint)tmpB + (uint)tmpF);

809

}

810

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

811

}

812

813

for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(1, src1_stride_y))

814

{

815

// Load values from matrix A

816

uchar a0 = *(src0_ptr + src_addr.s0 + 0 * src0_stride_y);

817

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

818

uchar a1 = *(src0_ptr + src_addr.s0 + 1 * src0_stride_y);

819

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

820

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

821

uchar a2 = *(src0_ptr + src_addr.s0 + 2 * src0_stride_y);

822

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

823

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

824

uchar a3 = *(src0_ptr + src_addr.s0 + 3 * src0_stride_y);

825

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

826

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

827

uchar a4 = *(src0_ptr + src_addr.s0 + 4 * src0_stride_y);

828

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

829

// Load values from matrix B

830

uchar4 b0 = vload4(0, src1_ptr + src_addr.s1);

// Accumulate

{

// Accumulate

ushort tmp0 = (ushort)b0.s0 * (ushort)a0;

836

ushort tmp1 = (ushort)b0.s1 * (ushort)a0;

837

ushort tmp2 = (ushort)b0.s2 * (ushort)a0;

838

ushort tmp3 = (ushort)b0.s3 * (ushort)a0;

839

840

acc00 += ((uint)tmp0);

841

acc01 += ((uint)tmp1);

842

acc02 += ((uint)tmp2);

843

acc03 += ((uint)tmp3);

844

}

845

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

846

{

847

// Accumulate

848

ushort tmp0 = (ushort)b0.s0 * (ushort)a1;

849

ushort tmp1 = (ushort)b0.s1 * (ushort)a1;

850

ushort tmp2 = (ushort)b0.s2 * (ushort)a1;

851

ushort tmp3 = (ushort)b0.s3 * (ushort)a1;

852

853

acc10 += ((uint)tmp0);

854

acc11 += ((uint)tmp1);

855

acc12 += ((uint)tmp2);

856

acc13 += ((uint)tmp3);

857

}

858

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

859

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

860

{

861

// Accumulate

862

ushort tmp0 = (ushort)b0.s0 * (ushort)a2;

863

ushort tmp1 = (ushort)b0.s1 * (ushort)a2;

864

ushort tmp2 = (ushort)b0.s2 * (ushort)a2;

865

ushort tmp3 = (ushort)b0.s3 * (ushort)a2;

866

867

acc20 += ((uint)tmp0);

868

acc21 += ((uint)tmp1);

869

acc22 += ((uint)tmp2);

870

acc23 += ((uint)tmp3);

871

}

872

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

873

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

874

{

875

// Accumulate

876

ushort tmp0 = (ushort)b0.s0 * (ushort)a3;

877

ushort tmp1 = (ushort)b0.s1 * (ushort)a3;

878

ushort tmp2 = (ushort)b0.s2 * (ushort)a3;

879

ushort tmp3 = (ushort)b0.s3 * (ushort)a3;

880

881

acc30 += ((uint)tmp0);

882

acc31 += ((uint)tmp1);

883

acc32 += ((uint)tmp2);

884

acc33 += ((uint)tmp3);

885

}

886

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

887

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

888

{

889

// Accumulate

890

ushort tmp0 = (ushort)b0.s0 * (ushort)a4;

891

ushort tmp1 = (ushort)b0.s1 * (ushort)a4;

892

ushort tmp2 = (ushort)b0.s2 * (ushort)a4;

893

ushort tmp3 = (ushort)b0.s3 * (ushort)a4;

894

895

acc40 += ((uint)tmp0);

896

acc41 += ((uint)tmp1);

897

acc42 += ((uint)tmp2);

898

acc43 += ((uint)tmp3);

899

}

900

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

901

}

902

903

// Compute destination address

904

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

905

906

// Store the result

907

vstore4((int4)(acc00, acc01, acc02, acc03), 0, (__global int *)(offset(&dst, 0, 0)));

908

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

909

vstore4((int4)(acc10, acc11, acc12, acc13), 0, (__global int *)(offset(&dst, 0, 1)));

910

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

911

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

912

vstore4((int4)(acc20, acc21, acc22, acc23), 0, (__global int *)(offset(&dst, 0, 2)));

913

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

914

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

915

vstore4((int4)(acc30, acc31, acc32, acc33), 0, (__global int *)(offset(&dst, 0, 3)));

916

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

917

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

918

vstore4((int4)(acc40, acc41, acc42, acc43), 0, (__global int *)(offset(&dst, 0, 4)));

919

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

920

}

921

#endif // defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_Y) && defined(COLS_A)

922

923

#if defined(COLS_A)

924

/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each row of Matrix A.

925

*

926

* @note This stage is needed to handle the offset of matrix product

927

* https://github.com/google/gemmlowp/blob/master/doc/low-precision.md

928

*

929

* @attention The number of matrix A columns needs to be passed at compile time using -DCOLS_A

930

*

931

* @param[in] src_ptr Pointer to the source tensor. Supported data type: QASYMM8

932

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

933

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

934

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

935

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

936

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

937

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

938

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

939

* @param[out] dst_ptr Pointer to the destination tensor Supported data type: S32

940

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

941

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

942

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

943

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

944

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

945

*/

946

__kernel void gemmlowp_matrix_a_reduction(TENSOR3D_DECLARATION(src),

947

IMAGE_DECLARATION(dst))

948

{

949

// Compute source and destination addresses

950

Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);

951

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

952

953

uint4 sum_row_u32 = (uint4)0;

954

uint sum_row = 0;

955

956

__global const uchar *matrix_a = (__global const uchar *)(src.ptr + get_global_id(0) * src_stride_y + get_global_id(1) * src_stride_z);

int i = 0;

// This for loop performs 16 accumulations

961

for(; i <= ((int)COLS_A - 16); i += 16)

962

{

963

const uchar16 a0_u8 = vload16(0, matrix_a + i);

964

965

sum_row_u32 += convert_uint4(a0_u8.s0123) + convert_uint4(a0_u8.s4567) + convert_uint4(a0_u8.s89AB) + convert_uint4(a0_u8.sCDEF);

966

}

967

968

// This for loop performs the leftover accumulations

969

for(; i < COLS_A; ++i)

970

{

971

sum_row += matrix_a[i];

972

}

973

974

sum_row += sum_row_u32.s0 + sum_row_u32.s1 + sum_row_u32.s2 + sum_row_u32.s3;

975

976

*((__global int *)dst.ptr) = (int)sum_row;

977

}

978

#endif // defined(COLS_A)

979

980

#if defined(COLS_B) && defined(ROWS_B)

981

/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each column of Matrix B.

982

*

983

* @note This stage is needed to handle the offset of matrix product

984

* https://github.com/google/gemmlowp/blob/master/doc/low-precision.md

985

*

986

* @attention The number of matrix B columns and rows needs to be passed at compile time using -DCOLS_B and -DROWS_B

987

*

988

* @param[in] src_ptr Pointer to the source tensor. Supported data type: QASYMM8

989

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

990

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

991

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

992

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

993

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

994

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

995

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

996

* @param[out] dst_ptr Pointer to the destination tensor Supported data type: S32

997

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

998

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

999

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1000

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

1001

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1002

*/

1003

__kernel void gemmlowp_matrix_b_reduction(TENSOR3D_DECLARATION(src),

1004

IMAGE_DECLARATION(dst))

1005

{

1006

// Compute source and destination addresses

1007

Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);

1008

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

1009

1010

uint16 sum_col_u32 = (uint16)0;

1011

1012

__global const uchar *matrix_b = (__global const uchar *)(src.ptr + get_global_id(1) * src_stride_z);

1013

1014

int i = 0;

1015

// This for loop performs 4 accumulations

1016

for(; i <= ((int)ROWS_B - 4); i += 4)

1017

{

1018

const uchar16 b0_u8 = vload16(0, matrix_b + 0 * src_stride_y);

1019

const uchar16 b1_u8 = vload16(0, matrix_b + 1 * src_stride_y);

1020

const uchar16 b2_u8 = vload16(0, matrix_b + 2 * src_stride_y);

1021

const uchar16 b3_u8 = vload16(0, matrix_b + 3 * src_stride_y);

1022

1023

sum_col_u32 += convert_uint16(b0_u8) + convert_uint16(b1_u8) + convert_uint16(b2_u8) + convert_uint16(b3_u8);

1024

1025

matrix_b += 4 * src_stride_y;

1026

}

1027

1028

// This for loop perfoms the leftover accumulations

1029

for(; i < (int)ROWS_B; ++i)

1030

{

1031

const uchar16 b0_u8 = vload16(0, matrix_b);

1032

1033

sum_col_u32 += convert_uint16(b0_u8);

1034

1035

matrix_b += src_stride_y;

1036

}

1037

1038

vstore16(convert_int16(sum_col_u32), 0, (__global int *)dst.ptr);

1039

}

1040

#endif // defined(COLS_B) && defined(ROWS_B)

1041

1042

#if defined(K_OFFSET)

1043

/* OpenCL kernel used to add the offset contribution after @ref CLGEMMLowpMatrixMultiplyKernel. The computation is performed in-place

1044

*

1045

* This kernel takes a final int32 accumulator value (the output of @CLGEMMLowpMatrixMultiplyKernel),

1046

* and adds to it the offset contribution of matrix A and matrix B in-place.

1047

*

1048

* @attention The k_offset = a_offset * b_offset * k (where k is the number of matrix A columns) needs to be passed at compile time using -DK_OFFSET (i.e. -DK_OFFSET=1200)

1049

* @note In case the offset contribution due to a_offset is required, a_offset needs to be passed at compile time using -DA_OFFSET (i.e. -DA_OFFSET=1)

1050

* @note In case the offset contribution due to b_offset is required, b_offset needs to be passed at compile time using -DB_OFFSET (i.e. -DB_OFFSET=6)

Chunosov

2017-11-22 20:42:13 +0700

[diff] [blame]

1051

* @note In case sum_col has batches, -DSUM_COL_HAS_BATCHES must be passed at compile time. Usually if gemmlowp is used to accelerate convolution layer, sum_col will not have batches

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

1052

*

1053

* The final result is:

1054

*

1055

* mm_result[i][k] = mm_result[i][k] +

1056

* (sum_col[k] * A_OFFSET) +

1057

* (sum_row[i] * B_OFFSET) +

1058

* (K_OFFSET)

1059

*

1060

* @param[in] mm_result_ptr Pointer to the source tensor. Supported data type: S32

1061

* @param[in] mm_result_stride_x Stride of the source tensor in X dimension (in bytes)

1062

* @param[in] mm_result_step_x mm_result_stride_x * number of elements along X processed per workitem(in bytes)

1063

* @param[in] mm_result_stride_y Stride of the source tensor in Y dimension (in bytes)

1064

* @param[in] mm_result_step_y mm_result_stride_y * number of elements along Y processed per workitem(in bytes)

1065

* @param[in] mm_result_stride_z Stride of the source tensor in Z dimension (in bytes)

1066

* @param[in] mm_result_step_z mm_result_stride_z * number of elements along Z processed per workitem(in bytes)

1067

* @param[in] mm_result_offset_first_element_in_bytes The offset of the first element in the source tensor

1068

* @param[in] sum_col_result_ptr Pointer to the source tensor. Supported data type: same as @p mm_result_ptr

1069

* @param[in] sum_col_result_stride_x Stride of the source tensor in X dimension (in bytes)

1070

* @param[in] sum_col_result_step_x sum_col_stride_x * number of elements along X processed per workitem(in bytes)

1071

* @param[in] sum_col_result_stride_y Stride of the source tensor in Y dimension (in bytes)

1072

* @param[in] sum_col_result_step_y sum_col_stride_y * number of elements along Y processed per workitem(in bytes)

1073

* @param[in] sum_col_result_offset_first_element_in_bytes The offset of the first element in the source tensor

1074

* @param[in] sum_row_result_ptr Pointer to the source tensor. Supported data type: same as @p mm_result_ptr

1075

* @param[in] sum_row_result_stride_x Stride of the source tensor in X dimension (in bytes)

1076

* @param[in] sum_row_result_step_x sum_row_stride_x * number of elements along X processed per workitem(in bytes)

1077

* @param[in] sum_row_result_stride_y Stride of the source tensor in Y dimension (in bytes)

1078

* @param[in] sum_row_result_step_y sum_row_stride_y * number of elements along Y processed per workitem(in bytes)

1079

* @param[in] sum_row_result_offset_first_element_in_bytes The offset of the first element in the source tensor

1080

*/

1081

__kernel void gemmlowp_offset_contribution(TENSOR3D_DECLARATION(mm_result)

1082

#if defined(A_OFFSET)

1083

,

1084

IMAGE_DECLARATION(sum_col)

1085

#endif // defined(A_OFFSET)

1086

#if defined(B_OFFSET)

1087

,

1088

IMAGE_DECLARATION(sum_row)

1089

#endif // defined(B_OFFSET)

1090

)

1091

{

1092

Tensor3D mm_result = CONVERT_TO_TENSOR3D_STRUCT(mm_result);

1093

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

1094

int4 a_offset_s32 = (int4)0;

1095

int4 b_offset_s32 = (int4)0;

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

1096

1097

#if defined(A_OFFSET)

1098

Image sum_col = CONVERT_TO_IMAGE_STRUCT(sum_col);

1099

1100

// Compute the offset contribution due to A_OFFSET

Chunosov

2017-11-22 20:42:13 +0700

[diff] [blame]

1101

#if defined(SUM_COL_HAS_BATCHES)

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

1102

a_offset_s32 = vload4(0, (__global int *)(sum_col.ptr + get_global_id(2) * sum_col_stride_y));

Chunosov

2017-11-22 20:42:13 +0700

[diff] [blame]

1103

#else // defined(MATRIX_B_HAS_BATCHES)

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

1104

a_offset_s32 = vload4(0, (__global int *)(sum_col.ptr));

Chunosov

2017-11-22 20:42:13 +0700

[diff] [blame]

1105

#endif // defined(MATRIX_B_HAS_BATCHES)

1106

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

1107

a_offset_s32 *= (int4)A_OFFSET;

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

1108

#endif // defined(A_OFFSET)

1109

1110

#if defined(B_OFFSET)

1111

Image sum_row = CONVERT_TO_IMAGE_STRUCT(sum_row);

1112

1113

// Compute the offset contribution due to B_OFFSET

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

1114

b_offset_s32 = (int4) * (((__global int *)(sum_row.ptr + get_global_id(2) * sum_row_stride_y)) + get_global_id(1));

1115

b_offset_s32 *= (int4)B_OFFSET;

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

1116

#endif // defined(B_OFFSET)

1117

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

1118

const int4 offset_term_s32 = (int4)K_OFFSET + a_offset_s32 + b_offset_s32;

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

1119

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

1120

int4 in_s32 = vload4(0, (__global int *)mm_result.ptr);

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

1121

1122

// Add the offset terms to GEMM's result

1123

in_s32 += offset_term_s32;

1124

1125

// Store the result with the offset contribution

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

1126

vstore4(in_s32, 0, (__global int *)mm_result.ptr);

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

1127

}

1128

#endif // defined(K_OFFSET)

1129

1130

#if defined(RESULT_OFFSET) && defined(RESULT_MULT_INT) && defined(RESULT_SHIFT)

1131

/** This OpenCL kernel is used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8

1132

*

1133

* This kernel takes a final int32 accumulator value and processes it to obtain the final QASYMM8 value.

1134

* The following computations will be performed by the kernel:

1135

*

1136

* -# Add offset terms to final result

1137

* -# Multiply each entry of result by result_mult_int

1138

* -# Add bias to final result (if -DADD_BIAS is passed at compile time)

1139

* -# Shift the int32 accumulator by result_shift

1140

* -# Clamp the value between the specified min and max bounds (if -DMIN_BOUND and/or -DMAX_BOUND are passed at compile time)

1141

* -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8.

1142

*

1143

* @attention The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and -DRESULT_SHIFT

1144

*

1145

* @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile time

1146

* @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND.

1147

* These values can be used to implement "rectified linear unit" activation functions

1148

*

1149

* @param[in] src_ptr Pointer to the source tensor. Supported data type: S32

1150

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1151

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1152

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1153

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1154

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1155

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

1156

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1157

* @param[in] biases_ptr Pointer to the biases tensor. Supported data type: same as @p src_ptr

1158

* @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes)

1159

* @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes)

1160

* @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor

1161

* @param[out] dst_ptr Pointer to the destination tensor Supported data type: QASYMM8

1162

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1163

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

1164

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1165

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

1166

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

1167

* @param[in] dst_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

1168

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1169

*/

1170

__kernel void gemmlowp_output_stage_quantize_down(TENSOR3D_DECLARATION(src),

1171

#if defined(ADD_BIAS)

1172

VECTOR_DECLARATION(biases),

1173

#endif // defined(ADD_BIAS)

1174

TENSOR3D_DECLARATION(dst))

1175

{

1176

// Compute source and destination addresses

1177

Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);

1178

Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);

1179

#if defined(ADD_BIAS)

1180

Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);

1181

#endif // defined(ADD_BIAS)

1182

1183

int16 input_values = vload16(0, (__global int *)src.ptr);

1184

Gian Marco

2017-11-28 09:10:03 +0000

[diff] [blame]

1185

// Add the offset terms to GEMM's result

1186

input_values += (int16)RESULT_OFFSET;

1187

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

1188

#if defined(ADD_BIAS)

1189

// Add bias

1190

const int16 biases_values = vload16(0, (__global int *)biases.ptr);

1191

input_values += (int16)biases_values;

1192

#endif // defined(ADD_BIAS)

1193

Georgios Pinitas

45bcc3a

2017-11-29 11:06:49 +0000

[diff] [blame]

1194

// Multiply by result_mult_int and shift

Gian Marco

2017-11-28 09:10:03 +0000

[diff] [blame]

1195

input_values *= RESULT_MULT_INT;

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

1196

Gian Marco

2017-11-28 09:10:03 +0000

[diff] [blame]

1197

input_values >>= RESULT_SHIFT;

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

1198

1199

uchar16 res = convert_uchar16_sat(input_values);

1200

1201

#if defined(MIN_BOUND)

1202

res = max(res, (uchar16)MIN_BOUND);

1203

#endif // defined(MIN_BOUND)

1204

#if defined(MAX_BOUND)

1205

res = min(res, (uchar16)MAX_BOUND);

1206

#endif // defined(MAX_BOUND)

1207

1208

// Store the result

1209

vstore16(res, 0, dst.ptr);

1210

}

Gian Marco

2017-11-28 09:10:03 +0000

[diff] [blame]

1211

#endif // defined(RESULT_OFFSET) && defined(RESULT_MULT_INT) && defined(RESULT_SHIFT)

1212

1213

#if defined(RESULT_OFFSET_AFTER_SHIFT) && defined(RESULT_FIXEDPOINT_MULTIPLIER) && defined(RESULT_SHIFT)

1214

/** This OpenCL kernel is used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8

1215

*

1216

* This kernel takes a final int32 accumulator value (the output of @ref CLGEMMLowpMatrixMultiplyKernel), and processes it to obtain the final QASYMM8 value.

1217

* The following computations will be performed by the kernel:

1218

*

1219

* -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier

1220

* -# Add bias to final result if bias tensor is not a nullptr

1221

* -# Round to nearest division by a power-of-two using result_shift

1222

* -# Add offset to each result

1223

* -# Clamp the value between the specified min and max bounds

1224

* -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8.

1225

*

1226

* @attention The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and -DRESULT_SHIFT

1227

*

1228

* @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile time

1229

* @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND.

1230

* These values can be used to implement "rectified linear unit" activation functions

1231

*

1232

* @param[in] src_ptr Pointer to the source tensor. Supported data type: S32

1233

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1234

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1235

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1236

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1237

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1238

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

1239

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1240

* @param[in] biases_ptr Pointer to the biases tensor. Supported data type: same as @p src_ptr

1241

* @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes)

1242

* @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes)

1243

* @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor

1244

* @param[out] dst_ptr Pointer to the destination tensor Supported data type: QASYMM8

1245

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1246

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

1247

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1248

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

1249

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

1250

* @param[in] dst_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

1251

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1252

*/

1253

__kernel void gemmlowp_output_stage_quantize_down_fixedpoint(TENSOR3D_DECLARATION(src),

1254

#if defined(ADD_BIAS)

1255

VECTOR_DECLARATION(biases),

1256

#endif // defined(ADD_BIAS)

1257

TENSOR3D_DECLARATION(dst))

1258

{

1259

// Compute source and destination addresses

1260

Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);

1261

Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);

1262

#if defined(ADD_BIAS)

1263

Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);

1264

#endif // defined(ADD_BIAS)

1265

1266

int16 input_values = vload16(0, (__global int *)src.ptr);

1267

1268

#if defined(ADD_BIAS)

1269

// Add bias

1270

const int16 biases_values = vload16(0, (__global int *)biases.ptr);

1271

input_values += (int16)biases_values;

1272

#endif // defined(ADD_BIAS)

1273

1274

// Multiply by result_mult_int and shift

1275

input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, 16);

1276

1277

// Add the offset terms to GEMM's result

1278

input_values += (int16)RESULT_OFFSET_AFTER_SHIFT;

1279

1280

uchar16 res = convert_uchar16_sat(input_values);

1281

1282

#if defined(MIN_BOUND)

1283

res = max(res, (uchar16)MIN_BOUND);

1284

#endif // defined(MIN_BOUND)

1285

#if defined(MAX_BOUND)

1286

res = min(res, (uchar16)MAX_BOUND);

1287

#endif // defined(MAX_BOUND)

1288

1289

// Store the result

1290

vstore16(res, 0, dst.ptr);

1291

}

Chunosov