Blame - src/core/CL/cl_kernels/gemmlowp.cl - ml/ComputeLibrary

2018-01-10 15:56:30 +0000

[diff] [blame]

143

__kernel void gemmlowp_mm_midgard(IMAGE_DECLARATION(src0),

144

IMAGE_DECLARATION(src1),

145

IMAGE_DECLARATION(dst))

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

146

{

147

int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;

148

149

// Compute starting address for matrix A and Matrix B

150

int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));

151

152

// Update address for the matrix A

153

src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;

154

155

// Update address for the matrix B

156

src_addr.s1 += idx;

157

158

int end_row_vec_a = src_addr.s0 + COLS_A;

159

160

VECTOR_UINT acc0 = 0;

161

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

162

VECTOR_UINT acc1 = 0;

163

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

164

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

165

VECTOR_UINT acc2 = 0;

166

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

167

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

168

VECTOR_UINT acc3 = 0;

169

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco

2018-01-10 15:56:30 +0000

[diff] [blame]

170

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

171

VECTOR_UINT acc4 = 0;

172

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

173

174

for(; src_addr.s0 <= (end_row_vec_a - 2); src_addr += (int2)(2, 2 * src1_stride_y))

175

{

176

// Load values from matrix A

177

uchar2 a0 = vload2(0, src0_ptr + src_addr.s0 + 0 * src0_stride_y);

178

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

179

uchar2 a1 = vload2(0, src0_ptr + src_addr.s0 + 1 * src0_stride_y);

180

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

181

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

182

uchar2 a2 = vload2(0, src0_ptr + src_addr.s0 + 2 * src0_stride_y);

183

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

184

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

185

uchar2 a3 = vload2(0, src0_ptr + src_addr.s0 + 3 * src0_stride_y);

186

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco

2018-01-10 15:56:30 +0000

[diff] [blame]

187

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

188

uchar2 a4 = vload2(0, src0_ptr + src_addr.s0 + 4 * src0_stride_y);

189

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

190

// Load values from matrix B

191

VECTOR_UCHAR b0 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, src1_ptr + src_addr.s1);

192

VECTOR_UCHAR b1 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, src1_ptr + src_addr.s1 + src1_stride_y);

193

194

// Accumulate

195

acc0 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a0.s0;

196

acc0 += CONVERT(b1, VECTOR_UINT) * (VECTOR_UINT)a0.s1;

197

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

198

acc1 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a1.s0;

199

acc1 += CONVERT(b1, VECTOR_UINT) * (VECTOR_UINT)a1.s1;

200

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

201

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

202

acc2 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a2.s0;

203

acc2 += CONVERT(b1, VECTOR_UINT) * (VECTOR_UINT)a2.s1;

204

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

205

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

206

acc3 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a3.s0;

207

acc3 += CONVERT(b1, VECTOR_UINT) * (VECTOR_UINT)a3.s1;

208

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco

2018-01-10 15:56:30 +0000

[diff] [blame]

209

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

210

acc4 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a4.s0;

211

acc4 += CONVERT(b1, VECTOR_UINT) * (VECTOR_UINT)a4.s1;

212

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

213

}

214

215

for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(1, src1_stride_y))

216

{

217

// Load values from matrix A

218

uchar a0 = *(src0_ptr + src_addr.s0 + 0 * src0_stride_y);

219

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

220

uchar a1 = *(src0_ptr + src_addr.s0 + 1 * src0_stride_y);

221

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

222

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

223

uchar a2 = *(src0_ptr + src_addr.s0 + 2 * src0_stride_y);

224

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

225

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

226

uchar a3 = *(src0_ptr + src_addr.s0 + 3 * src0_stride_y);

227

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco

2018-01-10 15:56:30 +0000

[diff] [blame]

228

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

229

uchar a4 = *(src0_ptr + src_addr.s0 + 4 * src0_stride_y);

230

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

231

// Load values from matrix B

232

VECTOR_UCHAR b0 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, src1_ptr + src_addr.s1);

233

234

// Accumulate

235

acc0 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a0;

236

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

237

acc1 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a1;

238

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

239

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

240

acc2 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a2;

241

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

242

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

243

acc3 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a3;

244

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco

2018-01-10 15:56:30 +0000

[diff] [blame]

245

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

246

acc4 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a4;

247

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

248

}

249

250

// Compute destination address

251

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

252

253

// Store the result

254

VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)

255

(CONVERT(acc0, VECTOR_INT), 0, (__global int *)(offset(&dst, 0, 0)));

256

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

257

VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)

258

(CONVERT(acc1, VECTOR_INT), 0, (__global int *)(offset(&dst, 0, 1)));

259

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

260

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

261

VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)

262

(CONVERT(acc2, VECTOR_INT), 0, (__global int *)(offset(&dst, 0, 2)));

263

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

264

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

265

VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)

266

(CONVERT(acc3, VECTOR_INT), 0, (__global int *)(offset(&dst, 0, 3)));

267

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco

2018-01-10 15:56:30 +0000

[diff] [blame]

268

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

269

VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)

270

(CONVERT(acc4, VECTOR_INT), 0, (__global int *)(offset(&dst, 0, 4)));

271

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

272

}

273

274

/** OpenCL kernel optimized for Bifrost architectures that computes the matrix multiplication between matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped

275

*

276

* @attention The number of matrix A columns needs to be passed at compile time using -DCOLS_A

277

*

278

* @param[in] src0_ptr Pointer to the source matrix. Supported data type: QASYMM8

279

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

280

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

281

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

282

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

283

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

284

* @param[in] src1_ptr Pointer to the source matrix. Supported data type: same as @p src0_ptr

285

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

286

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

287

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

288

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

289

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

290

* @param[out] dst_ptr Pointer to the destination matrix Supported data type: S32

291

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

292

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

293

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

294

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

295

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

296

*/

297

__kernel void gemmlowp_mm_bifrost(IMAGE_DECLARATION(src0),

298

IMAGE_DECLARATION(src1),

299

IMAGE_DECLARATION(dst))

300

{

301

int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;

302

303

// Compute starting address for matrix A and Matrix B

304

int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));

305

306

// Update address for the matrix A

307

src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;

308

309

// Update address for the matrix B

310

src_addr.s1 += idx;

311

312

int end_row_vec_a = src_addr.s0 + COLS_A;

uint acc00 = 0;

uint acc01 = 0;

uint acc02 = 0;

uint acc03 = 0;

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

uint acc10 = 0;

uint acc11 = 0;

uint acc12 = 0;

uint acc13 = 0;

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

324

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

uint acc20 = 0;

uint acc21 = 0;

uint acc22 = 0;

uint acc23 = 0;

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

330

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

uint acc30 = 0;

uint acc31 = 0;

uint acc32 = 0;

uint acc33 = 0;

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

336

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

uint acc40 = 0;

uint acc41 = 0;

uint acc42 = 0;

uint acc43 = 0;

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

342

343

for(; src_addr.s0 <= (end_row_vec_a - 4); src_addr += (int2)(4, 4 * src1_stride_y))

344

{

345

// Load values from matrix A

346

uchar4 a0 = vload4(0, src0_ptr + src_addr.s0 + 0 * src0_stride_y);

347

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

348

uchar4 a1 = vload4(0, src0_ptr + src_addr.s0 + 1 * src0_stride_y);

349

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

350

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

351

uchar4 a2 = vload4(0, src0_ptr + src_addr.s0 + 2 * src0_stride_y);

352

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

353

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

354

uchar4 a3 = vload4(0, src0_ptr + src_addr.s0 + 3 * src0_stride_y);

355

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

356

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

357

uchar4 a4 = vload4(0, src0_ptr + src_addr.s0 + 4 * src0_stride_y);

358

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

359

// Load values from matrix B

360

uchar4 b0 = vload4(0, src1_ptr + src_addr.s1 + 0 * src1_stride_y);

361

uchar4 b1 = vload4(0, src1_ptr + src_addr.s1 + 1 * src1_stride_y);

362

uchar4 b2 = vload4(0, src1_ptr + src_addr.s1 + 2 * src1_stride_y);

363

uchar4 b3 = vload4(0, src1_ptr + src_addr.s1 + 3 * src1_stride_y);

{

// Accumulate

ushort tmp0 = (ushort)b0.s0 * (ushort)a0.s0;

368

ushort tmp1 = (ushort)b0.s1 * (ushort)a0.s0;

369

ushort tmp2 = (ushort)b0.s2 * (ushort)a0.s0;

370

ushort tmp3 = (ushort)b0.s3 * (ushort)a0.s0;

371

372

ushort tmp4 = (ushort)b1.s0 * (ushort)a0.s1;

373

ushort tmp5 = (ushort)b1.s1 * (ushort)a0.s1;

374

ushort tmp6 = (ushort)b1.s2 * (ushort)a0.s1;

375

ushort tmp7 = (ushort)b1.s3 * (ushort)a0.s1;

376

377

ushort tmp8 = (ushort)b2.s0 * (ushort)a0.s2;

378

ushort tmp9 = (ushort)b2.s1 * (ushort)a0.s2;

379

ushort tmpA = (ushort)b2.s2 * (ushort)a0.s2;

380

ushort tmpB = (ushort)b2.s3 * (ushort)a0.s2;

381

382

ushort tmpC = (ushort)b3.s0 * (ushort)a0.s3;

383

ushort tmpD = (ushort)b3.s1 * (ushort)a0.s3;

384

ushort tmpE = (ushort)b3.s2 * (ushort)a0.s3;

385

ushort tmpF = (ushort)b3.s3 * (ushort)a0.s3;

386

387

acc00 += ((uint)tmp0 + (uint)tmp4 + (uint)tmp8 + (uint)tmpC);

388

acc01 += ((uint)tmp1 + (uint)tmp5 + (uint)tmp9 + (uint)tmpD);

389

acc02 += ((uint)tmp2 + (uint)tmp6 + (uint)tmpA + (uint)tmpE);

390

acc03 += ((uint)tmp3 + (uint)tmp7 + (uint)tmpB + (uint)tmpF);

391

}

392

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

393

{

394

// Accumulate

395

ushort tmp0 = (ushort)b0.s0 * (ushort)a1.s0;

396

ushort tmp1 = (ushort)b0.s1 * (ushort)a1.s0;

397

ushort tmp2 = (ushort)b0.s2 * (ushort)a1.s0;

398

ushort tmp3 = (ushort)b0.s3 * (ushort)a1.s0;

399

400

ushort tmp4 = (ushort)b1.s0 * (ushort)a1.s1;

401

ushort tmp5 = (ushort)b1.s1 * (ushort)a1.s1;

402

ushort tmp6 = (ushort)b1.s2 * (ushort)a1.s1;

403

ushort tmp7 = (ushort)b1.s3 * (ushort)a1.s1;

404

405

ushort tmp8 = (ushort)b2.s0 * (ushort)a1.s2;

406

ushort tmp9 = (ushort)b2.s1 * (ushort)a1.s2;

407

ushort tmpA = (ushort)b2.s2 * (ushort)a1.s2;

408

ushort tmpB = (ushort)b2.s3 * (ushort)a1.s2;

409

410

ushort tmpC = (ushort)b3.s0 * (ushort)a1.s3;

411

ushort tmpD = (ushort)b3.s1 * (ushort)a1.s3;

412

ushort tmpE = (ushort)b3.s2 * (ushort)a1.s3;

413

ushort tmpF = (ushort)b3.s3 * (ushort)a1.s3;

414

415

acc10 += ((uint)tmp0 + (uint)tmp4 + (uint)tmp8 + (uint)tmpC);

416

acc11 += ((uint)tmp1 + (uint)tmp5 + (uint)tmp9 + (uint)tmpD);

417

acc12 += ((uint)tmp2 + (uint)tmp6 + (uint)tmpA + (uint)tmpE);

418

acc13 += ((uint)tmp3 + (uint)tmp7 + (uint)tmpB + (uint)tmpF);

419

}

420

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

421

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

422

{

423

// Accumulate

424

ushort tmp0 = (ushort)b0.s0 * (ushort)a2.s0;

425

ushort tmp1 = (ushort)b0.s1 * (ushort)a2.s0;

426

ushort tmp2 = (ushort)b0.s2 * (ushort)a2.s0;

427

ushort tmp3 = (ushort)b0.s3 * (ushort)a2.s0;

428

429

ushort tmp4 = (ushort)b1.s0 * (ushort)a2.s1;

430

ushort tmp5 = (ushort)b1.s1 * (ushort)a2.s1;

431

ushort tmp6 = (ushort)b1.s2 * (ushort)a2.s1;

432

ushort tmp7 = (ushort)b1.s3 * (ushort)a2.s1;

433

434

ushort tmp8 = (ushort)b2.s0 * (ushort)a2.s2;

435

ushort tmp9 = (ushort)b2.s1 * (ushort)a2.s2;

436

ushort tmpA = (ushort)b2.s2 * (ushort)a2.s2;

437

ushort tmpB = (ushort)b2.s3 * (ushort)a2.s2;

438

439

ushort tmpC = (ushort)b3.s0 * (ushort)a2.s3;

440

ushort tmpD = (ushort)b3.s1 * (ushort)a2.s3;

441

ushort tmpE = (ushort)b3.s2 * (ushort)a2.s3;

442

ushort tmpF = (ushort)b3.s3 * (ushort)a2.s3;

443

444

acc20 += ((uint)tmp0 + (uint)tmp4 + (uint)tmp8 + (uint)tmpC);

445

acc21 += ((uint)tmp1 + (uint)tmp5 + (uint)tmp9 + (uint)tmpD);

446

acc22 += ((uint)tmp2 + (uint)tmp6 + (uint)tmpA + (uint)tmpE);

447

acc23 += ((uint)tmp3 + (uint)tmp7 + (uint)tmpB + (uint)tmpF);

448

}

449

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

450

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

451

{

452

// Accumulate

453

ushort tmp0 = (ushort)b0.s0 * (ushort)a3.s0;

454

ushort tmp1 = (ushort)b0.s1 * (ushort)a3.s0;

455

ushort tmp2 = (ushort)b0.s2 * (ushort)a3.s0;

456

ushort tmp3 = (ushort)b0.s3 * (ushort)a3.s0;

457

458

ushort tmp4 = (ushort)b1.s0 * (ushort)a3.s1;

459

ushort tmp5 = (ushort)b1.s1 * (ushort)a3.s1;

460

ushort tmp6 = (ushort)b1.s2 * (ushort)a3.s1;

461

ushort tmp7 = (ushort)b1.s3 * (ushort)a3.s1;

462

463

ushort tmp8 = (ushort)b2.s0 * (ushort)a3.s2;

464

ushort tmp9 = (ushort)b2.s1 * (ushort)a3.s2;

465

ushort tmpA = (ushort)b2.s2 * (ushort)a3.s2;

466

ushort tmpB = (ushort)b2.s3 * (ushort)a3.s2;

467

468

ushort tmpC = (ushort)b3.s0 * (ushort)a3.s3;

469

ushort tmpD = (ushort)b3.s1 * (ushort)a3.s3;

470

ushort tmpE = (ushort)b3.s2 * (ushort)a3.s3;

471

ushort tmpF = (ushort)b3.s3 * (ushort)a3.s3;

472

473

acc30 += ((uint)tmp0 + (uint)tmp4 + (uint)tmp8 + (uint)tmpC);

474

acc31 += ((uint)tmp1 + (uint)tmp5 + (uint)tmp9 + (uint)tmpD);

475

acc32 += ((uint)tmp2 + (uint)tmp6 + (uint)tmpA + (uint)tmpE);

476

acc33 += ((uint)tmp3 + (uint)tmp7 + (uint)tmpB + (uint)tmpF);

477

}

478

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

479

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

480

{

481

// Accumulate

482

ushort tmp0 = (ushort)b0.s0 * (ushort)a4.s0;

483

ushort tmp1 = (ushort)b0.s1 * (ushort)a4.s0;

484

ushort tmp2 = (ushort)b0.s2 * (ushort)a4.s0;

485

ushort tmp3 = (ushort)b0.s3 * (ushort)a4.s0;

486

487

ushort tmp4 = (ushort)b1.s0 * (ushort)a4.s1;

488

ushort tmp5 = (ushort)b1.s1 * (ushort)a4.s1;

489

ushort tmp6 = (ushort)b1.s2 * (ushort)a4.s1;

490

ushort tmp7 = (ushort)b1.s3 * (ushort)a4.s1;

491

492

ushort tmp8 = (ushort)b2.s0 * (ushort)a4.s2;

493

ushort tmp9 = (ushort)b2.s1 * (ushort)a4.s2;

494

ushort tmpA = (ushort)b2.s2 * (ushort)a4.s2;

495

ushort tmpB = (ushort)b2.s3 * (ushort)a4.s2;

496

497

ushort tmpC = (ushort)b3.s0 * (ushort)a4.s3;

498

ushort tmpD = (ushort)b3.s1 * (ushort)a4.s3;

499

ushort tmpE = (ushort)b3.s2 * (ushort)a4.s3;

500

ushort tmpF = (ushort)b3.s3 * (ushort)a4.s3;

501

502

acc40 += ((uint)tmp0 + (uint)tmp4 + (uint)tmp8 + (uint)tmpC);

503

acc41 += ((uint)tmp1 + (uint)tmp5 + (uint)tmp9 + (uint)tmpD);

504

acc42 += ((uint)tmp2 + (uint)tmp6 + (uint)tmpA + (uint)tmpE);

505

acc43 += ((uint)tmp3 + (uint)tmp7 + (uint)tmpB + (uint)tmpF);

506

}

507

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

508

}

509

510

for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(1, src1_stride_y))

511

{

512

// Load values from matrix A

513

uchar a0 = *(src0_ptr + src_addr.s0 + 0 * src0_stride_y);

514

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

515

uchar a1 = *(src0_ptr + src_addr.s0 + 1 * src0_stride_y);

516

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

517

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

518

uchar a2 = *(src0_ptr + src_addr.s0 + 2 * src0_stride_y);

519

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

520

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

521

uchar a3 = *(src0_ptr + src_addr.s0 + 3 * src0_stride_y);

522

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

523

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

524

uchar a4 = *(src0_ptr + src_addr.s0 + 4 * src0_stride_y);

525

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

526

// Load values from matrix B

527

uchar4 b0 = vload4(0, src1_ptr + src_addr.s1);

// Accumulate

{

// Accumulate

ushort tmp0 = (ushort)b0.s0 * (ushort)a0;

533

ushort tmp1 = (ushort)b0.s1 * (ushort)a0;

534

ushort tmp2 = (ushort)b0.s2 * (ushort)a0;

535

ushort tmp3 = (ushort)b0.s3 * (ushort)a0;

536

537

acc00 += ((uint)tmp0);

538

acc01 += ((uint)tmp1);

539

acc02 += ((uint)tmp2);

540

acc03 += ((uint)tmp3);

541

}

542

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

543

{

544

// Accumulate

545

ushort tmp0 = (ushort)b0.s0 * (ushort)a1;

546

ushort tmp1 = (ushort)b0.s1 * (ushort)a1;

547

ushort tmp2 = (ushort)b0.s2 * (ushort)a1;

548

ushort tmp3 = (ushort)b0.s3 * (ushort)a1;

549

550

acc10 += ((uint)tmp0);

551

acc11 += ((uint)tmp1);

552

acc12 += ((uint)tmp2);

553

acc13 += ((uint)tmp3);

554

}

555

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

556

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

557

{

558

// Accumulate

559

ushort tmp0 = (ushort)b0.s0 * (ushort)a2;

560

ushort tmp1 = (ushort)b0.s1 * (ushort)a2;

561

ushort tmp2 = (ushort)b0.s2 * (ushort)a2;

562

ushort tmp3 = (ushort)b0.s3 * (ushort)a2;

563

564

acc20 += ((uint)tmp0);

565

acc21 += ((uint)tmp1);

566

acc22 += ((uint)tmp2);

567

acc23 += ((uint)tmp3);

568

}

569

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

570

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

571

{

572

// Accumulate

573

ushort tmp0 = (ushort)b0.s0 * (ushort)a3;

574

ushort tmp1 = (ushort)b0.s1 * (ushort)a3;

575

ushort tmp2 = (ushort)b0.s2 * (ushort)a3;

576

ushort tmp3 = (ushort)b0.s3 * (ushort)a3;

577

578

acc30 += ((uint)tmp0);

579

acc31 += ((uint)tmp1);

580

acc32 += ((uint)tmp2);

581

acc33 += ((uint)tmp3);

582

}

583

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

584

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

585

{

586

// Accumulate

587

ushort tmp0 = (ushort)b0.s0 * (ushort)a4;

588

ushort tmp1 = (ushort)b0.s1 * (ushort)a4;

589

ushort tmp2 = (ushort)b0.s2 * (ushort)a4;

590

ushort tmp3 = (ushort)b0.s3 * (ushort)a4;

591

592

acc40 += ((uint)tmp0);

593

acc41 += ((uint)tmp1);

594

acc42 += ((uint)tmp2);

595

acc43 += ((uint)tmp3);

596

}

597

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

598

}

599

600

// Compute destination address

601

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

602

603

// Store the result

604

vstore4((int4)(acc00, acc01, acc02, acc03), 0, (__global int *)(offset(&dst, 0, 0)));

605

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

606

vstore4((int4)(acc10, acc11, acc12, acc13), 0, (__global int *)(offset(&dst, 0, 1)));

607

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

608

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

609

vstore4((int4)(acc20, acc21, acc22, acc23), 0, (__global int *)(offset(&dst, 0, 2)));

610

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

611

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

612

vstore4((int4)(acc30, acc31, acc32, acc33), 0, (__global int *)(offset(&dst, 0, 3)));

613

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

614

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

615

vstore4((int4)(acc40, acc41, acc42, acc43), 0, (__global int *)(offset(&dst, 0, 4)));

616

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

617

}

618

#endif // defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_Y) && defined(COLS_A)

619

620

#if defined(COLS_A)

621

/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each row of Matrix A.

622

*

623

* @note This stage is needed to handle the offset of matrix product

624

* https://github.com/google/gemmlowp/blob/master/doc/low-precision.md

625

*

626

* @attention The number of matrix A columns needs to be passed at compile time using -DCOLS_A

627

*

628

* @param[in] src_ptr Pointer to the source tensor. Supported data type: QASYMM8

629

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

630

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

631

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

632

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

633

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

634

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

635

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

636

* @param[out] dst_ptr Pointer to the destination tensor Supported data type: S32

637

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

638

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

639

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

640

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

641

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

642

*/

643

__kernel void gemmlowp_matrix_a_reduction(TENSOR3D_DECLARATION(src),

644

IMAGE_DECLARATION(dst))

645

{

646

// Compute source and destination addresses

647

Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);

648

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

649

650

uint4 sum_row_u32 = (uint4)0;

651

uint sum_row = 0;

652

653

__global const uchar *matrix_a = (__global const uchar *)(src.ptr + get_global_id(0) * src_stride_y + get_global_id(1) * src_stride_z);

int i = 0;

// This for loop performs 16 accumulations

658

for(; i <= ((int)COLS_A - 16); i += 16)

659

{

660

const uchar16 a0_u8 = vload16(0, matrix_a + i);

661

662

sum_row_u32 += convert_uint4(a0_u8.s0123) + convert_uint4(a0_u8.s4567) + convert_uint4(a0_u8.s89AB) + convert_uint4(a0_u8.sCDEF);

663

}

664

665

// This for loop performs the leftover accumulations

666

for(; i < COLS_A; ++i)

667

{

668

sum_row += matrix_a[i];

669

}

670

671

sum_row += sum_row_u32.s0 + sum_row_u32.s1 + sum_row_u32.s2 + sum_row_u32.s3;

672

673

*((__global int *)dst.ptr) = (int)sum_row;

674

}

675

#endif // defined(COLS_A)

676

677

#if defined(COLS_B) && defined(ROWS_B)

678

/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each column of Matrix B.

679

*

680

* @note This stage is needed to handle the offset of matrix product

681

* https://github.com/google/gemmlowp/blob/master/doc/low-precision.md

682

*

683

* @attention The number of matrix B columns and rows needs to be passed at compile time using -DCOLS_B and -DROWS_B

684

*

685

* @param[in] src_ptr Pointer to the source tensor. Supported data type: QASYMM8

686

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

687

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

688

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

689

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

690

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

691

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

692

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

693

* @param[out] dst_ptr Pointer to the destination tensor Supported data type: S32

694

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

695

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

696

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

697

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

698

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

699

*/

700

__kernel void gemmlowp_matrix_b_reduction(TENSOR3D_DECLARATION(src),

701

IMAGE_DECLARATION(dst))

702

{

703

// Compute source and destination addresses

704

Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);

705

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

706

707

uint16 sum_col_u32 = (uint16)0;

708

709

__global const uchar *matrix_b = (__global const uchar *)(src.ptr + get_global_id(1) * src_stride_z);

710

711

int i = 0;

712

// This for loop performs 4 accumulations

713

for(; i <= ((int)ROWS_B - 4); i += 4)

714

{

715

const uchar16 b0_u8 = vload16(0, matrix_b + 0 * src_stride_y);

716

const uchar16 b1_u8 = vload16(0, matrix_b + 1 * src_stride_y);

717

const uchar16 b2_u8 = vload16(0, matrix_b + 2 * src_stride_y);

718

const uchar16 b3_u8 = vload16(0, matrix_b + 3 * src_stride_y);

719

720

sum_col_u32 += convert_uint16(b0_u8) + convert_uint16(b1_u8) + convert_uint16(b2_u8) + convert_uint16(b3_u8);

721

722

matrix_b += 4 * src_stride_y;

723

}

724

725

// This for loop perfoms the leftover accumulations

726

for(; i < (int)ROWS_B; ++i)

727

{

728

const uchar16 b0_u8 = vload16(0, matrix_b);

729

730

sum_col_u32 += convert_uint16(b0_u8);

731

732

matrix_b += src_stride_y;

733

}

734

735

vstore16(convert_int16(sum_col_u32), 0, (__global int *)dst.ptr);

736

}

737

#endif // defined(COLS_B) && defined(ROWS_B)

738

739

#if defined(K_OFFSET)

740

/* OpenCL kernel used to add the offset contribution after @ref CLGEMMLowpMatrixMultiplyKernel. The computation is performed in-place

741

*

742

* This kernel takes a final int32 accumulator value (the output of @CLGEMMLowpMatrixMultiplyKernel),

743

* and adds to it the offset contribution of matrix A and matrix B in-place.

744

*

745

* @attention The k_offset = a_offset * b_offset * k (where k is the number of matrix A columns) needs to be passed at compile time using -DK_OFFSET (i.e. -DK_OFFSET=1200)

746

* @note In case the offset contribution due to a_offset is required, a_offset needs to be passed at compile time using -DA_OFFSET (i.e. -DA_OFFSET=1)

747

* @note In case the offset contribution due to b_offset is required, b_offset needs to be passed at compile time using -DB_OFFSET (i.e. -DB_OFFSET=6)

Chunosov

2017-11-22 20:42:13 +0700

[diff] [blame]

748

* @note In case sum_col has batches, -DSUM_COL_HAS_BATCHES must be passed at compile time. Usually if gemmlowp is used to accelerate convolution layer, sum_col will not have batches

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

749

*

750

* The final result is:

751

*

752

* mm_result[i][k] = mm_result[i][k] +

753

* (sum_col[k] * A_OFFSET) +

754

* (sum_row[i] * B_OFFSET) +

755

* (K_OFFSET)

756

*

757

* @param[in] mm_result_ptr Pointer to the source tensor. Supported data type: S32

758

* @param[in] mm_result_stride_x Stride of the source tensor in X dimension (in bytes)

759

* @param[in] mm_result_step_x mm_result_stride_x * number of elements along X processed per workitem(in bytes)

760

* @param[in] mm_result_stride_y Stride of the source tensor in Y dimension (in bytes)

761

* @param[in] mm_result_step_y mm_result_stride_y * number of elements along Y processed per workitem(in bytes)

762

* @param[in] mm_result_stride_z Stride of the source tensor in Z dimension (in bytes)

763

* @param[in] mm_result_step_z mm_result_stride_z * number of elements along Z processed per workitem(in bytes)

764

* @param[in] mm_result_offset_first_element_in_bytes The offset of the first element in the source tensor

765

* @param[in] sum_col_result_ptr Pointer to the source tensor. Supported data type: same as @p mm_result_ptr

766

* @param[in] sum_col_result_stride_x Stride of the source tensor in X dimension (in bytes)

767

* @param[in] sum_col_result_step_x sum_col_stride_x * number of elements along X processed per workitem(in bytes)

768

* @param[in] sum_col_result_stride_y Stride of the source tensor in Y dimension (in bytes)

769

* @param[in] sum_col_result_step_y sum_col_stride_y * number of elements along Y processed per workitem(in bytes)

770

* @param[in] sum_col_result_offset_first_element_in_bytes The offset of the first element in the source tensor

771

* @param[in] sum_row_result_ptr Pointer to the source tensor. Supported data type: same as @p mm_result_ptr

772

* @param[in] sum_row_result_stride_x Stride of the source tensor in X dimension (in bytes)

773

* @param[in] sum_row_result_step_x sum_row_stride_x * number of elements along X processed per workitem(in bytes)

774

* @param[in] sum_row_result_stride_y Stride of the source tensor in Y dimension (in bytes)

775

* @param[in] sum_row_result_step_y sum_row_stride_y * number of elements along Y processed per workitem(in bytes)

776

* @param[in] sum_row_result_offset_first_element_in_bytes The offset of the first element in the source tensor

777

*/

778

__kernel void gemmlowp_offset_contribution(TENSOR3D_DECLARATION(mm_result)

779

#if defined(A_OFFSET)

780

,

781

IMAGE_DECLARATION(sum_col)

782

#endif // defined(A_OFFSET)

783

#if defined(B_OFFSET)

784

,

785

IMAGE_DECLARATION(sum_row)

786

#endif // defined(B_OFFSET)

787

)

788

{

789

Tensor3D mm_result = CONVERT_TO_TENSOR3D_STRUCT(mm_result);

790

791

int16 a_offset_s32 = (int16)0;

792

int16 b_offset_s32 = (int16)0;

793

794

#if defined(A_OFFSET)

795

Image sum_col = CONVERT_TO_IMAGE_STRUCT(sum_col);

796

797

// Compute the offset contribution due to A_OFFSET

Chunosov

2017-11-22 20:42:13 +0700

[diff] [blame]

798

#if defined(SUM_COL_HAS_BATCHES)

799

a_offset_s32 = vload16(0, (__global int *)(sum_col.ptr + get_global_id(2) * sum_col_stride_y));

800

#else // defined(MATRIX_B_HAS_BATCHES)

Georgios Pinitas

45bcc3a

2017-11-29 11:06:49 +0000

[diff] [blame]

801

a_offset_s32 = vload16(0, (__global int *)(sum_col.ptr));

Chunosov

2017-11-22 20:42:13 +0700

[diff] [blame]

802

#endif // defined(MATRIX_B_HAS_BATCHES)

803

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

804

a_offset_s32 *= (int16)A_OFFSET;

805

#endif // defined(A_OFFSET)

806

807

#if defined(B_OFFSET)

808

Image sum_row = CONVERT_TO_IMAGE_STRUCT(sum_row);

809

810

// Compute the offset contribution due to B_OFFSET

811

b_offset_s32 = (int16) * (((__global int *)(sum_row.ptr + get_global_id(2) * sum_row_stride_y)) + get_global_id(1));

812

b_offset_s32 *= (int16)B_OFFSET;

813

#endif // defined(B_OFFSET)

814

815

const int16 offset_term_s32 = (int16)K_OFFSET + a_offset_s32 + b_offset_s32;

816

817

int16 in_s32 = vload16(0, (__global int *)mm_result.ptr);

818

819

// Add the offset terms to GEMM's result

820

in_s32 += offset_term_s32;

821

822

// Store the result with the offset contribution

823

vstore16(in_s32, 0, (__global int *)mm_result.ptr);

824

}

825

#endif // defined(K_OFFSET)

826

827

#if defined(RESULT_OFFSET) && defined(RESULT_MULT_INT) && defined(RESULT_SHIFT)

828

/** This OpenCL kernel is used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8

829

*

830

* This kernel takes a final int32 accumulator value and processes it to obtain the final QASYMM8 value.

831

* The following computations will be performed by the kernel:

832

*

833

* -# Add offset terms to final result

834

* -# Multiply each entry of result by result_mult_int

835

* -# Add bias to final result (if -DADD_BIAS is passed at compile time)

836

* -# Shift the int32 accumulator by result_shift

837

* -# Clamp the value between the specified min and max bounds (if -DMIN_BOUND and/or -DMAX_BOUND are passed at compile time)

838

* -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8.

839

*

840

* @attention The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and -DRESULT_SHIFT

841

*

842

* @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile time

843

* @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND.

844

* These values can be used to implement "rectified linear unit" activation functions

845

*

846

* @param[in] src_ptr Pointer to the source tensor. Supported data type: S32

847

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

848

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

849

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

850

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

851

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

852

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

853

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

854

* @param[in] biases_ptr Pointer to the biases tensor. Supported data type: same as @p src_ptr

855

* @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes)

856

* @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes)

857

* @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor

858

* @param[out] dst_ptr Pointer to the destination tensor Supported data type: QASYMM8

859

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

860

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

861

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

862

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

863

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

864

* @param[in] dst_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

865

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

866

*/

867

__kernel void gemmlowp_output_stage_quantize_down(TENSOR3D_DECLARATION(src),

868

#if defined(ADD_BIAS)

869

VECTOR_DECLARATION(biases),

870

#endif // defined(ADD_BIAS)

871

TENSOR3D_DECLARATION(dst))

872

{

873

// Compute source and destination addresses

874

Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);

875

Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);

876

#if defined(ADD_BIAS)

877

Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);

878

#endif // defined(ADD_BIAS)

879

880

int16 input_values = vload16(0, (__global int *)src.ptr);

881

Gian Marco

2017-11-28 09:10:03 +0000

[diff] [blame]

882

// Add the offset terms to GEMM's result

883

input_values += (int16)RESULT_OFFSET;

884

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

885

#if defined(ADD_BIAS)

886

// Add bias

887

const int16 biases_values = vload16(0, (__global int *)biases.ptr);

888

input_values += (int16)biases_values;

889

#endif // defined(ADD_BIAS)

890

Georgios Pinitas

45bcc3a

2017-11-29 11:06:49 +0000

[diff] [blame]

891

// Multiply by result_mult_int and shift

Gian Marco

2017-11-28 09:10:03 +0000

[diff] [blame]

892

input_values *= RESULT_MULT_INT;

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

893

Gian Marco

2017-11-28 09:10:03 +0000

[diff] [blame]

894

input_values >>= RESULT_SHIFT;

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

895

896

uchar16 res = convert_uchar16_sat(input_values);

897

898

#if defined(MIN_BOUND)

899

res = max(res, (uchar16)MIN_BOUND);

900

#endif // defined(MIN_BOUND)

901

#if defined(MAX_BOUND)

902

res = min(res, (uchar16)MAX_BOUND);

903

#endif // defined(MAX_BOUND)

904

905

// Store the result

906

vstore16(res, 0, dst.ptr);

907

}

Gian Marco

2017-11-28 09:10:03 +0000

[diff] [blame]

908

#endif // defined(RESULT_OFFSET) && defined(RESULT_MULT_INT) && defined(RESULT_SHIFT)

909

910

#if defined(RESULT_OFFSET_AFTER_SHIFT) && defined(RESULT_FIXEDPOINT_MULTIPLIER) && defined(RESULT_SHIFT)

911

/** This OpenCL kernel is used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8

912

*

913

* This kernel takes a final int32 accumulator value (the output of @ref CLGEMMLowpMatrixMultiplyKernel), and processes it to obtain the final QASYMM8 value.

914

* The following computations will be performed by the kernel:

915

*

916

* -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier

917

* -# Add bias to final result if bias tensor is not a nullptr

918

* -# Round to nearest division by a power-of-two using result_shift

919

* -# Add offset to each result

920

* -# Clamp the value between the specified min and max bounds

921

* -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8.

922

*

923

* @attention The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and -DRESULT_SHIFT

924

*

925

* @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile time

926

* @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND.

927

* These values can be used to implement "rectified linear unit" activation functions

928

*

929

* @param[in] src_ptr Pointer to the source tensor. Supported data type: S32

930

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

931

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

932

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

933

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

934

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

935

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

936

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

937

* @param[in] biases_ptr Pointer to the biases tensor. Supported data type: same as @p src_ptr

938

* @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes)

939

* @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes)

940

* @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor

941

* @param[out] dst_ptr Pointer to the destination tensor Supported data type: QASYMM8

942

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

943

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

944

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

945

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

946

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

947

* @param[in] dst_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

948

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

949

*/

950

__kernel void gemmlowp_output_stage_quantize_down_fixedpoint(TENSOR3D_DECLARATION(src),

951

#if defined(ADD_BIAS)

952

VECTOR_DECLARATION(biases),

953

#endif // defined(ADD_BIAS)

954

TENSOR3D_DECLARATION(dst))

955

{

956

// Compute source and destination addresses

957

Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);

958

Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);

959

#if defined(ADD_BIAS)

960

Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);

961

#endif // defined(ADD_BIAS)

962

963

int16 input_values = vload16(0, (__global int *)src.ptr);

964

965

#if defined(ADD_BIAS)

966

// Add bias

967

const int16 biases_values = vload16(0, (__global int *)biases.ptr);

968

input_values += (int16)biases_values;

969

#endif // defined(ADD_BIAS)

970

971

// Multiply by result_mult_int and shift

972

input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, 16);

973

974

// Add the offset terms to GEMM's result

975

input_values += (int16)RESULT_OFFSET_AFTER_SHIFT;

976

977

uchar16 res = convert_uchar16_sat(input_values);

978

979

#if defined(MIN_BOUND)

980

res = max(res, (uchar16)MIN_BOUND);

981

#endif // defined(MIN_BOUND)

982

#if defined(MAX_BOUND)

983

res = min(res, (uchar16)MAX_BOUND);

984

#endif // defined(MAX_BOUND)

985

986

// Store the result

987

vstore16(res, 0, dst.ptr);

988

}

Chunosov