Blame - src/core/CL/cl_kernels/gemmlowp.cl - ml/ComputeLibrary

2017-11-21 10:57:50 +0000

[diff] [blame]

199

*

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

200

* @note The number of matrix B columns needs to be passed at compile time using -DCOLS_B: e.g. -DCOLS_B=1024

201

* @note The transposition width step (mult_transpose1xW_width * 4) must be passed at compile time using -DTRANSPOSE1XW_WIDTH_STEP (i.e. -DTRANSPOSE1XW_WIDTH_STEP=2)

202

* @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (i.e. -DMULT_INTERLEAVE4X4_HEIGHT=2)

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

203

*

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

204

* @note In case the output has to be reinterpreted as a 3D tensor (i.e. output of convolution layer), the following information must be passed at compile time:

205

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

206

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

207

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

208

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped

209

*

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

210

* @param[in] src0_ptr Pointer to the source matrix. Supported data type: QASYMM8

211

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

212

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

213

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

214

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

215

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

216

* @param[in] src1_ptr Pointer to the source matrix. Supported data type: same as @p src0_ptr

217

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

218

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

219

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

220

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

221

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

222

* @param[out] dst_ptr Pointer to the destination matrix Supported data type: S32

223

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

224

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

225

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

226

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

227

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

228

* @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)

229

* @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)

230

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

231

* @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

232

*/

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

233

__kernel void gemmlowp_mm_interleaved_transposed_midgard(IMAGE_DECLARATION(src0),

234

IMAGE_DECLARATION(src1),

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

235

IMAGE_DECLARATION(dst),

uint src0_stride_z,

uint src1_stride_z,

uint dst_stride_z

#if defined(REINTERPRET_OUTPUT_AS_3D)

240

,

241

uint cross_plane_pad

242

#endif // REINTERPRET_OUTPUT_AS_3D

243

)

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

244

{

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

245

const int x = get_global_id(0) / TRANSPOSE1XW_WIDTH_STEP;

246

const int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;

247

const int z = get_global_id(2);

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

248

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

249

// Offset

250

const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;

251

const int offset_row_b = (get_global_id(0) % TRANSPOSE1XW_WIDTH_STEP) * 4;

252

253

// src_addr_a = address of matrix A

254

// src_addr_b = address of matrix B

Isabella Gottardi

b92805b

2018-09-28 18:24:27 +0100

[diff] [blame]

255

__global uchar *src_addr_a = (__global uchar *)(src0_ptr + z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes);

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

256

__global uchar *src_addr_b = (__global uchar *)(src1_ptr + x * src1_stride_y + src1_offset_first_element_in_bytes);

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

257

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

258

#if defined(MATRIX_B_DEPTH)

259

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

260

src_addr_b += (z % MATRIX_B_DEPTH) * src1_stride_z;

261

#else // defined(MATRIX_B_DEPTH)

262

src_addr_b += z * src1_stride_z;

263

#endif // defined(MATRIX_B_DEPTH)

264

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

265

// Compute end row address for matrix B

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

266

__global uchar *src_end_addr_b = src_addr_b + COLS_B;

267

268

src_addr_a += offset_row_a;

269

src_addr_b += offset_row_b;

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

270

271

// Reset accumulators

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

int4 c00 = 0;

int4 c10 = 0;

int4 c20 = 0;

int4 c30 = 0;

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

276

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

277

for(; src_addr_b <= (src_end_addr_b - (int)(8 * TRANSPOSE1XW_WIDTH_STEP)); src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 8 * TRANSPOSE1XW_WIDTH_STEP)

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

278

{

279

// Load values from matrix A (interleaved) and matrix B (transposed)

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

280

int4 a0 = convert_int4(vload4(0, src_addr_a));

281

int4 b0 = convert_int4(vload4(0, src_addr_b));

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

282

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

283

c00 += (int4)a0.s0 * b0;

284

c10 += (int4)a0.s1 * b0;

285

c20 += (int4)a0.s2 * b0;

286

c30 += (int4)a0.s3 * b0;

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

287

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

288

a0 = convert_int4(vload4(0, src_addr_a + 4 * MULT_INTERLEAVE4X4_HEIGHT));

289

b0 = convert_int4(vload4(0, src_addr_b + 4 * TRANSPOSE1XW_WIDTH_STEP));

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

290

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

291

c00 += (int4)a0.s0 * b0;

292

c10 += (int4)a0.s1 * b0;

293

c20 += (int4)a0.s2 * b0;

294

c30 += (int4)a0.s3 * b0;

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

295

}

296

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

297

for(; src_addr_b < src_end_addr_b; src_addr_a += (4 * MULT_INTERLEAVE4X4_HEIGHT), src_addr_b += (4 * TRANSPOSE1XW_WIDTH_STEP))

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

298

{

299

// Load values from matrix A (interleaved) and matrix B (transposed)

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

300

int4 a0 = convert_int4(vload4(0, src_addr_a));

301

int4 b0 = convert_int4(vload4(0, src_addr_b));

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

302

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

303

c00 += (int4)a0.s0 * b0;

304

c10 += (int4)a0.s1 * b0;

305

c20 += (int4)a0.s2 * b0;

306

c30 += (int4)a0.s3 * b0;

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

307

}

308

309

// Compute destination address

310

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

311

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

312

#if defined(REINTERPRET_OUTPUT_AS_3D)

313

// Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension

314

// in order to take into account the presence of possible cross plane paddings

//

// | |

// | plane0 |

// | |

// |__________________|

320

// |******************|

321

// | cross_plane_pad |

322

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

327

328

// The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D

329

uint4 zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;

330

zout = min(DEPTH_GEMM3D - 1, zout);

331

332

// Add offset due to the cross plane paddings

333

zout *= (cross_plane_pad * dst_stride_y);

334

335

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

336

// multiply dst_stride_z by DEPTH_GEMM3D

337

dst.ptr += z * dst_stride_z * DEPTH_GEMM3D;

338

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

339

// Store 4x4 block

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

340

vstore4(c00, 0, (__global int *)(dst.ptr + 0 * dst_stride_y + zout.s0));

341

vstore4(c10, 0, (__global int *)(dst.ptr + 1 * dst_stride_y + zout.s1));

342

vstore4(c20, 0, (__global int *)(dst.ptr + 2 * dst_stride_y + zout.s2));

343

vstore4(c30, 0, (__global int *)(dst.ptr + 3 * dst_stride_y + zout.s3));

344

345

#else // defined(REINTERPRET_OUTPUT_AS_3D)

346

// Add offset for batched GEMM

347

dst.ptr += z * dst_stride_z;

348

349

// Store 4x4 block

350

vstore4(c00, 0, (__global int *)(dst.ptr + 0 * dst_stride_y));

351

vstore4(c10, 0, (__global int *)(dst.ptr + 1 * dst_stride_y));

352

vstore4(c20, 0, (__global int *)(dst.ptr + 2 * dst_stride_y));

353

vstore4(c30, 0, (__global int *)(dst.ptr + 3 * dst_stride_y));

354

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

355

}

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

356

#endif // defined(COLS_B) && defined(MULT_INTERLEAVE4X4_HEIGHT) && defined(TRANSPOSE1XW_WIDTH_STEP)

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

357

358

#if defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_Y) && defined(COLS_A)

359

#define VECTOR_UCHAR VEC_DATA_TYPE(uchar, NUM_ELEMS_PROCESSED_PER_THREAD_X)

360

#define VECTOR_UINT VEC_DATA_TYPE(uint, NUM_ELEMS_PROCESSED_PER_THREAD_X)

361

#define VECTOR_INT VEC_DATA_TYPE(int, NUM_ELEMS_PROCESSED_PER_THREAD_X)

362

/** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped

363

*

364

* @attention The number of matrix A columns needs to be passed at compile time using -DCOLS_A

365

*

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

366

* @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:

367

* -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D

368

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

369

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

370

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

371

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped

372

*

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

373

* @param[in] src0_ptr Pointer to the source matrix. Supported data type: QASYMM8

374

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

375

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

376

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

377

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

378

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

379

* @param[in] src1_ptr Pointer to the source matrix. Supported data type: same as @p src0_ptr

380

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

381

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

382

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

383

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

384

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

385

* @param[out] dst_ptr Pointer to the destination matrix Supported data type: S32

386

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

387

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

388

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

389

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

390

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

391

* @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)

392

* @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)

393

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

394

* @param[in] src_cross_plane_pad (Optional) Bottom paddings in unit of elements for the input tensor (only if defined REINTERPRET_INPUT_AS_3D)

395

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements for the output tensor (only if defined REINTERPRET_OUTPUT_AS_3D)

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

396

*/

Gian Marco

2018-01-10 15:56:30 +0000

[diff] [blame]

397

__kernel void gemmlowp_mm_midgard(IMAGE_DECLARATION(src0),

398

IMAGE_DECLARATION(src1),

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

399

IMAGE_DECLARATION(dst),

uint src0_stride_z,

uint src1_stride_z,

uint dst_stride_z

#if defined(REINTERPRET_INPUT_AS_3D)

404

,

405

uint src_cross_plane_pad

406

#endif // REINTERPRET_INPUT_AS_3D

407

#if defined(REINTERPRET_OUTPUT_AS_3D)

408

,

409

uint dst_cross_plane_pad

410

#endif // REINTERPRET_OUTPUT_AS_3D

411

)

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

412

{

413

int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;

414

415

// Compute starting address for matrix A and Matrix B

416

int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));

417

418

// Update address for the matrix A

419

src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;

420

421

// Update address for the matrix B

422

src_addr.s1 += idx;

423

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

424

#if defined(REINTERPRET_INPUT_AS_3D)

425

// Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension

426

// in order to take into account the presence of possible cross plane paddings

//

// | |

// | plane0 |

// | |

// |__________________|

432

// |******************|

433

// | cross_plane_pad |

434

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

439

440

// The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D

441

uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;

442

zin = min(DEPTH_GEMM3D - 1, zin);

443

444

// Add offset due to the cross plane paddings

445

zin *= (src_cross_plane_pad * src0_stride_y);

446

447

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

448

// multiply src0_stride_z by DEPTH_GEMM3D

449

src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;

450

451

#else // defined(REINTERPRET_INPUT_AS_3D)

452

453

// Add offset for batched GEMM

454

src_addr.s0 += get_global_id(2) * src0_stride_z;

455

456

#endif // defined(REINTERPRET_INPUT_AS_3D)

457

458

#if defined(MATRIX_B_DEPTH)

459

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

460

src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;

461

#else // defined(MATRIX_B_DEPTH)

462

src_addr.s1 += get_global_id(2) * src1_stride_z;

463

#endif // defined(MATRIX_B_DEPTH)

464

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

465

int end_row_vec_a = src_addr.s0 + COLS_A;

466

467

VECTOR_UINT acc0 = 0;

468

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

469

VECTOR_UINT acc1 = 0;

470

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

471

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

472

VECTOR_UINT acc2 = 0;

473

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

474

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

475

VECTOR_UINT acc3 = 0;

476

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco

2018-01-10 15:56:30 +0000

[diff] [blame]

477

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

478

VECTOR_UINT acc4 = 0;

479

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

480

481

for(; src_addr.s0 <= (end_row_vec_a - 2); src_addr += (int2)(2, 2 * src1_stride_y))

482

{

483

// Load values from matrix A

484

uchar2 a0 = vload2(0, src0_ptr + src_addr.s0 + 0 * src0_stride_y);

485

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

486

uchar2 a1 = vload2(0, src0_ptr + src_addr.s0 + 1 * src0_stride_y);

487

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

488

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

489

uchar2 a2 = vload2(0, src0_ptr + src_addr.s0 + 2 * src0_stride_y);

490

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

491

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

492

uchar2 a3 = vload2(0, src0_ptr + src_addr.s0 + 3 * src0_stride_y);

493

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco

2018-01-10 15:56:30 +0000

[diff] [blame]

494

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

495

uchar2 a4 = vload2(0, src0_ptr + src_addr.s0 + 4 * src0_stride_y);

496

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

497

// Load values from matrix B

498

VECTOR_UCHAR b0 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, src1_ptr + src_addr.s1);

499

VECTOR_UCHAR b1 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, src1_ptr + src_addr.s1 + src1_stride_y);

500

501

// Accumulate

502

acc0 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a0.s0;

503

acc0 += CONVERT(b1, VECTOR_UINT) * (VECTOR_UINT)a0.s1;

504

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

505

acc1 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a1.s0;

506

acc1 += CONVERT(b1, VECTOR_UINT) * (VECTOR_UINT)a1.s1;

507

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

508

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

509

acc2 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a2.s0;

510

acc2 += CONVERT(b1, VECTOR_UINT) * (VECTOR_UINT)a2.s1;

511

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

512

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

513

acc3 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a3.s0;

514

acc3 += CONVERT(b1, VECTOR_UINT) * (VECTOR_UINT)a3.s1;

515

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco

2018-01-10 15:56:30 +0000

[diff] [blame]

516

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

517

acc4 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a4.s0;

518

acc4 += CONVERT(b1, VECTOR_UINT) * (VECTOR_UINT)a4.s1;

519

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

520

}

521

522

for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(1, src1_stride_y))

523

{

524

// Load values from matrix A

525

uchar a0 = *(src0_ptr + src_addr.s0 + 0 * src0_stride_y);

526

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

527

uchar a1 = *(src0_ptr + src_addr.s0 + 1 * src0_stride_y);

528

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

529

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

530

uchar a2 = *(src0_ptr + src_addr.s0 + 2 * src0_stride_y);

531

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

532

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

533

uchar a3 = *(src0_ptr + src_addr.s0 + 3 * src0_stride_y);

534

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco

2018-01-10 15:56:30 +0000

[diff] [blame]

535

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

536

uchar a4 = *(src0_ptr + src_addr.s0 + 4 * src0_stride_y);

537

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

538

// Load values from matrix B

539

VECTOR_UCHAR b0 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, src1_ptr + src_addr.s1);

540

541

// Accumulate

542

acc0 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a0;

543

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

544

acc1 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a1;

545

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

546

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

547

acc2 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a2;

548

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

549

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

550

acc3 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a3;

551

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco

2018-01-10 15:56:30 +0000

[diff] [blame]

552

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

553

acc4 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a4;

554

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

555

}

556

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

557

const int z = get_global_id(2);

558

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

559

// Compute destination address

560

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

561

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

562

#if defined(REINTERPRET_OUTPUT_AS_3D)

563

// Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension

564

// in order to take into account the presence of possible cross plane paddings

//

// | |

// | plane0 |

// | |

// |__________________|

570

// |******************|

571

// | cross_plane_pad |

572

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

577

578

// The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D

579

uint8 zout = ((uint8)(0, 1, 2, 3, 4, 5, 6, 7) + (uint8)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint8)HEIGHT_GEMM3D;

580

zout = min(DEPTH_GEMM3D - 1, zout);

581

582

// Add offset due to the cross plane paddings

583

zout *= (dst_cross_plane_pad * dst_stride_y);

584

585

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

586

// multiply dst_stride_z by DEPTH_GEMM3D

587

dst.ptr += z * dst_stride_z * DEPTH_GEMM3D;

588

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

589

// Store the result

590

VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

591

(CONVERT(acc0, VECTOR_INT), 0, (__global int *)(dst.ptr + 0 * dst_stride_y + zout.s0));

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

592

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

593

VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

594

(CONVERT(acc1, VECTOR_INT), 0, (__global int *)(dst.ptr + 1 * dst_stride_y + zout.s1));

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

595

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

596

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

597

VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

598

(CONVERT(acc2, VECTOR_INT), 0, (__global int *)(dst.ptr + 2 * dst_stride_y + zout.s2));

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

599

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

600

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

601

VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

602

(CONVERT(acc3, VECTOR_INT), 0, (__global int *)(dst.ptr + 3 * dst_stride_y + zout.s3));

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

603

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco

2018-01-10 15:56:30 +0000

[diff] [blame]

604

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

605

VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

606

(CONVERT(acc4, VECTOR_INT), 0, (__global int *)(dst.ptr + 4 * dst_stride_y + zout.s4));

Gian Marco

2018-01-10 15:56:30 +0000

[diff] [blame]

607

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

608

609

#else // defined(REINTERPRET_OUTPUT_AS_3D)

610

// Add offset for batched GEMM

611

dst.ptr += z * dst_stride_z;

612

613

// Store the result

614

VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)

615

(CONVERT(acc0, VECTOR_INT), 0, (__global int *)(dst.ptr + 0 * dst_stride_y));

616

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

617

VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)

618

(CONVERT(acc1, VECTOR_INT), 0, (__global int *)(dst.ptr + 1 * dst_stride_y));

619

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

620

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

621

VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)

622

(CONVERT(acc2, VECTOR_INT), 0, (__global int *)(dst.ptr + 2 * dst_stride_y));

623

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

624

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

625

VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)

626

(CONVERT(acc3, VECTOR_INT), 0, (__global int *)(dst.ptr + 3 * dst_stride_y));

627

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

628

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

629

VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)

630

(CONVERT(acc4, VECTOR_INT), 0, (__global int *)(dst.ptr + 4 * dst_stride_y));

631

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

632

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

Gian Marco

2018-01-10 15:56:30 +0000

[diff] [blame]

633

}

634

635

/** OpenCL kernel optimized for Bifrost architectures that computes the matrix multiplication between matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped

636

*

637

* @attention The number of matrix A columns needs to be passed at compile time using -DCOLS_A

638

*

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

639

* @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:

640

* -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D

641

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

642

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

643

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

644

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped

645

*

Gian Marco

2018-01-10 15:56:30 +0000

[diff] [blame]

646

* @param[in] src0_ptr Pointer to the source matrix. Supported data type: QASYMM8

647

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

648

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

649

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

650

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

651

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

652

* @param[in] src1_ptr Pointer to the source matrix. Supported data type: same as @p src0_ptr

653

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

654

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

655

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

656

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

657

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

658

* @param[out] dst_ptr Pointer to the destination matrix Supported data type: S32

659

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

660

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

661

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

662

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

663

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

664

* @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)

665

* @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)

666

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

667

* @param[in] src_cross_plane_pad (Optional) Bottom paddings in unit of elements for the input tensor (only if defined REINTERPRET_INPUT_AS_3D)

668

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements for the output tensor (only if defined REINTERPRET_OUTPUT_AS_3D)

Gian Marco

2018-01-10 15:56:30 +0000

[diff] [blame]

669

*/

670

__kernel void gemmlowp_mm_bifrost(IMAGE_DECLARATION(src0),

671

IMAGE_DECLARATION(src1),

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

672

IMAGE_DECLARATION(dst),

uint src0_stride_z,

uint src1_stride_z,

uint dst_stride_z

#if defined(REINTERPRET_INPUT_AS_3D)

677

,

678

uint src_cross_plane_pad

679

#endif // REINTERPRET_INPUT_AS_3D

680

#if defined(REINTERPRET_OUTPUT_AS_3D)

681

,

682

uint dst_cross_plane_pad

683

#endif // REINTERPRET_OUTPUT_AS_3D

684

)

Gian Marco

2018-01-10 15:56:30 +0000

[diff] [blame]

685

{

686

int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;

687

688

// Compute starting address for matrix A and Matrix B

689

int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));

690

691

// Update address for the matrix A

692

src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;

693

694

// Update address for the matrix B

695

src_addr.s1 += idx;

696

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

697

#if defined(REINTERPRET_INPUT_AS_3D)

698

// Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension

699

// in order to take into account the presence of possible cross plane paddings

//

// | |

// | plane0 |

// | |

// |__________________|

705

// |******************|

706

// | cross_plane_pad |

707

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

712

713

// The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D

714

uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;

715

zin = min(DEPTH_GEMM3D - 1, zin);

716

717

// Add offset due to the cross plane paddings

718

zin *= (src_cross_plane_pad * src0_stride_y);

719

720

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

721

// multiply src0_stride_z by DEPTH_GEMM3D

722

src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;

723

724

#else // defined(REINTERPRET_INPUT_AS_3D)

725

726

// Add offset for batched GEMM

727

src_addr.s0 += get_global_id(2) * src0_stride_z;

728

729

#endif // defined(REINTERPRET_INPUT_AS_3D)

730

731

#if defined(MATRIX_B_DEPTH)

732

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

733

src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;

734

#else // defined(MATRIX_B_DEPTH)

735

src_addr.s1 += get_global_id(2) * src1_stride_z;

736

#endif // defined(MATRIX_B_DEPTH)

737

Gian Marco

2018-01-10 15:56:30 +0000

[diff] [blame]

738

int end_row_vec_a = src_addr.s0 + COLS_A;

uint acc00 = 0;

uint acc01 = 0;

uint acc02 = 0;

uint acc03 = 0;

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

uint acc10 = 0;

uint acc11 = 0;

uint acc12 = 0;

uint acc13 = 0;

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

750

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

uint acc20 = 0;

uint acc21 = 0;

uint acc22 = 0;

uint acc23 = 0;

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

756

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

uint acc30 = 0;

uint acc31 = 0;

uint acc32 = 0;

uint acc33 = 0;

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

762

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

uint acc40 = 0;

uint acc41 = 0;

uint acc42 = 0;

uint acc43 = 0;

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

768

769

for(; src_addr.s0 <= (end_row_vec_a - 4); src_addr += (int2)(4, 4 * src1_stride_y))

770

{

771

// Load values from matrix A

772

uchar4 a0 = vload4(0, src0_ptr + src_addr.s0 + 0 * src0_stride_y);

773

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

774

uchar4 a1 = vload4(0, src0_ptr + src_addr.s0 + 1 * src0_stride_y);

775

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

776

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

777

uchar4 a2 = vload4(0, src0_ptr + src_addr.s0 + 2 * src0_stride_y);

778

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

779

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

780

uchar4 a3 = vload4(0, src0_ptr + src_addr.s0 + 3 * src0_stride_y);

781

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

782

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

783

uchar4 a4 = vload4(0, src0_ptr + src_addr.s0 + 4 * src0_stride_y);

784

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

785

// Load values from matrix B

786

uchar4 b0 = vload4(0, src1_ptr + src_addr.s1 + 0 * src1_stride_y);

787

uchar4 b1 = vload4(0, src1_ptr + src_addr.s1 + 1 * src1_stride_y);

788

uchar4 b2 = vload4(0, src1_ptr + src_addr.s1 + 2 * src1_stride_y);

789

uchar4 b3 = vload4(0, src1_ptr + src_addr.s1 + 3 * src1_stride_y);

{

// Accumulate

ushort tmp0 = (ushort)b0.s0 * (ushort)a0.s0;

794

ushort tmp1 = (ushort)b0.s1 * (ushort)a0.s0;

795

ushort tmp2 = (ushort)b0.s2 * (ushort)a0.s0;

796

ushort tmp3 = (ushort)b0.s3 * (ushort)a0.s0;

797

798

ushort tmp4 = (ushort)b1.s0 * (ushort)a0.s1;

799

ushort tmp5 = (ushort)b1.s1 * (ushort)a0.s1;

800

ushort tmp6 = (ushort)b1.s2 * (ushort)a0.s1;

801

ushort tmp7 = (ushort)b1.s3 * (ushort)a0.s1;

802

803

ushort tmp8 = (ushort)b2.s0 * (ushort)a0.s2;

804

ushort tmp9 = (ushort)b2.s1 * (ushort)a0.s2;

805

ushort tmpA = (ushort)b2.s2 * (ushort)a0.s2;

806

ushort tmpB = (ushort)b2.s3 * (ushort)a0.s2;

807

808

ushort tmpC = (ushort)b3.s0 * (ushort)a0.s3;

809

ushort tmpD = (ushort)b3.s1 * (ushort)a0.s3;

810

ushort tmpE = (ushort)b3.s2 * (ushort)a0.s3;

811

ushort tmpF = (ushort)b3.s3 * (ushort)a0.s3;

812

813

acc00 += ((uint)tmp0 + (uint)tmp4 + (uint)tmp8 + (uint)tmpC);

814

acc01 += ((uint)tmp1 + (uint)tmp5 + (uint)tmp9 + (uint)tmpD);

815

acc02 += ((uint)tmp2 + (uint)tmp6 + (uint)tmpA + (uint)tmpE);

816

acc03 += ((uint)tmp3 + (uint)tmp7 + (uint)tmpB + (uint)tmpF);

817

}

818

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

819

{

820

// Accumulate

821

ushort tmp0 = (ushort)b0.s0 * (ushort)a1.s0;

822

ushort tmp1 = (ushort)b0.s1 * (ushort)a1.s0;

823

ushort tmp2 = (ushort)b0.s2 * (ushort)a1.s0;

824

ushort tmp3 = (ushort)b0.s3 * (ushort)a1.s0;

825

826

ushort tmp4 = (ushort)b1.s0 * (ushort)a1.s1;

827

ushort tmp5 = (ushort)b1.s1 * (ushort)a1.s1;

828

ushort tmp6 = (ushort)b1.s2 * (ushort)a1.s1;

829

ushort tmp7 = (ushort)b1.s3 * (ushort)a1.s1;

830

831

ushort tmp8 = (ushort)b2.s0 * (ushort)a1.s2;

832

ushort tmp9 = (ushort)b2.s1 * (ushort)a1.s2;

833

ushort tmpA = (ushort)b2.s2 * (ushort)a1.s2;

834

ushort tmpB = (ushort)b2.s3 * (ushort)a1.s2;

835

836

ushort tmpC = (ushort)b3.s0 * (ushort)a1.s3;

837

ushort tmpD = (ushort)b3.s1 * (ushort)a1.s3;

838

ushort tmpE = (ushort)b3.s2 * (ushort)a1.s3;

839

ushort tmpF = (ushort)b3.s3 * (ushort)a1.s3;

840

841

acc10 += ((uint)tmp0 + (uint)tmp4 + (uint)tmp8 + (uint)tmpC);

842

acc11 += ((uint)tmp1 + (uint)tmp5 + (uint)tmp9 + (uint)tmpD);

843

acc12 += ((uint)tmp2 + (uint)tmp6 + (uint)tmpA + (uint)tmpE);

844

acc13 += ((uint)tmp3 + (uint)tmp7 + (uint)tmpB + (uint)tmpF);

845

}

846

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

847

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

848

{

849

// Accumulate

850

ushort tmp0 = (ushort)b0.s0 * (ushort)a2.s0;

851

ushort tmp1 = (ushort)b0.s1 * (ushort)a2.s0;

852

ushort tmp2 = (ushort)b0.s2 * (ushort)a2.s0;

853

ushort tmp3 = (ushort)b0.s3 * (ushort)a2.s0;

854

855

ushort tmp4 = (ushort)b1.s0 * (ushort)a2.s1;

856

ushort tmp5 = (ushort)b1.s1 * (ushort)a2.s1;

857

ushort tmp6 = (ushort)b1.s2 * (ushort)a2.s1;

858

ushort tmp7 = (ushort)b1.s3 * (ushort)a2.s1;

859

860

ushort tmp8 = (ushort)b2.s0 * (ushort)a2.s2;

861

ushort tmp9 = (ushort)b2.s1 * (ushort)a2.s2;

862

ushort tmpA = (ushort)b2.s2 * (ushort)a2.s2;

863

ushort tmpB = (ushort)b2.s3 * (ushort)a2.s2;

864

865

ushort tmpC = (ushort)b3.s0 * (ushort)a2.s3;

866

ushort tmpD = (ushort)b3.s1 * (ushort)a2.s3;

867

ushort tmpE = (ushort)b3.s2 * (ushort)a2.s3;

868

ushort tmpF = (ushort)b3.s3 * (ushort)a2.s3;

869

870

acc20 += ((uint)tmp0 + (uint)tmp4 + (uint)tmp8 + (uint)tmpC);

871

acc21 += ((uint)tmp1 + (uint)tmp5 + (uint)tmp9 + (uint)tmpD);

872

acc22 += ((uint)tmp2 + (uint)tmp6 + (uint)tmpA + (uint)tmpE);

873

acc23 += ((uint)tmp3 + (uint)tmp7 + (uint)tmpB + (uint)tmpF);

874

}

875

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

876

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

877

{

878

// Accumulate

879

ushort tmp0 = (ushort)b0.s0 * (ushort)a3.s0;

880

ushort tmp1 = (ushort)b0.s1 * (ushort)a3.s0;

881

ushort tmp2 = (ushort)b0.s2 * (ushort)a3.s0;

882

ushort tmp3 = (ushort)b0.s3 * (ushort)a3.s0;

883

884

ushort tmp4 = (ushort)b1.s0 * (ushort)a3.s1;

885

ushort tmp5 = (ushort)b1.s1 * (ushort)a3.s1;

886

ushort tmp6 = (ushort)b1.s2 * (ushort)a3.s1;

887

ushort tmp7 = (ushort)b1.s3 * (ushort)a3.s1;

888

889

ushort tmp8 = (ushort)b2.s0 * (ushort)a3.s2;

890

ushort tmp9 = (ushort)b2.s1 * (ushort)a3.s2;

891

ushort tmpA = (ushort)b2.s2 * (ushort)a3.s2;

892

ushort tmpB = (ushort)b2.s3 * (ushort)a3.s2;

893

894

ushort tmpC = (ushort)b3.s0 * (ushort)a3.s3;

895

ushort tmpD = (ushort)b3.s1 * (ushort)a3.s3;

896

ushort tmpE = (ushort)b3.s2 * (ushort)a3.s3;

897

ushort tmpF = (ushort)b3.s3 * (ushort)a3.s3;

898

899

acc30 += ((uint)tmp0 + (uint)tmp4 + (uint)tmp8 + (uint)tmpC);

900

acc31 += ((uint)tmp1 + (uint)tmp5 + (uint)tmp9 + (uint)tmpD);

901

acc32 += ((uint)tmp2 + (uint)tmp6 + (uint)tmpA + (uint)tmpE);

902

acc33 += ((uint)tmp3 + (uint)tmp7 + (uint)tmpB + (uint)tmpF);

903

}

904

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

905

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

906

{

907

// Accumulate

908

ushort tmp0 = (ushort)b0.s0 * (ushort)a4.s0;

909

ushort tmp1 = (ushort)b0.s1 * (ushort)a4.s0;

910

ushort tmp2 = (ushort)b0.s2 * (ushort)a4.s0;

911

ushort tmp3 = (ushort)b0.s3 * (ushort)a4.s0;

912

913

ushort tmp4 = (ushort)b1.s0 * (ushort)a4.s1;

914

ushort tmp5 = (ushort)b1.s1 * (ushort)a4.s1;

915

ushort tmp6 = (ushort)b1.s2 * (ushort)a4.s1;

916

ushort tmp7 = (ushort)b1.s3 * (ushort)a4.s1;

917

918

ushort tmp8 = (ushort)b2.s0 * (ushort)a4.s2;

919

ushort tmp9 = (ushort)b2.s1 * (ushort)a4.s2;

920

ushort tmpA = (ushort)b2.s2 * (ushort)a4.s2;

921

ushort tmpB = (ushort)b2.s3 * (ushort)a4.s2;

922

923

ushort tmpC = (ushort)b3.s0 * (ushort)a4.s3;

924

ushort tmpD = (ushort)b3.s1 * (ushort)a4.s3;

925

ushort tmpE = (ushort)b3.s2 * (ushort)a4.s3;

926

ushort tmpF = (ushort)b3.s3 * (ushort)a4.s3;

927

928

acc40 += ((uint)tmp0 + (uint)tmp4 + (uint)tmp8 + (uint)tmpC);

929

acc41 += ((uint)tmp1 + (uint)tmp5 + (uint)tmp9 + (uint)tmpD);

930

acc42 += ((uint)tmp2 + (uint)tmp6 + (uint)tmpA + (uint)tmpE);

931

acc43 += ((uint)tmp3 + (uint)tmp7 + (uint)tmpB + (uint)tmpF);

932

}

933

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

934

}

935

936

for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(1, src1_stride_y))

937

{

938

// Load values from matrix A

939

uchar a0 = *(src0_ptr + src_addr.s0 + 0 * src0_stride_y);

940

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

941

uchar a1 = *(src0_ptr + src_addr.s0 + 1 * src0_stride_y);

942

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

943

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

944

uchar a2 = *(src0_ptr + src_addr.s0 + 2 * src0_stride_y);

945

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

946

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

947

uchar a3 = *(src0_ptr + src_addr.s0 + 3 * src0_stride_y);

948

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

949

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

950

uchar a4 = *(src0_ptr + src_addr.s0 + 4 * src0_stride_y);

951

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

952

// Load values from matrix B

953

uchar4 b0 = vload4(0, src1_ptr + src_addr.s1);

// Accumulate

{

// Accumulate

ushort tmp0 = (ushort)b0.s0 * (ushort)a0;

959

ushort tmp1 = (ushort)b0.s1 * (ushort)a0;

960

ushort tmp2 = (ushort)b0.s2 * (ushort)a0;

961

ushort tmp3 = (ushort)b0.s3 * (ushort)a0;

962

963

acc00 += ((uint)tmp0);

964

acc01 += ((uint)tmp1);

965

acc02 += ((uint)tmp2);

966

acc03 += ((uint)tmp3);

967

}

968

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

969

{

970

// Accumulate

971

ushort tmp0 = (ushort)b0.s0 * (ushort)a1;

972

ushort tmp1 = (ushort)b0.s1 * (ushort)a1;

973

ushort tmp2 = (ushort)b0.s2 * (ushort)a1;

974

ushort tmp3 = (ushort)b0.s3 * (ushort)a1;

975

976

acc10 += ((uint)tmp0);

977

acc11 += ((uint)tmp1);

978

acc12 += ((uint)tmp2);

979

acc13 += ((uint)tmp3);

980

}

981

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

982

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

983

{

984

// Accumulate

985

ushort tmp0 = (ushort)b0.s0 * (ushort)a2;

986

ushort tmp1 = (ushort)b0.s1 * (ushort)a2;

987

ushort tmp2 = (ushort)b0.s2 * (ushort)a2;

988

ushort tmp3 = (ushort)b0.s3 * (ushort)a2;

989

990

acc20 += ((uint)tmp0);

991

acc21 += ((uint)tmp1);

992

acc22 += ((uint)tmp2);

993

acc23 += ((uint)tmp3);

994

}

995

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

996

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

997

{

998

// Accumulate

999

ushort tmp0 = (ushort)b0.s0 * (ushort)a3;

1000

ushort tmp1 = (ushort)b0.s1 * (ushort)a3;

1001

ushort tmp2 = (ushort)b0.s2 * (ushort)a3;

1002

ushort tmp3 = (ushort)b0.s3 * (ushort)a3;

1003

1004

acc30 += ((uint)tmp0);

1005

acc31 += ((uint)tmp1);

1006

acc32 += ((uint)tmp2);

1007

acc33 += ((uint)tmp3);

1008

}

1009

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1010

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

1011

{

1012

// Accumulate

1013

ushort tmp0 = (ushort)b0.s0 * (ushort)a4;

1014

ushort tmp1 = (ushort)b0.s1 * (ushort)a4;

1015

ushort tmp2 = (ushort)b0.s2 * (ushort)a4;

1016

ushort tmp3 = (ushort)b0.s3 * (ushort)a4;

1017

1018

acc40 += ((uint)tmp0);

1019

acc41 += ((uint)tmp1);

1020

acc42 += ((uint)tmp2);

1021

acc43 += ((uint)tmp3);

1022

}

1023

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

1024

}

1025

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

1026

const int z = get_global_id(2);

1027

Gian Marco

2018-01-10 15:56:30 +0000

[diff] [blame]

1028

// Compute destination address

1029

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

1030

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

1031

#if defined(REINTERPRET_OUTPUT_AS_3D)

1032

// Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension

1033

// in order to take into account the presence of possible cross plane paddings

//

// | |

// | plane0 |

// | |

// |__________________|

1039

// |******************|

1040

// | cross_plane_pad |

1041

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

1046

1047

// The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D

1048

uint8 zout = ((uint8)(0, 1, 2, 3, 4, 5, 6, 7) + (uint8)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint8)HEIGHT_GEMM3D;

1049

zout = min(DEPTH_GEMM3D - 1, zout);

1050

1051

// Add offset due to the cross plane paddings

1052

zout *= (dst_cross_plane_pad * dst_stride_y);

1053

1054

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

1055

// multiply dst_stride_z by DEPTH_GEMM3D

1056

dst.ptr += z * dst_stride_z * DEPTH_GEMM3D;

1057

Gian Marco

2018-01-10 15:56:30 +0000

[diff] [blame]

1058

// Store the result

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

1059

vstore4((int4)(acc00, acc01, acc02, acc03), 0, (__global int *)(dst.ptr + 0 * dst_stride_y + zout.s0));

Gian Marco

2018-01-10 15:56:30 +0000

[diff] [blame]

1060

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

1061

vstore4((int4)(acc10, acc11, acc12, acc13), 0, (__global int *)(dst.ptr + 1 * dst_stride_y + zout.s1));

Gian Marco

2018-01-10 15:56:30 +0000

[diff] [blame]

1062

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1063

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

1064

vstore4((int4)(acc20, acc21, acc22, acc23), 0, (__global int *)(dst.ptr + 2 * dst_stride_y + zout.s2));

Gian Marco

2018-01-10 15:56:30 +0000

[diff] [blame]

1065

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1066

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

1067

vstore4((int4)(acc30, acc31, acc32, acc33), 0, (__global int *)(dst.ptr + 3 * dst_stride_y + zout.s3));

Gian Marco

2018-01-10 15:56:30 +0000

[diff] [blame]

1068

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1069

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

1070

vstore4((int4)(acc40, acc41, acc42, acc43), 0, (__global int *)(dst.ptr + 4 * dst_stride_y + zout.s4));

Gian Marco

2018-01-10 15:56:30 +0000

[diff] [blame]

1071

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

1072

1073

#else // defined(REINTERPRET_OUTPUT_AS_3D)

1074

// Add offset for batched GEMM

1075

dst.ptr += z * dst_stride_z;

1076

1077

// Store the result

1078

vstore4((int4)(acc00, acc01, acc02, acc03), 0, (__global int *)(dst.ptr + 0 * dst_stride_y));

1079

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1080

vstore4((int4)(acc10, acc11, acc12, acc13), 0, (__global int *)(dst.ptr + 1 * dst_stride_y));

1081

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1082

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1083

vstore4((int4)(acc20, acc21, acc22, acc23), 0, (__global int *)(dst.ptr + 2 * dst_stride_y));

1084

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1085

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1086

vstore4((int4)(acc30, acc31, acc32, acc33), 0, (__global int *)(dst.ptr + 3 * dst_stride_y));

1087

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1088

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

1089

vstore4((int4)(acc40, acc41, acc42, acc43), 0, (__global int *)(dst.ptr + 4 * dst_stride_y));

1090

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

1091

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

1092

}

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1093

Georgios Pinitas

daa3855

2018-08-28 17:43:18 +0100

[diff] [blame]

1094

#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1095

/** OpenCL kernel optimized to use dot product that computes the matrix multiplication between matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped

1096

*

1097

* @attention The number of matrix A columns needs to be passed at compile time using -DCOLS_A

1098

*

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

1099

* @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:

1100

* -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D

1101

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

1102

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

1103

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

1104

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped

1105

*

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1106

* @param[in] src0_ptr Pointer to the source matrix. Supported data type: QASYMM8

1107

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

1108

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1109

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

1110

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1111

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

1112

* @param[in] src1_ptr Pointer to the source matrix. Supported data type: same as @p src0_ptr

1113

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

1114

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1115

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

1116

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1117

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

1118

* @param[out] dst_ptr Pointer to the destination matrix Supported data type: S32

1119

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

1120

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

1121

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

1122

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

1123

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

1124

* @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)

1125

* @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)

1126

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

1127

* @param[in] src_cross_plane_pad (Optional) Bottom paddings in unit of elements for the input tensor (only if defined REINTERPRET_INPUT_AS_3D)

1128

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements for the output tensor (only if defined REINTERPRET_OUTPUT_AS_3D)

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1129

*/

1130

__kernel void gemmlowp_mm_bifrost_dot8(IMAGE_DECLARATION(src0),

1131

IMAGE_DECLARATION(src1),

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

1132

IMAGE_DECLARATION(dst),

uint src0_stride_z,

uint src1_stride_z,

uint dst_stride_z

#if defined(REINTERPRET_INPUT_AS_3D)

1137

,

1138

uint src_cross_plane_pad

1139

#endif // REINTERPRET_INPUT_AS_3D

1140

#if defined(REINTERPRET_OUTPUT_AS_3D)

1141

,

1142

uint dst_cross_plane_pad

1143

#endif // REINTERPRET_OUTPUT_AS_3D)

1144

)

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1145

{

1146

int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;

1147

1148

// Compute starting address for matrix A and Matrix B

1149

int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));

1150

1151

// Update address for the matrix A

1152

src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;

1153

1154

// Update address for the matrix B

1155

src_addr.s1 += idx;

1156

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

1157

#if defined(REINTERPRET_INPUT_AS_3D)

1158

// Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension

1159

// in order to take into account the presence of possible cross plane paddings

//

// | |

// | plane0 |

// | |

// |__________________|

1165

// |******************|

1166

// | cross_plane_pad |

1167

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

1172

1173

// The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D

1174

uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;

1175

zin = min(DEPTH_GEMM3D - 1, zin);

1176

1177

// Add offset due to the cross plane paddings

1178

zin *= (src_cross_plane_pad * src0_stride_y);

1179

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1180

zin += ((uint4)(0, 1, 2, 3)) * src0_stride_y;

1181

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

1182

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

1183

// multiply src0_stride_z by DEPTH_GEMM3D

1184

src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;

1185

1186

#else // defined(REINTERPRET_INPUT_AS_3D)

1187

1188

// Add offset for batched GEMM

1189

src_addr.s0 += get_global_id(2) * src0_stride_z;

1190

1191

#endif // defined(REINTERPRET_INPUT_AS_3D)

1192

1193

#if defined(MATRIX_B_DEPTH)

1194

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

1195

src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;

1196

#else // defined(MATRIX_B_DEPTH)

1197

src_addr.s1 += get_global_id(2) * src1_stride_z;

1198

#endif // defined(MATRIX_B_DEPTH)

1199

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

uint acc00 = 0;

uint acc01 = 0;

uint acc02 = 0;

uint acc03 = 0;

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

uint acc04 = 0;

uint acc05 = 0;

uint acc06 = 0;

uint acc07 = 0;

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1208

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

uint acc10 = 0;

uint acc11 = 0;

uint acc12 = 0;

uint acc13 = 0;

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

uint acc14 = 0;

uint acc15 = 0;

uint acc16 = 0;

uint acc17 = 0;

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1217

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1218

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

uint acc20 = 0;

uint acc21 = 0;

uint acc22 = 0;

uint acc23 = 0;

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

uint acc24 = 0;

uint acc25 = 0;

uint acc26 = 0;

uint acc27 = 0;

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1227

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1228

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

uint acc30 = 0;

uint acc31 = 0;

uint acc32 = 0;

uint acc33 = 0;

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

uint acc34 = 0;

uint acc35 = 0;

uint acc36 = 0;

uint acc37 = 0;

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1237

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1238

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1239

// A and B src indices get incremented at the same time.

1240

int i = 0;

1241

for(; i <= ((int)COLS_A - 8); i += 8)

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1242

{

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1243

#if defined(REINTERPRET_INPUT_AS_3D)

1244

// Load values from matrix A and matrix B

1245

uchar8 a0 = vload8(0, (__global uchar *)(src0_ptr + src_addr.s0 + zin.s0));

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1246

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1247

uchar8 a1 = vload8(0, (__global uchar *)(src0_ptr + src_addr.s0 + zin.s1));

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1248

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1249

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1250

uchar8 a2 = vload8(0, (__global uchar *)(src0_ptr + src_addr.s0 + zin.s2));

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1251

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1252

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1253

uchar8 a3 = vload8(0, (__global uchar *)(src0_ptr + src_addr.s0 + zin.s3));

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1254

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1255

#else // defined(REINTERPRET_INPUT_AS_3D)

1256

// Load values from matrix A and matrix B

1257

uchar8 a0 = vload8(0, (__global uchar *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));

1258

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1259

uchar8 a1 = vload8(0, (__global uchar *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

1260

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1261

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1262

uchar8 a2 = vload8(0, (__global uchar *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

1263

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1264

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1265

uchar8 a3 = vload8(0, (__global uchar *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

1266

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1267

#endif // defined(REINTERPRET_INPUT_AS_3D)

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1268

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1269

uchar8 b0 = vload8(0, src1_ptr + src_addr.s1 + 0 * src1_stride_y);

1270

uchar8 b1 = vload8(0, src1_ptr + src_addr.s1 + 1 * src1_stride_y);

1271

uchar8 b2 = vload8(0, src1_ptr + src_addr.s1 + 2 * src1_stride_y);

1272

uchar8 b3 = vload8(0, src1_ptr + src_addr.s1 + 3 * src1_stride_y);

1273

src_addr.s1 += 4 * src1_stride_y;

1274

1275

ARM_DOT(a0.s0123, (uchar4)(b0.s0, b1.s0, b2.s0, b3.s0), acc00);

1276

ARM_DOT(a0.s0123, (uchar4)(b0.s1, b1.s1, b2.s1, b3.s1), acc01);

1277

ARM_DOT(a0.s0123, (uchar4)(b0.s2, b1.s2, b2.s2, b3.s2), acc02);

1278

ARM_DOT(a0.s0123, (uchar4)(b0.s3, b1.s3, b2.s3, b3.s3), acc03);

1279

ARM_DOT(a0.s0123, (uchar4)(b0.s4, b1.s4, b2.s4, b3.s4), acc04);

1280

ARM_DOT(a0.s0123, (uchar4)(b0.s5, b1.s5, b2.s5, b3.s5), acc05);

1281

ARM_DOT(a0.s0123, (uchar4)(b0.s6, b1.s6, b2.s6, b3.s6), acc06);

1282

ARM_DOT(a0.s0123, (uchar4)(b0.s7, b1.s7, b2.s7, b3.s7), acc07);

1283

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1284

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1285

ARM_DOT(a1.s0123, (uchar4)(b0.s0, b1.s0, b2.s0, b3.s0), acc10);

1286

ARM_DOT(a1.s0123, (uchar4)(b0.s1, b1.s1, b2.s1, b3.s1), acc11);

1287

ARM_DOT(a1.s0123, (uchar4)(b0.s2, b1.s2, b2.s2, b3.s2), acc12);

1288

ARM_DOT(a1.s0123, (uchar4)(b0.s3, b1.s3, b2.s3, b3.s3), acc13);

1289

ARM_DOT(a1.s0123, (uchar4)(b0.s4, b1.s4, b2.s4, b3.s4), acc14);

1290

ARM_DOT(a1.s0123, (uchar4)(b0.s5, b1.s5, b2.s5, b3.s5), acc15);

1291

ARM_DOT(a1.s0123, (uchar4)(b0.s6, b1.s6, b2.s6, b3.s6), acc16);

1292

ARM_DOT(a1.s0123, (uchar4)(b0.s7, b1.s7, b2.s7, b3.s7), acc17);

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1293

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1294

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1295

ARM_DOT(a2.s0123, (uchar4)(b0.s0, b1.s0, b2.s0, b3.s0), acc20);

1296

ARM_DOT(a2.s0123, (uchar4)(b0.s1, b1.s1, b2.s1, b3.s1), acc21);

1297

ARM_DOT(a2.s0123, (uchar4)(b0.s2, b1.s2, b2.s2, b3.s2), acc22);

1298

ARM_DOT(a2.s0123, (uchar4)(b0.s3, b1.s3, b2.s3, b3.s3), acc23);

1299

ARM_DOT(a2.s0123, (uchar4)(b0.s4, b1.s4, b2.s4, b3.s4), acc24);

1300

ARM_DOT(a2.s0123, (uchar4)(b0.s5, b1.s5, b2.s5, b3.s5), acc25);

1301

ARM_DOT(a2.s0123, (uchar4)(b0.s6, b1.s6, b2.s6, b3.s6), acc26);

1302

ARM_DOT(a2.s0123, (uchar4)(b0.s7, b1.s7, b2.s7, b3.s7), acc27);

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1303

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1304

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1305

ARM_DOT(a3.s0123, (uchar4)(b0.s0, b1.s0, b2.s0, b3.s0), acc30);

1306

ARM_DOT(a3.s0123, (uchar4)(b0.s1, b1.s1, b2.s1, b3.s1), acc31);

1307

ARM_DOT(a3.s0123, (uchar4)(b0.s2, b1.s2, b2.s2, b3.s2), acc32);

1308

ARM_DOT(a3.s0123, (uchar4)(b0.s3, b1.s3, b2.s3, b3.s3), acc33);

1309

ARM_DOT(a3.s0123, (uchar4)(b0.s4, b1.s4, b2.s4, b3.s4), acc34);

1310

ARM_DOT(a3.s0123, (uchar4)(b0.s5, b1.s5, b2.s5, b3.s5), acc35);

1311

ARM_DOT(a3.s0123, (uchar4)(b0.s6, b1.s6, b2.s6, b3.s6), acc36);

1312

ARM_DOT(a3.s0123, (uchar4)(b0.s7, b1.s7, b2.s7, b3.s7), acc37);

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1313

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1314

1315

b0 = vload8(0, src1_ptr + src_addr.s1 + 0 * src1_stride_y);

1316

b1 = vload8(0, src1_ptr + src_addr.s1 + 1 * src1_stride_y);

1317

b2 = vload8(0, src1_ptr + src_addr.s1 + 2 * src1_stride_y);

1318

b3 = vload8(0, src1_ptr + src_addr.s1 + 3 * src1_stride_y);

1319

src_addr.s1 += 4 * src1_stride_y;

1320

1321

ARM_DOT(a0.s4567, (uchar4)(b0.s0, b1.s0, b2.s0, b3.s0), acc00);

1322

ARM_DOT(a0.s4567, (uchar4)(b0.s1, b1.s1, b2.s1, b3.s1), acc01);

1323

ARM_DOT(a0.s4567, (uchar4)(b0.s2, b1.s2, b2.s2, b3.s2), acc02);

1324

ARM_DOT(a0.s4567, (uchar4)(b0.s3, b1.s3, b2.s3, b3.s3), acc03);

1325

ARM_DOT(a0.s4567, (uchar4)(b0.s4, b1.s4, b2.s4, b3.s4), acc04);

1326

ARM_DOT(a0.s4567, (uchar4)(b0.s5, b1.s5, b2.s5, b3.s5), acc05);

1327

ARM_DOT(a0.s4567, (uchar4)(b0.s6, b1.s6, b2.s6, b3.s6), acc06);

1328

ARM_DOT(a0.s4567, (uchar4)(b0.s7, b1.s7, b2.s7, b3.s7), acc07);

1329

1330

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1331

ARM_DOT(a1.s4567, (uchar4)(b0.s0, b1.s0, b2.s0, b3.s0), acc10);

1332

ARM_DOT(a1.s4567, (uchar4)(b0.s1, b1.s1, b2.s1, b3.s1), acc11);

1333

ARM_DOT(a1.s4567, (uchar4)(b0.s2, b1.s2, b2.s2, b3.s2), acc12);

1334

ARM_DOT(a1.s4567, (uchar4)(b0.s3, b1.s3, b2.s3, b3.s3), acc13);

1335

ARM_DOT(a1.s4567, (uchar4)(b0.s4, b1.s4, b2.s4, b3.s4), acc14);

1336

ARM_DOT(a1.s4567, (uchar4)(b0.s5, b1.s5, b2.s5, b3.s5), acc15);

1337

ARM_DOT(a1.s4567, (uchar4)(b0.s6, b1.s6, b2.s6, b3.s6), acc16);

1338

ARM_DOT(a1.s4567, (uchar4)(b0.s7, b1.s7, b2.s7, b3.s7), acc17);

1339

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1340

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1341

ARM_DOT(a2.s4567, (uchar4)(b0.s0, b1.s0, b2.s0, b3.s0), acc20);

1342

ARM_DOT(a2.s4567, (uchar4)(b0.s1, b1.s1, b2.s1, b3.s1), acc21);

1343

ARM_DOT(a2.s4567, (uchar4)(b0.s2, b1.s2, b2.s2, b3.s2), acc22);

1344

ARM_DOT(a2.s4567, (uchar4)(b0.s3, b1.s3, b2.s3, b3.s3), acc23);

1345

ARM_DOT(a2.s4567, (uchar4)(b0.s4, b1.s4, b2.s4, b3.s4), acc24);

1346

ARM_DOT(a2.s4567, (uchar4)(b0.s5, b1.s5, b2.s5, b3.s5), acc25);

1347

ARM_DOT(a2.s4567, (uchar4)(b0.s6, b1.s6, b2.s6, b3.s6), acc26);

1348

ARM_DOT(a2.s4567, (uchar4)(b0.s7, b1.s7, b2.s7, b3.s7), acc27);

1349

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1350

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1351

ARM_DOT(a3.s4567, (uchar4)(b0.s0, b1.s0, b2.s0, b3.s0), acc30);

1352

ARM_DOT(a3.s4567, (uchar4)(b0.s1, b1.s1, b2.s1, b3.s1), acc31);

1353

ARM_DOT(a3.s4567, (uchar4)(b0.s2, b1.s2, b2.s2, b3.s2), acc32);

1354

ARM_DOT(a3.s4567, (uchar4)(b0.s3, b1.s3, b2.s3, b3.s3), acc33);

1355

ARM_DOT(a3.s4567, (uchar4)(b0.s4, b1.s4, b2.s4, b3.s4), acc34);

1356

ARM_DOT(a3.s4567, (uchar4)(b0.s5, b1.s5, b2.s5, b3.s5), acc35);

1357

ARM_DOT(a3.s4567, (uchar4)(b0.s6, b1.s6, b2.s6, b3.s6), acc36);

1358

ARM_DOT(a3.s4567, (uchar4)(b0.s7, b1.s7, b2.s7, b3.s7), acc37);

1359

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1360

1361

src_addr.s0 += 8;

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1362

}

1363

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1364

for(; i < (int)COLS_A; ++i)

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1365

{

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1366

#if defined(REINTERPRET_INPUT_AS_3D)

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1367

// Load values from matrix A

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1368

uchar a0 = *((__global uchar *)(src0_ptr + src_addr.s0 + zin.s0));

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1369

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1370

uchar a1 = *((__global uchar *)(src0_ptr + src_addr.s0 + zin.s1));

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1371

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1372

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1373

uchar a2 = *((__global uchar *)(src0_ptr + src_addr.s0 + zin.s2));

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1374

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1375

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1376

uchar a3 = *((__global uchar *)(src0_ptr + src_addr.s0 + zin.s3));

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1377

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1378

#else // defined(REINTERPRET_INPUT_AS_3D)

1379

// Load values from matrix A

1380

uchar a0 = *((__global uchar *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));

1381

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1382

uchar a1 = *((__global uchar *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

1383

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1384

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1385

uchar a2 = *((__global uchar *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

1386

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1387

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1388

uchar a3 = *((__global uchar *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

1389

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1390

#endif // defined(REINTERPRET_INPUT_AS_3D)

1391

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1392

// Load values from matrix B

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1393

uchar8 b0 = vload8(0, src1_ptr + src_addr.s1);

1394

src_addr.s1 += src1_stride_y;

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1395

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1396

acc00 += (uint)a0 * b0.s0;

1397

acc01 += (uint)a0 * b0.s1;

1398

acc02 += (uint)a0 * b0.s2;

1399

acc03 += (uint)a0 * b0.s3;

1400

acc04 += (uint)a0 * b0.s4;

1401

acc05 += (uint)a0 * b0.s5;

1402

acc06 += (uint)a0 * b0.s6;

1403

acc07 += (uint)a0 * b0.s7;

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1404

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1405

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1406

acc10 += (uint)a1 * b0.s0;

1407

acc11 += (uint)a1 * b0.s1;

1408

acc12 += (uint)a1 * b0.s2;

1409

acc13 += (uint)a1 * b0.s3;

1410

acc14 += (uint)a1 * b0.s4;

1411

acc15 += (uint)a1 * b0.s5;

1412

acc16 += (uint)a1 * b0.s6;

1413

acc17 += (uint)a1 * b0.s7;

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1414

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1415

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1416

acc20 += (uint)a2 * b0.s0;

1417

acc21 += (uint)a2 * b0.s1;

1418

acc22 += (uint)a2 * b0.s2;

1419

acc23 += (uint)a2 * b0.s3;

1420

acc24 += (uint)a2 * b0.s4;

1421

acc25 += (uint)a2 * b0.s5;

1422

acc26 += (uint)a2 * b0.s6;

1423

acc27 += (uint)a2 * b0.s7;

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1424

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1425

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1426

acc30 += (uint)a3 * b0.s0;

1427

acc31 += (uint)a3 * b0.s1;

1428

acc32 += (uint)a3 * b0.s2;

1429

acc33 += (uint)a3 * b0.s3;

1430

acc34 += (uint)a3 * b0.s4;

1431

acc35 += (uint)a3 * b0.s5;

1432

acc36 += (uint)a3 * b0.s6;

1433

acc37 += (uint)a3 * b0.s7;

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1434

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1435

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1436

src_addr.s0 += 1;

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1437

}

1438

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1439

int z = get_global_id(2);

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

1440

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1441

// Compute destination address

1442

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

1443

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1444

// Compute dst address

1445

__global uchar *dst_addr = dst.ptr;

1446

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

1447

#if defined(REINTERPRET_OUTPUT_AS_3D)

1448

// Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension

1449

// in order to take into account the presence of possible cross plane paddings

//

// | |

// | plane0 |

// | |

// |__________________|

1455

// |******************|

1456

// | cross_plane_pad |

1457

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

1462

1463

// The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1464

uint4 zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

1465

zout = min(DEPTH_GEMM3D - 1, zout);

1466

1467

// Add offset due to the cross plane paddings

1468

zout *= (dst_cross_plane_pad * dst_stride_y);

1469

1470

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

1471

// multiply dst_stride_z by DEPTH_GEMM3D

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1472

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

1473

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1474

// Store the result

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1475

vstore4((int4)(acc00, acc01, acc02, acc03), 0, (__global int *)(dst_addr + 0 * dst_stride_y + zout.s0));

1476

vstore4((int4)(acc04, acc05, acc06, acc07), 1, (__global int *)(dst_addr + 0 * dst_stride_y + zout.s0));

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1477

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1478

vstore4((int4)(acc10, acc11, acc12, acc13), 0, (__global int *)(dst_addr + 1 * dst_stride_y + zout.s1));

1479

vstore4((int4)(acc14, acc15, acc16, acc17), 1, (__global int *)(dst_addr + 1 * dst_stride_y + zout.s1));

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1480

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1481

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1482

vstore4((int4)(acc20, acc21, acc22, acc23), 0, (__global int *)(dst_addr + 2 * dst_stride_y + zout.s2));

1483

vstore4((int4)(acc24, acc25, acc26, acc27), 1, (__global int *)(dst_addr + 2 * dst_stride_y + zout.s2));

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1484

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1485

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1486

vstore4((int4)(acc30, acc31, acc32, acc33), 0, (__global int *)(dst_addr + 3 * dst_stride_y + zout.s3));

1487

vstore4((int4)(acc34, acc35, acc36, acc37), 0, (__global int *)(dst_addr + 3 * dst_stride_y + zout.s3));

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1488

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

1489

1490

#else // defined(REINTERPRET_OUTPUT_AS_3D)

1491

// Add offset for batched GEMM

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1492

dst_addr += z * dst_stride_z;

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

1493

1494

// Store the result

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1495

vstore4((int4)(acc00, acc01, acc02, acc03), 0, (__global int *)(dst_addr + 0 * dst_stride_y));

1496

vstore4((int4)(acc04, acc05, acc06, acc07), 1, (__global int *)(dst_addr + 0 * dst_stride_y));

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

1497

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1498

vstore4((int4)(acc10, acc11, acc12, acc13), 0, (__global int *)(dst_addr + 1 * dst_stride_y));

1499

vstore4((int4)(acc14, acc15, acc16, acc17), 1, (__global int *)(dst_addr + 1 * dst_stride_y));

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

1500

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1501

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1502

vstore4((int4)(acc20, acc21, acc22, acc23), 0, (__global int *)(dst_addr + 2 * dst_stride_y));

1503

vstore4((int4)(acc24, acc25, acc26, acc27), 1, (__global int *)(dst_addr + 2 * dst_stride_y));

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

1504

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1505

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1506

vstore4((int4)(acc30, acc31, acc32, acc33), 0, (__global int *)(dst_addr + 3 * dst_stride_y));

1507

vstore4((int4)(acc34, acc35, acc36, acc37), 0, (__global int *)(dst_addr + 3 * dst_stride_y));

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

1508

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1509

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

1510

}

Georgios Pinitas

daa3855

2018-08-28 17:43:18 +0100

[diff] [blame]

1511

#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

1512

#endif // defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_Y) && defined(COLS_A)

1513

Gian Marco Iodice

2019-03-15 10:13:05 +0000

[diff] [blame]

1514

#if defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(M) && defined(N)

Gian Marco Iodice

2019-05-14 10:14:08 +0100

[diff] [blame]

1515

/** This OpenCL kernel computes the matrix multiplication between 2 matrices with QASYMM data type.

Gian Marco Iodice

2019-01-17 09:47:04 +0000

[diff] [blame]

1516

* The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be NOT transposed

1517

* The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be transposed

1518

*

Gian Marco Iodice

2019-03-15 10:13:05 +0000

[diff] [blame]

1519

* @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.

1520

* @note The GEMM's dimensions M and N must be passed at compile time using -DM and -DN (i.e. -DM=52 and -DN=90).

Gian Marco Iodice

2019-01-17 09:47:04 +0000

[diff] [blame]

1521

* @note The block's dimensions used for reshaping the LHS matrix and the RHS matrix (M0, N0 and K0) must be passed at compile time using -DM0, -DN0 and -DK0 (i.e. -DM0=4, -DN0=8, -DK0=4).

1522

* @note The number of M0xK0 vertical blocks stored on the same output row of the reshaped LHS matrix must be passed at compile time using -DV0 (i.e. -DV0=2)

1523

* @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (i.e. -DH0=2)

1524

* @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option -DLHS_INTERLEAVE must passed at compile time.

1525

* @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.

1526

* @note Only the following configurations of M0, N0 and K0 are currently supported:

1527

* - M0 = 2, 3, 4, 5, 6, 7, 8

1528

* - N0 = 2, 3, 4, 8, 16

1529

* - K0 = 2, 3, 4, 8, 16

Gian Marco Iodice

2019-03-11 16:07:12 +0000

[diff] [blame]

1530

* - V0 >= 1

1531

* - H0 >= 1

Gian Marco Iodice

2019-01-17 09:47:04 +0000

[diff] [blame]

1532

*

1533

* @note In case the output has to be reinterpreted as a 3D tensor (i.e. output of convolution layer), the following information must be passed at compile time:

1534

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

1535

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

1536

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

1537

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix NOT reshaped

1538

*

1539

* @param[in] lhs_ptr Pointer to the LHS reshaped matrix. Supported data type: QASYMM8

1540

* @param[in] lhs_stride_x Stride of the LHS reshaped matrix in X dimension (in bytes)

1541

* @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1542

* @param[in] lhs_stride_y Stride of the LHS reshaped matrix in Y dimension (in bytes)

1543

* @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1544

* @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS reshaped matrix

1545

* @param[in] rhs_ptr Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr

1546

* @param[in] rhs_stride_x Stride of the RHS reshaped matrix in X dimension (in bytes)

1547

* @param[in] rhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1548

* @param[in] rhs_stride_y Stride of the RHS reshaped matrix in Y dimension (in bytes)

1549

* @param[in] rhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1550

* @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS reshaped matrix

1551

* @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr

1552

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

1553

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1554

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

1555

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

1556

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

1557

* @param[in] k Number of columns in LHS matrix and rows in RHS matrix not reshaped.

1558

* @param[in] lhs_stride_z Stride of the LHS reshaped matrix in Z dimension (in bytes)

1559

* @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes)

1560

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

1561

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

1562

*/

1563

__kernel void gemmlowp_mm_reshaped_lhs_nt_rhs_t(IMAGE_DECLARATION(lhs),

1564

IMAGE_DECLARATION(rhs),

1565

IMAGE_DECLARATION(dst),

uint k,

uint lhs_stride_z,

uint rhs_stride_z,

uint dst_stride_z

#if defined(REINTERPRET_OUTPUT_AS_3D)

1571

,

1572

uint dst_cross_plane_pad

1573

#endif // REINTERPRET_OUTPUT_AS_3D

)

{

// Block size

#define LHS_BLOCK_SIZE ((K0) * (M0))

1578

1579

#if defined(LHS_INTERLEAVE)

1580

#define LHS_OFFSET_X (K0)

1581

#define LHS_STEP_X ((K0) * (V0))

1582

#define LHS_STEP_LOOP (1)

1583

#else // defined(INTERLEAVE)

1584

#define LHS_OFFSET_X (LHS_BLOCK_SIZE)

1585

#define LHS_STEP_X (K0)

1586

#define LHS_STEP_LOOP (V0)

1587

#endif // defined(INTERLEAVE)

1588

1589

// Block size

1590

#define RHS_BLOCK_SIZE ((K0) * (N0))

1591

1592

// RHS offset and step X

1593

#if defined(RHS_INTERLEAVE)

1594

#define RHS_OFFSET_X (K0)

1595

#define RHS_STEP_X ((K0) * (H0))

1596

#define RHS_STEP_LOOP (1)

1597

#else // defined(RHS_INTERLEAVE)

1598

#define RHS_OFFSET_X (RHS_BLOCK_SIZE)

1599

#define RHS_STEP_X (K0)

1600

#define RHS_STEP_LOOP (H0)

1601

#endif // defined(RHS_INTERLEAVE)

1602

Gian Marco Iodice

2019-05-14 10:14:08 +0100

[diff] [blame]

1603

uint x = get_global_id(0);

1604

uint y = get_global_id(1);

1605

uint z = get_global_id(2);

1606

Gian Marco Iodice

2019-03-15 10:13:05 +0000

[diff] [blame]

1607

#if defined(DUMMY_WORK_ITEMS)

Gian Marco Iodice

2019-05-14 10:14:08 +0100

[diff] [blame]

1608

if((x * N0 >= N) || (y * M0 >= M))

Gian Marco Iodice

2019-03-15 10:13:05 +0000

[diff] [blame]

{

return;

}

#endif // defined(DUMMY_WORK_ITEMS)

1613

Gian Marco Iodice

2019-01-17 09:47:04 +0000

[diff] [blame]

1614

// Compute LHS matrix address

Gian Marco Iodice

2019-05-14 10:14:08 +0100

[diff] [blame]

1615

__global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (y % V0) * (uint)LHS_OFFSET_X + (y / V0) * (uint)lhs_stride_y + (z * lhs_stride_z);

Gian Marco Iodice

2019-01-17 09:47:04 +0000

[diff] [blame]

1616

1617

// Compute RHS matrix address

Gian Marco Iodice

2019-05-14 10:14:08 +0100

[diff] [blame]

1618

__global uchar *rhs_addr = rhs_ptr + rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X + (x / (uint)H0) * rhs_stride_y;

Gian Marco Iodice

2019-01-17 09:47:04 +0000

[diff] [blame]

1619

1620

#if defined(MATRIX_B_DEPTH)

1621

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

Gian Marco Iodice

2019-05-14 10:14:08 +0100

[diff] [blame]

1622

rhs_addr += (z % MATRIX_B_DEPTH) * rhs_stride_z;

Gian Marco Iodice

2019-01-17 09:47:04 +0000

[diff] [blame]

1623

#else // defined(MATRIX_B_DEPTH)

Gian Marco Iodice

2019-05-14 10:14:08 +0100

[diff] [blame]

1624

rhs_addr += z * rhs_stride_z;

Gian Marco Iodice

2019-01-17 09:47:04 +0000

[diff] [blame]

1625

#endif // defined(MATRIX_B_DEPTH)

1626

Gian Marco Iodice

2019-05-14 10:14:08 +0100

[diff] [blame]

1627

REPEAT_VAR_INIT_TO_CONST(8, uint, zlhs, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;

1628

REPEAT_VAR_INIT_TO_CONST(16, uint, zrhs, 0);

1629

Gian Marco Iodice

2019-01-17 09:47:04 +0000

[diff] [blame]

1630

// Initialize the accumulators

1631

REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(uint, N0), c, 0); //VEC_DATA_TYPE(uint, N0) c0=0,c1=0,c2=0,... c(M0-1)=0;

1632

1633

for(int i = 0; i < k; i += K0)

1634

{

Gian Marco Iodice

2019-01-17 09:47:04 +0000

[diff] [blame]

1635

// Load values from LHS matrix

Gian Marco Iodice

2019-05-14 10:14:08 +0100

[diff] [blame]

1636

LOAD_BLOCK(M0, K0, uchar, a, lhs_addr, 0, LHS_STEP_X, zlhs);

Gian Marco Iodice

2019-01-17 09:47:04 +0000

[diff] [blame]

1637

1638

// Load values from RHS matrix

Gian Marco Iodice

2019-05-14 10:14:08 +0100

[diff] [blame]

1639

LOAD_BLOCK(N0, K0, uchar, b, rhs_addr, 0, RHS_STEP_X, zrhs);

Gian Marco Iodice

2019-01-17 09:47:04 +0000

[diff] [blame]

1640

Gian Marco Iodice

2019-05-14 10:14:08 +0100

[diff] [blame]

1641

// Partial matrix multiplication M0,N0,K0

1642

ARM_MM_K0XN0XM0(M0, N0, K0, a, b, c);

Gian Marco Iodice

2019-01-17 09:47:04 +0000

[diff] [blame]

1643

Gian Marco Iodice

2019-05-14 10:14:08 +0100

[diff] [blame]

1644

// Update address

Gian Marco Iodice

2019-01-17 09:47:04 +0000

[diff] [blame]

1645

lhs_addr += (M0 * LHS_STEP_X * LHS_STEP_LOOP);

1646

rhs_addr += (N0 * RHS_STEP_X * RHS_STEP_LOOP);

1647

}

1648

Gian Marco Iodice

2019-05-14 10:14:08 +0100

[diff] [blame]

1649

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(int)) + (y * (uint)M0 * dst_stride_y);

Gian Marco Iodice

2019-01-17 09:47:04 +0000

[diff] [blame]

1650

1651

REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;

1652

1653

#if defined(REINTERPRET_OUTPUT_AS_3D)

Gian Marco Iodice

2019-05-14 10:14:08 +0100

[diff] [blame]

1654

// The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

1655

CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);

Gian Marco Iodice

2019-01-17 09:47:04 +0000

[diff] [blame]

1656

1657

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

1658

// multiply dst_stride_z by DEPTH_GEMM3D

Gian Marco Iodice

2019-05-14 10:14:08 +0100

[diff] [blame]

1659

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

Gian Marco Iodice

2019-01-17 09:47:04 +0000

[diff] [blame]

1660

1661

#else // defined(REINTERPRET_OUTPUT_AS_3D)

1662

1663

// Add offset for batched GEMM

Gian Marco Iodice

2019-05-14 10:14:08 +0100

[diff] [blame]

1664

dst_addr += z * dst_stride_z;

Gian Marco Iodice

2019-01-17 09:47:04 +0000

[diff] [blame]

1665

1666

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

1667

Gian Marco Iodice

2019-05-14 10:14:08 +0100

[diff] [blame]

1668

// Convert and store output block

1669

CONVERT_STORE_BLOCK(M0, N0, int, c, dst_addr, dst_stride_y, zout);

Gian Marco Iodice

2019-01-17 09:47:04 +0000

[diff] [blame]

1670

1671

#undef LHS_BLOCK_SIZE

1672

#undef LHS_OFFSET_X

1673

#undef LHS_STEP_X

1674

#undef RHS_BLOCK_SIZE

1675

#undef RHS_OFFSET_X

1676

#undef RHS_STEP_X

1677

}

Gian Marco Iodice

2019-01-17 09:47:04 +0000

[diff] [blame]

1678

#endif // defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(K)

1679

Gian Marco Iodice

2019-03-11 16:07:12 +0000

[diff] [blame]

1680

#if defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(K)

1681

Gian Marco Iodice

2019-03-11 16:07:12 +0000

[diff] [blame]

1682

/** This OpenCL kernel computes the matrix multiplication between 2 matrices.

1683

* The LHS matrix is NOT reshaped

Gian Marco Iodice

2019-05-14 10:14:08 +0100

[diff] [blame]

1684

* The RHS matrix is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is transposed

Gian Marco Iodice

2019-03-11 16:07:12 +0000

[diff] [blame]

1685

*

1686

* @note The number of columns of LHS matrix must be passed at compile time using -DK (i.e. -DK=64)

1687

* @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (i.e. -DN0=8, -DK0=4).

1688

* @note The number of M0 rows to process must be passed at compile time using -DM0 (i.e. -DM0=2)

1689

* @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (i.e. -DH0=2)

1690

* @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.

1691

* @note Only the following configurations of M0, N0 and K0 are currently supported:

1692

* - M0 = 1, 2, 3, 4, 5, 6, 7, 8

1693

* - N0 = 2, 3, 4, 8, 16

1694

* - K0 = 2, 3, 4, 8, 16

1695

* - H0 >= 1

1696

*

1697

* @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:

1698

* -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D

1699

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

1700

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

1701

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

1702

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix

1703

*

1704

* @param[in] lhs_ptr Pointer to the LHS reshaped matrix. Supported data type: F16/F32

1705

* @param[in] lhs_stride_x Stride of the LHS reshaped matrix in X dimension (in bytes)

1706

* @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1707

* @param[in] lhs_stride_y Stride of the LHS reshaped matrix in Y dimension (in bytes)

1708

* @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1709

* @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS reshaped matrix

1710

* @param[in] rhs_ptr Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr

1711

* @param[in] rhs_stride_x Stride of the RHS reshaped matrix in X dimension (in bytes)

1712

* @param[in] rhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1713

* @param[in] rhs_stride_y Stride of the RHS reshaped matrix in Y dimension (in bytes)

1714

* @param[in] rhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1715

* @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS reshaped matrix

1716

* @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr

1717

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

1718

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1719

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

1720

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

1721

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

1722

* @param[in] lhs_stride_z Stride of the LHS reshaped matrix in Z dimension (in bytes)

1723

* @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes)

1724

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

1725

* @param[in] lhs_cross_plane_pad (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)

1726

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

1727

*/

1728

__kernel void gemmlowp_mm_reshaped_only_rhs_t(IMAGE_DECLARATION(lhs),

1729

IMAGE_DECLARATION(rhs),

1730

IMAGE_DECLARATION(dst),

uint lhs_stride_z,

uint rhs_stride_z,

uint dst_stride_z

#if defined(REINTERPRET_INPUT_AS_3D)

1735

,

1736

uint lhs_cross_plane_pad

1737

#endif // REINTERPRET_INPUT_AS_3D

1738

#if defined(REINTERPRET_OUTPUT_AS_3D)

1739

,

1740

uint dst_cross_plane_pad

1741

#endif // REINTERPRET_OUTPUT_AS_3D

)

{

// Block size

#define RHS_BLOCK_SIZE ((K0) * (N0))

1746

1747

// RHS offset and step X

1748

#if defined(RHS_INTERLEAVE)

1749

#define RHS_OFFSET_X (K0)

1750

#define RHS_STEP_X ((K0) * (H0))

1751

#define RHS_STEP_LOOP (1)

1752

#else // defined(RHS_INTERLEAVE)

1753

#define RHS_OFFSET_X (RHS_BLOCK_SIZE)

1754

#define RHS_STEP_X (K0)

1755

#define RHS_STEP_LOOP (H0)

1756

#endif // defined(RHS_INTERLEAVE)

1757

1758

uint x = get_global_id(0);

1759

uint y = get_global_id(1);

1760

uint z = get_global_id(2);

1761

Gian Marco Iodice

86cfffe

2019-04-02 11:02:20 +0100

[diff] [blame]

1762

#if defined(DUMMY_WORK_ITEMS)

1763

if((x * N0 >= N) || (y * M0 >= M))

{

return;

}

#endif // defined(DUMMY_WORK_ITEMS)

1768

Gian Marco Iodice

2019-03-11 16:07:12 +0000

[diff] [blame]

1769

// Compute LHS matrix address

1770

uint lhs_offset = lhs_offset_first_element_in_bytes + y * M0 * (uint)lhs_stride_y;

1771

1772

// Compute RHS matrix address

1773

uint rhs_offset = rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X + (x / (uint)H0) * rhs_stride_y;

1774

1775

#if defined(MATRIX_B_DEPTH)

1776

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

1777

rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;

1778

#else // defined(MATRIX_B_DEPTH)

1779

rhs_offset += z * rhs_stride_z;

1780

#endif // defined(MATRIX_B_DEPTH)

1781

Gian Marco Iodice

2019-05-14 10:14:08 +0100

[diff] [blame]

1782

REPEAT_VAR_INIT_TO_CONST(8, uint, zlhs, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;

1783

REPEAT_VAR_INIT_TO_CONST(16, uint, zrhs, 0);

Gian Marco Iodice

2019-03-11 16:07:12 +0000

[diff] [blame]

1784

1785

#if defined(REINTERPRET_INPUT_AS_3D)

Gian Marco Iodice

2019-05-14 10:14:08 +0100

[diff] [blame]

1786

// The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

1787

CALCULATE_Z_OFFSET(M0, uint, zlhs, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);

Gian Marco Iodice

2019-03-11 16:07:12 +0000

[diff] [blame]

1788

1789

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

1790

// multiply lhs_stride_z by DEPTH_GEMM3D

1791

lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;

1792

1793

#else // defined(REINTERPRET_INPUT_AS_3D)

1794

1795

// Add offset for batched GEMM

1796

lhs_offset += z * lhs_stride_z;

1797

1798

#endif // defined(REINTERPRET_INPUT_AS_3D)

1799

1800

// Initialize the accumulators

1801

REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(uint, N0), c, 0); //VEC_DATA_TYPE(uint, N0) c0=0,c1=0,c2=0,... c(N0-1)=0;

1802

1803

for(int i = 0; i < K; i += K0)

1804

{

Gian Marco Iodice

2019-03-11 16:07:12 +0000

[diff] [blame]

1805

// Load values from LHS matrix

Gian Marco Iodice

2019-05-14 10:14:08 +0100

[diff] [blame]

1806

LOAD_BLOCK(M0, K0, uchar, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);

Gian Marco Iodice

2019-03-11 16:07:12 +0000

[diff] [blame]

1807

1808

// Load values from RHS matrix

Gian Marco Iodice

2019-05-14 10:14:08 +0100

[diff] [blame]

1809

LOAD_BLOCK(N0, K0, uchar, b, rhs_ptr, rhs_offset, RHS_STEP_X, zrhs);

Gian Marco Iodice

2019-03-11 16:07:12 +0000

[diff] [blame]

1810

Gian Marco Iodice

2019-05-14 10:14:08 +0100

[diff] [blame]

1811

// Partial matrix multiplication M0,N0,K0

1812

ARM_MM_K0XN0XM0(M0, N0, K0, a, b, c);

Gian Marco Iodice

2019-03-11 16:07:12 +0000

[diff] [blame]

1813

1814

lhs_offset += K0;

1815

rhs_offset += N0 * RHS_STEP_X * RHS_STEP_LOOP;

1816

}

1817

1818

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0) * sizeof(int) + (y * (uint)M0 * dst_stride_y);

1819

1820

REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;

1821

1822

#if defined(REINTERPRET_OUTPUT_AS_3D)

Gian Marco Iodice

2019-03-11 16:07:12 +0000

[diff] [blame]

1823

// The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

Gian Marco Iodice

2019-05-14 10:14:08 +0100

[diff] [blame]

1824

CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);

Gian Marco Iodice

2019-03-11 16:07:12 +0000

[diff] [blame]

1825

1826

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

1827

// multiply dst_stride_z by DEPTH_GEMM3D

1828

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

1829

1830

#else // defined(REINTERPRET_OUTPUT_AS_3D)

1831

1832

// Add offset for batched GEMM

1833

dst_addr += z * dst_stride_z;

1834

1835

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

1836

Gian Marco Iodice

2019-05-14 10:14:08 +0100

[diff] [blame]

1837

// Convert and store output block

1838

CONVERT_STORE_BLOCK(M0, N0, int, c, dst_addr, dst_stride_y, zout);

Gian Marco Iodice

2019-03-11 16:07:12 +0000

[diff] [blame]

1839

1840

#undef RHS_BLOCK_SIZE

#undef RHS_OFFSET_X

#undef RHS_STEP_X

}

#endif // defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(DATA_TYPE) && defined(K)

1845

Gian Marco Iodice

e751062

2019-06-03 17:28:17 +0100

[diff] [blame]

1846

#if defined(M0) && defined(N0) && defined(K0) && defined(K)

1847

1848

/** This OpenCL kernel computes the matrix multiplication between 2 matrices.

1849

* The LHS matrix is NOT reshaped

1850

* The RHS matrix is NOT reshaped

1851

*

1852

* @note The number of columns of LHS matrix must be passed at compile time using -DK (i.e. -DK=64)

1853

* @note The number of M0 rows to process must be passed at compile time using -DM0 (i.e. -DM0=2)

1854

* @note The number of N0 columns to process must be passed at compile time using -DN0 (i.e. -DN0=2)

1855

* @note The number of K0 partial accumulations must be passed at compile time using -DK0 (i.e., -DK0=2)

1856

* @note Only the following configurations of M0, N0 and K0 are currently supported:

1857

* - M0 = 1, 2, 3, 4, 5, 6, 7, 8

1858

* - N0 = 2, 3, 4, 8, 16

1859

* - K0 = 2, 3, 4, 8, 16

1860

*

1861

* @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:

1862

* -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D

1863

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

1864

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

1865

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

1866

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix

1867

*

1868

* @param[in] lhs_ptr Pointer to the LHS reshaped matrix. Supported data type: F16/F32

1869

* @param[in] lhs_stride_x Stride of the LHS reshaped matrix in X dimension (in bytes)

1870

* @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1871

* @param[in] lhs_stride_y Stride of the LHS reshaped matrix in Y dimension (in bytes)

1872

* @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1873

* @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS reshaped matrix

1874

* @param[in] rhs_ptr Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr

1875

* @param[in] rhs_stride_x Stride of the RHS reshaped matrix in X dimension (in bytes)

1876

* @param[in] rhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1877

* @param[in] rhs_stride_y Stride of the RHS reshaped matrix in Y dimension (in bytes)

1878

* @param[in] rhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1879

* @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS reshaped matrix

1880

* @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr

1881

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

1882

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1883

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

1884

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

1885

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

1886

* @param[in] lhs_stride_z Stride of the LHS reshaped matrix in Z dimension (in bytes)

1887

* @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes)

1888

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

1889

* @param[in] lhs_cross_plane_pad (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)

1890

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

1891

*/

1892

__kernel void gemmlowp_mm_native(IMAGE_DECLARATION(lhs),

1893

IMAGE_DECLARATION(rhs),

1894

IMAGE_DECLARATION(dst),

uint lhs_stride_z,

uint rhs_stride_z,

uint dst_stride_z

#if defined(REINTERPRET_INPUT_AS_3D)

1899

,

1900

uint lhs_cross_plane_pad

1901

#endif // REINTERPRET_INPUT_AS_3D

1902

#if defined(REINTERPRET_OUTPUT_AS_3D)

1903

,

1904

uint dst_cross_plane_pad

1905

#endif // REINTERPRET_OUTPUT_AS_3D

1906

)

1907

{

1908

uint x = get_global_id(0);

1909

uint y = get_global_id(1);

1910

uint z = get_global_id(2);

1911

1912

#if defined(DUMMY_WORK_ITEMS)

1913

if((x * N0 >= N) || (y * M0 >= M))

{

return;

}

#endif // defined(DUMMY_WORK_ITEMS)

1918

1919

// Compute LHS matrix address

1920

uint lhs_offset = lhs_offset_first_element_in_bytes + y * M0 * (uint)lhs_stride_y;

1921

1922

// Compute RHS matrix address

1923

uint rhs_offset = rhs_offset_first_element_in_bytes + x * N0;

1924

1925

#if defined(MATRIX_B_DEPTH)

1926

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

1927

rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;

1928

#else // defined(MATRIX_B_DEPTH)

1929

rhs_offset += z * rhs_stride_z;

1930

#endif // defined(MATRIX_B_DEPTH)

1931

1932

REPEAT_VAR_INIT_TO_CONST(8, uint, zlhs, 0);

1933

REPEAT_VAR_INIT_TO_CONST(16, uint, zrhs, 0);

1934

1935

#if defined(REINTERPRET_INPUT_AS_3D)

1936

// The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

1937

CALCULATE_Z_OFFSET(M0, uint, zlhs, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);

1938

1939

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

1940

// multiply lhs_stride_z by DEPTH_GEMM3D

1941

lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;

1942

1943

#else // defined(REINTERPRET_INPUT_AS_3D)

1944

1945

// Add offset for batched GEMM

1946

lhs_offset += z * lhs_stride_z;

1947

1948

#endif // defined(REINTERPRET_INPUT_AS_3D)

1949

1950

// Initialize the accumulators

1951

REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(uint, N0), c, 0); //VEC_DATA_TYPE(uint, N0) c0=0,c1=0,c2=0,... c(M0-1)=0;

int i = 0;

for(; i <= (K - K0); i += K0)

1956

{

1957

// Load values from LHS matrix

1958

LOAD_BLOCK(M0, K0, uchar, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);

1959

1960

// Load values from RHS matrix

1961

LOAD_BLOCK(K0, N0, uchar, b, rhs_ptr, rhs_offset, rhs_stride_y, zrhs);

1962

1963

// Transpose the values from RHS matrix

1964

TRANSPOSE_K0XN0(K0, N0, b_t, b);

1965

1966

// Partial matrix multiplication M0,N0,K0

1967

ARM_MM_K0XN0XM0(M0, N0, K0, a, b_t, c);

// Update the offset

lhs_offset += K0;

rhs_offset += K0 * rhs_stride_y;

1972

}

1973

1974

// Left-over for loop

1975

for(; i < K; ++i)

1976

{

1977

// Load values from LHS matrix

1978

LOAD_BLOCK(M0, 1, uchar, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);

1979

1980

// Load values from RHS matrix

1981

LOAD_BLOCK(1, N0, uchar, b, rhs_ptr, rhs_offset, rhs_stride_y, zrhs);

1982

1983

// Transpose the values from RHS matrix

1984

TRANSPOSE_K0XN0(1, N0, b_t, b);

1985

1986

// Partial matrix multiplication M0,N0,1

1987

ARM_MM_K0XN0XM0(M0, N0, 1, a, b_t, c);

// Update the offset

lhs_offset += 1;

rhs_offset += rhs_stride_y;

1992

}

1993

1994

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0) * sizeof(int) + (y * (uint)M0 * dst_stride_y);

1995

1996

REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;

1997

1998

#if defined(REINTERPRET_OUTPUT_AS_3D)

1999

// The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

2000

CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);

2001

2002

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

2003

// multiply dst_stride_z by DEPTH_GEMM3D

2004

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

2005

2006

#else // defined(REINTERPRET_OUTPUT_AS_3D)

2007

2008

// Add offset for batched GEMM

2009

dst_addr += z * dst_stride_z;

2010

2011

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

2012

2013

// Convert and store output block

2014

CONVERT_STORE_BLOCK(M0, N0, int, c, dst_addr, dst_stride_y, zout);

2015

}

2016

#endif // defined(M0) && defined(N0) && defined(K0) && defined(K)

2017

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

2018

#if defined(COLS_A)

2019

/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each row of Matrix A.

2020

*

2021

* @note This stage is needed to handle the offset of matrix product

2022

* https://github.com/google/gemmlowp/blob/master/doc/low-precision.md

2023

*

2024

* @attention The number of matrix A columns needs to be passed at compile time using -DCOLS_A

2025

*

2026

* @param[in] src_ptr Pointer to the source tensor. Supported data type: QASYMM8

2027

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

2028

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

2029

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

2030

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

2031

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

2032

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

2033

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

2034

* @param[out] dst_ptr Pointer to the destination tensor Supported data type: S32

2035

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

2036

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

2037

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

2038

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

2039

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

2040

*/

2041

__kernel void gemmlowp_matrix_a_reduction(TENSOR3D_DECLARATION(src),

2042

IMAGE_DECLARATION(dst))

2043

{

2044

// Compute source and destination addresses

2045

Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);

2046

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

2047

2048

uint4 sum_row_u32 = (uint4)0;

2049

uint sum_row = 0;

2050

2051

__global const uchar *matrix_a = (__global const uchar *)(src.ptr + get_global_id(0) * src_stride_y + get_global_id(1) * src_stride_z);

int i = 0;

// This for loop performs 16 accumulations

2056

for(; i <= ((int)COLS_A - 16); i += 16)

2057

{

2058

const uchar16 a0_u8 = vload16(0, matrix_a + i);

2059

2060

sum_row_u32 += convert_uint4(a0_u8.s0123) + convert_uint4(a0_u8.s4567) + convert_uint4(a0_u8.s89AB) + convert_uint4(a0_u8.sCDEF);

2061

}

2062

2063

// This for loop performs the leftover accumulations

2064

for(; i < COLS_A; ++i)

2065

{

2066

sum_row += matrix_a[i];

2067

}

2068

2069

sum_row += sum_row_u32.s0 + sum_row_u32.s1 + sum_row_u32.s2 + sum_row_u32.s3;

2070

2071

*((__global int *)dst.ptr) = (int)sum_row;

2072

}

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

2073

2074

#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)

2075

/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each row of Matrix A using the arm dot product instruction

2076

*

2077

* @note This stage is needed to handle the offset of matrix product

2078

* https://github.com/google/gemmlowp/blob/master/doc/low-precision.md

2079

*

2080

* @attention The number of matrix A columns needs to be passed at compile time using -DCOLS_A

2081

*

2082

* @param[in] src_ptr Pointer to the source tensor. Supported data type: QASYMM8

2083

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

2084

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

2085

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

2086

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

2087

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

2088

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

2089

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

2090

* @param[out] dst_ptr Pointer to the destination tensor Supported data type: S32

2091

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

2092

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

2093

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

2094

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

2095

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

2096

*/

2097

__kernel void gemmlowp_matrix_a_reduction_dot8(TENSOR3D_DECLARATION(src),

2098

IMAGE_DECLARATION(dst))

2099

{

2100

// Compute source and destination addresses

2101

Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);

2102

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

uint sum_row = 0;

__global const uchar *matrix_a = (__global const uchar *)(src.ptr + get_global_id(0) * src_stride_y + get_global_id(1) * src_stride_z);

int i = 0;

// This for loop performs 16 accumulations

2111

for(; i <= ((int)COLS_A - 32); i += 32)

2112

{

2113

uchar16 a0_u8 = vload16(0, matrix_a + i);

2114

2115

sum_row += arm_dot(a0_u8.s0123, (uchar4)(1));

2116

sum_row += arm_dot(a0_u8.s4567, (uchar4)(1));

2117

sum_row += arm_dot(a0_u8.s89AB, (uchar4)(1));

2118

sum_row += arm_dot(a0_u8.sCDEF, (uchar4)(1));

2119

2120

a0_u8 = vload16(1, matrix_a + i);

2121

2122

sum_row += arm_dot(a0_u8.s0123, (uchar4)(1));

2123

sum_row += arm_dot(a0_u8.s4567, (uchar4)(1));

2124

sum_row += arm_dot(a0_u8.s89AB, (uchar4)(1));

2125

sum_row += arm_dot(a0_u8.sCDEF, (uchar4)(1));

2126

}

2127

2128

// This for loop performs the leftover accumulations

2129

for(; i < COLS_A; ++i)

2130

{

2131

sum_row += matrix_a[i];

2132

}

2133

2134

*((__global int *)dst.ptr) = (int)sum_row;

2135

}

2136

#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

2137

#endif // defined(COLS_A)

2138

2139

#if defined(COLS_B) && defined(ROWS_B)

2140

/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each column of Matrix B.

2141

*

2142

* @note This stage is needed to handle the offset of matrix product

2143

* https://github.com/google/gemmlowp/blob/master/doc/low-precision.md

2144

*

2145

* @attention The number of matrix B columns and rows needs to be passed at compile time using -DCOLS_B and -DROWS_B

2146

*

2147

* @param[in] src_ptr Pointer to the source tensor. Supported data type: QASYMM8

2148

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

2149

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

2150

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

2151

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

2152

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

2153

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

2154

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

2155

* @param[out] dst_ptr Pointer to the destination tensor Supported data type: S32

2156

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

2157

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

2158

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

2159

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

2160

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

2161

*/

2162

__kernel void gemmlowp_matrix_b_reduction(TENSOR3D_DECLARATION(src),

2163

IMAGE_DECLARATION(dst))

2164

{

2165

// Compute source and destination addresses

2166

Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);

2167

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

2168

2169

uint16 sum_col_u32 = (uint16)0;

2170

2171

__global const uchar *matrix_b = (__global const uchar *)(src.ptr + get_global_id(1) * src_stride_z);

2172

2173

int i = 0;

2174

// This for loop performs 4 accumulations

2175

for(; i <= ((int)ROWS_B - 4); i += 4)

2176

{

2177

const uchar16 b0_u8 = vload16(0, matrix_b + 0 * src_stride_y);

2178

const uchar16 b1_u8 = vload16(0, matrix_b + 1 * src_stride_y);

2179

const uchar16 b2_u8 = vload16(0, matrix_b + 2 * src_stride_y);

2180

const uchar16 b3_u8 = vload16(0, matrix_b + 3 * src_stride_y);

2181

2182

sum_col_u32 += convert_uint16(b0_u8) + convert_uint16(b1_u8) + convert_uint16(b2_u8) + convert_uint16(b3_u8);

2183

2184

matrix_b += 4 * src_stride_y;

2185

}

2186

2187

// This for loop perfoms the leftover accumulations

2188

for(; i < (int)ROWS_B; ++i)

2189

{

2190

const uchar16 b0_u8 = vload16(0, matrix_b);

2191

2192

sum_col_u32 += convert_uint16(b0_u8);

2193

2194

matrix_b += src_stride_y;

2195

}

2196

2197

vstore16(convert_int16(sum_col_u32), 0, (__global int *)dst.ptr);

2198

}

2199

#endif // defined(COLS_B) && defined(ROWS_B)

2200

2201

#if defined(K_OFFSET)

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

2202

2203

/* Helper function used to calculate the offset contribution after @ref CLGEMMLowpMatrixMultiplyKernel.

2204

*

2205

* This kernel takes a final int32 accumulator value (the output of @CLGEMMLowpMatrixMultiplyKernel),

2206

* and calculates the offset contribution of matrix A and matrix B.

2207

*

2208

* @attention The k_offset = a_offset * b_offset * k (where k is the number of matrix A columns) needs to be passed at compile time using -DK_OFFSET (i.e. -DK_OFFSET=1200)

2209

* @note In case the offset contribution due to a_offset is required, a_offset needs to be passed at compile time using -DA_OFFSET (i.e. -DA_OFFSET=1)

2210

* @note In case the offset contribution due to b_offset is required, b_offset needs to be passed at compile time using -DB_OFFSET (i.e. -DB_OFFSET=6)

2211

* @note In case sum_col has batches, -DSUM_COL_HAS_BATCHES must be passed at compile time. Usually if gemmlowp is used to accelerate convolution layer, sum_col will not have batches

2212

*

2213

* @param[in] x get_global_id(0) * 4

2214

* @param[in] y get_global_id(1)

2215

* @param[in] z get_global_id(2)

2216

* @param[in] sum_col_ptr (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr

2217

* @param[in] sum_col_stride_x (Optional) Stride of the source tensor in X dimension (in bytes)

2218

* @param[in] sum_col_step_x (Optional) sum_col_stride_x * number of elements along X processed per workitem(in bytes)

2219

* @param[in] sum_col_stride_y (Optional) Stride of the source tensor in Y dimension (in bytes)

2220

* @param[in] sum_col_step_y (Optional) sum_col_stride_y * number of elements along Y processed per workitem(in bytes)

2221

* @param[in] sum_col_offset_first_element_in_bytes (Optional) The offset of the first element in the source tensor

2222

* @param[in] sum_row_ptr (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr

2223

* @param[in] sum_row_stride_x (Optional) Stride of the source tensor in X dimension (in bytes)

2224

* @param[in] sum_row_step_x (Optional) sum_row_stride_x * number of elements along X processed per workitem(in bytes)

2225

* @param[in] sum_row_stride_y (Optional) Stride of the source tensor in Y dimension (in bytes)

2226

* @param[in] sum_row_step_y (Optional) sum_row_stride_y * number of elements along Y processed per workitem(in bytes)

2227

* @param[in] sum_row_offset_first_element_in_bytes (Optional) The offset of the first element in the source tensor

2228

* @param[in] biases_ptr (Optional) Pointer to the biases tensor. Supported data type: same as @p src_ptr

2229

* @param[in] biases_stride_x (Optional) Stride of the biases tensor in X dimension (in bytes)

2230

* @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)

2231

* @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases tensor

2232

*/

2233

inline int4 offset_contribution(

int x,

int y,

int z

#if defined(A_OFFSET)

2238

,

2239

IMAGE_DECLARATION(sum_col)

2240

#endif // defined(A_OFFSET)

2241

#if defined(B_OFFSET)

2242

,

2243

IMAGE_DECLARATION(sum_row)

2244

#endif // defined(B_OFFSET)

2245

#if defined(ADD_BIAS)

2246

,

2247

VECTOR_DECLARATION(biases)

2248

#endif // defined(ADD_BIAS)

2249

)

2250

{

2251

int4 a_offset_s32 = (int4)0;

2252

int4 b_offset_s32 = (int4)0;

2253

2254

int batch_id = z;

2255

#if defined(DEPTH_INPUT3D)

2256

batch_id /= (int)DEPTH_INPUT3D;

2257

#endif // defined(DEPTH_INPUT3D)

2258

2259

#if defined(A_OFFSET)

2260

// Compute the offset contribution due to A_OFFSET

2261

__global uchar *sum_col_addr = sum_col_ptr + sum_col_offset_first_element_in_bytes + x * sizeof(int);

2262

2263

// Compute the offset contribution due to A_OFFSET

2264

#if defined(SUM_COL_HAS_BATCHES)

2265

a_offset_s32 = vload4(0, (__global int *)(sum_col_addr + batch_id * sum_col_stride_y));

2266

#else // defined(SUM_COL_HAS_BATCHES)

2267

a_offset_s32 = vload4(0, (__global int *)sum_col_addr);

2268

#endif // defined(SUM_COL_HAS_BATCHES)

2269

2270

a_offset_s32 *= (int4)A_OFFSET;

2271

#endif // defined(A_OFFSET)

2272

2273

#if defined(B_OFFSET)

2274

// Compute the offset contribution due to A_OFFSET

2275

__global uchar *sum_row_addr = sum_row_ptr + sum_row_offset_first_element_in_bytes + y * sizeof(int);

2276

2277

// Compute the offset contribution due to B_OFFSET

2278

#if defined(HEIGHT_INPUT3D) && defined(DEPTH_INPUT3D)

2279

b_offset_s32 = (int4) * (((__global int *)(sum_row_addr + batch_id * sum_row_stride_y)) + (z % (int)DEPTH_INPUT3D) * (int)HEIGHT_INPUT3D);

2280

#else // defined(HEIGHT_INPUT3D) && defined(DEPTH_INPUT3D)

2281

b_offset_s32 = (int4) * (((__global int *)(sum_row_addr + batch_id * sum_row_stride_y)));

2282

#endif // defined(HEIGHT_INPUT3D) && defined(DEPTH_INPUT3D)

2283

b_offset_s32 *= (int4)B_OFFSET;

2284

#endif // defined(B_OFFSET)

2285

2286

#if defined(ADD_BIAS)

2287

// Add bias

2288

__global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int);

2289

2290

int4 biases_values = vload4(0, (__global int *)bias_addr);

2291

b_offset_s32 += (int4)biases_values;

2292

#endif // defined(ADD_BIAS)

2293

2294

return (int4)K_OFFSET + a_offset_s32 + b_offset_s32;

2295

}

2296

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

2297

/* OpenCL kernel used to add the offset contribution after @ref CLGEMMLowpMatrixMultiplyKernel. The computation is performed in-place

2298

*

2299

* This kernel takes a final int32 accumulator value (the output of @CLGEMMLowpMatrixMultiplyKernel),

2300

* and adds to it the offset contribution of matrix A and matrix B in-place.

2301

*

2302

* @attention The k_offset = a_offset * b_offset * k (where k is the number of matrix A columns) needs to be passed at compile time using -DK_OFFSET (i.e. -DK_OFFSET=1200)

2303

* @note In case the offset contribution due to a_offset is required, a_offset needs to be passed at compile time using -DA_OFFSET (i.e. -DA_OFFSET=1)

2304

* @note In case the offset contribution due to b_offset is required, b_offset needs to be passed at compile time using -DB_OFFSET (i.e. -DB_OFFSET=6)

Chunosov

5124be5

2017-11-22 20:42:13 +0700

[diff] [blame]

2305

* @note In case sum_col has batches, -DSUM_COL_HAS_BATCHES must be passed at compile time. Usually if gemmlowp is used to accelerate convolution layer, sum_col will not have batches

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

2306

*

2307

* The final result is:

2308

*

2309

* mm_result[i][k] = mm_result[i][k] +

2310

* (sum_col[k] * A_OFFSET) +

2311

* (sum_row[i] * B_OFFSET) +

2312

* (K_OFFSET)

2313

*

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

2314

* @param[in] mm_result_ptr Pointer to the source tensor. Supported data type: S32

2315

* @param[in] mm_result_stride_x Stride of the source tensor in X dimension (in bytes)

2316

* @param[in] mm_result_step_x mm_result_stride_x * number of elements along X processed per workitem(in bytes)

2317

* @param[in] mm_result_stride_y Stride of the source tensor in Y dimension (in bytes)

2318

* @param[in] mm_result_step_y mm_result_stride_y * number of elements along Y processed per workitem(in bytes)

2319

* @param[in] mm_result_stride_z Stride of the source tensor in Z dimension (in bytes)

2320

* @param[in] mm_result_step_z mm_result_stride_z * number of elements along Z processed per workitem(in bytes)

2321

* @param[in] mm_result_offset_first_element_in_bytes The offset of the first element in the source tensor

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

2322

* @param[in] sum_col_ptr (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr

2323

* @param[in] sum_col_stride_x (Optional) Stride of the source tensor in X dimension (in bytes)

2324

* @param[in] sum_col_step_x (Optional) sum_col_stride_x * number of elements along X processed per workitem(in bytes)

2325

* @param[in] sum_col_stride_y (Optional) Stride of the source tensor in Y dimension (in bytes)

2326

* @param[in] sum_col_step_y (Optional) sum_col_stride_y * number of elements along Y processed per workitem(in bytes)

2327

* @param[in] sum_col_offset_first_element_in_bytes (Optional) The offset of the first element in the source tensor

2328

* @param[in] sum_row_ptr (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr

2329

* @param[in] sum_row_stride_x (Optional) Stride of the source tensor in X dimension (in bytes)

2330

* @param[in] sum_row_step_x (Optional) sum_row_stride_x * number of elements along X processed per workitem(in bytes)

2331

* @param[in] sum_row_stride_y (Optional) Stride of the source tensor in Y dimension (in bytes)

2332

* @param[in] sum_row_step_y (Optional) sum_row_stride_y * number of elements along Y processed per workitem(in bytes)

2333

* @param[in] sum_row_offset_first_element_in_bytes (Optional) The offset of the first element in the source tensor

2334

* @param[in] biases_ptr (Optional) Pointer to the biases tensor. Supported data type: same as @p src_ptr

2335

* @param[in] biases_stride_x (Optional) Stride of the biases tensor in X dimension (in bytes)

2336

* @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)

2337

* @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases tensor

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

2338

*/

2339

__kernel void gemmlowp_offset_contribution(TENSOR3D_DECLARATION(mm_result)

2340

#if defined(A_OFFSET)

2341

,

2342

IMAGE_DECLARATION(sum_col)

2343

#endif // defined(A_OFFSET)

2344

#if defined(B_OFFSET)

2345

,

2346

IMAGE_DECLARATION(sum_row)

2347

#endif // defined(B_OFFSET)

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

2348

#if defined(ADD_BIAS)

2349

,

2350

VECTOR_DECLARATION(biases)

2351

#endif // defined(ADD_BIAS))

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

2352

)

2353

{

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

2354

const int x = get_global_id(0) * 4;

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

2355

const int y = get_global_id(1);

2356

const int z = get_global_id(2);

2357

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

2358

// Compute offset contribution

2359

int4 offset_term_s32 = offset_contribution(

2360

x, y, z

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

2361

#if defined(A_OFFSET)

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

,

sum_col_ptr,

sum_col_stride_x,

sum_col_step_x,

sum_col_stride_y,

sum_col_step_y,

sum_col_offset_first_element_in_bytes

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

2369

#endif // defined(A_OFFSET)

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

2370

#if defined(B_OFFSET)

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

,

sum_row_ptr,

sum_row_stride_x,

sum_row_step_x,

sum_row_stride_y,

sum_row_step_y,

sum_row_offset_first_element_in_bytes

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

2378

#endif // defined(B_OFFSET)

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

2379

#if defined(ADD_BIAS)

,

biases_ptr,

biases_stride_x,

biases_step_x,

biases_offset_first_element_in_bytes

2385

#endif // defined(ADD_BIAS)

2386

);

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

2387

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

2388

__global uchar *mm_result_addr = mm_result_ptr + mm_result_offset_first_element_in_bytes + x * sizeof(int) + y * mm_result_stride_y + z * mm_result_stride_z;

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

2389

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

2390

int4 in_s32 = vload4(0, (__global int *)mm_result_addr);

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

2391

2392

// Add the offset terms to GEMM's result

2393

in_s32 += offset_term_s32;

2394

2395

// Store the result with the offset contribution

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

2396

vstore4(in_s32, 0, (__global int *)mm_result_addr);

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

2397

}

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

2398

2399

#if defined(RESULT_OFFSET) && defined(RESULT_MULTIPLIER) && defined(RESULT_SHIFT)

2400

/* OpenCL kernel used to add the offset contribution after @ref CLGEMMLowpMatrixMultiplyKernel and it quantizes down to uint8.

2401

*

2402

* This kernel takes a final int32 accumulator value (the output of @CLGEMMLowpMatrixMultiplyKernel), adds to it the offset contribution of matrix A and matrix B and quantizes to uint8 through the output stage.

2403

*

2404

*

2405

* @attention The k_offset = a_offset * b_offset * k (where k is the number of matrix A columns) needs to be passed at compile time using -DK_OFFSET (i.e. -DK_OFFSET=1200)

2406

* @note In case the offset contribution due to a_offset is required, a_offset needs to be passed at compile time using -DA_OFFSET (i.e. -DA_OFFSET=1)

2407

* @note In case the offset contribution due to b_offset is required, b_offset needs to be passed at compile time using -DB_OFFSET (i.e. -DB_OFFSET=6)

2408

* @note In case sum_col has batches, -DSUM_COL_HAS_BATCHES must be passed at compile time. Usually if gemmlowp is used to accelerate convolution layer, sum_col will not have batches

2409

*

2410

* The result before the output stage is:

2411

*

2412

* mm_result[i][k] = mm_result[i][k] +

2413

* (sum_col[k] * A_OFFSET) +

2414

* (sum_row[i] * B_OFFSET) +

2415

* (K_OFFSET)

2416

*

2417

* This result is quantized down to uint8 using the output stage. The output stage computes the following operations:

2418

*

2419

* -# Add offset terms to final result

2420

* -# Multiply each entry of result by result_mult_int

2421

* -# Add bias to final result (if -DADD_BIAS is passed at compile time)

2422

* -# Shift the int32 accumulator by result_shift

2423

* -# Clamp the value between the specified min and max bounds (if -DMIN_BOUND and/or -DMAX_BOUND are passed at compile time)

2424

* -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8.

2425

*

2426

* @attention The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and -DRESULT_SHIFT

2427

*

2428

* @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile time

2429

* @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND.

2430

* These values can be used to implement "rectified linear unit" activation functions

2431

*

2432

* @param[in] mm_result_ptr Pointer to the source tensor. Supported data type: S32

2433

* @param[in] mm_result_stride_x Stride of the source tensor in X dimension (in bytes)

2434

* @param[in] mm_result_step_x mm_result_stride_x * number of elements along X processed per workitem(in bytes)

2435

* @param[in] mm_result_stride_y Stride of the source tensor in Y dimension (in bytes)

2436

* @param[in] mm_result_step_y mm_result_stride_y * number of elements along Y processed per workitem(in bytes)

2437

* @param[in] mm_result_stride_z Stride of the source tensor in Z dimension (in bytes)

2438

* @param[in] mm_result_step_z mm_result_stride_z * number of elements along Z processed per workitem(in bytes)

2439

* @param[in] mm_result_offset_first_element_in_bytes The offset of the first element in the source tensor

2440

* @param[in] sum_col_ptr (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr

2441

* @param[in] sum_col_stride_x (Optional) Stride of the source tensor in X dimension (in bytes)

2442

* @param[in] sum_col_step_x (Optional) sum_col_stride_x * number of elements along X processed per workitem(in bytes)

2443

* @param[in] sum_col_stride_y (Optional) Stride of the source tensor in Y dimension (in bytes)

2444

* @param[in] sum_col_step_y (Optional) sum_col_stride_y * number of elements along Y processed per workitem(in bytes)

2445

* @param[in] sum_col_offset_first_element_in_bytes (Optional) The offset of the first element in the source tensor

2446

* @param[in] sum_row_ptr (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr

2447

* @param[in] sum_row_stride_x (Optional) Stride of the source tensor in X dimension (in bytes)

2448

* @param[in] sum_row_step_x (Optional) sum_row_stride_x * number of elements along X processed per workitem(in bytes)

2449

* @param[in] sum_row_stride_y (Optional) Stride of the source tensor in Y dimension (in bytes)

2450

* @param[in] sum_row_step_y (Optional) sum_row_stride_y * number of elements along Y processed per workitem(in bytes)

2451

* @param[in] sum_row_offset_first_element_in_bytes (Optional) The offset of the first element in the source tensor

2452

* @param[in] biases_ptr (Optional) Pointer to the biases tensor. Supported data type: same as @p src_ptr

2453

* @param[in] biases_stride_x (Optional) Stride of the biases tensor in X dimension (in bytes)

2454

* @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)

2455

* @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases tensor

2456

* @param[out] dst_ptr Pointer to the destination tensor Supported data type: QASYMM8

2457

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

2458

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

2459

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

2460

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

2461

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

2462

* @param[in] dst_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

2463

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

2464

*/

2465

__kernel void gemmlowp_offset_contribution_quantize_down(TENSOR3D_DECLARATION(mm_result)

2466

#if defined(A_OFFSET)

2467

,

2468

IMAGE_DECLARATION(sum_col)

2469

#endif // defined(A_OFFSET)

2470

#if defined(B_OFFSET)

2471

,

2472

IMAGE_DECLARATION(sum_row)

2473

#endif // defined(B_OFFSET)

2474

,

2475

#if defined(ADD_BIAS)

2476

VECTOR_DECLARATION(biases),

2477

#endif // defined(ADD_BIAS)

2478

TENSOR3D_DECLARATION(dst))

2479

{

2480

const int x = get_global_id(0) * 4;

2481

const int y = get_global_id(1);

2482

const int z = get_global_id(2);

2483

2484

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;

2485

2486

// Compute offset contribution

2487

int4 offset_term_s32 = offset_contribution(

2488

x, y, z

2489

#if defined(A_OFFSET)

,

sum_col_ptr,

sum_col_stride_x,

sum_col_step_x,

sum_col_stride_y,

sum_col_step_y,

sum_col_offset_first_element_in_bytes

2497

#endif // defined(A_OFFSET)

2498

#if defined(B_OFFSET)

,

sum_row_ptr,

sum_row_stride_x,

sum_row_step_x,

sum_row_stride_y,

sum_row_step_y,

sum_row_offset_first_element_in_bytes

2506

#endif // defined(B_OFFSET)

2507

#if defined(ADD_BIAS)

,

biases_ptr,

biases_stride_x,

biases_step_x,

biases_offset_first_element_in_bytes

2513

#endif // defined(ADD_BIAS)

2514

);

2515

2516

__global uchar *mm_result_addr = mm_result_ptr + mm_result_offset_first_element_in_bytes + x * sizeof(int) + y * mm_result_stride_y + z * mm_result_stride_z;

2517

2518

int4 in_s32 = vload4(0, (__global int *)mm_result_addr);

2519

2520

// Add the offset terms to GEMM's result

2521

in_s32 += offset_term_s32;

2522

2523

// -------------- OUTPUT STAGE

2524

2525

// Add the offset terms to GEMM's result

2526

in_s32 += (int4)RESULT_OFFSET;

2527

2528

// Multiply by result_mult_int and shift

2529

in_s32 *= RESULT_MULTIPLIER;

2530

2531

in_s32 >>= RESULT_SHIFT;

2532

2533

uchar4 res = convert_uchar4_sat(in_s32);

2534

2535

#if defined(MIN_BOUND)

2536

res = max(res, (uchar4)MIN_BOUND);

2537

#endif // defined(MIN_BOUND)

2538

#if defined(MAX_BOUND)

2539

res = min(res, (uchar4)MAX_BOUND);

2540

#endif // defined(MAX_BOUND)

2541

2542

// Store the result

2543

vstore4(res, 0, dst_addr);

2544

}

2545

2546

/* OpenCL kernel used to add the offset contribution after @ref CLGEMMLowpMatrixMultiplyKernel and it quantizes down to uint8.

2547

*

2548

* This kernel takes a final int32 accumulator value (the output of @CLGEMMLowpMatrixMultiplyKernel), adds to it the offset contribution of matrix A and matrix B and quantizes to uint8 through the output stage.

2549

*

2550

*

2551

* @attention The k_offset = a_offset * b_offset * k (where k is the number of matrix A columns) needs to be passed at compile time using -DK_OFFSET (i.e. -DK_OFFSET=1200)

2552

* @note In case the offset contribution due to a_offset is required, a_offset needs to be passed at compile time using -DA_OFFSET (i.e. -DA_OFFSET=1)

2553

* @note In case the offset contribution due to b_offset is required, b_offset needs to be passed at compile time using -DB_OFFSET (i.e. -DB_OFFSET=6)

2554

* @note In case sum_col has batches, -DSUM_COL_HAS_BATCHES must be passed at compile time. Usually if gemmlowp is used to accelerate convolution layer, sum_col will not have batches

2555

*

2556

* The result before the output stage is:

2557

*

2558

* mm_result[i][k] = mm_result[i][k] +

2559

* (sum_col[k] * A_OFFSET) +

2560

* (sum_row[i] * B_OFFSET) +

2561

* (K_OFFSET)

2562

*

2563

* This result is quantized down to uint8 using the output stage. The output stage computes the following operations:

2564

*

2565

* -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier

2566

* -# Add bias to final result if bias tensor is not a nullptr

2567

* -# Round to nearest division by a power-of-two using result_shift

2568

* -# Add offset to each result

2569

* -# Clamp the value between the specified min and max bounds

2570

* -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8.

2571

*

2572

* @attention The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and -DRESULT_SHIFT

2573

*

2574

* @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile time

2575

* @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND.

2576

* These values can be used to implement "rectified linear unit" activation functions

2577

*

2578

* @param[in] mm_result_ptr Pointer to the source tensor. Supported data type: S32

2579

* @param[in] mm_result_stride_x Stride of the source tensor in X dimension (in bytes)

2580

* @param[in] mm_result_step_x mm_result_stride_x * number of elements along X processed per workitem(in bytes)

2581

* @param[in] mm_result_stride_y Stride of the source tensor in Y dimension (in bytes)

2582

* @param[in] mm_result_step_y mm_result_stride_y * number of elements along Y processed per workitem(in bytes)

2583

* @param[in] mm_result_stride_z Stride of the source tensor in Z dimension (in bytes)

2584

* @param[in] mm_result_step_z mm_result_stride_z * number of elements along Z processed per workitem(in bytes)

2585

* @param[in] mm_result_offset_first_element_in_bytes The offset of the first element in the source tensor

2586

* @param[in] sum_col_ptr (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr

2587

* @param[in] sum_col_stride_x (Optional) Stride of the source tensor in X dimension (in bytes)

2588

* @param[in] sum_col_step_x (Optional) sum_col_stride_x * number of elements along X processed per workitem(in bytes)

2589

* @param[in] sum_col_stride_y (Optional) Stride of the source tensor in Y dimension (in bytes)

2590

* @param[in] sum_col_step_y (Optional) sum_col_stride_y * number of elements along Y processed per workitem(in bytes)

2591

* @param[in] sum_col_offset_first_element_in_bytes (Optional) The offset of the first element in the source tensor

2592

* @param[in] sum_row_ptr (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr

2593

* @param[in] sum_row_stride_x (Optional) Stride of the source tensor in X dimension (in bytes)

2594

* @param[in] sum_row_step_x (Optional) sum_row_stride_x * number of elements along X processed per workitem(in bytes)

2595

* @param[in] sum_row_stride_y (Optional) Stride of the source tensor in Y dimension (in bytes)

2596

* @param[in] sum_row_step_y (Optional) sum_row_stride_y * number of elements along Y processed per workitem(in bytes)

2597

* @param[in] sum_row_offset_first_element_in_bytes (Optional) The offset of the first element in the source tensor

2598

* @param[in] biases_ptr (Optional) Pointer to the biases tensor. Supported data type: same as @p src_ptr

2599

* @param[in] biases_stride_x (Optional) Stride of the biases tensor in X dimension (in bytes)

2600

* @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)

2601

* @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases tensor

2602

* @param[out] dst_ptr Pointer to the destination tensor Supported data type: QASYMM8

2603

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

2604

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

2605

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

2606

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

2607

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

2608

* @param[in] dst_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

2609

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

2610

*/

2611

__kernel void gemmlowp_offset_contribution_quantize_down_fixedpoint(TENSOR3D_DECLARATION(mm_result)

2612

#if defined(A_OFFSET)

2613

,

2614

IMAGE_DECLARATION(sum_col)

2615

#endif // defined(A_OFFSET)

2616

#if defined(B_OFFSET)

2617

,

2618

IMAGE_DECLARATION(sum_row)

2619

#endif // defined(B_OFFSET)

2620

,

2621

#if defined(ADD_BIAS)

2622

VECTOR_DECLARATION(biases),

2623

#endif // defined(ADD_BIAS)

2624

TENSOR3D_DECLARATION(dst))

2625

{

2626

const int x = get_global_id(0) * 4;

2627

const int y = get_global_id(1);

2628

const int z = get_global_id(2);

2629

2630

// Compute offset contribution

2631

int4 offset_term_s32 = offset_contribution(

2632

x, y, z

2633

#if defined(A_OFFSET)

,

sum_col_ptr,

sum_col_stride_x,

sum_col_step_x,

sum_col_stride_y,

sum_col_step_y,

sum_col_offset_first_element_in_bytes

2641

#endif // defined(A_OFFSET)

2642

#if defined(B_OFFSET)

,

sum_row_ptr,

sum_row_stride_x,

sum_row_step_x,

sum_row_stride_y,

sum_row_step_y,

sum_row_offset_first_element_in_bytes

2650

#endif // defined(B_OFFSET)

2651

#if defined(ADD_BIAS)

,

biases_ptr,

biases_stride_x,

biases_step_x,

biases_offset_first_element_in_bytes

2657

#endif // defined(ADD_BIAS)

2658

);

2659

2660

__global uchar *mm_result_addr = mm_result_ptr + mm_result_offset_first_element_in_bytes + x * sizeof(int) + y * mm_result_stride_y + z * mm_result_stride_z;

2661

2662

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;

2663

2664

int4 in_s32 = vload4(0, (__global int *)mm_result_addr);

2665

2666

// Add the offset terms to GEMM's result

2667

in_s32 += offset_term_s32;

2668

2669

// -------------- OUTPUT STAGE

2670

2671

// Multiply by result_mult_int and shift

2672

in_s32 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(in_s32, RESULT_MULTIPLIER, RESULT_SHIFT, 4);

2673

2674

// Add the offset terms to GEMM's result

2675

in_s32 += (int4)RESULT_OFFSET;

2676

2677

uchar4 res = convert_uchar4_sat(in_s32);

2678

2679

#if defined(MIN_BOUND)

2680

res = max(res, (uchar4)MIN_BOUND);

2681

#endif // defined(MIN_BOUND)

2682

#if defined(MAX_BOUND)

2683

res = min(res, (uchar4)MAX_BOUND);

2684

#endif // defined(MAX_BOUND)

2685

2686

// Store the result

2687

vstore4(res, 0, dst_addr);

2688

}

2689

#endif // defined(K_OFFSET) && defined(RESULT_OFFSET) && defined(RESULT_MULTIPLIER) && defined(RESULT_SHIFT)

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

2690

#endif // defined(K_OFFSET)

2691

2692

#if defined(RESULT_OFFSET) && defined(RESULT_MULT_INT) && defined(RESULT_SHIFT)

2693

/** This OpenCL kernel is used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8

2694

*

2695

* This kernel takes a final int32 accumulator value and processes it to obtain the final QASYMM8 value.

2696

* The following computations will be performed by the kernel:

2697

*

2698

* -# Add offset terms to final result

2699

* -# Multiply each entry of result by result_mult_int

2700

* -# Add bias to final result (if -DADD_BIAS is passed at compile time)

2701

* -# Shift the int32 accumulator by result_shift

2702

* -# Clamp the value between the specified min and max bounds (if -DMIN_BOUND and/or -DMAX_BOUND are passed at compile time)

2703

* -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8.

2704

*

2705

* @attention The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and -DRESULT_SHIFT

2706

*

2707

* @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile time

2708

* @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND.

2709

* These values can be used to implement "rectified linear unit" activation functions

2710

*

2711

* @param[in] src_ptr Pointer to the source tensor. Supported data type: S32

2712

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

2713

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

2714

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

2715

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

2716

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

2717

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

2718

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

2719

* @param[in] biases_ptr (Optional) Pointer to the biases tensor. Supported data type: same as @p src_ptr

2720

* @param[in] biases_stride_x (Optional) Stride of the biases tensor in X dimension (in bytes)

2721

* @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)

2722

* @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases tensor

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

2723

* @param[out] dst_ptr Pointer to the destination tensor Supported data type: QASYMM8

2724

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

2725

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

2726

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

2727

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

2728

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

2729

* @param[in] dst_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

2730

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

2731

*/

2732

__kernel void gemmlowp_output_stage_quantize_down(TENSOR3D_DECLARATION(src),

2733

#if defined(ADD_BIAS)

2734

VECTOR_DECLARATION(biases),

2735

#endif // defined(ADD_BIAS)

2736

TENSOR3D_DECLARATION(dst))

2737

{

2738

// Compute source and destination addresses

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

2739

int x = get_global_id(0) * 4;

2740

int y = get_global_id(1);

2741

int z = get_global_id(2);

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

2742

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

2743

__global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(int) + y * src_stride_y + z * src_stride_z;

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

2744

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

2745

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;

2746

2747

int4 input_values = vload4(0, (__global int *)src_addr);

Gian Marco

2017-11-28 09:10:03 +0000

[diff] [blame]

2748

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

2749

#if defined(ADD_BIAS)

2750

// Add bias

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

2751

__global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int);

2752

2753

int4 biases_values = vload4(0, (__global int *)bias_addr);

2754

input_values += (int4)biases_values;

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

2755

#endif // defined(ADD_BIAS)

2756

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

2757

// Add the offset terms to GEMM's result

2758

input_values += (int4)RESULT_OFFSET;

2759

Georgios Pinitas

45bcc3a

2017-11-29 11:06:49 +0000

[diff] [blame]

2760

// Multiply by result_mult_int and shift

Gian Marco

2017-11-28 09:10:03 +0000

[diff] [blame]

2761

input_values *= RESULT_MULT_INT;

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

2762

Gian Marco

2017-11-28 09:10:03 +0000

[diff] [blame]

2763

input_values >>= RESULT_SHIFT;

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

2764

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

2765

uchar4 res = convert_uchar4_sat(input_values);

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

2766

2767

#if defined(MIN_BOUND)

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

2768

res = max(res, (uchar4)MIN_BOUND);

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

2769

#endif // defined(MIN_BOUND)

2770

#if defined(MAX_BOUND)

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

2771

res = min(res, (uchar4)MAX_BOUND);

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

2772

#endif // defined(MAX_BOUND)

2773

2774

// Store the result

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

2775

vstore4(res, 0, dst_addr);

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

2776

}

Gian Marco

2017-11-28 09:10:03 +0000

[diff] [blame]

2777

#endif // defined(RESULT_OFFSET) && defined(RESULT_MULT_INT) && defined(RESULT_SHIFT)

2778

2779

#if defined(RESULT_OFFSET_AFTER_SHIFT) && defined(RESULT_FIXEDPOINT_MULTIPLIER) && defined(RESULT_SHIFT)

2780

/** This OpenCL kernel is used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8

2781

*

2782

* This kernel takes a final int32 accumulator value (the output of @ref CLGEMMLowpMatrixMultiplyKernel), and processes it to obtain the final QASYMM8 value.

2783

* The following computations will be performed by the kernel:

2784

*

2785

* -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier

2786

* -# Add bias to final result if bias tensor is not a nullptr

2787

* -# Round to nearest division by a power-of-two using result_shift

2788

* -# Add offset to each result

2789

* -# Clamp the value between the specified min and max bounds

2790

* -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8.

2791

*

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

2792

* @attention The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_OFFSET_AFTER_SHIFT, -DRESULT_FIXEDPOINT_MULTIPLIER and -DRESULT_SHIFT

Gian Marco

2017-11-28 09:10:03 +0000

[diff] [blame]

2793

*

2794

* @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile time

2795

* @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND.

2796

* These values can be used to implement "rectified linear unit" activation functions

2797

*

2798

* @param[in] src_ptr Pointer to the source tensor. Supported data type: S32

2799

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

2800

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

2801

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

2802

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

2803

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

2804

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

2805

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

2806

* @param[in] biases_ptr (Optional) Pointer to the biases tensor. Supported data type: same as @p src_ptr

2807

* @param[in] biases_stride_x (Optional) Stride of the biases tensor in X dimension (in bytes)

2808

* @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)

2809

* @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases tensor

Gian Marco

2017-11-28 09:10:03 +0000

[diff] [blame]

2810

* @param[out] dst_ptr Pointer to the destination tensor Supported data type: QASYMM8

2811

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

2812

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

2813

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

2814

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

2815

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

2816

* @param[in] dst_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

2817

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

2818

*/

2819

__kernel void gemmlowp_output_stage_quantize_down_fixedpoint(TENSOR3D_DECLARATION(src),

2820

#if defined(ADD_BIAS)

2821

VECTOR_DECLARATION(biases),

2822

#endif // defined(ADD_BIAS)

2823

TENSOR3D_DECLARATION(dst))

2824

{

2825

// Compute source and destination addresses

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

2826

int x = get_global_id(0) * 4;

2827

int y = get_global_id(1);

2828

int z = get_global_id(2);

Georgios Pinitas

932491f

2018-09-21 16:33:15 +0100

[diff] [blame]

2829

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

2830

__global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(int) + y * src_stride_y + z * src_stride_z;

Gian Marco

2017-11-28 09:10:03 +0000

[diff] [blame]

2831

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

2832

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;

2833

2834

int4 input_values = vload4(0, (__global int *)src_addr);

Gian Marco

2017-11-28 09:10:03 +0000

[diff] [blame]

2835

2836

#if defined(ADD_BIAS)

2837

// Add bias

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

2838

__global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int);

2839

2840

int4 biases_values = vload4(0, (__global int *)bias_addr);

2841

input_values += (int4)biases_values;

Gian Marco

2017-11-28 09:10:03 +0000

[diff] [blame]

2842

#endif // defined(ADD_BIAS)

2843

2844

// Multiply by result_mult_int and shift

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

2845

input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, 4);

Gian Marco

2017-11-28 09:10:03 +0000

[diff] [blame]

2846

2847

// Add the offset terms to GEMM's result

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

2848

input_values += (int4)RESULT_OFFSET_AFTER_SHIFT;

Gian Marco

2017-11-28 09:10:03 +0000

[diff] [blame]

2849

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

2850

uchar4 res = convert_uchar4_sat(input_values);

Gian Marco

2017-11-28 09:10:03 +0000

[diff] [blame]

2851

2852

#if defined(MIN_BOUND)

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

2853

res = max(res, (uchar4)MIN_BOUND);

Gian Marco

2017-11-28 09:10:03 +0000

[diff] [blame]

2854

#endif // defined(MIN_BOUND)

2855

#if defined(MAX_BOUND)

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

2856

res = min(res, (uchar4)MAX_BOUND);

Gian Marco

2017-11-28 09:10:03 +0000

[diff] [blame]

2857

#endif // defined(MAX_BOUND)

2858

2859

// Store the result

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

2860

vstore4(res, 0, dst_addr);

Gian Marco

2017-11-28 09:10:03 +0000

[diff] [blame]

2861

}

Chunosov

5124be5

2017-11-22 20:42:13 +0700

[diff] [blame]

2862

#endif // defined(RESULT_OFFSET_AFTER_SHIFT) && defined(RESULT_FIXEDPOINT_MULTIPLIER) && defined(RESULT_SHIFT)

Georgios Pinitas

2018-10-22 13:49:08 +0100

[diff] [blame]

2863

2864

#if defined(REAL_MULTIPLIER) && defined(OUTPUT_OFFSET)

2865

/** This OpenCL kernel is used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8

2866

*

2867

* This kernel takes a final int32 accumulator value (the output of @ref CLGEMMLowpMatrixMultiplyKernel), and processes it to obtain the final QASYMM8 value.

2868

* The following computations will be performed by the kernel:

2869

*

2870

* -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier

2871

* -# Add bias to final result if bias tensor is not a nullptr

2872

* -# Requantize

2873

* -# Add offset to each result

2874

* -# Clamp the value between the specified min and max bounds

2875

* -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8.

2876

*

2877

* @attention The offset and scalar scale factor must be passed at compile time using -DRESULT_OFFSET, -DREAL_MULTIPLIER

2878

*

2879

* @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile time

2880

* @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND.

2881

* These values can be used to implement "rectified linear unit" activation functions

2882

*

2883

* @param[in] src_ptr Pointer to the source tensor. Supported data type: S32

2884

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

2885

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

2886

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

2887

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

2888

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

2889

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

2890

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

2891

* @param[in] biases_ptr Pointer to the biases tensor. Supported data type: same as @p src_ptr

2892

* @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes)

2893

* @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes)

2894

* @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor

2895

* @param[out] dst_ptr Pointer to the destination tensor Supported data type: QASYMM8

2896

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

2897

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

2898

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

2899

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

2900

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

2901

* @param[in] dst_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

2902

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

2903

* @param[in] dst_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

2904

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

2905

*/

2906

__kernel void gemmlowp_output_stage_quantize_down_float(TENSOR3D_DECLARATION(src),

2907

#if defined(ADD_BIAS)

2908

VECTOR_DECLARATION(biases),

2909

#endif // defined(ADD_BIAS)

2910

#if defined(DST_HEIGHT)

2911

TENSOR4D_DECLARATION(dst))

2912

#else // defined(DST_HEIGHT)

2913

TENSOR3D_DECLARATION(dst))

2914

#endif // defined(DST_HEIGHT)

2915

{

2916

// Compute source and destination addresses

Gian Marco Iodice

2018-10-30 12:20:03 +0000

[diff] [blame]

2917

int x = get_global_id(0) * 4;

2918

int y = get_global_id(1);

2919

int z = get_global_id(2);

Georgios Pinitas

2018-10-22 13:49:08 +0100

[diff] [blame]

2920

Gian Marco Iodice

2018-10-30 12:20:03 +0000

[diff] [blame]

2921

__global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(int) + y * src_stride_y + z * src_stride_z;

Georgios Pinitas

2018-10-22 13:49:08 +0100

[diff] [blame]

2922

Gian Marco Iodice

2018-10-30 12:20:03 +0000

[diff] [blame]

2923

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;

2924

2925

int4 input_values = vload4(0, (__global int *)src_addr);

Georgios Pinitas

2018-10-22 13:49:08 +0100

[diff] [blame]

2926

2927

#if defined(ADD_BIAS)

2928

// Add bias

Gian Marco Iodice

2018-10-30 12:20:03 +0000

[diff] [blame]

2929

__global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int);

2930

2931

int4 biases_values = vload4(0, (__global int *)bias_addr);

2932

input_values += (int4)biases_values;

Georgios Pinitas

2018-10-22 13:49:08 +0100

[diff] [blame]

2933

#endif // defined(ADD_BIAS)

2934

2935

// Convert to float

Gian Marco Iodice

2018-10-30 12:20:03 +0000

[diff] [blame]

2936

float16 input_values_f = convert_float4(input_values);

Georgios Pinitas

2018-10-22 13:49:08 +0100

[diff] [blame]

2937

input_values_f = round(input_values_f * (float)REAL_MULTIPLIER + (float)OUTPUT_OFFSET);

2938

Gian Marco Iodice

2018-10-30 12:20:03 +0000

[diff] [blame]

2939

uchar4 res = convert_uchar4_sat(input_values_f);

Georgios Pinitas

2018-10-22 13:49:08 +0100

[diff] [blame]

2940

2941

#if defined(MIN_BOUND)

Gian Marco Iodice

2018-10-30 12:20:03 +0000

[diff] [blame]

2942

res = max(res, (uchar4)MIN_BOUND);

Georgios Pinitas

2018-10-22 13:49:08 +0100

[diff] [blame]

2943

#endif // defined(MIN_BOUND)

2944

#if defined(MAX_BOUND)

Gian Marco Iodice

2018-10-30 12:20:03 +0000

[diff] [blame]

2945

res = min(res, (uchar4)MAX_BOUND);

Georgios Pinitas

2018-10-22 13:49:08 +0100

[diff] [blame]

2946

#endif // defined(MAX_BOUND)

2947

2948

// Store the result

Gian Marco Iodice

2018-10-30 12:20:03 +0000

[diff] [blame]

2949

vstore4(res, 0, dst_addr);

Georgios Pinitas