Blame - src/core/CL/cl_kernels/gemmlowp.cl - ml/ComputeLibrary

2017-11-21 10:57:50 +0000

[diff] [blame]

199

*

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

200

* @note The number of matrix B columns needs to be passed at compile time using -DCOLS_B: e.g. -DCOLS_B=1024

201

* @note The transposition width step (mult_transpose1xW_width * 4) must be passed at compile time using -DTRANSPOSE1XW_WIDTH_STEP (i.e. -DTRANSPOSE1XW_WIDTH_STEP=2)

202

* @note The multiplication factor for the height of the 4x4 interleaved block must be passed at compile time using -DMULT_INTERLEAVE4X4_HEIGHT (i.e. -DMULT_INTERLEAVE4X4_HEIGHT=2)

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

203

*

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

204

* @note In case the output has to be reinterpreted as a 3D tensor (i.e. output of convolution layer), the following information must be passed at compile time:

205

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

206

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

207

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

208

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped

209

*

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

210

* @param[in] src0_ptr Pointer to the source matrix. Supported data type: QASYMM8

211

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

212

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

213

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

214

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

215

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

216

* @param[in] src1_ptr Pointer to the source matrix. Supported data type: same as @p src0_ptr

217

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

218

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

219

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

220

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

221

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

222

* @param[out] dst_ptr Pointer to the destination matrix Supported data type: S32

223

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

224

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

225

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

226

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

227

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

228

* @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)

229

* @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)

230

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

231

* @param[in] cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

232

*/

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

233

__kernel void gemmlowp_mm_interleaved_transposed_midgard(IMAGE_DECLARATION(src0),

234

IMAGE_DECLARATION(src1),

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

235

IMAGE_DECLARATION(dst),

uint src0_stride_z,

uint src1_stride_z,

uint dst_stride_z

#if defined(REINTERPRET_OUTPUT_AS_3D)

240

,

241

uint cross_plane_pad

242

#endif // REINTERPRET_OUTPUT_AS_3D

243

)

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

244

{

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

245

const int x = get_global_id(0) / TRANSPOSE1XW_WIDTH_STEP;

246

const int y = get_global_id(1) / MULT_INTERLEAVE4X4_HEIGHT;

247

const int z = get_global_id(2);

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

248

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

249

// Offset

250

const int offset_row_a = (get_global_id(1) % MULT_INTERLEAVE4X4_HEIGHT) * 4;

251

const int offset_row_b = (get_global_id(0) % TRANSPOSE1XW_WIDTH_STEP) * 4;

252

253

// src_addr_a = address of matrix A

254

// src_addr_b = address of matrix B

Isabella Gottardi

b92805b

2018-09-28 18:24:27 +0100

[diff] [blame]

255

__global uchar *src_addr_a = (__global uchar *)(src0_ptr + z * src0_stride_z + y * src0_stride_y + src0_offset_first_element_in_bytes);

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

256

__global uchar *src_addr_b = (__global uchar *)(src1_ptr + x * src1_stride_y + src1_offset_first_element_in_bytes);

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

257

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

258

#if defined(MATRIX_B_DEPTH)

259

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

260

src_addr_b += (z % MATRIX_B_DEPTH) * src1_stride_z;

261

#else // defined(MATRIX_B_DEPTH)

262

src_addr_b += z * src1_stride_z;

263

#endif // defined(MATRIX_B_DEPTH)

264

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

265

// Compute end row address for matrix B

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

266

__global uchar *src_end_addr_b = src_addr_b + COLS_B;

267

268

src_addr_a += offset_row_a;

269

src_addr_b += offset_row_b;

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

270

271

// Reset accumulators

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

int4 c00 = 0;

int4 c10 = 0;

int4 c20 = 0;

int4 c30 = 0;

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

276

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

277

for(; src_addr_b <= (src_end_addr_b - (int)(8 * TRANSPOSE1XW_WIDTH_STEP)); src_addr_a += 8 * MULT_INTERLEAVE4X4_HEIGHT, src_addr_b += 8 * TRANSPOSE1XW_WIDTH_STEP)

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

278

{

279

// Load values from matrix A (interleaved) and matrix B (transposed)

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

280

int4 a0 = convert_int4(vload4(0, src_addr_a));

281

int4 b0 = convert_int4(vload4(0, src_addr_b));

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

282

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

283

c00 += (int4)a0.s0 * b0;

284

c10 += (int4)a0.s1 * b0;

285

c20 += (int4)a0.s2 * b0;

286

c30 += (int4)a0.s3 * b0;

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

287

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

288

a0 = convert_int4(vload4(0, src_addr_a + 4 * MULT_INTERLEAVE4X4_HEIGHT));

289

b0 = convert_int4(vload4(0, src_addr_b + 4 * TRANSPOSE1XW_WIDTH_STEP));

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

290

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

291

c00 += (int4)a0.s0 * b0;

292

c10 += (int4)a0.s1 * b0;

293

c20 += (int4)a0.s2 * b0;

294

c30 += (int4)a0.s3 * b0;

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

295

}

296

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

297

for(; src_addr_b < src_end_addr_b; src_addr_a += (4 * MULT_INTERLEAVE4X4_HEIGHT), src_addr_b += (4 * TRANSPOSE1XW_WIDTH_STEP))

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

298

{

299

// Load values from matrix A (interleaved) and matrix B (transposed)

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

300

int4 a0 = convert_int4(vload4(0, src_addr_a));

301

int4 b0 = convert_int4(vload4(0, src_addr_b));

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

302

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

303

c00 += (int4)a0.s0 * b0;

304

c10 += (int4)a0.s1 * b0;

305

c20 += (int4)a0.s2 * b0;

306

c30 += (int4)a0.s3 * b0;

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

307

}

308

309

// Compute destination address

310

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

311

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

312

#if defined(REINTERPRET_OUTPUT_AS_3D)

313

// Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension

314

// in order to take into account the presence of possible cross plane paddings

//

// | |

// | plane0 |

// | |

// |__________________|

320

// |******************|

321

// | cross_plane_pad |

322

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

327

328

// The plane (zout) is calculated dividing M (get_global_id(1) * 4) by HEIGHT_GEMM3D

329

uint4 zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * 4)) / (uint4)HEIGHT_GEMM3D;

330

zout = min(DEPTH_GEMM3D - 1, zout);

331

332

// Add offset due to the cross plane paddings

333

zout *= (cross_plane_pad * dst_stride_y);

334

335

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

336

// multiply dst_stride_z by DEPTH_GEMM3D

337

dst.ptr += z * dst_stride_z * DEPTH_GEMM3D;

338

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

339

// Store 4x4 block

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

340

vstore4(c00, 0, (__global int *)(dst.ptr + 0 * dst_stride_y + zout.s0));

341

vstore4(c10, 0, (__global int *)(dst.ptr + 1 * dst_stride_y + zout.s1));

342

vstore4(c20, 0, (__global int *)(dst.ptr + 2 * dst_stride_y + zout.s2));

343

vstore4(c30, 0, (__global int *)(dst.ptr + 3 * dst_stride_y + zout.s3));

344

345

#else // defined(REINTERPRET_OUTPUT_AS_3D)

346

// Add offset for batched GEMM

347

dst.ptr += z * dst_stride_z;

348

349

// Store 4x4 block

350

vstore4(c00, 0, (__global int *)(dst.ptr + 0 * dst_stride_y));

351

vstore4(c10, 0, (__global int *)(dst.ptr + 1 * dst_stride_y));

352

vstore4(c20, 0, (__global int *)(dst.ptr + 2 * dst_stride_y));

353

vstore4(c30, 0, (__global int *)(dst.ptr + 3 * dst_stride_y));

354

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

355

}

Gian Marco

2018-01-30 13:35:54 +0000

[diff] [blame]

356

#endif // defined(COLS_B) && defined(MULT_INTERLEAVE4X4_HEIGHT) && defined(TRANSPOSE1XW_WIDTH_STEP)

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

357

358

#if defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_Y) && defined(COLS_A)

359

#define VECTOR_UCHAR VEC_DATA_TYPE(uchar, NUM_ELEMS_PROCESSED_PER_THREAD_X)

360

#define VECTOR_UINT VEC_DATA_TYPE(uint, NUM_ELEMS_PROCESSED_PER_THREAD_X)

361

#define VECTOR_INT VEC_DATA_TYPE(int, NUM_ELEMS_PROCESSED_PER_THREAD_X)

362

/** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped

363

*

364

* @attention The number of matrix A columns needs to be passed at compile time using -DCOLS_A

365

*

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

366

* @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:

367

* -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D

368

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

369

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

370

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

371

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped

372

*

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

373

* @param[in] src0_ptr Pointer to the source matrix. Supported data type: QASYMM8

374

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

375

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

376

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

377

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

378

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

379

* @param[in] src1_ptr Pointer to the source matrix. Supported data type: same as @p src0_ptr

380

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

381

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

382

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

383

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

384

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

385

* @param[out] dst_ptr Pointer to the destination matrix Supported data type: S32

386

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

387

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

388

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

389

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

390

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

391

* @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)

392

* @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)

393

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

394

* @param[in] src_cross_plane_pad (Optional) Bottom paddings in unit of elements for the input tensor (only if defined REINTERPRET_INPUT_AS_3D)

395

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements for the output tensor (only if defined REINTERPRET_OUTPUT_AS_3D)

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

396

*/

Gian Marco

2018-01-10 15:56:30 +0000

[diff] [blame]

397

__kernel void gemmlowp_mm_midgard(IMAGE_DECLARATION(src0),

398

IMAGE_DECLARATION(src1),

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

399

IMAGE_DECLARATION(dst),

uint src0_stride_z,

uint src1_stride_z,

uint dst_stride_z

#if defined(REINTERPRET_INPUT_AS_3D)

404

,

405

uint src_cross_plane_pad

406

#endif // REINTERPRET_INPUT_AS_3D

407

#if defined(REINTERPRET_OUTPUT_AS_3D)

408

,

409

uint dst_cross_plane_pad

410

#endif // REINTERPRET_OUTPUT_AS_3D

411

)

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

412

{

413

int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;

414

415

// Compute starting address for matrix A and Matrix B

416

int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));

417

418

// Update address for the matrix A

419

src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;

420

421

// Update address for the matrix B

422

src_addr.s1 += idx;

423

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

424

#if defined(REINTERPRET_INPUT_AS_3D)

425

// Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension

426

// in order to take into account the presence of possible cross plane paddings

//

// | |

// | plane0 |

// | |

// |__________________|

432

// |******************|

433

// | cross_plane_pad |

434

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

439

440

// The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D

441

uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;

442

zin = min(DEPTH_GEMM3D - 1, zin);

443

444

// Add offset due to the cross plane paddings

445

zin *= (src_cross_plane_pad * src0_stride_y);

446

447

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

448

// multiply src0_stride_z by DEPTH_GEMM3D

449

src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;

450

451

#else // defined(REINTERPRET_INPUT_AS_3D)

452

453

// Add offset for batched GEMM

454

src_addr.s0 += get_global_id(2) * src0_stride_z;

455

456

#endif // defined(REINTERPRET_INPUT_AS_3D)

457

458

#if defined(MATRIX_B_DEPTH)

459

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

460

src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;

461

#else // defined(MATRIX_B_DEPTH)

462

src_addr.s1 += get_global_id(2) * src1_stride_z;

463

#endif // defined(MATRIX_B_DEPTH)

464

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

465

int end_row_vec_a = src_addr.s0 + COLS_A;

466

467

VECTOR_UINT acc0 = 0;

468

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

469

VECTOR_UINT acc1 = 0;

470

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

471

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

472

VECTOR_UINT acc2 = 0;

473

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

474

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

475

VECTOR_UINT acc3 = 0;

476

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco

2018-01-10 15:56:30 +0000

[diff] [blame]

477

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

478

VECTOR_UINT acc4 = 0;

479

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

480

481

for(; src_addr.s0 <= (end_row_vec_a - 2); src_addr += (int2)(2, 2 * src1_stride_y))

482

{

483

// Load values from matrix A

484

uchar2 a0 = vload2(0, src0_ptr + src_addr.s0 + 0 * src0_stride_y);

485

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

486

uchar2 a1 = vload2(0, src0_ptr + src_addr.s0 + 1 * src0_stride_y);

487

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

488

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

489

uchar2 a2 = vload2(0, src0_ptr + src_addr.s0 + 2 * src0_stride_y);

490

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

491

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

492

uchar2 a3 = vload2(0, src0_ptr + src_addr.s0 + 3 * src0_stride_y);

493

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco

2018-01-10 15:56:30 +0000

[diff] [blame]

494

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

495

uchar2 a4 = vload2(0, src0_ptr + src_addr.s0 + 4 * src0_stride_y);

496

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

497

// Load values from matrix B

498

VECTOR_UCHAR b0 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, src1_ptr + src_addr.s1);

499

VECTOR_UCHAR b1 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, src1_ptr + src_addr.s1 + src1_stride_y);

500

501

// Accumulate

502

acc0 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a0.s0;

503

acc0 += CONVERT(b1, VECTOR_UINT) * (VECTOR_UINT)a0.s1;

504

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

505

acc1 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a1.s0;

506

acc1 += CONVERT(b1, VECTOR_UINT) * (VECTOR_UINT)a1.s1;

507

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

508

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

509

acc2 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a2.s0;

510

acc2 += CONVERT(b1, VECTOR_UINT) * (VECTOR_UINT)a2.s1;

511

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

512

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

513

acc3 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a3.s0;

514

acc3 += CONVERT(b1, VECTOR_UINT) * (VECTOR_UINT)a3.s1;

515

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco

2018-01-10 15:56:30 +0000

[diff] [blame]

516

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

517

acc4 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a4.s0;

518

acc4 += CONVERT(b1, VECTOR_UINT) * (VECTOR_UINT)a4.s1;

519

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

520

}

521

522

for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(1, src1_stride_y))

523

{

524

// Load values from matrix A

525

uchar a0 = *(src0_ptr + src_addr.s0 + 0 * src0_stride_y);

526

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

527

uchar a1 = *(src0_ptr + src_addr.s0 + 1 * src0_stride_y);

528

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

529

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

530

uchar a2 = *(src0_ptr + src_addr.s0 + 2 * src0_stride_y);

531

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

532

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

533

uchar a3 = *(src0_ptr + src_addr.s0 + 3 * src0_stride_y);

534

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco

2018-01-10 15:56:30 +0000

[diff] [blame]

535

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

536

uchar a4 = *(src0_ptr + src_addr.s0 + 4 * src0_stride_y);

537

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

538

// Load values from matrix B

539

VECTOR_UCHAR b0 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, src1_ptr + src_addr.s1);

540

541

// Accumulate

542

acc0 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a0;

543

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

544

acc1 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a1;

545

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

546

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

547

acc2 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a2;

548

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

549

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

550

acc3 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a3;

551

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco

2018-01-10 15:56:30 +0000

[diff] [blame]

552

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

553

acc4 += CONVERT(b0, VECTOR_UINT) * (VECTOR_UINT)a4;

554

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

555

}

556

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

557

const int z = get_global_id(2);

558

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

559

// Compute destination address

560

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

561

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

562

#if defined(REINTERPRET_OUTPUT_AS_3D)

563

// Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension

564

// in order to take into account the presence of possible cross plane paddings

//

// | |

// | plane0 |

// | |

// |__________________|

570

// |******************|

571

// | cross_plane_pad |

572

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

577

578

// The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D

579

uint8 zout = ((uint8)(0, 1, 2, 3, 4, 5, 6, 7) + (uint8)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint8)HEIGHT_GEMM3D;

580

zout = min(DEPTH_GEMM3D - 1, zout);

581

582

// Add offset due to the cross plane paddings

583

zout *= (dst_cross_plane_pad * dst_stride_y);

584

585

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

586

// multiply dst_stride_z by DEPTH_GEMM3D

587

dst.ptr += z * dst_stride_z * DEPTH_GEMM3D;

588

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

589

// Store the result

590

VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

591

(CONVERT(acc0, VECTOR_INT), 0, (__global int *)(dst.ptr + 0 * dst_stride_y + zout.s0));

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

592

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

593

VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

594

(CONVERT(acc1, VECTOR_INT), 0, (__global int *)(dst.ptr + 1 * dst_stride_y + zout.s1));

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

595

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

596

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

597

VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

598

(CONVERT(acc2, VECTOR_INT), 0, (__global int *)(dst.ptr + 2 * dst_stride_y + zout.s2));

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

599

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

600

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

601

VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

602

(CONVERT(acc3, VECTOR_INT), 0, (__global int *)(dst.ptr + 3 * dst_stride_y + zout.s3));

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

603

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco

2018-01-10 15:56:30 +0000

[diff] [blame]

604

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

605

VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

606

(CONVERT(acc4, VECTOR_INT), 0, (__global int *)(dst.ptr + 4 * dst_stride_y + zout.s4));

Gian Marco

2018-01-10 15:56:30 +0000

[diff] [blame]

607

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

608

609

#else // defined(REINTERPRET_OUTPUT_AS_3D)

610

// Add offset for batched GEMM

611

dst.ptr += z * dst_stride_z;

612

613

// Store the result

614

VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)

615

(CONVERT(acc0, VECTOR_INT), 0, (__global int *)(dst.ptr + 0 * dst_stride_y));

616

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

617

VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)

618

(CONVERT(acc1, VECTOR_INT), 0, (__global int *)(dst.ptr + 1 * dst_stride_y));

619

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

620

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

621

VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)

622

(CONVERT(acc2, VECTOR_INT), 0, (__global int *)(dst.ptr + 2 * dst_stride_y));

623

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

624

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

625

VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)

626

(CONVERT(acc3, VECTOR_INT), 0, (__global int *)(dst.ptr + 3 * dst_stride_y));

627

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

628

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

629

VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)

630

(CONVERT(acc4, VECTOR_INT), 0, (__global int *)(dst.ptr + 4 * dst_stride_y));

631

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

632

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

Gian Marco

2018-01-10 15:56:30 +0000

[diff] [blame]

633

}

634

635

/** OpenCL kernel optimized for Bifrost architectures that computes the matrix multiplication between matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped

636

*

637

* @attention The number of matrix A columns needs to be passed at compile time using -DCOLS_A

638

*

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

639

* @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:

640

* -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D

641

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

642

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

643

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

644

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped

645

*

Gian Marco

2018-01-10 15:56:30 +0000

[diff] [blame]

646

* @param[in] src0_ptr Pointer to the source matrix. Supported data type: QASYMM8

647

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

648

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

649

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

650

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

651

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

652

* @param[in] src1_ptr Pointer to the source matrix. Supported data type: same as @p src0_ptr

653

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

654

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

655

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

656

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

657

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

658

* @param[out] dst_ptr Pointer to the destination matrix Supported data type: S32

659

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

660

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

661

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

662

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

663

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

664

* @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)

665

* @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)

666

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

667

* @param[in] src_cross_plane_pad (Optional) Bottom paddings in unit of elements for the input tensor (only if defined REINTERPRET_INPUT_AS_3D)

668

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements for the output tensor (only if defined REINTERPRET_OUTPUT_AS_3D)

Gian Marco

2018-01-10 15:56:30 +0000

[diff] [blame]

669

*/

670

__kernel void gemmlowp_mm_bifrost(IMAGE_DECLARATION(src0),

671

IMAGE_DECLARATION(src1),

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

672

IMAGE_DECLARATION(dst),

uint src0_stride_z,

uint src1_stride_z,

uint dst_stride_z

#if defined(REINTERPRET_INPUT_AS_3D)

677

,

678

uint src_cross_plane_pad

679

#endif // REINTERPRET_INPUT_AS_3D

680

#if defined(REINTERPRET_OUTPUT_AS_3D)

681

,

682

uint dst_cross_plane_pad

683

#endif // REINTERPRET_OUTPUT_AS_3D

684

)

Gian Marco

2018-01-10 15:56:30 +0000

[diff] [blame]

685

{

686

int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;

687

688

// Compute starting address for matrix A and Matrix B

689

int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));

690

691

// Update address for the matrix A

692

src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;

693

694

// Update address for the matrix B

695

src_addr.s1 += idx;

696

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

697

#if defined(REINTERPRET_INPUT_AS_3D)

698

// Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension

699

// in order to take into account the presence of possible cross plane paddings

//

// | |

// | plane0 |

// | |

// |__________________|

705

// |******************|

706

// | cross_plane_pad |

707

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

712

713

// The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D

714

uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;

715

zin = min(DEPTH_GEMM3D - 1, zin);

716

717

// Add offset due to the cross plane paddings

718

zin *= (src_cross_plane_pad * src0_stride_y);

719

720

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

721

// multiply src0_stride_z by DEPTH_GEMM3D

722

src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;

723

724

#else // defined(REINTERPRET_INPUT_AS_3D)

725

726

// Add offset for batched GEMM

727

src_addr.s0 += get_global_id(2) * src0_stride_z;

728

729

#endif // defined(REINTERPRET_INPUT_AS_3D)

730

731

#if defined(MATRIX_B_DEPTH)

732

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

733

src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;

734

#else // defined(MATRIX_B_DEPTH)

735

src_addr.s1 += get_global_id(2) * src1_stride_z;

736

#endif // defined(MATRIX_B_DEPTH)

737

Gian Marco

2018-01-10 15:56:30 +0000

[diff] [blame]

738

int end_row_vec_a = src_addr.s0 + COLS_A;

uint acc00 = 0;

uint acc01 = 0;

uint acc02 = 0;

uint acc03 = 0;

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

uint acc10 = 0;

uint acc11 = 0;

uint acc12 = 0;

uint acc13 = 0;

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

750

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

uint acc20 = 0;

uint acc21 = 0;

uint acc22 = 0;

uint acc23 = 0;

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

756

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

uint acc30 = 0;

uint acc31 = 0;

uint acc32 = 0;

uint acc33 = 0;

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

762

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

uint acc40 = 0;

uint acc41 = 0;

uint acc42 = 0;

uint acc43 = 0;

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

768

769

for(; src_addr.s0 <= (end_row_vec_a - 4); src_addr += (int2)(4, 4 * src1_stride_y))

770

{

771

// Load values from matrix A

772

uchar4 a0 = vload4(0, src0_ptr + src_addr.s0 + 0 * src0_stride_y);

773

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

774

uchar4 a1 = vload4(0, src0_ptr + src_addr.s0 + 1 * src0_stride_y);

775

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

776

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

777

uchar4 a2 = vload4(0, src0_ptr + src_addr.s0 + 2 * src0_stride_y);

778

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

779

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

780

uchar4 a3 = vload4(0, src0_ptr + src_addr.s0 + 3 * src0_stride_y);

781

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

782

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

783

uchar4 a4 = vload4(0, src0_ptr + src_addr.s0 + 4 * src0_stride_y);

784

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

785

// Load values from matrix B

786

uchar4 b0 = vload4(0, src1_ptr + src_addr.s1 + 0 * src1_stride_y);

787

uchar4 b1 = vload4(0, src1_ptr + src_addr.s1 + 1 * src1_stride_y);

788

uchar4 b2 = vload4(0, src1_ptr + src_addr.s1 + 2 * src1_stride_y);

789

uchar4 b3 = vload4(0, src1_ptr + src_addr.s1 + 3 * src1_stride_y);

{

// Accumulate

ushort tmp0 = (ushort)b0.s0 * (ushort)a0.s0;

794

ushort tmp1 = (ushort)b0.s1 * (ushort)a0.s0;

795

ushort tmp2 = (ushort)b0.s2 * (ushort)a0.s0;

796

ushort tmp3 = (ushort)b0.s3 * (ushort)a0.s0;

797

798

ushort tmp4 = (ushort)b1.s0 * (ushort)a0.s1;

799

ushort tmp5 = (ushort)b1.s1 * (ushort)a0.s1;

800

ushort tmp6 = (ushort)b1.s2 * (ushort)a0.s1;

801

ushort tmp7 = (ushort)b1.s3 * (ushort)a0.s1;

802

803

ushort tmp8 = (ushort)b2.s0 * (ushort)a0.s2;

804

ushort tmp9 = (ushort)b2.s1 * (ushort)a0.s2;

805

ushort tmpA = (ushort)b2.s2 * (ushort)a0.s2;

806

ushort tmpB = (ushort)b2.s3 * (ushort)a0.s2;

807

808

ushort tmpC = (ushort)b3.s0 * (ushort)a0.s3;

809

ushort tmpD = (ushort)b3.s1 * (ushort)a0.s3;

810

ushort tmpE = (ushort)b3.s2 * (ushort)a0.s3;

811

ushort tmpF = (ushort)b3.s3 * (ushort)a0.s3;

812

813

acc00 += ((uint)tmp0 + (uint)tmp4 + (uint)tmp8 + (uint)tmpC);

814

acc01 += ((uint)tmp1 + (uint)tmp5 + (uint)tmp9 + (uint)tmpD);

815

acc02 += ((uint)tmp2 + (uint)tmp6 + (uint)tmpA + (uint)tmpE);

816

acc03 += ((uint)tmp3 + (uint)tmp7 + (uint)tmpB + (uint)tmpF);

817

}

818

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

819

{

820

// Accumulate

821

ushort tmp0 = (ushort)b0.s0 * (ushort)a1.s0;

822

ushort tmp1 = (ushort)b0.s1 * (ushort)a1.s0;

823

ushort tmp2 = (ushort)b0.s2 * (ushort)a1.s0;

824

ushort tmp3 = (ushort)b0.s3 * (ushort)a1.s0;

825

826

ushort tmp4 = (ushort)b1.s0 * (ushort)a1.s1;

827

ushort tmp5 = (ushort)b1.s1 * (ushort)a1.s1;

828

ushort tmp6 = (ushort)b1.s2 * (ushort)a1.s1;

829

ushort tmp7 = (ushort)b1.s3 * (ushort)a1.s1;

830

831

ushort tmp8 = (ushort)b2.s0 * (ushort)a1.s2;

832

ushort tmp9 = (ushort)b2.s1 * (ushort)a1.s2;

833

ushort tmpA = (ushort)b2.s2 * (ushort)a1.s2;

834

ushort tmpB = (ushort)b2.s3 * (ushort)a1.s2;

835

836

ushort tmpC = (ushort)b3.s0 * (ushort)a1.s3;

837

ushort tmpD = (ushort)b3.s1 * (ushort)a1.s3;

838

ushort tmpE = (ushort)b3.s2 * (ushort)a1.s3;

839

ushort tmpF = (ushort)b3.s3 * (ushort)a1.s3;

840

841

acc10 += ((uint)tmp0 + (uint)tmp4 + (uint)tmp8 + (uint)tmpC);

842

acc11 += ((uint)tmp1 + (uint)tmp5 + (uint)tmp9 + (uint)tmpD);

843

acc12 += ((uint)tmp2 + (uint)tmp6 + (uint)tmpA + (uint)tmpE);

844

acc13 += ((uint)tmp3 + (uint)tmp7 + (uint)tmpB + (uint)tmpF);

845

}

846

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

847

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

848

{

849

// Accumulate

850

ushort tmp0 = (ushort)b0.s0 * (ushort)a2.s0;

851

ushort tmp1 = (ushort)b0.s1 * (ushort)a2.s0;

852

ushort tmp2 = (ushort)b0.s2 * (ushort)a2.s0;

853

ushort tmp3 = (ushort)b0.s3 * (ushort)a2.s0;

854

855

ushort tmp4 = (ushort)b1.s0 * (ushort)a2.s1;

856

ushort tmp5 = (ushort)b1.s1 * (ushort)a2.s1;

857

ushort tmp6 = (ushort)b1.s2 * (ushort)a2.s1;

858

ushort tmp7 = (ushort)b1.s3 * (ushort)a2.s1;

859

860

ushort tmp8 = (ushort)b2.s0 * (ushort)a2.s2;

861

ushort tmp9 = (ushort)b2.s1 * (ushort)a2.s2;

862

ushort tmpA = (ushort)b2.s2 * (ushort)a2.s2;

863

ushort tmpB = (ushort)b2.s3 * (ushort)a2.s2;

864

865

ushort tmpC = (ushort)b3.s0 * (ushort)a2.s3;

866

ushort tmpD = (ushort)b3.s1 * (ushort)a2.s3;

867

ushort tmpE = (ushort)b3.s2 * (ushort)a2.s3;

868

ushort tmpF = (ushort)b3.s3 * (ushort)a2.s3;

869

870

acc20 += ((uint)tmp0 + (uint)tmp4 + (uint)tmp8 + (uint)tmpC);

871

acc21 += ((uint)tmp1 + (uint)tmp5 + (uint)tmp9 + (uint)tmpD);

872

acc22 += ((uint)tmp2 + (uint)tmp6 + (uint)tmpA + (uint)tmpE);

873

acc23 += ((uint)tmp3 + (uint)tmp7 + (uint)tmpB + (uint)tmpF);

874

}

875

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

876

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

877

{

878

// Accumulate

879

ushort tmp0 = (ushort)b0.s0 * (ushort)a3.s0;

880

ushort tmp1 = (ushort)b0.s1 * (ushort)a3.s0;

881

ushort tmp2 = (ushort)b0.s2 * (ushort)a3.s0;

882

ushort tmp3 = (ushort)b0.s3 * (ushort)a3.s0;

883

884

ushort tmp4 = (ushort)b1.s0 * (ushort)a3.s1;

885

ushort tmp5 = (ushort)b1.s1 * (ushort)a3.s1;

886

ushort tmp6 = (ushort)b1.s2 * (ushort)a3.s1;

887

ushort tmp7 = (ushort)b1.s3 * (ushort)a3.s1;

888

889

ushort tmp8 = (ushort)b2.s0 * (ushort)a3.s2;

890

ushort tmp9 = (ushort)b2.s1 * (ushort)a3.s2;

891

ushort tmpA = (ushort)b2.s2 * (ushort)a3.s2;

892

ushort tmpB = (ushort)b2.s3 * (ushort)a3.s2;

893

894

ushort tmpC = (ushort)b3.s0 * (ushort)a3.s3;

895

ushort tmpD = (ushort)b3.s1 * (ushort)a3.s3;

896

ushort tmpE = (ushort)b3.s2 * (ushort)a3.s3;

897

ushort tmpF = (ushort)b3.s3 * (ushort)a3.s3;

898

899

acc30 += ((uint)tmp0 + (uint)tmp4 + (uint)tmp8 + (uint)tmpC);

900

acc31 += ((uint)tmp1 + (uint)tmp5 + (uint)tmp9 + (uint)tmpD);

901

acc32 += ((uint)tmp2 + (uint)tmp6 + (uint)tmpA + (uint)tmpE);

902

acc33 += ((uint)tmp3 + (uint)tmp7 + (uint)tmpB + (uint)tmpF);

903

}

904

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

905

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

906

{

907

// Accumulate

908

ushort tmp0 = (ushort)b0.s0 * (ushort)a4.s0;

909

ushort tmp1 = (ushort)b0.s1 * (ushort)a4.s0;

910

ushort tmp2 = (ushort)b0.s2 * (ushort)a4.s0;

911

ushort tmp3 = (ushort)b0.s3 * (ushort)a4.s0;

912

913

ushort tmp4 = (ushort)b1.s0 * (ushort)a4.s1;

914

ushort tmp5 = (ushort)b1.s1 * (ushort)a4.s1;

915

ushort tmp6 = (ushort)b1.s2 * (ushort)a4.s1;

916

ushort tmp7 = (ushort)b1.s3 * (ushort)a4.s1;

917

918

ushort tmp8 = (ushort)b2.s0 * (ushort)a4.s2;

919

ushort tmp9 = (ushort)b2.s1 * (ushort)a4.s2;

920

ushort tmpA = (ushort)b2.s2 * (ushort)a4.s2;

921

ushort tmpB = (ushort)b2.s3 * (ushort)a4.s2;

922

923

ushort tmpC = (ushort)b3.s0 * (ushort)a4.s3;

924

ushort tmpD = (ushort)b3.s1 * (ushort)a4.s3;

925

ushort tmpE = (ushort)b3.s2 * (ushort)a4.s3;

926

ushort tmpF = (ushort)b3.s3 * (ushort)a4.s3;

927

928

acc40 += ((uint)tmp0 + (uint)tmp4 + (uint)tmp8 + (uint)tmpC);

929

acc41 += ((uint)tmp1 + (uint)tmp5 + (uint)tmp9 + (uint)tmpD);

930

acc42 += ((uint)tmp2 + (uint)tmp6 + (uint)tmpA + (uint)tmpE);

931

acc43 += ((uint)tmp3 + (uint)tmp7 + (uint)tmpB + (uint)tmpF);

932

}

933

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

934

}

935

936

for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(1, src1_stride_y))

937

{

938

// Load values from matrix A

939

uchar a0 = *(src0_ptr + src_addr.s0 + 0 * src0_stride_y);

940

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

941

uchar a1 = *(src0_ptr + src_addr.s0 + 1 * src0_stride_y);

942

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

943

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

944

uchar a2 = *(src0_ptr + src_addr.s0 + 2 * src0_stride_y);

945

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

946

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

947

uchar a3 = *(src0_ptr + src_addr.s0 + 3 * src0_stride_y);

948

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

949

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

950

uchar a4 = *(src0_ptr + src_addr.s0 + 4 * src0_stride_y);

951

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

952

// Load values from matrix B

953

uchar4 b0 = vload4(0, src1_ptr + src_addr.s1);

// Accumulate

{

// Accumulate

ushort tmp0 = (ushort)b0.s0 * (ushort)a0;

959

ushort tmp1 = (ushort)b0.s1 * (ushort)a0;

960

ushort tmp2 = (ushort)b0.s2 * (ushort)a0;

961

ushort tmp3 = (ushort)b0.s3 * (ushort)a0;

962

963

acc00 += ((uint)tmp0);

964

acc01 += ((uint)tmp1);

965

acc02 += ((uint)tmp2);

966

acc03 += ((uint)tmp3);

967

}

968

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

969

{

970

// Accumulate

971

ushort tmp0 = (ushort)b0.s0 * (ushort)a1;

972

ushort tmp1 = (ushort)b0.s1 * (ushort)a1;

973

ushort tmp2 = (ushort)b0.s2 * (ushort)a1;

974

ushort tmp3 = (ushort)b0.s3 * (ushort)a1;

975

976

acc10 += ((uint)tmp0);

977

acc11 += ((uint)tmp1);

978

acc12 += ((uint)tmp2);

979

acc13 += ((uint)tmp3);

980

}

981

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

982

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

983

{

984

// Accumulate

985

ushort tmp0 = (ushort)b0.s0 * (ushort)a2;

986

ushort tmp1 = (ushort)b0.s1 * (ushort)a2;

987

ushort tmp2 = (ushort)b0.s2 * (ushort)a2;

988

ushort tmp3 = (ushort)b0.s3 * (ushort)a2;

989

990

acc20 += ((uint)tmp0);

991

acc21 += ((uint)tmp1);

992

acc22 += ((uint)tmp2);

993

acc23 += ((uint)tmp3);

994

}

995

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

996

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

997

{

998

// Accumulate

999

ushort tmp0 = (ushort)b0.s0 * (ushort)a3;

1000

ushort tmp1 = (ushort)b0.s1 * (ushort)a3;

1001

ushort tmp2 = (ushort)b0.s2 * (ushort)a3;

1002

ushort tmp3 = (ushort)b0.s3 * (ushort)a3;

1003

1004

acc30 += ((uint)tmp0);

1005

acc31 += ((uint)tmp1);

1006

acc32 += ((uint)tmp2);

1007

acc33 += ((uint)tmp3);

1008

}

1009

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1010

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

1011

{

1012

// Accumulate

1013

ushort tmp0 = (ushort)b0.s0 * (ushort)a4;

1014

ushort tmp1 = (ushort)b0.s1 * (ushort)a4;

1015

ushort tmp2 = (ushort)b0.s2 * (ushort)a4;

1016

ushort tmp3 = (ushort)b0.s3 * (ushort)a4;

1017

1018

acc40 += ((uint)tmp0);

1019

acc41 += ((uint)tmp1);

1020

acc42 += ((uint)tmp2);

1021

acc43 += ((uint)tmp3);

1022

}

1023

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

1024

}

1025

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

1026

const int z = get_global_id(2);

1027

Gian Marco

2018-01-10 15:56:30 +0000

[diff] [blame]

1028

// Compute destination address

1029

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

1030

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

1031

#if defined(REINTERPRET_OUTPUT_AS_3D)

1032

// Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension

1033

// in order to take into account the presence of possible cross plane paddings

//

// | |

// | plane0 |

// | |

// |__________________|

1039

// |******************|

1040

// | cross_plane_pad |

1041

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

1046

1047

// The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D

1048

uint8 zout = ((uint8)(0, 1, 2, 3, 4, 5, 6, 7) + (uint8)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint8)HEIGHT_GEMM3D;

1049

zout = min(DEPTH_GEMM3D - 1, zout);

1050

1051

// Add offset due to the cross plane paddings

1052

zout *= (dst_cross_plane_pad * dst_stride_y);

1053

1054

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

1055

// multiply dst_stride_z by DEPTH_GEMM3D

1056

dst.ptr += z * dst_stride_z * DEPTH_GEMM3D;

1057

Gian Marco

2018-01-10 15:56:30 +0000

[diff] [blame]

1058

// Store the result

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

1059

vstore4((int4)(acc00, acc01, acc02, acc03), 0, (__global int *)(dst.ptr + 0 * dst_stride_y + zout.s0));

Gian Marco

2018-01-10 15:56:30 +0000

[diff] [blame]

1060

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

1061

vstore4((int4)(acc10, acc11, acc12, acc13), 0, (__global int *)(dst.ptr + 1 * dst_stride_y + zout.s1));

Gian Marco

2018-01-10 15:56:30 +0000

[diff] [blame]

1062

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1063

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

1064

vstore4((int4)(acc20, acc21, acc22, acc23), 0, (__global int *)(dst.ptr + 2 * dst_stride_y + zout.s2));

Gian Marco

2018-01-10 15:56:30 +0000

[diff] [blame]

1065

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1066

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

1067

vstore4((int4)(acc30, acc31, acc32, acc33), 0, (__global int *)(dst.ptr + 3 * dst_stride_y + zout.s3));

Gian Marco

2018-01-10 15:56:30 +0000

[diff] [blame]

1068

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1069

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

1070

vstore4((int4)(acc40, acc41, acc42, acc43), 0, (__global int *)(dst.ptr + 4 * dst_stride_y + zout.s4));

Gian Marco

2018-01-10 15:56:30 +0000

[diff] [blame]

1071

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

1072

1073

#else // defined(REINTERPRET_OUTPUT_AS_3D)

1074

// Add offset for batched GEMM

1075

dst.ptr += z * dst_stride_z;

1076

1077

// Store the result

1078

vstore4((int4)(acc00, acc01, acc02, acc03), 0, (__global int *)(dst.ptr + 0 * dst_stride_y));

1079

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1080

vstore4((int4)(acc10, acc11, acc12, acc13), 0, (__global int *)(dst.ptr + 1 * dst_stride_y));

1081

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1082

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1083

vstore4((int4)(acc20, acc21, acc22, acc23), 0, (__global int *)(dst.ptr + 2 * dst_stride_y));

1084

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1085

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1086

vstore4((int4)(acc30, acc31, acc32, acc33), 0, (__global int *)(dst.ptr + 3 * dst_stride_y));

1087

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1088

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

1089

vstore4((int4)(acc40, acc41, acc42, acc43), 0, (__global int *)(dst.ptr + 4 * dst_stride_y));

1090

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 4

1091

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

1092

}

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1093

Georgios Pinitas

daa3855

2018-08-28 17:43:18 +0100

[diff] [blame]

1094

#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1095

/** OpenCL kernel optimized to use dot product that computes the matrix multiplication between matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped

1096

*

1097

* @attention The number of matrix A columns needs to be passed at compile time using -DCOLS_A

1098

*

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

1099

* @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:

1100

* -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D

1101

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

1102

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

1103

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

1104

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns matrix A NOT reshaped

1105

*

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1106

* @param[in] src0_ptr Pointer to the source matrix. Supported data type: QASYMM8

1107

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

1108

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1109

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

1110

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1111

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

1112

* @param[in] src1_ptr Pointer to the source matrix. Supported data type: same as @p src0_ptr

1113

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

1114

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1115

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

1116

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1117

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

1118

* @param[out] dst_ptr Pointer to the destination matrix Supported data type: S32

1119

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

1120

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

1121

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

1122

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

1123

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

1124

* @param[in] src0_stride_z Stride of the source matrix in Z dimension (in bytes)

1125

* @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)

1126

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

1127

* @param[in] src_cross_plane_pad (Optional) Bottom paddings in unit of elements for the input tensor (only if defined REINTERPRET_INPUT_AS_3D)

1128

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements for the output tensor (only if defined REINTERPRET_OUTPUT_AS_3D)

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1129

*/

1130

__kernel void gemmlowp_mm_bifrost_dot8(IMAGE_DECLARATION(src0),

1131

IMAGE_DECLARATION(src1),

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

1132

IMAGE_DECLARATION(dst),

uint src0_stride_z,

uint src1_stride_z,

uint dst_stride_z

#if defined(REINTERPRET_INPUT_AS_3D)

1137

,

1138

uint src_cross_plane_pad

1139

#endif // REINTERPRET_INPUT_AS_3D

1140

#if defined(REINTERPRET_OUTPUT_AS_3D)

1141

,

1142

uint dst_cross_plane_pad

1143

#endif // REINTERPRET_OUTPUT_AS_3D)

1144

)

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1145

{

1146

int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;

1147

1148

// Compute starting address for matrix A and Matrix B

1149

int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));

1150

1151

// Update address for the matrix A

1152

src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;

1153

1154

// Update address for the matrix B

1155

src_addr.s1 += idx;

1156

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

1157

#if defined(REINTERPRET_INPUT_AS_3D)

1158

// Since we load a 2D input tile from a 3D tensor, we need to check when the plane changes across the z dimension

1159

// in order to take into account the presence of possible cross plane paddings

//

// | |

// | plane0 |

// | |

// |__________________|

1165

// |******************|

1166

// | cross_plane_pad |

1167

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

1172

1173

// The plane (zin) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D

1174

uint4 zin = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;

1175

zin = min(DEPTH_GEMM3D - 1, zin);

1176

1177

// Add offset due to the cross plane paddings

1178

zin *= (src_cross_plane_pad * src0_stride_y);

1179

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1180

zin += ((uint4)(0, 1, 2, 3)) * src0_stride_y;

1181

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

1182

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

1183

// multiply src0_stride_z by DEPTH_GEMM3D

1184

src_addr.s0 += get_global_id(2) * src0_stride_z * DEPTH_GEMM3D;

1185

1186

#else // defined(REINTERPRET_INPUT_AS_3D)

1187

1188

// Add offset for batched GEMM

1189

src_addr.s0 += get_global_id(2) * src0_stride_z;

1190

1191

#endif // defined(REINTERPRET_INPUT_AS_3D)

1192

1193

#if defined(MATRIX_B_DEPTH)

1194

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

1195

src_addr.s1 += (get_global_id(2) % MATRIX_B_DEPTH) * src1_stride_z;

1196

#else // defined(MATRIX_B_DEPTH)

1197

src_addr.s1 += get_global_id(2) * src1_stride_z;

1198

#endif // defined(MATRIX_B_DEPTH)

1199

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

uint acc00 = 0;

uint acc01 = 0;

uint acc02 = 0;

uint acc03 = 0;

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

uint acc04 = 0;

uint acc05 = 0;

uint acc06 = 0;

uint acc07 = 0;

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1208

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

uint acc10 = 0;

uint acc11 = 0;

uint acc12 = 0;

uint acc13 = 0;

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

uint acc14 = 0;

uint acc15 = 0;

uint acc16 = 0;

uint acc17 = 0;

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1217

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1218

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

uint acc20 = 0;

uint acc21 = 0;

uint acc22 = 0;

uint acc23 = 0;

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

uint acc24 = 0;

uint acc25 = 0;

uint acc26 = 0;

uint acc27 = 0;

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1227

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1228

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

uint acc30 = 0;

uint acc31 = 0;

uint acc32 = 0;

uint acc33 = 0;

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

uint acc34 = 0;

uint acc35 = 0;

uint acc36 = 0;

uint acc37 = 0;

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1237

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1238

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1239

// A and B src indices get incremented at the same time.

1240

int i = 0;

1241

for(; i <= ((int)COLS_A - 8); i += 8)

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1242

{

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1243

#if defined(REINTERPRET_INPUT_AS_3D)

1244

// Load values from matrix A and matrix B

1245

uchar8 a0 = vload8(0, (__global uchar *)(src0_ptr + src_addr.s0 + zin.s0));

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1246

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1247

uchar8 a1 = vload8(0, (__global uchar *)(src0_ptr + src_addr.s0 + zin.s1));

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1248

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1249

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1250

uchar8 a2 = vload8(0, (__global uchar *)(src0_ptr + src_addr.s0 + zin.s2));

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1251

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1252

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1253

uchar8 a3 = vload8(0, (__global uchar *)(src0_ptr + src_addr.s0 + zin.s3));

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1254

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1255

#else // defined(REINTERPRET_INPUT_AS_3D)

1256

// Load values from matrix A and matrix B

1257

uchar8 a0 = vload8(0, (__global uchar *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));

1258

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1259

uchar8 a1 = vload8(0, (__global uchar *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

1260

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1261

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1262

uchar8 a2 = vload8(0, (__global uchar *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

1263

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1264

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1265

uchar8 a3 = vload8(0, (__global uchar *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

1266

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1267

#endif // defined(REINTERPRET_INPUT_AS_3D)

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1268

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1269

uchar8 b0 = vload8(0, src1_ptr + src_addr.s1 + 0 * src1_stride_y);

1270

uchar8 b1 = vload8(0, src1_ptr + src_addr.s1 + 1 * src1_stride_y);

1271

uchar8 b2 = vload8(0, src1_ptr + src_addr.s1 + 2 * src1_stride_y);

1272

uchar8 b3 = vload8(0, src1_ptr + src_addr.s1 + 3 * src1_stride_y);

1273

src_addr.s1 += 4 * src1_stride_y;

1274

1275

ARM_DOT(a0.s0123, (uchar4)(b0.s0, b1.s0, b2.s0, b3.s0), acc00);

1276

ARM_DOT(a0.s0123, (uchar4)(b0.s1, b1.s1, b2.s1, b3.s1), acc01);

1277

ARM_DOT(a0.s0123, (uchar4)(b0.s2, b1.s2, b2.s2, b3.s2), acc02);

1278

ARM_DOT(a0.s0123, (uchar4)(b0.s3, b1.s3, b2.s3, b3.s3), acc03);

1279

ARM_DOT(a0.s0123, (uchar4)(b0.s4, b1.s4, b2.s4, b3.s4), acc04);

1280

ARM_DOT(a0.s0123, (uchar4)(b0.s5, b1.s5, b2.s5, b3.s5), acc05);

1281

ARM_DOT(a0.s0123, (uchar4)(b0.s6, b1.s6, b2.s6, b3.s6), acc06);

1282

ARM_DOT(a0.s0123, (uchar4)(b0.s7, b1.s7, b2.s7, b3.s7), acc07);

1283

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1284

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1285

ARM_DOT(a1.s0123, (uchar4)(b0.s0, b1.s0, b2.s0, b3.s0), acc10);

1286

ARM_DOT(a1.s0123, (uchar4)(b0.s1, b1.s1, b2.s1, b3.s1), acc11);

1287

ARM_DOT(a1.s0123, (uchar4)(b0.s2, b1.s2, b2.s2, b3.s2), acc12);

1288

ARM_DOT(a1.s0123, (uchar4)(b0.s3, b1.s3, b2.s3, b3.s3), acc13);

1289

ARM_DOT(a1.s0123, (uchar4)(b0.s4, b1.s4, b2.s4, b3.s4), acc14);

1290

ARM_DOT(a1.s0123, (uchar4)(b0.s5, b1.s5, b2.s5, b3.s5), acc15);

1291

ARM_DOT(a1.s0123, (uchar4)(b0.s6, b1.s6, b2.s6, b3.s6), acc16);

1292

ARM_DOT(a1.s0123, (uchar4)(b0.s7, b1.s7, b2.s7, b3.s7), acc17);

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1293

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1294

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1295

ARM_DOT(a2.s0123, (uchar4)(b0.s0, b1.s0, b2.s0, b3.s0), acc20);

1296

ARM_DOT(a2.s0123, (uchar4)(b0.s1, b1.s1, b2.s1, b3.s1), acc21);

1297

ARM_DOT(a2.s0123, (uchar4)(b0.s2, b1.s2, b2.s2, b3.s2), acc22);

1298

ARM_DOT(a2.s0123, (uchar4)(b0.s3, b1.s3, b2.s3, b3.s3), acc23);

1299

ARM_DOT(a2.s0123, (uchar4)(b0.s4, b1.s4, b2.s4, b3.s4), acc24);

1300

ARM_DOT(a2.s0123, (uchar4)(b0.s5, b1.s5, b2.s5, b3.s5), acc25);

1301

ARM_DOT(a2.s0123, (uchar4)(b0.s6, b1.s6, b2.s6, b3.s6), acc26);

1302

ARM_DOT(a2.s0123, (uchar4)(b0.s7, b1.s7, b2.s7, b3.s7), acc27);

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1303

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1304

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1305

ARM_DOT(a3.s0123, (uchar4)(b0.s0, b1.s0, b2.s0, b3.s0), acc30);

1306

ARM_DOT(a3.s0123, (uchar4)(b0.s1, b1.s1, b2.s1, b3.s1), acc31);

1307

ARM_DOT(a3.s0123, (uchar4)(b0.s2, b1.s2, b2.s2, b3.s2), acc32);

1308

ARM_DOT(a3.s0123, (uchar4)(b0.s3, b1.s3, b2.s3, b3.s3), acc33);

1309

ARM_DOT(a3.s0123, (uchar4)(b0.s4, b1.s4, b2.s4, b3.s4), acc34);

1310

ARM_DOT(a3.s0123, (uchar4)(b0.s5, b1.s5, b2.s5, b3.s5), acc35);

1311

ARM_DOT(a3.s0123, (uchar4)(b0.s6, b1.s6, b2.s6, b3.s6), acc36);

1312

ARM_DOT(a3.s0123, (uchar4)(b0.s7, b1.s7, b2.s7, b3.s7), acc37);

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1313

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1314

1315

b0 = vload8(0, src1_ptr + src_addr.s1 + 0 * src1_stride_y);

1316

b1 = vload8(0, src1_ptr + src_addr.s1 + 1 * src1_stride_y);

1317

b2 = vload8(0, src1_ptr + src_addr.s1 + 2 * src1_stride_y);

1318

b3 = vload8(0, src1_ptr + src_addr.s1 + 3 * src1_stride_y);

1319

src_addr.s1 += 4 * src1_stride_y;

1320

1321

ARM_DOT(a0.s4567, (uchar4)(b0.s0, b1.s0, b2.s0, b3.s0), acc00);

1322

ARM_DOT(a0.s4567, (uchar4)(b0.s1, b1.s1, b2.s1, b3.s1), acc01);

1323

ARM_DOT(a0.s4567, (uchar4)(b0.s2, b1.s2, b2.s2, b3.s2), acc02);

1324

ARM_DOT(a0.s4567, (uchar4)(b0.s3, b1.s3, b2.s3, b3.s3), acc03);

1325

ARM_DOT(a0.s4567, (uchar4)(b0.s4, b1.s4, b2.s4, b3.s4), acc04);

1326

ARM_DOT(a0.s4567, (uchar4)(b0.s5, b1.s5, b2.s5, b3.s5), acc05);

1327

ARM_DOT(a0.s4567, (uchar4)(b0.s6, b1.s6, b2.s6, b3.s6), acc06);

1328

ARM_DOT(a0.s4567, (uchar4)(b0.s7, b1.s7, b2.s7, b3.s7), acc07);

1329

1330

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1331

ARM_DOT(a1.s4567, (uchar4)(b0.s0, b1.s0, b2.s0, b3.s0), acc10);

1332

ARM_DOT(a1.s4567, (uchar4)(b0.s1, b1.s1, b2.s1, b3.s1), acc11);

1333

ARM_DOT(a1.s4567, (uchar4)(b0.s2, b1.s2, b2.s2, b3.s2), acc12);

1334

ARM_DOT(a1.s4567, (uchar4)(b0.s3, b1.s3, b2.s3, b3.s3), acc13);

1335

ARM_DOT(a1.s4567, (uchar4)(b0.s4, b1.s4, b2.s4, b3.s4), acc14);

1336

ARM_DOT(a1.s4567, (uchar4)(b0.s5, b1.s5, b2.s5, b3.s5), acc15);

1337

ARM_DOT(a1.s4567, (uchar4)(b0.s6, b1.s6, b2.s6, b3.s6), acc16);

1338

ARM_DOT(a1.s4567, (uchar4)(b0.s7, b1.s7, b2.s7, b3.s7), acc17);

1339

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1340

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1341

ARM_DOT(a2.s4567, (uchar4)(b0.s0, b1.s0, b2.s0, b3.s0), acc20);

1342

ARM_DOT(a2.s4567, (uchar4)(b0.s1, b1.s1, b2.s1, b3.s1), acc21);

1343

ARM_DOT(a2.s4567, (uchar4)(b0.s2, b1.s2, b2.s2, b3.s2), acc22);

1344

ARM_DOT(a2.s4567, (uchar4)(b0.s3, b1.s3, b2.s3, b3.s3), acc23);

1345

ARM_DOT(a2.s4567, (uchar4)(b0.s4, b1.s4, b2.s4, b3.s4), acc24);

1346

ARM_DOT(a2.s4567, (uchar4)(b0.s5, b1.s5, b2.s5, b3.s5), acc25);

1347

ARM_DOT(a2.s4567, (uchar4)(b0.s6, b1.s6, b2.s6, b3.s6), acc26);

1348

ARM_DOT(a2.s4567, (uchar4)(b0.s7, b1.s7, b2.s7, b3.s7), acc27);

1349

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1350

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1351

ARM_DOT(a3.s4567, (uchar4)(b0.s0, b1.s0, b2.s0, b3.s0), acc30);

1352

ARM_DOT(a3.s4567, (uchar4)(b0.s1, b1.s1, b2.s1, b3.s1), acc31);

1353

ARM_DOT(a3.s4567, (uchar4)(b0.s2, b1.s2, b2.s2, b3.s2), acc32);

1354

ARM_DOT(a3.s4567, (uchar4)(b0.s3, b1.s3, b2.s3, b3.s3), acc33);

1355

ARM_DOT(a3.s4567, (uchar4)(b0.s4, b1.s4, b2.s4, b3.s4), acc34);

1356

ARM_DOT(a3.s4567, (uchar4)(b0.s5, b1.s5, b2.s5, b3.s5), acc35);

1357

ARM_DOT(a3.s4567, (uchar4)(b0.s6, b1.s6, b2.s6, b3.s6), acc36);

1358

ARM_DOT(a3.s4567, (uchar4)(b0.s7, b1.s7, b2.s7, b3.s7), acc37);

1359

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1360

1361

src_addr.s0 += 8;

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1362

}

1363

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1364

for(; i < (int)COLS_A; ++i)

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1365

{

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1366

#if defined(REINTERPRET_INPUT_AS_3D)

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1367

// Load values from matrix A

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1368

uchar a0 = *((__global uchar *)(src0_ptr + src_addr.s0 + zin.s0));

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1369

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1370

uchar a1 = *((__global uchar *)(src0_ptr + src_addr.s0 + zin.s1));

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1371

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1372

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1373

uchar a2 = *((__global uchar *)(src0_ptr + src_addr.s0 + zin.s2));

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1374

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1375

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1376

uchar a3 = *((__global uchar *)(src0_ptr + src_addr.s0 + zin.s3));

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1377

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1378

#else // defined(REINTERPRET_INPUT_AS_3D)

1379

// Load values from matrix A

1380

uchar a0 = *((__global uchar *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));

1381

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1382

uchar a1 = *((__global uchar *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

1383

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1384

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1385

uchar a2 = *((__global uchar *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

1386

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1387

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1388

uchar a3 = *((__global uchar *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

1389

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1390

#endif // defined(REINTERPRET_INPUT_AS_3D)

1391

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1392

// Load values from matrix B

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1393

uchar8 b0 = vload8(0, src1_ptr + src_addr.s1);

1394

src_addr.s1 += src1_stride_y;

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1395

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1396

acc00 += (uint)a0 * b0.s0;

1397

acc01 += (uint)a0 * b0.s1;

1398

acc02 += (uint)a0 * b0.s2;

1399

acc03 += (uint)a0 * b0.s3;

1400

acc04 += (uint)a0 * b0.s4;

1401

acc05 += (uint)a0 * b0.s5;

1402

acc06 += (uint)a0 * b0.s6;

1403

acc07 += (uint)a0 * b0.s7;

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1404

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1405

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1406

acc10 += (uint)a1 * b0.s0;

1407

acc11 += (uint)a1 * b0.s1;

1408

acc12 += (uint)a1 * b0.s2;

1409

acc13 += (uint)a1 * b0.s3;

1410

acc14 += (uint)a1 * b0.s4;

1411

acc15 += (uint)a1 * b0.s5;

1412

acc16 += (uint)a1 * b0.s6;

1413

acc17 += (uint)a1 * b0.s7;

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1414

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1415

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1416

acc20 += (uint)a2 * b0.s0;

1417

acc21 += (uint)a2 * b0.s1;

1418

acc22 += (uint)a2 * b0.s2;

1419

acc23 += (uint)a2 * b0.s3;

1420

acc24 += (uint)a2 * b0.s4;

1421

acc25 += (uint)a2 * b0.s5;

1422

acc26 += (uint)a2 * b0.s6;

1423

acc27 += (uint)a2 * b0.s7;

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1424

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1425

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1426

acc30 += (uint)a3 * b0.s0;

1427

acc31 += (uint)a3 * b0.s1;

1428

acc32 += (uint)a3 * b0.s2;

1429

acc33 += (uint)a3 * b0.s3;

1430

acc34 += (uint)a3 * b0.s4;

1431

acc35 += (uint)a3 * b0.s5;

1432

acc36 += (uint)a3 * b0.s6;

1433

acc37 += (uint)a3 * b0.s7;

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1434

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1435

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1436

src_addr.s0 += 1;

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1437

}

1438

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1439

int z = get_global_id(2);

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

1440

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1441

// Compute destination address

1442

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

1443

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1444

// Compute dst address

1445

__global uchar *dst_addr = dst.ptr;

1446

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

1447

#if defined(REINTERPRET_OUTPUT_AS_3D)

1448

// Since we store a 2D output tile in a 3D tensor, we need to check when the plane changes across the z dimension

1449

// in order to take into account the presence of possible cross plane paddings

//

// | |

// | plane0 |

// | |

// |__________________|

1455

// |******************|

1456

// | cross_plane_pad |

1457

// |******************|

// | |

// | plane1 |

// | |

// |__________________|

1462

1463

// The plane (zout) is calculated dividing M (get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y) by HEIGHT_GEMM3D

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1464

uint4 zout = ((uint4)(0, 1, 2, 3) + (uint4)(get_global_id(1) * NUM_ELEMS_PROCESSED_PER_THREAD_Y)) / (uint4)HEIGHT_GEMM3D;

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

1465

zout = min(DEPTH_GEMM3D - 1, zout);

1466

1467

// Add offset due to the cross plane paddings

1468

zout *= (dst_cross_plane_pad * dst_stride_y);

1469

1470

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

1471

// multiply dst_stride_z by DEPTH_GEMM3D

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1472

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

1473

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1474

// Store the result

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1475

vstore4((int4)(acc00, acc01, acc02, acc03), 0, (__global int *)(dst_addr + 0 * dst_stride_y + zout.s0));

1476

vstore4((int4)(acc04, acc05, acc06, acc07), 1, (__global int *)(dst_addr + 0 * dst_stride_y + zout.s0));

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1477

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1478

vstore4((int4)(acc10, acc11, acc12, acc13), 0, (__global int *)(dst_addr + 1 * dst_stride_y + zout.s1));

1479

vstore4((int4)(acc14, acc15, acc16, acc17), 1, (__global int *)(dst_addr + 1 * dst_stride_y + zout.s1));

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1480

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1481

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1482

vstore4((int4)(acc20, acc21, acc22, acc23), 0, (__global int *)(dst_addr + 2 * dst_stride_y + zout.s2));

1483

vstore4((int4)(acc24, acc25, acc26, acc27), 1, (__global int *)(dst_addr + 2 * dst_stride_y + zout.s2));

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1484

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1485

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1486

vstore4((int4)(acc30, acc31, acc32, acc33), 0, (__global int *)(dst_addr + 3 * dst_stride_y + zout.s3));

1487

vstore4((int4)(acc34, acc35, acc36, acc37), 0, (__global int *)(dst_addr + 3 * dst_stride_y + zout.s3));

Giorgio Arena

2018-07-06 17:06:36 +0100

[diff] [blame]

1488

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

1489

1490

#else // defined(REINTERPRET_OUTPUT_AS_3D)

1491

// Add offset for batched GEMM

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1492

dst_addr += z * dst_stride_z;

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

1493

1494

// Store the result

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1495

vstore4((int4)(acc00, acc01, acc02, acc03), 0, (__global int *)(dst_addr + 0 * dst_stride_y));

1496

vstore4((int4)(acc04, acc05, acc06, acc07), 1, (__global int *)(dst_addr + 0 * dst_stride_y));

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

1497

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1498

vstore4((int4)(acc10, acc11, acc12, acc13), 0, (__global int *)(dst_addr + 1 * dst_stride_y));

1499

vstore4((int4)(acc14, acc15, acc16, acc17), 1, (__global int *)(dst_addr + 1 * dst_stride_y));

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

1500

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1501

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1502

vstore4((int4)(acc20, acc21, acc22, acc23), 0, (__global int *)(dst_addr + 2 * dst_stride_y));

1503

vstore4((int4)(acc24, acc25, acc26, acc27), 1, (__global int *)(dst_addr + 2 * dst_stride_y));

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

1504

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1505

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1506

vstore4((int4)(acc30, acc31, acc32, acc33), 0, (__global int *)(dst_addr + 3 * dst_stride_y));

1507

vstore4((int4)(acc34, acc35, acc36, acc37), 0, (__global int *)(dst_addr + 3 * dst_stride_y));

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

1508

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1509

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

1510

}

Georgios Pinitas

daa3855

2018-08-28 17:43:18 +0100

[diff] [blame]

1511

#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

1512

#endif // defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_Y) && defined(COLS_A)

1513

Gian Marco Iodice

2019-03-15 10:13:05 +0000

[diff] [blame]

1514

#if defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(M) && defined(N)

Gian Marco Iodice

2019-05-14 10:14:08 +0100

[diff] [blame^]

1515

/** This OpenCL kernel computes the matrix multiplication between 2 matrices with QASYMM data type.

Gian Marco Iodice

2019-01-17 09:47:04 +0000

[diff] [blame]

1516

* The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be NOT transposed

1517

* The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be transposed

1518

*

Gian Marco Iodice

2019-03-15 10:13:05 +0000

[diff] [blame]

1519

* @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.

1520

* @note The GEMM's dimensions M and N must be passed at compile time using -DM and -DN (i.e. -DM=52 and -DN=90).

Gian Marco Iodice

2019-01-17 09:47:04 +0000

[diff] [blame]

1521

* @note The block's dimensions used for reshaping the LHS matrix and the RHS matrix (M0, N0 and K0) must be passed at compile time using -DM0, -DN0 and -DK0 (i.e. -DM0=4, -DN0=8, -DK0=4).

1522

* @note The number of M0xK0 vertical blocks stored on the same output row of the reshaped LHS matrix must be passed at compile time using -DV0 (i.e. -DV0=2)

1523

* @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (i.e. -DH0=2)

1524

* @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option -DLHS_INTERLEAVE must passed at compile time.

1525

* @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.

1526

* @note Only the following configurations of M0, N0 and K0 are currently supported:

1527

* - M0 = 2, 3, 4, 5, 6, 7, 8

1528

* - N0 = 2, 3, 4, 8, 16

1529

* - K0 = 2, 3, 4, 8, 16

Gian Marco Iodice

2019-03-11 16:07:12 +0000

[diff] [blame]

1530

* - V0 >= 1

1531

* - H0 >= 1

Gian Marco Iodice

2019-01-17 09:47:04 +0000

[diff] [blame]

1532

*

1533

* @note In case the output has to be reinterpreted as a 3D tensor (i.e. output of convolution layer), the following information must be passed at compile time:

1534

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

1535

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

1536

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

1537

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix NOT reshaped

1538

*

1539

* @param[in] lhs_ptr Pointer to the LHS reshaped matrix. Supported data type: QASYMM8

1540

* @param[in] lhs_stride_x Stride of the LHS reshaped matrix in X dimension (in bytes)

1541

* @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1542

* @param[in] lhs_stride_y Stride of the LHS reshaped matrix in Y dimension (in bytes)

1543

* @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1544

* @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS reshaped matrix

1545

* @param[in] rhs_ptr Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr

1546

* @param[in] rhs_stride_x Stride of the RHS reshaped matrix in X dimension (in bytes)

1547

* @param[in] rhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1548

* @param[in] rhs_stride_y Stride of the RHS reshaped matrix in Y dimension (in bytes)

1549

* @param[in] rhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1550

* @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS reshaped matrix

1551

* @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr

1552

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

1553

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1554

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

1555

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

1556

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

1557

* @param[in] k Number of columns in LHS matrix and rows in RHS matrix not reshaped.

1558

* @param[in] lhs_stride_z Stride of the LHS reshaped matrix in Z dimension (in bytes)

1559

* @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes)

1560

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

1561

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

1562

*/

1563

__kernel void gemmlowp_mm_reshaped_lhs_nt_rhs_t(IMAGE_DECLARATION(lhs),

1564

IMAGE_DECLARATION(rhs),

1565

IMAGE_DECLARATION(dst),

uint k,

uint lhs_stride_z,

uint rhs_stride_z,

uint dst_stride_z

#if defined(REINTERPRET_OUTPUT_AS_3D)

1571

,

1572

uint dst_cross_plane_pad

1573

#endif // REINTERPRET_OUTPUT_AS_3D

)

{

// Block size

#define LHS_BLOCK_SIZE ((K0) * (M0))

1578

1579

#if defined(LHS_INTERLEAVE)

1580

#define LHS_OFFSET_X (K0)

1581

#define LHS_STEP_X ((K0) * (V0))

1582

#define LHS_STEP_LOOP (1)

1583

#else // defined(INTERLEAVE)

1584

#define LHS_OFFSET_X (LHS_BLOCK_SIZE)

1585

#define LHS_STEP_X (K0)

1586

#define LHS_STEP_LOOP (V0)

1587

#endif // defined(INTERLEAVE)

1588

1589

// Block size

1590

#define RHS_BLOCK_SIZE ((K0) * (N0))

1591

1592

// RHS offset and step X

1593

#if defined(RHS_INTERLEAVE)

1594

#define RHS_OFFSET_X (K0)

1595

#define RHS_STEP_X ((K0) * (H0))

1596

#define RHS_STEP_LOOP (1)

1597

#else // defined(RHS_INTERLEAVE)

1598

#define RHS_OFFSET_X (RHS_BLOCK_SIZE)

1599

#define RHS_STEP_X (K0)

1600

#define RHS_STEP_LOOP (H0)

1601

#endif // defined(RHS_INTERLEAVE)

1602

Gian Marco Iodice

2019-05-14 10:14:08 +0100

[diff] [blame^]

1603

uint x = get_global_id(0);

1604

uint y = get_global_id(1);

1605

uint z = get_global_id(2);

1606

Gian Marco Iodice

2019-03-15 10:13:05 +0000

[diff] [blame]

1607

#if defined(DUMMY_WORK_ITEMS)

Gian Marco Iodice

2019-05-14 10:14:08 +0100

[diff] [blame^]

1608

if((x * N0 >= N) || (y * M0 >= M))

Gian Marco Iodice

2019-03-15 10:13:05 +0000

[diff] [blame]

{

return;

}

#endif // defined(DUMMY_WORK_ITEMS)

1613

Gian Marco Iodice

2019-01-17 09:47:04 +0000

[diff] [blame]

1614

// Compute LHS matrix address

Gian Marco Iodice

2019-05-14 10:14:08 +0100

[diff] [blame^]

1615

__global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (y % V0) * (uint)LHS_OFFSET_X + (y / V0) * (uint)lhs_stride_y + (z * lhs_stride_z);

Gian Marco Iodice

2019-01-17 09:47:04 +0000

[diff] [blame]

1616

1617

// Compute RHS matrix address

Gian Marco Iodice

2019-05-14 10:14:08 +0100

[diff] [blame^]

1618

__global uchar *rhs_addr = rhs_ptr + rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X + (x / (uint)H0) * rhs_stride_y;

Gian Marco Iodice

2019-01-17 09:47:04 +0000

[diff] [blame]

1619

1620

#if defined(MATRIX_B_DEPTH)

1621

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

Gian Marco Iodice

2019-05-14 10:14:08 +0100

[diff] [blame^]

1622

rhs_addr += (z % MATRIX_B_DEPTH) * rhs_stride_z;

Gian Marco Iodice

2019-01-17 09:47:04 +0000

[diff] [blame]

1623

#else // defined(MATRIX_B_DEPTH)

Gian Marco Iodice

2019-05-14 10:14:08 +0100

[diff] [blame^]

1624

rhs_addr += z * rhs_stride_z;

Gian Marco Iodice

2019-01-17 09:47:04 +0000

[diff] [blame]

1625

#endif // defined(MATRIX_B_DEPTH)

1626

Gian Marco Iodice

2019-05-14 10:14:08 +0100

[diff] [blame^]

1627

REPEAT_VAR_INIT_TO_CONST(8, uint, zlhs, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;

1628

REPEAT_VAR_INIT_TO_CONST(16, uint, zrhs, 0);

1629

Gian Marco Iodice

2019-01-17 09:47:04 +0000

[diff] [blame]

1630

// Initialize the accumulators

1631

REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(uint, N0), c, 0); //VEC_DATA_TYPE(uint, N0) c0=0,c1=0,c2=0,... c(M0-1)=0;

1632

1633

for(int i = 0; i < k; i += K0)

1634

{

Gian Marco Iodice

2019-01-17 09:47:04 +0000

[diff] [blame]

1635

// Load values from LHS matrix

Gian Marco Iodice

2019-05-14 10:14:08 +0100

[diff] [blame^]

1636

LOAD_BLOCK(M0, K0, uchar, a, lhs_addr, 0, LHS_STEP_X, zlhs);

Gian Marco Iodice

2019-01-17 09:47:04 +0000

[diff] [blame]

1637

1638

// Load values from RHS matrix

Gian Marco Iodice

2019-05-14 10:14:08 +0100

[diff] [blame^]

1639

LOAD_BLOCK(N0, K0, uchar, b, rhs_addr, 0, RHS_STEP_X, zrhs);

Gian Marco Iodice

2019-01-17 09:47:04 +0000

[diff] [blame]

1640

Gian Marco Iodice

2019-05-14 10:14:08 +0100

[diff] [blame^]

1641

// Partial matrix multiplication M0,N0,K0

1642

ARM_MM_K0XN0XM0(M0, N0, K0, a, b, c);

Gian Marco Iodice

2019-01-17 09:47:04 +0000

[diff] [blame]

1643

Gian Marco Iodice

2019-05-14 10:14:08 +0100

[diff] [blame^]

1644

// Update address

Gian Marco Iodice

2019-01-17 09:47:04 +0000

[diff] [blame]

1645

lhs_addr += (M0 * LHS_STEP_X * LHS_STEP_LOOP);

1646

rhs_addr += (N0 * RHS_STEP_X * RHS_STEP_LOOP);

1647

}

1648

Gian Marco Iodice

2019-05-14 10:14:08 +0100

[diff] [blame^]

1649

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(int)) + (y * (uint)M0 * dst_stride_y);

Gian Marco Iodice

2019-01-17 09:47:04 +0000

[diff] [blame]

1650

1651

REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;

1652

1653

#if defined(REINTERPRET_OUTPUT_AS_3D)

Gian Marco Iodice

2019-05-14 10:14:08 +0100

[diff] [blame^]

1654

// The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

1655

CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);

Gian Marco Iodice

2019-01-17 09:47:04 +0000

[diff] [blame]

1656

1657

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

1658

// multiply dst_stride_z by DEPTH_GEMM3D

Gian Marco Iodice

2019-05-14 10:14:08 +0100

[diff] [blame^]

1659

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

Gian Marco Iodice

2019-01-17 09:47:04 +0000

[diff] [blame]

1660

1661

#else // defined(REINTERPRET_OUTPUT_AS_3D)

1662

1663

// Add offset for batched GEMM

Gian Marco Iodice

2019-05-14 10:14:08 +0100

[diff] [blame^]

1664

dst_addr += z * dst_stride_z;

Gian Marco Iodice

2019-01-17 09:47:04 +0000

[diff] [blame]

1665

1666

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

1667

Gian Marco Iodice

2019-05-14 10:14:08 +0100

[diff] [blame^]

1668

// Convert and store output block

1669

CONVERT_STORE_BLOCK(M0, N0, int, c, dst_addr, dst_stride_y, zout);

Gian Marco Iodice

2019-01-17 09:47:04 +0000

[diff] [blame]

1670

1671

#undef LHS_BLOCK_SIZE

1672

#undef LHS_OFFSET_X

1673

#undef LHS_STEP_X

1674

#undef RHS_BLOCK_SIZE

1675

#undef RHS_OFFSET_X

1676

#undef RHS_STEP_X

1677

}

Gian Marco Iodice

2019-01-17 09:47:04 +0000

[diff] [blame]

1678

#endif // defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(K)

1679

Gian Marco Iodice

2019-03-11 16:07:12 +0000

[diff] [blame]

1680

#if defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(K)

1681

Gian Marco Iodice

2019-03-11 16:07:12 +0000

[diff] [blame]

1682

/** This OpenCL kernel computes the matrix multiplication between 2 matrices.

1683

* The LHS matrix is NOT reshaped

Gian Marco Iodice

2019-05-14 10:14:08 +0100

[diff] [blame^]

1684

* The RHS matrix is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is transposed

Gian Marco Iodice

2019-03-11 16:07:12 +0000

[diff] [blame]

1685

*

1686

* @note The number of columns of LHS matrix must be passed at compile time using -DK (i.e. -DK=64)

1687

* @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (i.e. -DN0=8, -DK0=4).

1688

* @note The number of M0 rows to process must be passed at compile time using -DM0 (i.e. -DM0=2)

1689

* @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (i.e. -DH0=2)

1690

* @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.

1691

* @note Only the following configurations of M0, N0 and K0 are currently supported:

1692

* - M0 = 1, 2, 3, 4, 5, 6, 7, 8

1693

* - N0 = 2, 3, 4, 8, 16

1694

* - K0 = 2, 3, 4, 8, 16

1695

* - H0 >= 1

1696

*

1697

* @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:

1698

* -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D

1699

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

1700

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

1701

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

1702

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix

1703

*

1704

* @param[in] lhs_ptr Pointer to the LHS reshaped matrix. Supported data type: F16/F32

1705

* @param[in] lhs_stride_x Stride of the LHS reshaped matrix in X dimension (in bytes)

1706

* @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1707

* @param[in] lhs_stride_y Stride of the LHS reshaped matrix in Y dimension (in bytes)

1708

* @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1709

* @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS reshaped matrix

1710

* @param[in] rhs_ptr Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr

1711

* @param[in] rhs_stride_x Stride of the RHS reshaped matrix in X dimension (in bytes)

1712

* @param[in] rhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1713

* @param[in] rhs_stride_y Stride of the RHS reshaped matrix in Y dimension (in bytes)

1714

* @param[in] rhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1715

* @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS reshaped matrix

1716

* @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr

1717

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

1718

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1719

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

1720

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

1721

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

1722

* @param[in] lhs_stride_z Stride of the LHS reshaped matrix in Z dimension (in bytes)

1723

* @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes)

1724

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

1725

* @param[in] lhs_cross_plane_pad (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)

1726

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

1727

*/

1728

__kernel void gemmlowp_mm_reshaped_only_rhs_t(IMAGE_DECLARATION(lhs),

1729

IMAGE_DECLARATION(rhs),

1730

IMAGE_DECLARATION(dst),

uint lhs_stride_z,

uint rhs_stride_z,

uint dst_stride_z

#if defined(REINTERPRET_INPUT_AS_3D)

1735

,

1736

uint lhs_cross_plane_pad

1737

#endif // REINTERPRET_INPUT_AS_3D

1738

#if defined(REINTERPRET_OUTPUT_AS_3D)

1739

,

1740

uint dst_cross_plane_pad

1741

#endif // REINTERPRET_OUTPUT_AS_3D

)

{

// Block size

#define RHS_BLOCK_SIZE ((K0) * (N0))

1746

1747

// RHS offset and step X

1748

#if defined(RHS_INTERLEAVE)

1749

#define RHS_OFFSET_X (K0)

1750

#define RHS_STEP_X ((K0) * (H0))

1751

#define RHS_STEP_LOOP (1)

1752

#else // defined(RHS_INTERLEAVE)

1753

#define RHS_OFFSET_X (RHS_BLOCK_SIZE)

1754

#define RHS_STEP_X (K0)

1755

#define RHS_STEP_LOOP (H0)

1756

#endif // defined(RHS_INTERLEAVE)

1757

1758

uint x = get_global_id(0);

1759

uint y = get_global_id(1);

1760

uint z = get_global_id(2);

1761

Gian Marco Iodice

86cfffe

2019-04-02 11:02:20 +0100

[diff] [blame]

1762

#if defined(DUMMY_WORK_ITEMS)

1763

if((x * N0 >= N) || (y * M0 >= M))

{

return;

}

#endif // defined(DUMMY_WORK_ITEMS)

1768

Gian Marco Iodice

2019-03-11 16:07:12 +0000

[diff] [blame]

1769

// Compute LHS matrix address

1770

uint lhs_offset = lhs_offset_first_element_in_bytes + y * M0 * (uint)lhs_stride_y;

1771

1772

// Compute RHS matrix address

1773

uint rhs_offset = rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X + (x / (uint)H0) * rhs_stride_y;

1774

1775

#if defined(MATRIX_B_DEPTH)

1776

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

1777

rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;

1778

#else // defined(MATRIX_B_DEPTH)

1779

rhs_offset += z * rhs_stride_z;

1780

#endif // defined(MATRIX_B_DEPTH)

1781

Gian Marco Iodice

2019-05-14 10:14:08 +0100

[diff] [blame^]

1782

REPEAT_VAR_INIT_TO_CONST(8, uint, zlhs, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;

1783

REPEAT_VAR_INIT_TO_CONST(16, uint, zrhs, 0);

Gian Marco Iodice

2019-03-11 16:07:12 +0000

[diff] [blame]

1784

1785

#if defined(REINTERPRET_INPUT_AS_3D)

Gian Marco Iodice

2019-05-14 10:14:08 +0100

[diff] [blame^]

1786

// The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

1787

CALCULATE_Z_OFFSET(M0, uint, zlhs, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);

Gian Marco Iodice

2019-03-11 16:07:12 +0000

[diff] [blame]

1788

1789

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

1790

// multiply lhs_stride_z by DEPTH_GEMM3D

1791

lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;

1792

1793

#else // defined(REINTERPRET_INPUT_AS_3D)

1794

1795

// Add offset for batched GEMM

1796

lhs_offset += z * lhs_stride_z;

1797

1798

#endif // defined(REINTERPRET_INPUT_AS_3D)

1799

1800

// Initialize the accumulators

1801

REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(uint, N0), c, 0); //VEC_DATA_TYPE(uint, N0) c0=0,c1=0,c2=0,... c(N0-1)=0;

1802

1803

for(int i = 0; i < K; i += K0)

1804

{

Gian Marco Iodice

2019-03-11 16:07:12 +0000

[diff] [blame]

1805

// Load values from LHS matrix

Gian Marco Iodice

2019-05-14 10:14:08 +0100

[diff] [blame^]

1806

LOAD_BLOCK(M0, K0, uchar, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);

Gian Marco Iodice

2019-03-11 16:07:12 +0000

[diff] [blame]

1807

1808

// Load values from RHS matrix

Gian Marco Iodice

2019-05-14 10:14:08 +0100

[diff] [blame^]

1809

LOAD_BLOCK(N0, K0, uchar, b, rhs_ptr, rhs_offset, RHS_STEP_X, zrhs);

Gian Marco Iodice

2019-03-11 16:07:12 +0000

[diff] [blame]

1810

Gian Marco Iodice

2019-05-14 10:14:08 +0100

[diff] [blame^]

1811

// Partial matrix multiplication M0,N0,K0

1812

ARM_MM_K0XN0XM0(M0, N0, K0, a, b, c);

Gian Marco Iodice

2019-03-11 16:07:12 +0000

[diff] [blame]

1813

1814

lhs_offset += K0;

1815

rhs_offset += N0 * RHS_STEP_X * RHS_STEP_LOOP;

1816

}

1817

1818

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0) * sizeof(int) + (y * (uint)M0 * dst_stride_y);

1819

1820

REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;

1821

1822

#if defined(REINTERPRET_OUTPUT_AS_3D)

Gian Marco Iodice

2019-03-11 16:07:12 +0000

[diff] [blame]

1823

// The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

Gian Marco Iodice

2019-05-14 10:14:08 +0100

[diff] [blame^]

1824

CALCULATE_Z_OFFSET(M0, uint, zout, y, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);

Gian Marco Iodice

2019-03-11 16:07:12 +0000

[diff] [blame]

1825

1826

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

1827

// multiply dst_stride_z by DEPTH_GEMM3D

1828

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

1829

1830

#else // defined(REINTERPRET_OUTPUT_AS_3D)

1831

1832

// Add offset for batched GEMM

1833

dst_addr += z * dst_stride_z;

1834

1835

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

1836

Gian Marco Iodice

2019-05-14 10:14:08 +0100

[diff] [blame^]

1837

// Convert and store output block

1838

CONVERT_STORE_BLOCK(M0, N0, int, c, dst_addr, dst_stride_y, zout);

Gian Marco Iodice

2019-03-11 16:07:12 +0000

[diff] [blame]

1839

1840

#undef RHS_BLOCK_SIZE

#undef RHS_OFFSET_X

#undef RHS_STEP_X

}

#endif // defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(DATA_TYPE) && defined(K)

1845

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

1846

#if defined(COLS_A)

1847

/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each row of Matrix A.

1848

*

1849

* @note This stage is needed to handle the offset of matrix product

1850

* https://github.com/google/gemmlowp/blob/master/doc/low-precision.md

1851

*

1852

* @attention The number of matrix A columns needs to be passed at compile time using -DCOLS_A

1853

*

1854

* @param[in] src_ptr Pointer to the source tensor. Supported data type: QASYMM8

1855

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1856

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1857

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1858

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1859

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1860

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

1861

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1862

* @param[out] dst_ptr Pointer to the destination tensor Supported data type: S32

1863

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1864

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

1865

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1866

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

1867

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1868

*/

1869

__kernel void gemmlowp_matrix_a_reduction(TENSOR3D_DECLARATION(src),

1870

IMAGE_DECLARATION(dst))

1871

{

1872

// Compute source and destination addresses

1873

Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);

1874

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

1875

1876

uint4 sum_row_u32 = (uint4)0;

1877

uint sum_row = 0;

1878

1879

__global const uchar *matrix_a = (__global const uchar *)(src.ptr + get_global_id(0) * src_stride_y + get_global_id(1) * src_stride_z);

int i = 0;

// This for loop performs 16 accumulations

1884

for(; i <= ((int)COLS_A - 16); i += 16)

1885

{

1886

const uchar16 a0_u8 = vload16(0, matrix_a + i);

1887

1888

sum_row_u32 += convert_uint4(a0_u8.s0123) + convert_uint4(a0_u8.s4567) + convert_uint4(a0_u8.s89AB) + convert_uint4(a0_u8.sCDEF);

1889

}

1890

1891

// This for loop performs the leftover accumulations

1892

for(; i < COLS_A; ++i)

1893

{

1894

sum_row += matrix_a[i];

1895

}

1896

1897

sum_row += sum_row_u32.s0 + sum_row_u32.s1 + sum_row_u32.s2 + sum_row_u32.s3;

1898

1899

*((__global int *)dst.ptr) = (int)sum_row;

1900

}

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1901

1902

#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)

1903

/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each row of Matrix A using the arm dot product instruction

1904

*

1905

* @note This stage is needed to handle the offset of matrix product

1906

* https://github.com/google/gemmlowp/blob/master/doc/low-precision.md

1907

*

1908

* @attention The number of matrix A columns needs to be passed at compile time using -DCOLS_A

1909

*

1910

* @param[in] src_ptr Pointer to the source tensor. Supported data type: QASYMM8

1911

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1912

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1913

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1914

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1915

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1916

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

1917

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1918

* @param[out] dst_ptr Pointer to the destination tensor Supported data type: S32

1919

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1920

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

1921

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1922

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

1923

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1924

*/

1925

__kernel void gemmlowp_matrix_a_reduction_dot8(TENSOR3D_DECLARATION(src),

1926

IMAGE_DECLARATION(dst))

1927

{

1928

// Compute source and destination addresses

1929

Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);

1930

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

uint sum_row = 0;

__global const uchar *matrix_a = (__global const uchar *)(src.ptr + get_global_id(0) * src_stride_y + get_global_id(1) * src_stride_z);

int i = 0;

// This for loop performs 16 accumulations

1939

for(; i <= ((int)COLS_A - 32); i += 32)

1940

{

1941

uchar16 a0_u8 = vload16(0, matrix_a + i);

1942

1943

sum_row += arm_dot(a0_u8.s0123, (uchar4)(1));

1944

sum_row += arm_dot(a0_u8.s4567, (uchar4)(1));

1945

sum_row += arm_dot(a0_u8.s89AB, (uchar4)(1));

1946

sum_row += arm_dot(a0_u8.sCDEF, (uchar4)(1));

1947

1948

a0_u8 = vload16(1, matrix_a + i);

1949

1950

sum_row += arm_dot(a0_u8.s0123, (uchar4)(1));

1951

sum_row += arm_dot(a0_u8.s4567, (uchar4)(1));

1952

sum_row += arm_dot(a0_u8.s89AB, (uchar4)(1));

1953

sum_row += arm_dot(a0_u8.sCDEF, (uchar4)(1));

1954

}

1955

1956

// This for loop performs the leftover accumulations

1957

for(; i < COLS_A; ++i)

1958

{

1959

sum_row += matrix_a[i];

1960

}

1961

1962

*((__global int *)dst.ptr) = (int)sum_row;

1963

}

1964

#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

1965

#endif // defined(COLS_A)

1966

1967

#if defined(COLS_B) && defined(ROWS_B)

1968

/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each column of Matrix B.

1969

*

1970

* @note This stage is needed to handle the offset of matrix product

1971

* https://github.com/google/gemmlowp/blob/master/doc/low-precision.md

1972

*

1973

* @attention The number of matrix B columns and rows needs to be passed at compile time using -DCOLS_B and -DROWS_B

1974

*

1975

* @param[in] src_ptr Pointer to the source tensor. Supported data type: QASYMM8

1976

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1977

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1978

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1979

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1980

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1981

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

1982

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1983

* @param[out] dst_ptr Pointer to the destination tensor Supported data type: S32

1984

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1985

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

1986

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1987

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

1988

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1989

*/

1990

__kernel void gemmlowp_matrix_b_reduction(TENSOR3D_DECLARATION(src),

1991

IMAGE_DECLARATION(dst))

1992

{

1993

// Compute source and destination addresses

1994

Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);

1995

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

1996

1997

uint16 sum_col_u32 = (uint16)0;

1998

1999

__global const uchar *matrix_b = (__global const uchar *)(src.ptr + get_global_id(1) * src_stride_z);

2000

2001

int i = 0;

2002

// This for loop performs 4 accumulations

2003

for(; i <= ((int)ROWS_B - 4); i += 4)

2004

{

2005

const uchar16 b0_u8 = vload16(0, matrix_b + 0 * src_stride_y);

2006

const uchar16 b1_u8 = vload16(0, matrix_b + 1 * src_stride_y);

2007

const uchar16 b2_u8 = vload16(0, matrix_b + 2 * src_stride_y);

2008

const uchar16 b3_u8 = vload16(0, matrix_b + 3 * src_stride_y);

2009

2010

sum_col_u32 += convert_uint16(b0_u8) + convert_uint16(b1_u8) + convert_uint16(b2_u8) + convert_uint16(b3_u8);

2011

2012

matrix_b += 4 * src_stride_y;

2013

}

2014

2015

// This for loop perfoms the leftover accumulations

2016

for(; i < (int)ROWS_B; ++i)

2017

{

2018

const uchar16 b0_u8 = vload16(0, matrix_b);

2019

2020

sum_col_u32 += convert_uint16(b0_u8);

2021

2022

matrix_b += src_stride_y;

2023

}

2024

2025

vstore16(convert_int16(sum_col_u32), 0, (__global int *)dst.ptr);

2026

}

2027

#endif // defined(COLS_B) && defined(ROWS_B)

2028

2029

#if defined(K_OFFSET)

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

2030

2031

/* Helper function used to calculate the offset contribution after @ref CLGEMMLowpMatrixMultiplyKernel.

2032

*

2033

* This kernel takes a final int32 accumulator value (the output of @CLGEMMLowpMatrixMultiplyKernel),

2034

* and calculates the offset contribution of matrix A and matrix B.

2035

*

2036

* @attention The k_offset = a_offset * b_offset * k (where k is the number of matrix A columns) needs to be passed at compile time using -DK_OFFSET (i.e. -DK_OFFSET=1200)

2037

* @note In case the offset contribution due to a_offset is required, a_offset needs to be passed at compile time using -DA_OFFSET (i.e. -DA_OFFSET=1)

2038

* @note In case the offset contribution due to b_offset is required, b_offset needs to be passed at compile time using -DB_OFFSET (i.e. -DB_OFFSET=6)

2039

* @note In case sum_col has batches, -DSUM_COL_HAS_BATCHES must be passed at compile time. Usually if gemmlowp is used to accelerate convolution layer, sum_col will not have batches

2040

*

2041

* @param[in] x get_global_id(0) * 4

2042

* @param[in] y get_global_id(1)

2043

* @param[in] z get_global_id(2)

2044

* @param[in] sum_col_ptr (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr

2045

* @param[in] sum_col_stride_x (Optional) Stride of the source tensor in X dimension (in bytes)

2046

* @param[in] sum_col_step_x (Optional) sum_col_stride_x * number of elements along X processed per workitem(in bytes)

2047

* @param[in] sum_col_stride_y (Optional) Stride of the source tensor in Y dimension (in bytes)

2048

* @param[in] sum_col_step_y (Optional) sum_col_stride_y * number of elements along Y processed per workitem(in bytes)

2049

* @param[in] sum_col_offset_first_element_in_bytes (Optional) The offset of the first element in the source tensor

2050

* @param[in] sum_row_ptr (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr

2051

* @param[in] sum_row_stride_x (Optional) Stride of the source tensor in X dimension (in bytes)

2052

* @param[in] sum_row_step_x (Optional) sum_row_stride_x * number of elements along X processed per workitem(in bytes)

2053

* @param[in] sum_row_stride_y (Optional) Stride of the source tensor in Y dimension (in bytes)

2054

* @param[in] sum_row_step_y (Optional) sum_row_stride_y * number of elements along Y processed per workitem(in bytes)

2055

* @param[in] sum_row_offset_first_element_in_bytes (Optional) The offset of the first element in the source tensor

2056

* @param[in] biases_ptr (Optional) Pointer to the biases tensor. Supported data type: same as @p src_ptr

2057

* @param[in] biases_stride_x (Optional) Stride of the biases tensor in X dimension (in bytes)

2058

* @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)

2059

* @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases tensor

2060

*/

2061

inline int4 offset_contribution(

int x,

int y,

int z

#if defined(A_OFFSET)

2066

,

2067

IMAGE_DECLARATION(sum_col)

2068

#endif // defined(A_OFFSET)

2069

#if defined(B_OFFSET)

2070

,

2071

IMAGE_DECLARATION(sum_row)

2072

#endif // defined(B_OFFSET)

2073

#if defined(ADD_BIAS)

2074

,

2075

VECTOR_DECLARATION(biases)

2076

#endif // defined(ADD_BIAS)

2077

)

2078

{

2079

int4 a_offset_s32 = (int4)0;

2080

int4 b_offset_s32 = (int4)0;

2081

2082

int batch_id = z;

2083

#if defined(DEPTH_INPUT3D)

2084

batch_id /= (int)DEPTH_INPUT3D;

2085

#endif // defined(DEPTH_INPUT3D)

2086

2087

#if defined(A_OFFSET)

2088

// Compute the offset contribution due to A_OFFSET

2089

__global uchar *sum_col_addr = sum_col_ptr + sum_col_offset_first_element_in_bytes + x * sizeof(int);

2090

2091

// Compute the offset contribution due to A_OFFSET

2092

#if defined(SUM_COL_HAS_BATCHES)

2093

a_offset_s32 = vload4(0, (__global int *)(sum_col_addr + batch_id * sum_col_stride_y));

2094

#else // defined(SUM_COL_HAS_BATCHES)

2095

a_offset_s32 = vload4(0, (__global int *)sum_col_addr);

2096

#endif // defined(SUM_COL_HAS_BATCHES)

2097

2098

a_offset_s32 *= (int4)A_OFFSET;

2099

#endif // defined(A_OFFSET)

2100

2101

#if defined(B_OFFSET)

2102

// Compute the offset contribution due to A_OFFSET

2103

__global uchar *sum_row_addr = sum_row_ptr + sum_row_offset_first_element_in_bytes + y * sizeof(int);

2104

2105

// Compute the offset contribution due to B_OFFSET

2106

#if defined(HEIGHT_INPUT3D) && defined(DEPTH_INPUT3D)

2107

b_offset_s32 = (int4) * (((__global int *)(sum_row_addr + batch_id * sum_row_stride_y)) + (z % (int)DEPTH_INPUT3D) * (int)HEIGHT_INPUT3D);

2108

#else // defined(HEIGHT_INPUT3D) && defined(DEPTH_INPUT3D)

2109

b_offset_s32 = (int4) * (((__global int *)(sum_row_addr + batch_id * sum_row_stride_y)));

2110

#endif // defined(HEIGHT_INPUT3D) && defined(DEPTH_INPUT3D)

2111

b_offset_s32 *= (int4)B_OFFSET;

2112

#endif // defined(B_OFFSET)

2113

2114

#if defined(ADD_BIAS)

2115

// Add bias

2116

__global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int);

2117

2118

int4 biases_values = vload4(0, (__global int *)bias_addr);

2119

b_offset_s32 += (int4)biases_values;

2120

#endif // defined(ADD_BIAS)

2121

2122

return (int4)K_OFFSET + a_offset_s32 + b_offset_s32;

2123

}

2124

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

2125

/* OpenCL kernel used to add the offset contribution after @ref CLGEMMLowpMatrixMultiplyKernel. The computation is performed in-place

2126

*

2127

* This kernel takes a final int32 accumulator value (the output of @CLGEMMLowpMatrixMultiplyKernel),

2128

* and adds to it the offset contribution of matrix A and matrix B in-place.

2129

*

2130

* @attention The k_offset = a_offset * b_offset * k (where k is the number of matrix A columns) needs to be passed at compile time using -DK_OFFSET (i.e. -DK_OFFSET=1200)

2131

* @note In case the offset contribution due to a_offset is required, a_offset needs to be passed at compile time using -DA_OFFSET (i.e. -DA_OFFSET=1)

2132

* @note In case the offset contribution due to b_offset is required, b_offset needs to be passed at compile time using -DB_OFFSET (i.e. -DB_OFFSET=6)

Chunosov

5124be5

2017-11-22 20:42:13 +0700

[diff] [blame]

2133

* @note In case sum_col has batches, -DSUM_COL_HAS_BATCHES must be passed at compile time. Usually if gemmlowp is used to accelerate convolution layer, sum_col will not have batches

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

2134

*

2135

* The final result is:

2136

*

2137

* mm_result[i][k] = mm_result[i][k] +

2138

* (sum_col[k] * A_OFFSET) +

2139

* (sum_row[i] * B_OFFSET) +

2140

* (K_OFFSET)

2141

*

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

2142

* @param[in] mm_result_ptr Pointer to the source tensor. Supported data type: S32

2143

* @param[in] mm_result_stride_x Stride of the source tensor in X dimension (in bytes)

2144

* @param[in] mm_result_step_x mm_result_stride_x * number of elements along X processed per workitem(in bytes)

2145

* @param[in] mm_result_stride_y Stride of the source tensor in Y dimension (in bytes)

2146

* @param[in] mm_result_step_y mm_result_stride_y * number of elements along Y processed per workitem(in bytes)

2147

* @param[in] mm_result_stride_z Stride of the source tensor in Z dimension (in bytes)

2148

* @param[in] mm_result_step_z mm_result_stride_z * number of elements along Z processed per workitem(in bytes)

2149

* @param[in] mm_result_offset_first_element_in_bytes The offset of the first element in the source tensor

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

2150

* @param[in] sum_col_ptr (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr

2151

* @param[in] sum_col_stride_x (Optional) Stride of the source tensor in X dimension (in bytes)

2152

* @param[in] sum_col_step_x (Optional) sum_col_stride_x * number of elements along X processed per workitem(in bytes)

2153

* @param[in] sum_col_stride_y (Optional) Stride of the source tensor in Y dimension (in bytes)

2154

* @param[in] sum_col_step_y (Optional) sum_col_stride_y * number of elements along Y processed per workitem(in bytes)

2155

* @param[in] sum_col_offset_first_element_in_bytes (Optional) The offset of the first element in the source tensor

2156

* @param[in] sum_row_ptr (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr

2157

* @param[in] sum_row_stride_x (Optional) Stride of the source tensor in X dimension (in bytes)

2158

* @param[in] sum_row_step_x (Optional) sum_row_stride_x * number of elements along X processed per workitem(in bytes)

2159

* @param[in] sum_row_stride_y (Optional) Stride of the source tensor in Y dimension (in bytes)

2160

* @param[in] sum_row_step_y (Optional) sum_row_stride_y * number of elements along Y processed per workitem(in bytes)

2161

* @param[in] sum_row_offset_first_element_in_bytes (Optional) The offset of the first element in the source tensor

2162

* @param[in] biases_ptr (Optional) Pointer to the biases tensor. Supported data type: same as @p src_ptr

2163

* @param[in] biases_stride_x (Optional) Stride of the biases tensor in X dimension (in bytes)

2164

* @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)

2165

* @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases tensor

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

2166

*/

2167

__kernel void gemmlowp_offset_contribution(TENSOR3D_DECLARATION(mm_result)

2168

#if defined(A_OFFSET)

2169

,

2170

IMAGE_DECLARATION(sum_col)

2171

#endif // defined(A_OFFSET)

2172

#if defined(B_OFFSET)

2173

,

2174

IMAGE_DECLARATION(sum_row)

2175

#endif // defined(B_OFFSET)

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

2176

#if defined(ADD_BIAS)

2177

,

2178

VECTOR_DECLARATION(biases)

2179

#endif // defined(ADD_BIAS))

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

2180

)

2181

{

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

2182

const int x = get_global_id(0) * 4;

Georgios Pinitas

2018-09-24 16:31:08 +0100

[diff] [blame]

2183

const int y = get_global_id(1);

2184

const int z = get_global_id(2);

2185

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

2186

// Compute offset contribution

2187

int4 offset_term_s32 = offset_contribution(

2188

x, y, z

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

2189

#if defined(A_OFFSET)

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

,

sum_col_ptr,

sum_col_stride_x,

sum_col_step_x,

sum_col_stride_y,

sum_col_step_y,

sum_col_offset_first_element_in_bytes

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

2197

#endif // defined(A_OFFSET)

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

2198

#if defined(B_OFFSET)

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

,

sum_row_ptr,

sum_row_stride_x,

sum_row_step_x,

sum_row_stride_y,

sum_row_step_y,

sum_row_offset_first_element_in_bytes

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

2206

#endif // defined(B_OFFSET)

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

2207

#if defined(ADD_BIAS)

,

biases_ptr,

biases_stride_x,

biases_step_x,

biases_offset_first_element_in_bytes

2213

#endif // defined(ADD_BIAS)

2214

);

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

2215

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

2216

__global uchar *mm_result_addr = mm_result_ptr + mm_result_offset_first_element_in_bytes + x * sizeof(int) + y * mm_result_stride_y + z * mm_result_stride_z;

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

2217

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

2218

int4 in_s32 = vload4(0, (__global int *)mm_result_addr);

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

2219

2220

// Add the offset terms to GEMM's result

2221

in_s32 += offset_term_s32;

2222

2223

// Store the result with the offset contribution

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

2224

vstore4(in_s32, 0, (__global int *)mm_result_addr);

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

2225

}

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

2226

2227

#if defined(RESULT_OFFSET) && defined(RESULT_MULTIPLIER) && defined(RESULT_SHIFT)

2228

/* OpenCL kernel used to add the offset contribution after @ref CLGEMMLowpMatrixMultiplyKernel and it quantizes down to uint8.

2229

*

2230

* This kernel takes a final int32 accumulator value (the output of @CLGEMMLowpMatrixMultiplyKernel), adds to it the offset contribution of matrix A and matrix B and quantizes to uint8 through the output stage.

2231

*

2232

*

2233

* @attention The k_offset = a_offset * b_offset * k (where k is the number of matrix A columns) needs to be passed at compile time using -DK_OFFSET (i.e. -DK_OFFSET=1200)

2234

* @note In case the offset contribution due to a_offset is required, a_offset needs to be passed at compile time using -DA_OFFSET (i.e. -DA_OFFSET=1)

2235

* @note In case the offset contribution due to b_offset is required, b_offset needs to be passed at compile time using -DB_OFFSET (i.e. -DB_OFFSET=6)

2236

* @note In case sum_col has batches, -DSUM_COL_HAS_BATCHES must be passed at compile time. Usually if gemmlowp is used to accelerate convolution layer, sum_col will not have batches

2237

*

2238

* The result before the output stage is:

2239

*

2240

* mm_result[i][k] = mm_result[i][k] +

2241

* (sum_col[k] * A_OFFSET) +

2242

* (sum_row[i] * B_OFFSET) +

2243

* (K_OFFSET)

2244

*

2245

* This result is quantized down to uint8 using the output stage. The output stage computes the following operations:

2246

*

2247

* -# Add offset terms to final result

2248

* -# Multiply each entry of result by result_mult_int

2249

* -# Add bias to final result (if -DADD_BIAS is passed at compile time)

2250

* -# Shift the int32 accumulator by result_shift

2251

* -# Clamp the value between the specified min and max bounds (if -DMIN_BOUND and/or -DMAX_BOUND are passed at compile time)

2252

* -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8.

2253

*

2254

* @attention The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and -DRESULT_SHIFT

2255

*

2256

* @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile time

2257

* @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND.

2258

* These values can be used to implement "rectified linear unit" activation functions

2259

*

2260

* @param[in] mm_result_ptr Pointer to the source tensor. Supported data type: S32

2261

* @param[in] mm_result_stride_x Stride of the source tensor in X dimension (in bytes)

2262

* @param[in] mm_result_step_x mm_result_stride_x * number of elements along X processed per workitem(in bytes)

2263

* @param[in] mm_result_stride_y Stride of the source tensor in Y dimension (in bytes)

2264

* @param[in] mm_result_step_y mm_result_stride_y * number of elements along Y processed per workitem(in bytes)

2265

* @param[in] mm_result_stride_z Stride of the source tensor in Z dimension (in bytes)

2266

* @param[in] mm_result_step_z mm_result_stride_z * number of elements along Z processed per workitem(in bytes)

2267

* @param[in] mm_result_offset_first_element_in_bytes The offset of the first element in the source tensor

2268

* @param[in] sum_col_ptr (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr

2269

* @param[in] sum_col_stride_x (Optional) Stride of the source tensor in X dimension (in bytes)

2270

* @param[in] sum_col_step_x (Optional) sum_col_stride_x * number of elements along X processed per workitem(in bytes)

2271

* @param[in] sum_col_stride_y (Optional) Stride of the source tensor in Y dimension (in bytes)

2272

* @param[in] sum_col_step_y (Optional) sum_col_stride_y * number of elements along Y processed per workitem(in bytes)

2273

* @param[in] sum_col_offset_first_element_in_bytes (Optional) The offset of the first element in the source tensor

2274

* @param[in] sum_row_ptr (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr

2275

* @param[in] sum_row_stride_x (Optional) Stride of the source tensor in X dimension (in bytes)

2276

* @param[in] sum_row_step_x (Optional) sum_row_stride_x * number of elements along X processed per workitem(in bytes)

2277

* @param[in] sum_row_stride_y (Optional) Stride of the source tensor in Y dimension (in bytes)

2278

* @param[in] sum_row_step_y (Optional) sum_row_stride_y * number of elements along Y processed per workitem(in bytes)

2279

* @param[in] sum_row_offset_first_element_in_bytes (Optional) The offset of the first element in the source tensor

2280

* @param[in] biases_ptr (Optional) Pointer to the biases tensor. Supported data type: same as @p src_ptr

2281

* @param[in] biases_stride_x (Optional) Stride of the biases tensor in X dimension (in bytes)

2282

* @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)

2283

* @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases tensor

2284

* @param[out] dst_ptr Pointer to the destination tensor Supported data type: QASYMM8

2285

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

2286

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

2287

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

2288

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

2289

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

2290

* @param[in] dst_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

2291

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

2292

*/

2293

__kernel void gemmlowp_offset_contribution_quantize_down(TENSOR3D_DECLARATION(mm_result)

2294

#if defined(A_OFFSET)

2295

,

2296

IMAGE_DECLARATION(sum_col)

2297

#endif // defined(A_OFFSET)

2298

#if defined(B_OFFSET)

2299

,

2300

IMAGE_DECLARATION(sum_row)

2301

#endif // defined(B_OFFSET)

2302

,

2303

#if defined(ADD_BIAS)

2304

VECTOR_DECLARATION(biases),

2305

#endif // defined(ADD_BIAS)

2306

TENSOR3D_DECLARATION(dst))

2307

{

2308

const int x = get_global_id(0) * 4;

2309

const int y = get_global_id(1);

2310

const int z = get_global_id(2);

2311

2312

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;

2313

2314

// Compute offset contribution

2315

int4 offset_term_s32 = offset_contribution(

2316

x, y, z

2317

#if defined(A_OFFSET)

,

sum_col_ptr,

sum_col_stride_x,

sum_col_step_x,

sum_col_stride_y,

sum_col_step_y,

sum_col_offset_first_element_in_bytes

2325

#endif // defined(A_OFFSET)

2326

#if defined(B_OFFSET)

,

sum_row_ptr,

sum_row_stride_x,

sum_row_step_x,

sum_row_stride_y,

sum_row_step_y,

sum_row_offset_first_element_in_bytes

2334

#endif // defined(B_OFFSET)

2335

#if defined(ADD_BIAS)

,

biases_ptr,

biases_stride_x,

biases_step_x,

biases_offset_first_element_in_bytes

2341

#endif // defined(ADD_BIAS)

2342

);

2343

2344

__global uchar *mm_result_addr = mm_result_ptr + mm_result_offset_first_element_in_bytes + x * sizeof(int) + y * mm_result_stride_y + z * mm_result_stride_z;

2345

2346

int4 in_s32 = vload4(0, (__global int *)mm_result_addr);

2347

2348

// Add the offset terms to GEMM's result

2349

in_s32 += offset_term_s32;

2350

2351

// -------------- OUTPUT STAGE

2352

2353

// Add the offset terms to GEMM's result

2354

in_s32 += (int4)RESULT_OFFSET;

2355

2356

// Multiply by result_mult_int and shift

2357

in_s32 *= RESULT_MULTIPLIER;

2358

2359

in_s32 >>= RESULT_SHIFT;

2360

2361

uchar4 res = convert_uchar4_sat(in_s32);

2362

2363

#if defined(MIN_BOUND)

2364

res = max(res, (uchar4)MIN_BOUND);

2365

#endif // defined(MIN_BOUND)

2366

#if defined(MAX_BOUND)

2367

res = min(res, (uchar4)MAX_BOUND);

2368

#endif // defined(MAX_BOUND)

2369

2370

// Store the result

2371

vstore4(res, 0, dst_addr);

2372

}

2373

2374

/* OpenCL kernel used to add the offset contribution after @ref CLGEMMLowpMatrixMultiplyKernel and it quantizes down to uint8.

2375

*

2376

* This kernel takes a final int32 accumulator value (the output of @CLGEMMLowpMatrixMultiplyKernel), adds to it the offset contribution of matrix A and matrix B and quantizes to uint8 through the output stage.

2377

*

2378

*

2379

* @attention The k_offset = a_offset * b_offset * k (where k is the number of matrix A columns) needs to be passed at compile time using -DK_OFFSET (i.e. -DK_OFFSET=1200)

2380

* @note In case the offset contribution due to a_offset is required, a_offset needs to be passed at compile time using -DA_OFFSET (i.e. -DA_OFFSET=1)

2381

* @note In case the offset contribution due to b_offset is required, b_offset needs to be passed at compile time using -DB_OFFSET (i.e. -DB_OFFSET=6)

2382

* @note In case sum_col has batches, -DSUM_COL_HAS_BATCHES must be passed at compile time. Usually if gemmlowp is used to accelerate convolution layer, sum_col will not have batches

2383

*

2384

* The result before the output stage is:

2385

*

2386

* mm_result[i][k] = mm_result[i][k] +

2387

* (sum_col[k] * A_OFFSET) +

2388

* (sum_row[i] * B_OFFSET) +

2389

* (K_OFFSET)

2390

*

2391

* This result is quantized down to uint8 using the output stage. The output stage computes the following operations:

2392

*

2393

* -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier

2394

* -# Add bias to final result if bias tensor is not a nullptr

2395

* -# Round to nearest division by a power-of-two using result_shift

2396

* -# Add offset to each result

2397

* -# Clamp the value between the specified min and max bounds

2398

* -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8.

2399

*

2400

* @attention The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and -DRESULT_SHIFT

2401

*

2402

* @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile time

2403

* @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND.

2404

* These values can be used to implement "rectified linear unit" activation functions

2405

*

2406

* @param[in] mm_result_ptr Pointer to the source tensor. Supported data type: S32

2407

* @param[in] mm_result_stride_x Stride of the source tensor in X dimension (in bytes)

2408

* @param[in] mm_result_step_x mm_result_stride_x * number of elements along X processed per workitem(in bytes)

2409

* @param[in] mm_result_stride_y Stride of the source tensor in Y dimension (in bytes)

2410

* @param[in] mm_result_step_y mm_result_stride_y * number of elements along Y processed per workitem(in bytes)

2411

* @param[in] mm_result_stride_z Stride of the source tensor in Z dimension (in bytes)

2412

* @param[in] mm_result_step_z mm_result_stride_z * number of elements along Z processed per workitem(in bytes)

2413

* @param[in] mm_result_offset_first_element_in_bytes The offset of the first element in the source tensor

2414

* @param[in] sum_col_ptr (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr

2415

* @param[in] sum_col_stride_x (Optional) Stride of the source tensor in X dimension (in bytes)

2416

* @param[in] sum_col_step_x (Optional) sum_col_stride_x * number of elements along X processed per workitem(in bytes)

2417

* @param[in] sum_col_stride_y (Optional) Stride of the source tensor in Y dimension (in bytes)

2418

* @param[in] sum_col_step_y (Optional) sum_col_stride_y * number of elements along Y processed per workitem(in bytes)

2419

* @param[in] sum_col_offset_first_element_in_bytes (Optional) The offset of the first element in the source tensor

2420

* @param[in] sum_row_ptr (Optional) Pointer to the source tensor. Supported data type: same as @p mm_result_ptr

2421

* @param[in] sum_row_stride_x (Optional) Stride of the source tensor in X dimension (in bytes)

2422

* @param[in] sum_row_step_x (Optional) sum_row_stride_x * number of elements along X processed per workitem(in bytes)

2423

* @param[in] sum_row_stride_y (Optional) Stride of the source tensor in Y dimension (in bytes)

2424

* @param[in] sum_row_step_y (Optional) sum_row_stride_y * number of elements along Y processed per workitem(in bytes)

2425

* @param[in] sum_row_offset_first_element_in_bytes (Optional) The offset of the first element in the source tensor

2426

* @param[in] biases_ptr (Optional) Pointer to the biases tensor. Supported data type: same as @p src_ptr

2427

* @param[in] biases_stride_x (Optional) Stride of the biases tensor in X dimension (in bytes)

2428

* @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)

2429

* @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases tensor

2430

* @param[out] dst_ptr Pointer to the destination tensor Supported data type: QASYMM8

2431

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

2432

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

2433

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

2434

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

2435

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

2436

* @param[in] dst_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

2437

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

2438

*/

2439

__kernel void gemmlowp_offset_contribution_quantize_down_fixedpoint(TENSOR3D_DECLARATION(mm_result)

2440

#if defined(A_OFFSET)

2441

,

2442

IMAGE_DECLARATION(sum_col)

2443

#endif // defined(A_OFFSET)

2444

#if defined(B_OFFSET)

2445

,

2446

IMAGE_DECLARATION(sum_row)

2447

#endif // defined(B_OFFSET)

2448

,

2449

#if defined(ADD_BIAS)

2450

VECTOR_DECLARATION(biases),

2451

#endif // defined(ADD_BIAS)

2452

TENSOR3D_DECLARATION(dst))

2453

{

2454

const int x = get_global_id(0) * 4;

2455

const int y = get_global_id(1);

2456

const int z = get_global_id(2);

2457

2458

// Compute offset contribution

2459

int4 offset_term_s32 = offset_contribution(

2460

x, y, z

2461

#if defined(A_OFFSET)

,

sum_col_ptr,

sum_col_stride_x,

sum_col_step_x,

sum_col_stride_y,

sum_col_step_y,

sum_col_offset_first_element_in_bytes

2469

#endif // defined(A_OFFSET)

2470

#if defined(B_OFFSET)

,

sum_row_ptr,

sum_row_stride_x,

sum_row_step_x,

sum_row_stride_y,

sum_row_step_y,

sum_row_offset_first_element_in_bytes

2478

#endif // defined(B_OFFSET)

2479

#if defined(ADD_BIAS)

,

biases_ptr,

biases_stride_x,

biases_step_x,

biases_offset_first_element_in_bytes

2485

#endif // defined(ADD_BIAS)

2486

);

2487

2488

__global uchar *mm_result_addr = mm_result_ptr + mm_result_offset_first_element_in_bytes + x * sizeof(int) + y * mm_result_stride_y + z * mm_result_stride_z;

2489

2490

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;

2491

2492

int4 in_s32 = vload4(0, (__global int *)mm_result_addr);

2493

2494

// Add the offset terms to GEMM's result

2495

in_s32 += offset_term_s32;

2496

2497

// -------------- OUTPUT STAGE

2498

2499

// Multiply by result_mult_int and shift

2500

in_s32 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(in_s32, RESULT_MULTIPLIER, RESULT_SHIFT, 4);

2501

2502

// Add the offset terms to GEMM's result

2503

in_s32 += (int4)RESULT_OFFSET;

2504

2505

uchar4 res = convert_uchar4_sat(in_s32);

2506

2507

#if defined(MIN_BOUND)

2508

res = max(res, (uchar4)MIN_BOUND);

2509

#endif // defined(MIN_BOUND)

2510

#if defined(MAX_BOUND)

2511

res = min(res, (uchar4)MAX_BOUND);

2512

#endif // defined(MAX_BOUND)

2513

2514

// Store the result

2515

vstore4(res, 0, dst_addr);

2516

}

2517

#endif // defined(K_OFFSET) && defined(RESULT_OFFSET) && defined(RESULT_MULTIPLIER) && defined(RESULT_SHIFT)

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

2518

#endif // defined(K_OFFSET)

2519

2520

#if defined(RESULT_OFFSET) && defined(RESULT_MULT_INT) && defined(RESULT_SHIFT)

2521

/** This OpenCL kernel is used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8

2522

*

2523

* This kernel takes a final int32 accumulator value and processes it to obtain the final QASYMM8 value.

2524

* The following computations will be performed by the kernel:

2525

*

2526

* -# Add offset terms to final result

2527

* -# Multiply each entry of result by result_mult_int

2528

* -# Add bias to final result (if -DADD_BIAS is passed at compile time)

2529

* -# Shift the int32 accumulator by result_shift

2530

* -# Clamp the value between the specified min and max bounds (if -DMIN_BOUND and/or -DMAX_BOUND are passed at compile time)

2531

* -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8.

2532

*

2533

* @attention The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_OFFSET, -RESULT_MULT_INT and -DRESULT_SHIFT

2534

*

2535

* @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile time

2536

* @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND.

2537

* These values can be used to implement "rectified linear unit" activation functions

2538

*

2539

* @param[in] src_ptr Pointer to the source tensor. Supported data type: S32

2540

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

2541

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

2542

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

2543

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

2544

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

2545

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

2546

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

2547

* @param[in] biases_ptr (Optional) Pointer to the biases tensor. Supported data type: same as @p src_ptr

2548

* @param[in] biases_stride_x (Optional) Stride of the biases tensor in X dimension (in bytes)

2549

* @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)

2550

* @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases tensor

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

2551

* @param[out] dst_ptr Pointer to the destination tensor Supported data type: QASYMM8

2552

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

2553

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

2554

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

2555

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

2556

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

2557

* @param[in] dst_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

2558

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

2559

*/

2560

__kernel void gemmlowp_output_stage_quantize_down(TENSOR3D_DECLARATION(src),

2561

#if defined(ADD_BIAS)

2562

VECTOR_DECLARATION(biases),

2563

#endif // defined(ADD_BIAS)

2564

TENSOR3D_DECLARATION(dst))

2565

{

2566

// Compute source and destination addresses

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

2567

int x = get_global_id(0) * 4;

2568

int y = get_global_id(1);

2569

int z = get_global_id(2);

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

2570

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

2571

__global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(int) + y * src_stride_y + z * src_stride_z;

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

2572

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

2573

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;

2574

2575

int4 input_values = vload4(0, (__global int *)src_addr);

Gian Marco

2017-11-28 09:10:03 +0000

[diff] [blame]

2576

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

2577

#if defined(ADD_BIAS)

2578

// Add bias

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

2579

__global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int);

2580

2581

int4 biases_values = vload4(0, (__global int *)bias_addr);

2582

input_values += (int4)biases_values;

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

2583

#endif // defined(ADD_BIAS)

2584

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

2585

// Add the offset terms to GEMM's result

2586

input_values += (int4)RESULT_OFFSET;

2587

Georgios Pinitas

45bcc3a

2017-11-29 11:06:49 +0000

[diff] [blame]

2588

// Multiply by result_mult_int and shift

Gian Marco

2017-11-28 09:10:03 +0000

[diff] [blame]

2589

input_values *= RESULT_MULT_INT;

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

2590

Gian Marco

2017-11-28 09:10:03 +0000

[diff] [blame]

2591

input_values >>= RESULT_SHIFT;

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

2592

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

2593

uchar4 res = convert_uchar4_sat(input_values);

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

2594

2595

#if defined(MIN_BOUND)

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

2596

res = max(res, (uchar4)MIN_BOUND);

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

2597

#endif // defined(MIN_BOUND)

2598

#if defined(MAX_BOUND)

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

2599

res = min(res, (uchar4)MAX_BOUND);

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

2600

#endif // defined(MAX_BOUND)

2601

2602

// Store the result

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

2603

vstore4(res, 0, dst_addr);

Gian Marco

2017-11-21 10:57:50 +0000

[diff] [blame]

2604

}

Gian Marco

2017-11-28 09:10:03 +0000

[diff] [blame]

2605

#endif // defined(RESULT_OFFSET) && defined(RESULT_MULT_INT) && defined(RESULT_SHIFT)

2606

2607

#if defined(RESULT_OFFSET_AFTER_SHIFT) && defined(RESULT_FIXEDPOINT_MULTIPLIER) && defined(RESULT_SHIFT)

2608

/** This OpenCL kernel is used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8

2609

*

2610

* This kernel takes a final int32 accumulator value (the output of @ref CLGEMMLowpMatrixMultiplyKernel), and processes it to obtain the final QASYMM8 value.

2611

* The following computations will be performed by the kernel:

2612

*

2613

* -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier

2614

* -# Add bias to final result if bias tensor is not a nullptr

2615

* -# Round to nearest division by a power-of-two using result_shift

2616

* -# Add offset to each result

2617

* -# Clamp the value between the specified min and max bounds

2618

* -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8.

2619

*

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

2620

* @attention The offset, scalar scale factor and number of bits to shift right of output tensor must be passed at compile time using -DRESULT_OFFSET_AFTER_SHIFT, -DRESULT_FIXEDPOINT_MULTIPLIER and -DRESULT_SHIFT

Gian Marco

2017-11-28 09:10:03 +0000

[diff] [blame]

2621

*

2622

* @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile time

2623

* @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND.

2624

* These values can be used to implement "rectified linear unit" activation functions

2625

*

2626

* @param[in] src_ptr Pointer to the source tensor. Supported data type: S32

2627

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

2628

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

2629

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

2630

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

2631

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

2632

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

2633

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

2634

* @param[in] biases_ptr (Optional) Pointer to the biases tensor. Supported data type: same as @p src_ptr

2635

* @param[in] biases_stride_x (Optional) Stride of the biases tensor in X dimension (in bytes)

2636

* @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)

2637

* @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases tensor

Gian Marco

2017-11-28 09:10:03 +0000

[diff] [blame]

2638

* @param[out] dst_ptr Pointer to the destination tensor Supported data type: QASYMM8

2639

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

2640

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

2641

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

2642

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

2643

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

2644

* @param[in] dst_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

2645

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

2646

*/

2647

__kernel void gemmlowp_output_stage_quantize_down_fixedpoint(TENSOR3D_DECLARATION(src),

2648

#if defined(ADD_BIAS)

2649

VECTOR_DECLARATION(biases),

2650

#endif // defined(ADD_BIAS)

2651

TENSOR3D_DECLARATION(dst))

2652

{

2653

// Compute source and destination addresses

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

2654

int x = get_global_id(0) * 4;

2655

int y = get_global_id(1);

2656

int z = get_global_id(2);

Georgios Pinitas

932491f

2018-09-21 16:33:15 +0100

[diff] [blame]

2657

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

2658

__global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(int) + y * src_stride_y + z * src_stride_z;

Gian Marco

2017-11-28 09:10:03 +0000

[diff] [blame]

2659

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

2660

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;

2661

2662

int4 input_values = vload4(0, (__global int *)src_addr);

Gian Marco

2017-11-28 09:10:03 +0000

[diff] [blame]

2663

2664

#if defined(ADD_BIAS)

2665

// Add bias

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

2666

__global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int);

2667

2668

int4 biases_values = vload4(0, (__global int *)bias_addr);

2669

input_values += (int4)biases_values;

Gian Marco

2017-11-28 09:10:03 +0000

[diff] [blame]

2670

#endif // defined(ADD_BIAS)

2671

2672

// Multiply by result_mult_int and shift

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

2673

input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, 4);

Gian Marco

2017-11-28 09:10:03 +0000

[diff] [blame]

2674

2675

// Add the offset terms to GEMM's result

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

2676

input_values += (int4)RESULT_OFFSET_AFTER_SHIFT;

Gian Marco

2017-11-28 09:10:03 +0000

[diff] [blame]

2677

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

2678

uchar4 res = convert_uchar4_sat(input_values);

Gian Marco

2017-11-28 09:10:03 +0000

[diff] [blame]

2679

2680

#if defined(MIN_BOUND)

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

2681

res = max(res, (uchar4)MIN_BOUND);

Gian Marco

2017-11-28 09:10:03 +0000

[diff] [blame]

2682

#endif // defined(MIN_BOUND)

2683

#if defined(MAX_BOUND)

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

2684

res = min(res, (uchar4)MAX_BOUND);

Gian Marco

2017-11-28 09:10:03 +0000

[diff] [blame]

2685

#endif // defined(MAX_BOUND)

2686

2687

// Store the result

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

2688

vstore4(res, 0, dst_addr);

Gian Marco

2017-11-28 09:10:03 +0000

[diff] [blame]

2689

}

Chunosov

5124be5

2017-11-22 20:42:13 +0700

[diff] [blame]

2690

#endif // defined(RESULT_OFFSET_AFTER_SHIFT) && defined(RESULT_FIXEDPOINT_MULTIPLIER) && defined(RESULT_SHIFT)

Georgios Pinitas

2018-10-22 13:49:08 +0100

[diff] [blame]

2691

2692

#if defined(REAL_MULTIPLIER) && defined(OUTPUT_OFFSET)

2693

/** This OpenCL kernel is used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8

2694

*

2695

* This kernel takes a final int32 accumulator value (the output of @ref CLGEMMLowpMatrixMultiplyKernel), and processes it to obtain the final QASYMM8 value.

2696

* The following computations will be performed by the kernel:

2697

*

2698

* -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier

2699

* -# Add bias to final result if bias tensor is not a nullptr

2700

* -# Requantize

2701

* -# Add offset to each result

2702

* -# Clamp the value between the specified min and max bounds

2703

* -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8.

2704

*

2705

* @attention The offset and scalar scale factor must be passed at compile time using -DRESULT_OFFSET, -DREAL_MULTIPLIER

2706

*

2707

* @note In case the addition of int32 biases is required, -DADD_BIAS should be passed at compile time

2708

* @note In case the clamping of the result is required, the min and max bounds can be passed at compile time using -DMIN_BOUND and -DMAX_BOUND.

2709

* These values can be used to implement "rectified linear unit" activation functions

2710

*

2711

* @param[in] src_ptr Pointer to the source tensor. Supported data type: S32

2712

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

2713

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

2714

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

2715

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

2716

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

2717

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

2718

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

2719

* @param[in] biases_ptr Pointer to the biases tensor. Supported data type: same as @p src_ptr

2720

* @param[in] biases_stride_x Stride of the biases tensor in X dimension (in bytes)

2721

* @param[in] biases_step_x biases_stride_x * number of elements along X processed per workitem(in bytes)

2722

* @param[in] biases_offset_first_element_in_bytes The offset of the first element in the biases tensor

2723

* @param[out] dst_ptr Pointer to the destination tensor Supported data type: QASYMM8

2724

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

2725

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

2726

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

2727

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

2728

* @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)

2729

* @param[in] dst_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

2730

* @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)

2731

* @param[in] dst_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

2732

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

2733

*/

2734

__kernel void gemmlowp_output_stage_quantize_down_float(TENSOR3D_DECLARATION(src),

2735

#if defined(ADD_BIAS)

2736

VECTOR_DECLARATION(biases),

2737

#endif // defined(ADD_BIAS)

2738

#if defined(DST_HEIGHT)

2739

TENSOR4D_DECLARATION(dst))

2740

#else // defined(DST_HEIGHT)

2741

TENSOR3D_DECLARATION(dst))

2742

#endif // defined(DST_HEIGHT)

2743

{

2744

// Compute source and destination addresses

Gian Marco Iodice

2018-10-30 12:20:03 +0000

[diff] [blame]

2745

int x = get_global_id(0) * 4;

2746

int y = get_global_id(1);

2747

int z = get_global_id(2);

Georgios Pinitas

2018-10-22 13:49:08 +0100

[diff] [blame]

2748

Gian Marco Iodice

2018-10-30 12:20:03 +0000

[diff] [blame]

2749

__global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(int) + y * src_stride_y + z * src_stride_z;

Georgios Pinitas

2018-10-22 13:49:08 +0100

[diff] [blame]

2750

Gian Marco Iodice

2018-10-30 12:20:03 +0000

[diff] [blame]

2751

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;

2752

2753

int4 input_values = vload4(0, (__global int *)src_addr);

Georgios Pinitas

2018-10-22 13:49:08 +0100

[diff] [blame]

2754

2755

#if defined(ADD_BIAS)

2756

// Add bias

Gian Marco Iodice

2018-10-30 12:20:03 +0000

[diff] [blame]

2757

__global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int);

2758

2759

int4 biases_values = vload4(0, (__global int *)bias_addr);

2760

input_values += (int4)biases_values;

Georgios Pinitas

2018-10-22 13:49:08 +0100

[diff] [blame]

2761

#endif // defined(ADD_BIAS)

2762

2763

// Convert to float

Gian Marco Iodice

2018-10-30 12:20:03 +0000

[diff] [blame]

2764

float16 input_values_f = convert_float4(input_values);

Georgios Pinitas

2018-10-22 13:49:08 +0100

[diff] [blame]

2765

input_values_f = round(input_values_f * (float)REAL_MULTIPLIER + (float)OUTPUT_OFFSET);

2766

Gian Marco Iodice

2018-10-30 12:20:03 +0000

[diff] [blame]

2767

uchar4 res = convert_uchar4_sat(input_values_f);

Georgios Pinitas

2018-10-22 13:49:08 +0100

[diff] [blame]

2768

2769

#if defined(MIN_BOUND)

Gian Marco Iodice

2018-10-30 12:20:03 +0000

[diff] [blame]

2770

res = max(res, (uchar4)MIN_BOUND);

Georgios Pinitas

2018-10-22 13:49:08 +0100

[diff] [blame]

2771

#endif // defined(MIN_BOUND)

2772

#if defined(MAX_BOUND)

Gian Marco Iodice

2018-10-30 12:20:03 +0000

[diff] [blame]

2773

res = min(res, (uchar4)MAX_BOUND);

Georgios Pinitas

2018-10-22 13:49:08 +0100

[diff] [blame]

2774

#endif // defined(MAX_BOUND)

2775

2776

// Store the result

Gian Marco Iodice

2018-10-30 12:20:03 +0000

[diff] [blame]

2777

vstore4(res, 0, dst_addr);

Georgios Pinitas