Blame - src/core/CL/cl_kernels/gemm.cl - ml/ComputeLibrary

2017-09-04 18:44:23 +0100

[diff] [blame]

256

*

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

257

* @attention The number of matrix B columns needs to be passed at compile time using -DCOLS_B

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

258

*

259

* @param[in] src0_ptr Pointer to the source matrix. Supported formats: U8

260

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

261

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

262

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

263

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

264

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

265

* @param[in] src1_ptr Pointer to the source matrix. Supported formats: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

266

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

267

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

268

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

269

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

270

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

271

* @param[out] dst_ptr Pointer to the destination matrix Supported formats: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

272

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

273

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

274

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

275

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

276

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

277

* @param[in] a_offset Offset to be added to each element of the matrix A

278

* @param[in] b_offset Offset to be added to each element of the matrix B.

279

* @param[in] c_offset Offset to be added to each element of the matrix C.

280

* @param[in] c_mult_int Multiplied with each element of the matrix C.

281

* @param[in] shift Number of bits to shift right the result.

282

*/

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

283

__kernel void gemm_mm_interleaved_transposed_u8(IMAGE_DECLARATION(src0),

284

IMAGE_DECLARATION(src1),

285

IMAGE_DECLARATION(dst),

int a_offset,

int b_offset,

int c_offset,

int c_mult_int,

int shift)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

291

{

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

292

// src_addr.s0 = address of matrix A

293

// src_addr.s1 = address of matrix B

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

294

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

295

// Compute address for matrix A and B

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

296

int2 src_addr = (int2)(get_global_id(1), get_global_id(0)) * (int2)((src0_stride_y),

297

(src1_stride_y));

298

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

299

// Add offset_first_element_in_bytes

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

300

src_addr = src_addr + ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));

301

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

302

// Compute end row address for matrix B

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

303

int end_row_mtx_b = src_addr.s1 + COLS_B;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

304

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

305

// Reset accumulators

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

int16 c00 = 0.0f;

int16 c10 = 0.0f;

int16 c20 = 0.0f;

int16 c30 = 0.0f;

for(; src_addr.s1 <= (end_row_mtx_b - 8); src_addr += (int2)(8, 32))

312

{

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

313

// Load values from matrix A (interleaved) and matrix B (transposed)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

314

int8 a0 = (int8)a_offset + convert_int8(vload8(0, ((__global uchar *)src0_ptr) + src_addr.s0));

315

int16 b0 = (int16)b_offset + convert_int16(vload16(0, ((__global uchar *)src1_ptr) + src_addr.s1));

316

317

c00 += (int16)a0.s0 * b0;

318

c10 += (int16)a0.s1 * b0;

319

c20 += (int16)a0.s2 * b0;

320

c30 += (int16)a0.s3 * b0;

321

322

int16 b1 = (int16)b_offset + convert_int16(vload16(0, ((__global uchar *)src1_ptr) + src_addr.s1 + 16));

323

324

c00 += (int16)a0.s4 * b1;

325

c10 += (int16)a0.s5 * b1;

326

c20 += (int16)a0.s6 * b1;

327

c30 += (int16)a0.s7 * b1;

328

}

329

330

for(; src_addr.s1 < end_row_mtx_b; src_addr += (int2)(4, 16))

331

{

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

332

// Load values from matrix A (interleaved) and matrix B (transposed)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

333

int4 a0 = (int4)a_offset + convert_int4(vload4(0, ((__global uchar *)src0_ptr) + src_addr.s0));

334

int16 b0 = (int16)b_offset + convert_int16(vload16(0, ((__global uchar *)src1_ptr) + src_addr.s1));

335

336

c00 += (int16)a0.s0 * b0;

337

c10 += (int16)a0.s1 * b0;

338

c20 += (int16)a0.s2 * b0;

339

c30 += (int16)a0.s3 * b0;

340

}

341

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

342

// Compute destination address

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

343

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

344

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

345

// Multiply by the weight of matrix product

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

346

c00 = (((int16)c_offset + c00) * (int16)c_mult_int) >> shift;

347

c10 = (((int16)c_offset + c10) * (int16)c_mult_int) >> shift;

348

c20 = (((int16)c_offset + c20) * (int16)c_mult_int) >> shift;

349

c30 = (((int16)c_offset + c30) * (int16)c_mult_int) >> shift;

350

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

351

// Store 4x16 block

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

352

vstore16(convert_uchar16_sat(c00), 0, (__global uchar *)(offset(&dst, 0, 0)));

353

vstore16(convert_uchar16_sat(c10), 0, (__global uchar *)(offset(&dst, 0, 1)));

354

vstore16(convert_uchar16_sat(c20), 0, (__global uchar *)(offset(&dst, 0, 2)));

355

vstore16(convert_uchar16_sat(c30), 0, (__global uchar *)(offset(&dst, 0, 3)));

356

}

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

357

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

358

/** This OpenCL kernel is optimised for Midgard. It computes the matrix multiplication between matrix A (src0) and matrix B (src1)

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

359

* Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_32bit and @ref gemm_transpose1x4 before running the matrix multiplication

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

360

*

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

361

* @attention The number of columns of matrix B and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

362

*

363

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32

364

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

365

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

366

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

367

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

368

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

369

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

370

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

371

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

372

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

373

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

374

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

375

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

376

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

377

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

378

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

379

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

380

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

381

*/

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

382

__kernel void gemm_mm_interleaved_transposed_f32_midgard(IMAGE_DECLARATION(src0),

383

IMAGE_DECLARATION(src1),

384

IMAGE_DECLARATION(dst))

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

385

{

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

386

// src_addr.s0 = address of matrix A

387

// src_addr.s1 = address of matrix B

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

388

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

389

// Compute address for matrix A and B

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

390

int2 src_addr = (int2)(get_global_id(1), get_global_id(0)) * (int2)((src0_stride_y),

391

(src1_stride_y));

392

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

393

// Add offset_first_element_in_bytes

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

394

src_addr = src_addr + ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));

395

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

396

// Divide by 4 in order to get the src_addr in unit of float

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

397

src_addr = src_addr >> 2;

398

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

399

// Compute end row address for matrix B

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

400

int end_row_mtx_b = src_addr.s1 + COLS_B;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

401

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

402

// Reset accumulators

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

float4 c00 = 0.0f;

float4 c10 = 0.0f;

float4 c20 = 0.0f;

float4 c30 = 0.0f;

for(; src_addr.s1 <= (end_row_mtx_b - 8); src_addr += (int2)(8, 8))

409

{

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

410

// Load values from matrix A (interleaved) and matrix B (transposed)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

411

float4 a0 = vload4(0, ((__global float *)src0_ptr) + src_addr.s0);

412

float4 b0 = vload4(0, ((__global float *)src1_ptr) + src_addr.s1);

413

414

c00 += (float4)a0.s0 * b0;

415

c10 += (float4)a0.s1 * b0;

416

c20 += (float4)a0.s2 * b0;

417

c30 += (float4)a0.s3 * b0;

418

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

419

// Load values from matrix A (interleaved) and matrix B (transposed)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

420

a0 = vload4(0, ((__global float *)src0_ptr) + src_addr.s0 + 4);

421

b0 = vload4(0, ((__global float *)src1_ptr) + src_addr.s1 + 4);

422

423

c00 += (float4)a0.s0 * b0;

424

c10 += (float4)a0.s1 * b0;

425

c20 += (float4)a0.s2 * b0;

426

c30 += (float4)a0.s3 * b0;

427

}

428

429

for(; src_addr.s1 < end_row_mtx_b; src_addr += (int2)(4, 4))

430

{

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

431

// Load values from matrix A (interleaved) and matrix B (transposed)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

432

float4 a0 = vload4(0, ((__global float *)src0_ptr) + src_addr.s0);

433

float4 b0 = vload4(0, ((__global float *)src1_ptr) + src_addr.s1);

434

435

c00 += (float4)a0.s0 * b0;

436

c10 += (float4)a0.s1 * b0;

437

c20 += (float4)a0.s2 * b0;

438

c30 += (float4)a0.s3 * b0;

439

}

440

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

441

// Compute destination address

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

442

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

443

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

444

#if defined(ALPHA)

445

// Multiply by the weight of matrix product

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

446

c00 = c00 * (float4)ALPHA;

447

c10 = c10 * (float4)ALPHA;

448

c20 = c20 * (float4)ALPHA;

449

c30 = c30 * (float4)ALPHA;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

450

#endif // defined(ALPHA)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

451

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

452

// Store 4x4 block

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

453

vstore4(c00, 0, (__global float *)(offset(&dst, 0, 0)));

454

vstore4(c10, 0, (__global float *)(offset(&dst, 0, 1)));

455

vstore4(c20, 0, (__global float *)(offset(&dst, 0, 2)));

456

vstore4(c30, 0, (__global float *)(offset(&dst, 0, 3)));

457

}

458

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

459

/** This OpenCL kernel is optimized for Bifrost. It computes the matrix multiplication between matrix A (src0) and matrix B (src1)

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

460

* Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_32bit and @ref gemm_transpose1x4 before running the matrix multiplication

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

461

*

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

462

* @attention The number of matrix B columns and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

463

*

464

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32

465

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

466

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

467

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

468

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

469

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

470

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

471

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

472

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

473

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

474

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

475

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

476

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

477

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

478

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

479

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

480

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

481

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

482

*/

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

483

__kernel void gemm_mm_interleaved_transposed_f32_bifrost(IMAGE_DECLARATION(src0),

484

IMAGE_DECLARATION(src1),

485

IMAGE_DECLARATION(dst))

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

486

{

487

// src_addr_a = address of matrix A

488

// src_addr_b = address of matrix B

489

__global float *src_addr_a = (__global float *)(src0_ptr + get_global_id(1) * src0_stride_y + src0_offset_first_element_in_bytes);

490

__global float *src_addr_b = (__global float *)(src1_ptr + get_global_id(0) * src1_stride_y + src1_offset_first_element_in_bytes);

491

492

// Compute end row address for matrix B

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

493

__global float *src_end_addr_b = src_addr_b + COLS_B;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

494

495

// Reset accumulators

float c00 = 0.0f;

float c01 = 0.0f;

float c02 = 0.0f;

float c03 = 0.0f;

float c10 = 0.0f;

float c11 = 0.0f;

float c12 = 0.0f;

float c13 = 0.0f;

float c20 = 0.0f;

float c21 = 0.0f;

float c22 = 0.0f;

float c23 = 0.0f;

float c30 = 0.0f;

float c31 = 0.0f;

float c32 = 0.0f;

float c33 = 0.0f;

for(; src_addr_b <= (src_end_addr_b - 16); src_addr_a += 16, src_addr_b += 16)

514

{

515

// Load values from matrix A (interleaved) and matrix B (transposed)

516

float4 a0 = vload4(0, src_addr_a);

517

float4 b0 = vload4(0, src_addr_b);

518

519

c00 = fma(a0.s0, b0.s0, c00);

520

c01 = fma(a0.s0, b0.s1, c01);

521

c02 = fma(a0.s0, b0.s2, c02);

522

c03 = fma(a0.s0, b0.s3, c03);

523

524

c10 = fma(a0.s1, b0.s0, c10);

525

c11 = fma(a0.s1, b0.s1, c11);

526

c12 = fma(a0.s1, b0.s2, c12);

527

c13 = fma(a0.s1, b0.s3, c13);

528

529

c20 = fma(a0.s2, b0.s0, c20);

530

c21 = fma(a0.s2, b0.s1, c21);

531

c22 = fma(a0.s2, b0.s2, c22);

532

c23 = fma(a0.s2, b0.s3, c23);

533

534

c30 = fma(a0.s3, b0.s0, c30);

535

c31 = fma(a0.s3, b0.s1, c31);

536

c32 = fma(a0.s3, b0.s2, c32);

537

c33 = fma(a0.s3, b0.s3, c33);

538

539

// Load values from matrix A (interleaved) and matrix B (transposed)

540

a0 = vload4(0, src_addr_a + 4);

541

b0 = vload4(0, src_addr_b + 4);

542

543

c00 = fma(a0.s0, b0.s0, c00);

544

c01 = fma(a0.s0, b0.s1, c01);

545

c02 = fma(a0.s0, b0.s2, c02);

546

c03 = fma(a0.s0, b0.s3, c03);

547

548

c10 = fma(a0.s1, b0.s0, c10);

549

c11 = fma(a0.s1, b0.s1, c11);

550

c12 = fma(a0.s1, b0.s2, c12);

551

c13 = fma(a0.s1, b0.s3, c13);

552

553

c20 = fma(a0.s2, b0.s0, c20);

554

c21 = fma(a0.s2, b0.s1, c21);

555

c22 = fma(a0.s2, b0.s2, c22);

556

c23 = fma(a0.s2, b0.s3, c23);

557

558

c30 = fma(a0.s3, b0.s0, c30);

559

c31 = fma(a0.s3, b0.s1, c31);

560

c32 = fma(a0.s3, b0.s2, c32);

561

c33 = fma(a0.s3, b0.s3, c33);

562

563

// Load values from matrix A (interleaved) and matrix B (transposed)

564

a0 = vload4(0, src_addr_a + 8);

565

b0 = vload4(0, src_addr_b + 8);

566

567

c00 = fma(a0.s0, b0.s0, c00);

568

c01 = fma(a0.s0, b0.s1, c01);

569

c02 = fma(a0.s0, b0.s2, c02);

570

c03 = fma(a0.s0, b0.s3, c03);

571

572

c10 = fma(a0.s1, b0.s0, c10);

573

c11 = fma(a0.s1, b0.s1, c11);

574

c12 = fma(a0.s1, b0.s2, c12);

575

c13 = fma(a0.s1, b0.s3, c13);

576

577

c20 = fma(a0.s2, b0.s0, c20);

578

c21 = fma(a0.s2, b0.s1, c21);

579

c22 = fma(a0.s2, b0.s2, c22);

580

c23 = fma(a0.s2, b0.s3, c23);

581

582

c30 = fma(a0.s3, b0.s0, c30);

583

c31 = fma(a0.s3, b0.s1, c31);

584

c32 = fma(a0.s3, b0.s2, c32);

585

c33 = fma(a0.s3, b0.s3, c33);

586

587

// Load values from matrix A (interleaved) and matrix B (transposed)

588

a0 = vload4(0, src_addr_a + 12);

589

b0 = vload4(0, src_addr_b + 12);

590

591

c00 = fma(a0.s0, b0.s0, c00);

592

c01 = fma(a0.s0, b0.s1, c01);

593

c02 = fma(a0.s0, b0.s2, c02);

594

c03 = fma(a0.s0, b0.s3, c03);

595

596

c10 = fma(a0.s1, b0.s0, c10);

597

c11 = fma(a0.s1, b0.s1, c11);

598

c12 = fma(a0.s1, b0.s2, c12);

599

c13 = fma(a0.s1, b0.s3, c13);

600

601

c20 = fma(a0.s2, b0.s0, c20);

602

c21 = fma(a0.s2, b0.s1, c21);

603

c22 = fma(a0.s2, b0.s2, c22);

604

c23 = fma(a0.s2, b0.s3, c23);

605

606

c30 = fma(a0.s3, b0.s0, c30);

607

c31 = fma(a0.s3, b0.s1, c31);

608

c32 = fma(a0.s3, b0.s2, c32);

609

c33 = fma(a0.s3, b0.s3, c33);

610

}

611

612

for(; src_addr_b < src_end_addr_b; src_addr_a += 4, src_addr_b += 4)

613

{

614

// Load values from matrix A (interleaved) and matrix B (transposed)

615

float4 a0 = vload4(0, src_addr_a);

616

float4 b0 = vload4(0, src_addr_b);

617

618

c00 = fma(a0.s0, b0.s0, c00);

619

c01 = fma(a0.s0, b0.s1, c01);

620

c02 = fma(a0.s0, b0.s2, c02);

621

c03 = fma(a0.s0, b0.s3, c03);

622

623

c10 = fma(a0.s1, b0.s0, c10);

624

c11 = fma(a0.s1, b0.s1, c11);

625

c12 = fma(a0.s1, b0.s2, c12);

626

c13 = fma(a0.s1, b0.s3, c13);

627

628

c20 = fma(a0.s2, b0.s0, c20);

629

c21 = fma(a0.s2, b0.s1, c21);

630

c22 = fma(a0.s2, b0.s2, c22);

631

c23 = fma(a0.s2, b0.s3, c23);

632

633

c30 = fma(a0.s3, b0.s0, c30);

634

c31 = fma(a0.s3, b0.s1, c31);

635

c32 = fma(a0.s3, b0.s2, c32);

636

c33 = fma(a0.s3, b0.s3, c33);

637

}

638

639

// Compute destination address

640

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

641

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

642

#if defined(ALPHA)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

643

// Multiply by the weight of matrix product

c00 = c00 * ALPHA;

c01 = c01 * ALPHA;

c02 = c02 * ALPHA;

c03 = c03 * ALPHA;

c10 = c10 * ALPHA;

c11 = c11 * ALPHA;

c12 = c12 * ALPHA;

c13 = c13 * ALPHA;

c20 = c20 * ALPHA;

c21 = c21 * ALPHA;

c22 = c22 * ALPHA;

c23 = c23 * ALPHA;

c30 = c30 * ALPHA;

c31 = c31 * ALPHA;

c32 = c32 * ALPHA;

c33 = c33 * ALPHA;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

660

#endif // defined(ALPHA)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

661

662

barrier(CLK_GLOBAL_MEM_FENCE);

663

664

// Store 4x4 block

665

vstore4((float4)(c00, c01, c02, c03), 0, (__global float *)(offset(&dst, 0, 0)));

666

vstore4((float4)(c10, c11, c12, c13), 0, (__global float *)(offset(&dst, 0, 1)));

667

vstore4((float4)(c20, c21, c22, c23), 0, (__global float *)(offset(&dst, 0, 2)));

668

vstore4((float4)(c30, c31, c32, c33), 0, (__global float *)(offset(&dst, 0, 3)));

669

}

670

Matthew Bentham

6f31f8c

2017-10-27 11:50:06 +0100

[diff] [blame]

671

#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

672

/** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1)

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

673

* Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_16bit and @ref gemm_transpose1x8 before running the matrix multiplication

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

674

*

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

675

* @attention The number of matrix B columns and the optional alpha's value need to be passed at compile time using -DCOLS_B and -DALPHA

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

676

*

677

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16

678

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

679

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

680

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

681

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

682

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

683

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

684

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

685

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

686

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

687

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

688

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

689

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

690

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

691

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

692

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

693

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

694

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

695

*/

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

696

__kernel void gemm_mm_interleaved_transposed_f16(IMAGE_DECLARATION(src0),

697

IMAGE_DECLARATION(src1),

698

IMAGE_DECLARATION(dst))

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

699

{

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

700

// src_addr.s0 = address of matrix A

701

// src_addr.s1 = address of matrix B

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

702

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

703

// Compute address for matrix A and B

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

704

int2 src_addr = (int2)(get_global_id(1), get_global_id(0)) * (int2)((src0_stride_y),

705

(src1_stride_y));

706

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

707

// Add offset_first_element_in_bytes

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

708

src_addr = src_addr + ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));

709

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

710

// Divide by 2 in order to get the src_addr in unit of half

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

711

src_addr = src_addr >> 1;

712

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

713

// Compute end row address for matrix B

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

714

int end_row_mtx_b = src_addr.s1 + COLS_B;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

715

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

716

// Reset accumulators

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

half8 c00 = 0.0f;

half8 c10 = 0.0f;

half8 c20 = 0.0f;

half8 c30 = 0.0f;

Moritz Pflanzer

2017-07-21 15:55:28 +0100

[diff] [blame]

722

for(; src_addr.s1 <= (end_row_mtx_b - 16); src_addr += (int2)(8, 16))

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

723

{

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

724

// Load values from matrix A (interleaved) and matrix B (transposed)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

725

half4 a0 = vload4(0, ((__global half *)src0_ptr) + src_addr.s0);

726

half8 b0 = vload8(0, ((__global half *)src1_ptr) + src_addr.s1);

727

728

c00 += (half8)a0.s0 * b0;

729

c10 += (half8)a0.s1 * b0;

730

c20 += (half8)a0.s2 * b0;

731

c30 += (half8)a0.s3 * b0;

732

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

733

// Load values from matrix A (interleaved) and matrix B (transposed)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

734

a0 = vload4(0, ((__global half *)src0_ptr) + src_addr.s0 + 4);

735

b0 = vload8(0, ((__global half *)src1_ptr) + src_addr.s1 + 8);

736

737

c00 += (half8)a0.s0 * b0;

738

c10 += (half8)a0.s1 * b0;

739

c20 += (half8)a0.s2 * b0;

740

c30 += (half8)a0.s3 * b0;

741

}

742

743

for(; src_addr.s1 < end_row_mtx_b; src_addr += (int2)(4, 8))

744

{

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

745

// Load values from matrix A (interleaved) and matrix B (transposed)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

746

half4 a0 = vload4(0, ((__global half *)src0_ptr) + src_addr.s0);

747

half8 b0 = vload8(0, ((__global half *)src1_ptr) + src_addr.s1);

748

749

c00 += (half8)a0.s0 * b0;

750

c10 += (half8)a0.s1 * b0;

751

c20 += (half8)a0.s2 * b0;

752

c30 += (half8)a0.s3 * b0;

753

}

754

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

755

// Compute destination address

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

756

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

757

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

758

#if defined(ALPHA)

759

// Multiply by the weight of matrix product

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

760

c00 = c00 * (half8)ALPHA;

761

c10 = c10 * (half8)ALPHA;

762

c20 = c20 * (half8)ALPHA;

763

c30 = c30 * (half8)ALPHA;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

764

#endif // defined(ALPHA)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

765

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

766

// Store 4x8 block

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

767

vstore8(c00, 0, (__global half *)(offset(&dst, 0, 0)));

768

vstore8(c10, 0, (__global half *)(offset(&dst, 0, 1)));

769

vstore8(c20, 0, (__global half *)(offset(&dst, 0, 2)));

770

vstore8(c30, 0, (__global half *)(offset(&dst, 0, 3)));

771

}

Matthew Bentham

6f31f8c

2017-10-27 11:50:06 +0100

[diff] [blame]

772

#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

773

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

774

#if defined(FIXED_POINT_POSITION)

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

775

/** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1) in 8 bit fixed point precision

776

* Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_8bit and @ref gemm_transpose1x16 before running the matrix multiplication

777

*

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

778

* @attention The number of matrix B columns, the optional alpha's value and fixed point position need to be passed at compile time using -DCOLS_B -DALPHA and -DFIXED_POINT_POSITION

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

779

*

780

* @note: ALPHA must be passed in 8 bit fixed point format

781

*

782

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: QS8

783

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

784

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

785

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

786

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

787

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

788

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

789

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

790

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

791

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

792

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

793

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

794

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

795

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

796

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

797

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

798

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

799

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

800

*/

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

801

__kernel void gemm_mm_interleaved_transposed_qs8(IMAGE_DECLARATION(src0),

802

IMAGE_DECLARATION(src1),

803

IMAGE_DECLARATION(dst))

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

804

{

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

805

// src_addr.s0 = address of matrix A

806

// src_addr.s1 = address of matrix B

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

807

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

808

// Compute address for matrix A and B

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

809

int2 src_addr = (int2)(get_global_id(1), get_global_id(0)) * (int2)((src0_stride_y),

810

(src1_stride_y));

811

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

812

// Add offset_first_element_in_bytes

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

813

src_addr = src_addr + ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));

814

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

815

// Compute end row address for matrix B

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

816

int end_row_mtx_b = src_addr.s1 + COLS_B;

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

817

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

818

// Reset accumulators

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

short8 c00 = 0.0f;

short8 c10 = 0.0f;

short8 c20 = 0.0f;

short8 c30 = 0.0f;

short8 c01 = 0.0f;

short8 c11 = 0.0f;

short8 c21 = 0.0f;

short8 c31 = 0.0f;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

828

// This for loop performs 1 accumulation for each iteration

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

829

for(; src_addr.s1 <= (end_row_mtx_b - 16); src_addr += (int2)(4, 16))

830

{

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

831

// Load values from matrix A (interleaved) and matrix B (transposed)

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

832

char4 a0 = vload4(0, ((__global char *)src0_ptr) + src_addr.s0);

833

char16 b0 = vload16(0, ((__global char *)src1_ptr) + src_addr.s1);

834

835

c00 = mlal_sat_qs8x8(c00, (char8)a0.s0, b0.s01234567, FIXED_POINT_POSITION);

836

c10 = mlal_sat_qs8x8(c10, (char8)a0.s1, b0.s01234567, FIXED_POINT_POSITION);

837

c20 = mlal_sat_qs8x8(c20, (char8)a0.s2, b0.s01234567, FIXED_POINT_POSITION);

838

c30 = mlal_sat_qs8x8(c30, (char8)a0.s3, b0.s01234567, FIXED_POINT_POSITION);

839

840

c01 = mlal_sat_qs8x8(c01, (char8)a0.s0, b0.s89ABCDEF, FIXED_POINT_POSITION);

841

c11 = mlal_sat_qs8x8(c11, (char8)a0.s1, b0.s89ABCDEF, FIXED_POINT_POSITION);

842

c21 = mlal_sat_qs8x8(c21, (char8)a0.s2, b0.s89ABCDEF, FIXED_POINT_POSITION);

843

c31 = mlal_sat_qs8x8(c31, (char8)a0.s3, b0.s89ABCDEF, FIXED_POINT_POSITION);

844

}

845

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

846

// Compute destination address

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

847

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

848

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

849

// Multiply by the weight of matrix product

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

850

char16 c00_qs8 = convert_char16_sat((short16)(c00, c01));

851

char16 c10_qs8 = convert_char16_sat((short16)(c10, c11));

852

char16 c20_qs8 = convert_char16_sat((short16)(c20, c21));

853

char16 c30_qs8 = convert_char16_sat((short16)(c30, c31));

854

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

855

#if defined(ALPHA)

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

856

c00_qs8 = mul_sat_qs8x16(c00_qs8, (char16)ALPHA, FIXED_POINT_POSITION);

857

c10_qs8 = mul_sat_qs8x16(c10_qs8, (char16)ALPHA, FIXED_POINT_POSITION);

858

c20_qs8 = mul_sat_qs8x16(c20_qs8, (char16)ALPHA, FIXED_POINT_POSITION);

859

c30_qs8 = mul_sat_qs8x16(c30_qs8, (char16)ALPHA, FIXED_POINT_POSITION);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

860

#endif // defined(ALPHA)

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

861

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

862

// Store 16x4 block

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

863

vstore16(c00_qs8, 0, (__global char *)(offset(&dst, 0, 0)));

864

vstore16(c10_qs8, 0, (__global char *)(offset(&dst, 0, 1)));

865

vstore16(c20_qs8, 0, (__global char *)(offset(&dst, 0, 2)));

866

vstore16(c30_qs8, 0, (__global char *)(offset(&dst, 0, 3)));

867

}

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

868

869

/** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1) in 16 bit fixed point precision

870

* Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_16bit and @ref gemm_transpose1x8 before running the matrix multiplication

871

*

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

872

* @attention The number of matrix B columns, the optional alpha's value and fixed point position need to be passed at compile time using -DCOLS_B -DALPHA and -DFIXED_POINT_POSITION

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

873

*

874

* @note: ALPHA must be passed in 16 bit fixed point format

875

*

876

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: QS16

877

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

878

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

879

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

880

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

881

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

882

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

883

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

884

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

885

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

886

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

887

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

888

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

889

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

890

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

891

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

892

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

893

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

894

*/

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

895

__kernel void gemm_mm_interleaved_transposed_qs16(IMAGE_DECLARATION(src0),

896

IMAGE_DECLARATION(src1),

897

IMAGE_DECLARATION(dst))

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

898

{

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

899

// src_addr.s0 = address of matrix A

900

// src_addr.s1 = address of matrix B

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

901

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

902

// Compute address for matrix A and B

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

903

int2 src_addr = (int2)(get_global_id(1), get_global_id(0)) * (int2)((src0_stride_y),

904

(src1_stride_y));

905

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

906

// Add offset_first_element_in_bytes

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

907

src_addr = src_addr + ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));

908

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

909

// Divide by 2 in order to get the src_addr in unit of short

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

910

src_addr = src_addr >> 1;

911

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

912

// Compute end row address for matrix B

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

913

int end_row_mtx_b = src_addr.s1 + COLS_B;

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

914

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

915

// Reset accumulators

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

int8 c00 = 0.0f;

int8 c10 = 0.0f;

int8 c20 = 0.0f;

int8 c30 = 0.0f;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

921

// This for loop performs 1 accumulation for each iteration

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

922

for(; src_addr.s1 <= (end_row_mtx_b - 8); src_addr += (int2)(4, 8))

923

{

924

/* Load values from matrix A (interleaved) and matrix B (transposed) */

925

short4 a0 = vload4(0, ((__global short *)src0_ptr) + src_addr.s0);

926

short8 b0 = vload8(0, ((__global short *)src1_ptr) + src_addr.s1);

927

928

c00 = mlal_sat_qs16x8(c00, (short8)a0.s0, b0, FIXED_POINT_POSITION);

929

c10 = mlal_sat_qs16x8(c10, (short8)a0.s1, b0, FIXED_POINT_POSITION);

930

c20 = mlal_sat_qs16x8(c20, (short8)a0.s2, b0, FIXED_POINT_POSITION);

931

c30 = mlal_sat_qs16x8(c30, (short8)a0.s3, b0, FIXED_POINT_POSITION);

932

}

933

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

934

// Compute destination address

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

935

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

936

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

937

// Multiply by the weight of matrix product

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

938

short8 c00_qs16 = convert_short8_sat(c00);

939

short8 c10_qs16 = convert_short8_sat(c10);

940

short8 c20_qs16 = convert_short8_sat(c20);

941

short8 c30_qs16 = convert_short8_sat(c30);

942

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

943

#if defined(ALPHA)

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

944

c00_qs16 = mul_sat_qs16x8(c00_qs16, (short8)ALPHA, FIXED_POINT_POSITION);

945

c10_qs16 = mul_sat_qs16x8(c10_qs16, (short8)ALPHA, FIXED_POINT_POSITION);

946

c20_qs16 = mul_sat_qs16x8(c20_qs16, (short8)ALPHA, FIXED_POINT_POSITION);

947

c30_qs16 = mul_sat_qs16x8(c30_qs16, (short8)ALPHA, FIXED_POINT_POSITION);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

948

#endif // defined(ALPHA)

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

949

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

950

// Store 8x4 block

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

951

vstore8(c00_qs16, 0, (__global short *)(offset(&dst, 0, 0)));

952

vstore8(c10_qs16, 0, (__global short *)(offset(&dst, 0, 1)));

953

vstore8(c20_qs16, 0, (__global short *)(offset(&dst, 0, 2)));

954

vstore8(c30_qs16, 0, (__global short *)(offset(&dst, 0, 3)));

955

}

956

#endif // defined(FIXED_POINT_POSITION)

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

957

#endif // defined(COLS_B)

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

958

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

959

#if defined(COLS_A) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && (NUM_ELEMS_PROCESSED_PER_THREAD_Y)

960

#if defined(DATA_TYPE)

961

#define VECTOR_TYPE VEC_DATA_TYPE(DATA_TYPE, NUM_ELEMS_PROCESSED_PER_THREAD_X)

962

/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

963

*

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

964

* @note This OpenCL kernel works with floating point data types (F16/F32)

965

* @note The floating point data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)

966

* @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

967

* @note The number of matrix A columns and the optional alpha's value need to be passed at compile time using -DCOLS_A and -DALPHA

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

968

*

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

969

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16/F32

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

970

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

971

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

972

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

973

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

974

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

975

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

976

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

977

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

978

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

979

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

980

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

981

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

982

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

983

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

984

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

985

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

986

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

987

*/

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

988

__kernel void gemm_mm_floating_point(IMAGE_DECLARATION(src0),

989

IMAGE_DECLARATION(src1),

990

IMAGE_DECLARATION(dst))

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

991

{

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

992

int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

993

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

994

// Compute starting address for matrix A and Matrix B

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

995

int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

996

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

997

// Update address for the matrix A

998

src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

999

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1000

// Update address for the matrix B

1001

src_addr.s1 += idx * sizeof(DATA_TYPE);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1002

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1003

int end_row_vec_a = src_addr.s0 + (COLS_A * sizeof(DATA_TYPE));

1004

1005

VECTOR_TYPE acc0 = 0.0f;

1006

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1007

VECTOR_TYPE acc1 = 0.0f;

1008

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1009

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1010

VECTOR_TYPE acc2 = 0.0f;

1011

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1012

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1013

VECTOR_TYPE acc3 = 0.0f;

1014

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1015

Georgios Pinitas

96880cf

2017-10-20 18:52:20 +0100

[diff] [blame]

1016

for(; src_addr.s0 <= (end_row_vec_a - 2 * (int)sizeof(DATA_TYPE)); src_addr += (int2)(2 * sizeof(DATA_TYPE), 2 * src1_stride_y))

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1017

{

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1018

// Load values from matrix A

1019

VEC_DATA_TYPE(DATA_TYPE, 2)

1020

a0 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));

1021

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1022

VEC_DATA_TYPE(DATA_TYPE, 2)

1023

a1 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

1024

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1025

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1026

VEC_DATA_TYPE(DATA_TYPE, 2)

1027

a2 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

1028

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1029

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1030

VEC_DATA_TYPE(DATA_TYPE, 2)

1031

a3 = vload2(0, (__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

1032

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1033

// Load values from matrix B

1034

VECTOR_TYPE b0 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, (__global DATA_TYPE *)(src1_ptr + src_addr.s1));

1035

VECTOR_TYPE b1 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, (__global DATA_TYPE *)(src1_ptr + src_addr.s1 + src1_stride_y));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1036

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1037

// Accumulate

1038

acc0 += b0 * (VECTOR_TYPE)a0.s0;

1039

acc0 += b1 * (VECTOR_TYPE)a0.s1;

1040

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1041

acc1 += b0 * (VECTOR_TYPE)a1.s0;

1042

acc1 += b1 * (VECTOR_TYPE)a1.s1;

1043

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1044

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1045

acc2 += b0 * (VECTOR_TYPE)a2.s0;

1046

acc2 += b1 * (VECTOR_TYPE)a2.s1;

1047

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1048

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1049

acc3 += b0 * (VECTOR_TYPE)a3.s0;

1050

acc3 += b1 * (VECTOR_TYPE)a3.s1;

1051

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1052

}

1053

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1054

for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(sizeof(DATA_TYPE), src1_stride_y))

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1055

{

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1056

// Load values from matrix A

1057

DATA_TYPE a0 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));

1058

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1059

DATA_TYPE a1 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

1060

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1061

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1062

DATA_TYPE a2 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

1063

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1064

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1065

DATA_TYPE a3 = *((__global DATA_TYPE *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

1066

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1067

// Load values from matrix B

1068

VECTOR_TYPE b0 = VLOAD(NUM_ELEMS_PROCESSED_PER_THREAD_X)(0, (__global DATA_TYPE *)(src1_ptr + src_addr.s1));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1069

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1070

// Accumulate

1071

acc0 += b0 * (VECTOR_TYPE)a0;

1072

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1073

acc1 += b0 * (VECTOR_TYPE)a1;

1074

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1075

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1076

acc2 += b0 * (VECTOR_TYPE)a2;

1077

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1078

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1079

acc3 += b0 * (VECTOR_TYPE)a3;

1080

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1081

}

1082

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1083

// Compute destination address

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1084

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

1085

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1086

// Multiply by the weight of matrix-matrix product and store the result

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1087

#if defined(ALPHA)

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1088

acc0 = acc0 * (VECTOR_TYPE)ALPHA;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1089

#endif // defined(ALPHA)

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1090

VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)

1091

(acc0, 0, (__global DATA_TYPE *)(offset(&dst, 0, 0)));

1092

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1093

#if defined(ALPHA)

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1094

acc1 = acc1 * (VECTOR_TYPE)ALPHA;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1095

#endif // defined(ALPHA)

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1096

VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)

1097

(acc1, 0, (__global DATA_TYPE *)(offset(&dst, 0, 1)));

1098

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1099

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1100

#if defined(ALPHA)

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1101

acc2 = acc2 * (VECTOR_TYPE)ALPHA;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1102

#endif // defined(ALPHA)

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1103

VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)

1104

(acc2, 0, (__global DATA_TYPE *)(offset(&dst, 0, 2)));

1105

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1106

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1107

#if defined(ALPHA)

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1108

acc3 = acc3 * (VECTOR_TYPE)ALPHA;

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1109

#endif // defined(ALPHA)

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1110

VSTORE(NUM_ELEMS_PROCESSED_PER_THREAD_X)

1111

(acc3, 0, (__global DATA_TYPE *)(offset(&dst, 0, 3)));

1112

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1113

}

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1114

#endif // defined(DATA_TYPE)

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

1115

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1116

/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped

1117

*

1118

* @note This OpenCL kernel works with the 32-bit floating point data type (float) and uses the fma units.

1119

* @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y.

1120

* This kernel optimally uses -DNUM_ELEMS_PROCESSED_PER_THREAD_X=4.

1121

* @note The number of matrix A columns must be passed at compile time using -DCOLS_A.

1122

* @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha

1123

*

1124

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16/F32

1125

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

1126

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1127

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

1128

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1129

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

1130

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

1131

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

1132

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1133

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

1134

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1135

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

1136

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

1137

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

1138

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

1139

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

1140

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

1141

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

1142

*/

1143

__kernel void gemm_mm_floating_point_f32_bifrost(IMAGE_DECLARATION(src0),

1144

IMAGE_DECLARATION(src1),

1145

IMAGE_DECLARATION(dst))

1146

{

1147

int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;

1148

1149

// Compute starting address for matrix A and matrix B

1150

int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));

1151

1152

// Update address for matrix A

1153

src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;

1154

1155

// Update address for matrix B

1156

src_addr.s1 += idx * sizeof(float);

1157

1158

// Address boundary for matrix A

1159

int end_row_vec_a = src_addr.s0 + (COLS_A * sizeof(float));

1160

1161

// Initialize accumulators

float acc00 = 0.0f;

float acc01 = 0.0f;

float acc02 = 0.0f;

float acc03 = 0.0f;

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

float acc10 = 0.0f;

float acc11 = 0.0f;

float acc12 = 0.0f;

float acc13 = 0.0f;

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1173

1174

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

float acc20 = 0.0f;

float acc21 = 0.0f;

float acc22 = 0.0f;

float acc23 = 0.0f;

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1180

1181

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

float acc30 = 0.0f;

float acc31 = 0.0f;

float acc32 = 0.0f;

float acc33 = 0.0f;

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1187

1188

// A and B src indices get incremented at the same time.

1189

for(; src_addr.s0 <= (end_row_vec_a - 2 * (int)sizeof(float)); src_addr += (int2)(2 * sizeof(float), 2 * src1_stride_y))

1190

{

1191

// Load values from matrix A

1192

float2 a0 = vload2(0, (__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));

1193

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1194

float2 a1 = vload2(0, (__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

1195

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1196

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1197

float2 a2 = vload2(0, (__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

1198

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1199

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1200

float2 a3 = vload2(0, (__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

1201

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1202

// Load values from matrix B

1203

float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1 + 0 * src1_stride_y));

1204

float4 b1 = vload4(0, (__global float *)(src1_ptr + src_addr.s1 + 1 * src1_stride_y));

1205

1206

// Multiply and accumulate

1207

acc00 = fma(a0.s0, b0.s0, acc00);

1208

acc00 = fma(a0.s1, b1.s0, acc00);

1209

acc01 = fma(a0.s0, b0.s1, acc01);

1210

acc01 = fma(a0.s1, b1.s1, acc01);

1211

acc02 = fma(a0.s0, b0.s2, acc02);

1212

acc02 = fma(a0.s1, b1.s2, acc02);

1213

acc03 = fma(a0.s1, b1.s3, acc03);

1214

acc03 = fma(a0.s0, b0.s3, acc03);

1215

1216

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1217

acc10 = fma(a1.s0, b0.s0, acc10);

1218

acc11 = fma(a1.s0, b0.s1, acc11);

1219

acc12 = fma(a1.s0, b0.s2, acc12);

1220

acc13 = fma(a1.s0, b0.s3, acc13);

1221

1222

acc10 = fma(a1.s1, b1.s0, acc10);

1223

acc11 = fma(a1.s1, b1.s1, acc11);

1224

acc12 = fma(a1.s1, b1.s2, acc12);

1225

acc13 = fma(a1.s1, b1.s3, acc13);

1226

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1227

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1228

acc20 = fma(a2.s0, b0.s0, acc20);

1229

acc21 = fma(a2.s0, b0.s1, acc21);

1230

acc22 = fma(a2.s0, b0.s2, acc22);

1231

acc23 = fma(a2.s0, b0.s3, acc23);

1232

1233

acc20 = fma(a2.s1, b1.s0, acc20);

1234

acc21 = fma(a2.s1, b1.s1, acc21);

1235

acc22 = fma(a2.s1, b1.s2, acc22);

1236

acc23 = fma(a2.s1, b1.s3, acc23);

1237

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1238

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1239

acc30 = fma(a3.s0, b0.s0, acc30);

1240

acc31 = fma(a3.s0, b0.s1, acc31);

1241

acc32 = fma(a3.s0, b0.s2, acc32);

1242

acc33 = fma(a3.s0, b0.s3, acc33);

1243

1244

acc30 = fma(a3.s1, b1.s0, acc30);

1245

acc31 = fma(a3.s1, b1.s1, acc31);

1246

acc32 = fma(a3.s1, b1.s2, acc32);

1247

acc33 = fma(a3.s1, b1.s3, acc33);

1248

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1249

}

1250

1251

for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(sizeof(float), src1_stride_y))

1252

{

1253

// Load values from matrix A

1254

float a0 = *((__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));

1255

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1256

float a1 = *((__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

1257

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1258

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1259

float a2 = *((__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

1260

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1261

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1262

float a3 = *((__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

1263

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1264

// Load values from matrix B

1265

float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));

1266

1267

// Multiply and accumulate

1268

acc00 = fma(a0, b0.s0, acc00);

1269

acc01 = fma(a0, b0.s1, acc01);

1270

acc02 = fma(a0, b0.s2, acc02);

1271

acc03 = fma(a0, b0.s3, acc03);

1272

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1273

acc10 = fma(a1, b0.s0, acc10);

1274

acc11 = fma(a1, b0.s1, acc11);

1275

acc12 = fma(a1, b0.s2, acc12);

1276

acc13 = fma(a1, b0.s3, acc13);

1277

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1278

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1279

acc20 = fma(a2, b0.s0, acc20);

1280

acc21 = fma(a2, b0.s1, acc21);

1281

acc22 = fma(a2, b0.s2, acc22);

1282

acc23 = fma(a2, b0.s3, acc23);

1283

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1284

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1285

acc30 = fma(a3, b0.s0, acc30);

1286

acc31 = fma(a3, b0.s1, acc31);

1287

acc32 = fma(a3, b0.s2, acc32);

1288

acc33 = fma(a3, b0.s3, acc33);

1289

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1290

}

1291

1292

// Compute destination address

1293

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

1294

1295

// Multiply by the weight of matrix-matrix product and store the result

1296

#if defined(ALPHA)

1297

acc00 = acc00 * ALPHA;

1298

acc01 = acc01 * ALPHA;

1299

acc02 = acc02 * ALPHA;

1300

acc03 = acc03 * ALPHA;

1301

#endif // defined(ALPHA)

1302

1303

float4 acc0 = ((float4)(acc00, acc01, acc02, acc03));

1304

vstore4(acc0, 0, (__global float *)(offset(&dst, 0, 0)));

1305

1306

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1307

#if defined(ALPHA)

1308

acc10 = acc10 * ALPHA;

1309

acc11 = acc11 * ALPHA;

1310

acc12 = acc12 * ALPHA;

1311

acc13 = acc13 * ALPHA;

1312

#endif // defined(ALPHA)

1313

float4 acc1 = ((float4)(acc10, acc11, acc12, acc13));

1314

vstore4(acc1, 0, (__global float *)(offset(&dst, 0, 1)));

1315

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1316

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1317

#if defined(ALPHA)

1318

acc20 = acc20 * ALPHA;

1319

acc21 = acc21 * ALPHA;

1320

acc22 = acc22 * ALPHA;

1321

acc23 = acc23 * ALPHA;

1322

#endif // defined(ALPHA)

1323

float4 acc2 = ((float4)(acc20, acc21, acc22, acc23));

1324

vstore4(acc2, 0, (__global float *)(offset(&dst, 0, 2)));

1325

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1326

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1327

#if defined(ALPHA)

1328

acc30 = acc30 * ALPHA;

1329

acc31 = acc31 * ALPHA;

1330

acc32 = acc32 * ALPHA;

1331

acc33 = acc33 * ALPHA;

1332

#endif // defined(ALPHA)

1333

float4 acc3 = ((float4)(acc30, acc31, acc32, acc33));

1334

vstore4(acc3, 0, (__global float *)(offset(&dst, 0, 3)));

1335

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1336

}

1337

1338

/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not been reshaped

1339

*

1340

* @note This OpenCL kernel works with the 32-bit floating point data type (float) and uses the fma units.

1341

* This OpenCL kernel is optimized for Bifrost when the number of matrix B columns is less or equal to 1000.

1342

* @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y.

1343

* This kernel optimally uses -DNUM_ELEMS_PROCESSED_PER_THREAD_X=2.

1344

* @note The number of matrix A columns must be passed at compile time using -DCOLS_A.

1345

* @note The optional value of scalar alpha is passed at compile time using -DALPHA=alpha if alpha!=1.0f.

1346

*

1347

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16/F32

1348

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

1349

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1350

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

1351

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1352

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

1353

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

1354

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

1355

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1356

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

1357

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1358

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

1359

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

1360

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

1361

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

1362

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

1363

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

1364

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

1365

*/

1366

__kernel void gemm_mm_floating_point_f32_bifrost_1000(IMAGE_DECLARATION(src0),

1367

IMAGE_DECLARATION(src1),

1368

IMAGE_DECLARATION(dst))

1369

{

1370

// Requires 2 NUM_ELEMS_PROCESSED_PER_THREAD_X, C vect2, A vect4, B (2 vload2) // to fix for NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1371

int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;

1372

1373

// Compute starting address for matrix A and Matrix B

1374

int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));

1375

1376

// Update address for the matrix A

1377

src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;

1378

1379

// Update address for the matrix B

1380

src_addr.s1 += idx * sizeof(float);

1381

1382

// Address boundary for the matrix A

1383

int end_row_vec_a = src_addr.s0 + (COLS_A * sizeof(float));

1384

1385

// Initialize accumulators

float acc00 = 0.0f;

float acc01 = 0.0f;

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1390

float acc10 = 0.0f;

1391

float acc11 = 0.0f;

1392

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1393

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1394

float acc20 = 0.0f;

1395

float acc21 = 0.0f;

1396

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1397

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1398

float acc30 = 0.0f;

1399

float acc31 = 0.0f;

1400

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1401

1402

// A and B src indices get incremented at the same time.

1403

for(; src_addr.s0 <= (end_row_vec_a - 4 * (int)sizeof(float)); src_addr += (int2)(4 * sizeof(float), 4 * src1_stride_y))

1404

{

1405

// Load values from matrix A

1406

float4 a0 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));

1407

1408

// Load values from matrix B

1409

float2 b0 = vload2(0, (__global float *)(src1_ptr + src_addr.s1 + 0 * src1_stride_y));

1410

float2 b1 = vload2(0, (__global float *)(src1_ptr + src_addr.s1 + 1 * src1_stride_y));

1411

float2 b2 = vload2(0, (__global float *)(src1_ptr + src_addr.s1 + 2 * src1_stride_y));

1412

float2 b3 = vload2(0, (__global float *)(src1_ptr + src_addr.s1 + 3 * src1_stride_y));

1413

1414

// Multiply and accumulate

1415

acc00 = fma(a0.s0, b0.s0, acc00);

1416

acc00 = fma(a0.s1, b1.s0, acc00);

1417

acc00 = fma(a0.s2, b2.s0, acc00);

1418

acc00 = fma(a0.s3, b3.s0, acc00);

1419

1420

acc01 = fma(a0.s0, b0.s1, acc01);

1421

acc01 = fma(a0.s1, b1.s1, acc01);

1422

acc01 = fma(a0.s2, b2.s1, acc01);

1423

acc01 = fma(a0.s3, b3.s1, acc01);

1424

1425

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1426

a0 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

1427

acc10 = fma(a0.s0, b0.s0, acc10);

1428

acc10 = fma(a0.s1, b1.s0, acc10);

1429

acc10 = fma(a0.s2, b2.s0, acc10);

1430

acc10 = fma(a0.s3, b3.s0, acc10);

1431

1432

acc11 = fma(a0.s0, b0.s1, acc11);

1433

acc11 = fma(a0.s1, b1.s1, acc11);

1434

acc11 = fma(a0.s2, b2.s1, acc11);

1435

acc11 = fma(a0.s3, b3.s1, acc11);

1436

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1437

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1438

a0 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

1439

acc20 = fma(a0.s0, b0.s0, acc20);

1440

acc20 = fma(a0.s1, b1.s0, acc20);

1441

acc20 = fma(a0.s2, b2.s0, acc20);

1442

acc20 = fma(a0.s3, b3.s0, acc20);

1443

1444

acc21 = fma(a0.s0, b0.s1, acc21);

1445

acc21 = fma(a0.s1, b1.s1, acc21);

1446

acc21 = fma(a0.s2, b2.s1, acc21);

1447

acc21 = fma(a0.s3, b3.s1, acc21);

1448

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1449

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1450

a0 = vload4(0, (__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

1451

acc30 = fma(a0.s0, b0.s0, acc30);

1452

acc30 = fma(a0.s1, b1.s0, acc30);

1453

acc30 = fma(a0.s2, b2.s0, acc30);

1454

acc30 = fma(a0.s3, b3.s0, acc30);

1455

1456

acc31 = fma(a0.s0, b0.s1, acc31);

1457

acc31 = fma(a0.s1, b1.s1, acc31);

1458

acc31 = fma(a0.s2, b2.s1, acc31);

1459

acc31 = fma(a0.s3, b3.s1, acc31);

1460

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1461

}

1462

// float size increment

1463

for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(4, src1_stride_y))

1464

{

1465

// Load values from matrix A

1466

float a0 = *((__global float *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));

1467

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1468

float a1 = *((__global float *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

1469

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1470

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1471

float a2 = *((__global float *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

1472

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1473

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1474

float a3 = *((__global float *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

1475

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1476

// Load values from matrix B

1477

float2 b0 = vload2(0, (__global float *)(src1_ptr + src_addr.s1));

1478

1479

// Multiply and accumulate

1480

acc00 = fma(a0, b0.s0, acc00);

1481

acc01 = fma(a0, b0.s1, acc01);

1482

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1483

acc10 = fma(a1, b0.s0, acc10);

1484

acc11 = fma(a1, b0.s1, acc11);

1485

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1486

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1487

acc20 = fma(a2, b0.s0, acc20);

1488

acc21 = fma(a2, b0.s1, acc21);

1489

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1490

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1491

acc30 = fma(a3, b0.s0, acc30);

1492

acc31 = fma(a3, b0.s1, acc31);

1493

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1494

}

1495

1496

// Compute destination address

1497

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

1498

1499

// Multiply by the weight of matrix-matrix product and store the result

1500

#if defined(ALPHA)

1501

acc00 = acc00 * ALPHA;

1502

acc01 = acc01 * ALPHA;

1503

#endif // defined(ALPHA)

1504

float2 acc0 = ((float2)(acc00, acc01));

1505

vstore2(acc0, 0, (__global float *)(offset(&dst, 0, 0)));

1506

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1507

#if defined(ALPHA)

1508

acc10 = acc10 * ALPHA;

1509

acc11 = acc11 * ALPHA;

1510

#endif // defined(ALPHA)

1511

float2 acc1 = ((float2)(acc10, acc11));

1512

vstore2(acc1, 0, (__global float *)(offset(&dst, 0, 1)));

1513

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1514

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1515

#if defined(ALPHA)

1516

acc20 = acc20 * ALPHA;

1517

acc21 = acc21 * ALPHA;

1518

#endif // defined(ALPHA)

1519

float2 acc2 = ((float2)(acc20, acc21));

1520

vstore2(acc2, 0, (__global float *)(offset(&dst, 0, 2)));

1521

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1522

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1523

#if defined(ALPHA)

1524

acc30 = acc30 * ALPHA;

1525

acc31 = acc31 * ALPHA;

1526

#endif // defined(ALPHA)

1527

float2 acc3 = (float2)(acc30, acc31);

1528

vstore2(acc3, 0, (__global float *)(offset(&dst, 0, 3)));

1529

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1530

}

1531

1532

#if defined(FIXED_POINT_POSITION)

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1533

/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

1534

*

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1535

* @note This OpenCL kernel works with fixed point data types QS8

1536

* @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1537

* @note The number matrix A columns, the number of elements processed per thread along the Y direction and the alpha's value need to be passed at compile time using -DCOLS_A, -DNUM_ELEMS_PROCESSED_PER_THREAD_Y and -DALPHA

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1538

* @note The fixed point position need to be passed at compile time using -DFIXED_POINT_POSITION

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1539

* @note The optional alpha value must be passed in 8 bit fixed point format using -DALPHA

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

1540

*

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1541

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: QS8/QS16

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

1542

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

1543

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1544

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

1545

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1546

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

1547

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

1548

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

1549

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1550

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

1551

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1552

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

1553

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

1554

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

1555

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

1556

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

1557

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

1558

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

1559

*/

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1560

__kernel void gemm_mm_qs8(IMAGE_DECLARATION(src0),

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

1561

IMAGE_DECLARATION(src1),

1562

IMAGE_DECLARATION(dst))

1563

{

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1564

int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

1565

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1566

// Compute starting address for matrix A and Matrix B

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

1567

int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

1568

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1569

// Update address for the matrix A

1570

src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

1571

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1572

// Update address for the matrix B

1573

src_addr.s1 += idx * sizeof(char);

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

1574

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1575

int end_row_vec_a = src_addr.s0 + (COLS_A * sizeof(char));

short8 acc00 = 0;

short8 acc01 = 0;

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1580

short8 acc10 = 0;

1581

short8 acc11 = 0;

1582

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1583

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1584

short8 acc20 = 0;

1585

short8 acc21 = 0;

1586

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1587

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1588

short8 acc30 = 0;

1589

short8 acc31 = 0;

1590

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1591

1592

// This for loop performs 4 accumulations per iteration

1593

for(; src_addr.s0 <= (end_row_vec_a - 2); src_addr += (int2)(2, 2 * src1_stride_y))

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

1594

{

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1595

char2 a0 = vload2(0, (__global char *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));

1596

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1597

char2 a1 = vload2(0, (__global char *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

1598

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1599

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1600

char2 a2 = vload2(0, (__global char *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

1601

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1602

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1603

char2 a3 = vload2(0, (__global char *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

1604

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

1605

char16 b0 = vload16(0, (__global char *)(src1_ptr + src_addr.s1 + 0 * src1_stride_y));

1606

char16 b1 = vload16(0, (__global char *)(src1_ptr + src_addr.s1 + 1 * src1_stride_y));

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

1607

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1608

acc00 = mlal_sat_qs8x8(acc00, (char8)a0.s0, b0.s01234567, FIXED_POINT_POSITION);

1609

acc00 = mlal_sat_qs8x8(acc00, (char8)a0.s1, b1.s01234567, FIXED_POINT_POSITION);

1610

acc01 = mlal_sat_qs8x8(acc01, (char8)a0.s0, b0.s89ABCDEF, FIXED_POINT_POSITION);

1611

acc01 = mlal_sat_qs8x8(acc01, (char8)a0.s1, b1.s89ABCDEF, FIXED_POINT_POSITION);

1612

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1613

acc10 = mlal_sat_qs8x8(acc10, (char8)a1.s0, b0.s01234567, FIXED_POINT_POSITION);

1614

acc10 = mlal_sat_qs8x8(acc10, (char8)a1.s1, b1.s01234567, FIXED_POINT_POSITION);

1615

acc11 = mlal_sat_qs8x8(acc11, (char8)a1.s0, b0.s89ABCDEF, FIXED_POINT_POSITION);

1616

acc11 = mlal_sat_qs8x8(acc11, (char8)a1.s1, b1.s89ABCDEF, FIXED_POINT_POSITION);

1617

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1618

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1619

acc20 = mlal_sat_qs8x8(acc20, (char8)a2.s0, b0.s01234567, FIXED_POINT_POSITION);

1620

acc20 = mlal_sat_qs8x8(acc20, (char8)a2.s1, b1.s01234567, FIXED_POINT_POSITION);

1621

acc21 = mlal_sat_qs8x8(acc21, (char8)a2.s0, b0.s89ABCDEF, FIXED_POINT_POSITION);

1622

acc21 = mlal_sat_qs8x8(acc21, (char8)a2.s1, b1.s89ABCDEF, FIXED_POINT_POSITION);

1623

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1624

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1625

acc30 = mlal_sat_qs8x8(acc30, (char8)a3.s0, b0.s01234567, FIXED_POINT_POSITION);

1626

acc30 = mlal_sat_qs8x8(acc30, (char8)a3.s1, b1.s01234567, FIXED_POINT_POSITION);

1627

acc31 = mlal_sat_qs8x8(acc31, (char8)a3.s0, b0.s89ABCDEF, FIXED_POINT_POSITION);

1628

acc31 = mlal_sat_qs8x8(acc31, (char8)a3.s1, b1.s89ABCDEF, FIXED_POINT_POSITION);

1629

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

1630

}

1631

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1632

// Left-over accumulations

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

1633

for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(1, src1_stride_y))

1634

{

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1635

char a0 = *((__global char *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));

1636

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1637

char a1 = *((__global char *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

1638

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1639

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1640

char a2 = *((__global char *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

1641

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1642

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1643

char a3 = *((__global char *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

1644

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

1645

char16 b0 = vload16(0, (__global char *)(src1_ptr + src_addr.s1));

1646

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1647

acc00 = mlal_sat_qs8x8(acc00, (char8)a0, b0.s01234567, FIXED_POINT_POSITION);

1648

acc01 = mlal_sat_qs8x8(acc01, (char8)a0, b0.s89ABCDEF, FIXED_POINT_POSITION);

1649

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1650

acc10 = mlal_sat_qs8x8(acc10, (char8)a1, b0.s01234567, FIXED_POINT_POSITION);

1651

acc11 = mlal_sat_qs8x8(acc11, (char8)a1, b0.s89ABCDEF, FIXED_POINT_POSITION);

1652

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1653

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1654

acc20 = mlal_sat_qs8x8(acc20, (char8)a2, b0.s01234567, FIXED_POINT_POSITION);

1655

acc21 = mlal_sat_qs8x8(acc21, (char8)a2, b0.s89ABCDEF, FIXED_POINT_POSITION);

1656

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1657

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1658

acc30 = mlal_sat_qs8x8(acc30, (char8)a3, b0.s01234567, FIXED_POINT_POSITION);

1659

acc31 = mlal_sat_qs8x8(acc31, (char8)a3, b0.s89ABCDEF, FIXED_POINT_POSITION);

1660

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

1661

}

1662

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1663

// Compute destination address

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

1664

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

1665

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1666

// Multiply by the weight of matrix product and store the result

1667

char16 acc_qs8;

1668

acc_qs8 = convert_char16_sat((short16)(acc00, acc01));

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1669

#if defined(ALPHA)

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

1670

acc_qs8 = mul_sat_qs8x16(acc_qs8, (char16)ALPHA, FIXED_POINT_POSITION);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1671

#endif // defined(ALPHA)

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

1672

vstore16(acc_qs8, 0, (__global char *)(offset(&dst, 0, 0)));

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1673

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1674

acc_qs8 = convert_char16_sat((short16)(acc10, acc11));

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1675

#if defined(ALPHA)

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1676

acc_qs8 = mul_sat_qs8x16(acc_qs8, (char16)ALPHA, FIXED_POINT_POSITION);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1677

#endif // defined(ALPHA)

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1678

vstore16(acc_qs8, 0, (__global char *)(offset(&dst, 0, 1)));

1679

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1680

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1681

acc_qs8 = convert_char16_sat((short16)(acc20, acc21));

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1682

#if defined(ALPHA)

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1683

acc_qs8 = mul_sat_qs8x16(acc_qs8, (char16)ALPHA, FIXED_POINT_POSITION);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1684

#endif // defined(ALPHA)

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1685

vstore16(acc_qs8, 0, (__global char *)(offset(&dst, 0, 2)));

1686

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1687

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1688

acc_qs8 = convert_char16_sat((short16)(acc30, acc31));

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1689

#if defined(ALPHA)

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1690

acc_qs8 = mul_sat_qs8x16(acc_qs8, (char16)ALPHA, FIXED_POINT_POSITION);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1691

#endif // defined(ALPHA)

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1692

vstore16(acc_qs8, 0, (__global char *)(offset(&dst, 0, 3)));

1693

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

1694

}

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

1695

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1696

/** This OpenCL kernel computes the matrix by matrix multiplication between the matrix A (src0) and matrix B (src1) in case both matrices have not beed reshaped

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

1697

*

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1698

* @note This OpenCL kernel works with fixed point data types QS16

1699

* @note The number of elements processed along the x and y directions must be passed at compile time using -DNUM_ELEMS_PROCESSED_PER_THREAD_X and -DNUM_ELEMS_PROCESSED_PER_THREAD_Y

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1700

* @note The number of matrix A columns, the number of elements processed per thread along the Y direction and the alpha's value need to be passed at compile time using -DCOLS_A, -DNUM_ELEMS_PROCESSED_PER_THREAD_Y and -DALPHA

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1701

* @note The fixed point position need to be passed at compile time using -DFIXED_POINT_POSITION

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1702

* @note The optional alpha value must be passed in 16 bit fixed point format using -DALPHA

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

1703

*

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1704

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: QS8/QS16

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

1705

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

1706

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1707

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

1708

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1709

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

1710

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

1711

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

1712

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1713

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

1714

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1715

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

1716

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

1717

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

1718

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

1719

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

1720

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

1721

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

1722

*/

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1723

__kernel void gemm_mm_qs16(IMAGE_DECLARATION(src0),

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

1724

IMAGE_DECLARATION(src1),

1725

IMAGE_DECLARATION(dst))

1726

{

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1727

int idx = get_global_id(0) * NUM_ELEMS_PROCESSED_PER_THREAD_X;

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

1728

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1729

// Compute starting address for matrix A and Matrix B

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

1730

int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1731

1732

// Update address for the matrix A

1733

src_addr.s0 += get_global_id(1) * src0_stride_y * NUM_ELEMS_PROCESSED_PER_THREAD_Y;

1734

1735

// Update address for the matrix B

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

1736

src_addr.s1 += idx * sizeof(short);

1737

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1738

int end_row_vec_a = src_addr.s0 + (COLS_A * sizeof(short));

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

1739

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

1740

int8 acc0 = 0;

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1741

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1742

int8 acc1 = 0;

1743

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1744

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1745

int8 acc2 = 0;

1746

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1747

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1748

int8 acc3 = 0;

1749

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

1750

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1751

// This for loop performs 4 accumulations per iteration

Georgios Pinitas

96880cf

2017-10-20 18:52:20 +0100

[diff] [blame]

1752

for(; src_addr.s0 <= (end_row_vec_a - 2 * (int)sizeof(short)); src_addr += (int2)(2 * sizeof(short), 2 * src1_stride_y))

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

1753

{

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1754

short2 a0 = vload2(0, (__global short *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));

1755

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1756

short2 a1 = vload2(0, (__global short *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

1757

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1758

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1759

short2 a2 = vload2(0, (__global short *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

1760

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1761

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1762

short2 a3 = vload2(0, (__global short *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

1763

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

1764

short8 b0 = vload8(0, (__global short *)(src1_ptr + src_addr.s1 + 0 * src1_stride_y));

1765

short8 b1 = vload8(0, (__global short *)(src1_ptr + src_addr.s1 + 1 * src1_stride_y));

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

1766

1767

acc0 = mlal_sat_qs16x8(acc0, (short8)a0.s0, b0, FIXED_POINT_POSITION);

1768

acc0 = mlal_sat_qs16x8(acc0, (short8)a0.s1, b1, FIXED_POINT_POSITION);

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1769

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1770

acc1 = mlal_sat_qs16x8(acc1, (short8)a1.s0, b0, FIXED_POINT_POSITION);

1771

acc1 = mlal_sat_qs16x8(acc1, (short8)a1.s1, b1, FIXED_POINT_POSITION);

1772

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1773

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1774

acc2 = mlal_sat_qs16x8(acc2, (short8)a2.s0, b0, FIXED_POINT_POSITION);

1775

acc2 = mlal_sat_qs16x8(acc2, (short8)a2.s1, b1, FIXED_POINT_POSITION);

1776

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1777

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1778

acc3 = mlal_sat_qs16x8(acc3, (short8)a3.s0, b0, FIXED_POINT_POSITION);

1779

acc3 = mlal_sat_qs16x8(acc3, (short8)a3.s1, b1, FIXED_POINT_POSITION);

1780

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

1781

}

1782

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1783

// Left-over accumulations

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

1784

for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(sizeof(short), src1_stride_y))

1785

{

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1786

short a0 = *((__global short *)(src0_ptr + src_addr.s0 + 0 * src0_stride_y));

1787

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1788

short a1 = *((__global short *)(src0_ptr + src_addr.s0 + 1 * src0_stride_y));

1789

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1790

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1791

short a2 = *((__global short *)(src0_ptr + src_addr.s0 + 2 * src0_stride_y));

1792

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1793

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1794

short a3 = *((__global short *)(src0_ptr + src_addr.s0 + 3 * src0_stride_y));

1795

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

1796

short8 b0 = vload8(0, (__global short *)(src1_ptr + src_addr.s1));

1797

1798

acc0 = mlal_sat_qs16x8(acc0, (short8)a0, b0, FIXED_POINT_POSITION);

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1799

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1800

acc1 = mlal_sat_qs16x8(acc1, (short8)a1, b0, FIXED_POINT_POSITION);

1801

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1802

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1803

acc2 = mlal_sat_qs16x8(acc2, (short8)a2, b0, FIXED_POINT_POSITION);

1804

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1805

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1806

acc3 = mlal_sat_qs16x8(acc3, (short8)a3, b0, FIXED_POINT_POSITION);

1807

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

1808

}

1809

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1810

// Compute destination address

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

1811

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

1812

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1813

// Multiply by the weight of matrix product and store the result

1814

short8 acc_qs16;

1815

acc_qs16 = convert_short8_sat(acc0);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1816

#if defined(ALPHA)

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

1817

acc_qs16 = mul_sat_qs16x8(acc_qs16, (short8)ALPHA, FIXED_POINT_POSITION);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1818

#endif // defined(ALPHA)

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

1819

vstore8(acc_qs16, 0, (__global short *)(offset(&dst, 0, 0)));

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1820

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1821

acc_qs16 = convert_short8_sat(acc1);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1822

#if defined(ALPHA)

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1823

acc_qs16 = mul_sat_qs16x8(acc_qs16, (short8)ALPHA, FIXED_POINT_POSITION);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1824

#endif // defined(ALPHA)

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1825

vstore8(acc_qs16, 0, (__global short *)(offset(&dst, 0, 1)));

1826

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 1

1827

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1828

acc_qs16 = convert_short8_sat(acc2);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1829

#if defined(ALPHA)

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1830

acc_qs16 = mul_sat_qs16x8(acc_qs16, (short8)ALPHA, FIXED_POINT_POSITION);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1831

#endif // defined(ALPHA)

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1832

vstore8(acc_qs16, 0, (__global short *)(offset(&dst, 0, 2)));

1833

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 2

1834

#if NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

1835

acc_qs16 = convert_short8_sat(acc3);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1836

#if defined(ALPHA)

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1837

acc_qs16 = mul_sat_qs16x8(acc_qs16, (short8)ALPHA, FIXED_POINT_POSITION);

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1838

#endif // defined(ALPHA)

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1839

vstore8(acc_qs16, 0, (__global short *)(offset(&dst, 0, 3)));

1840

#endif // NUM_ELEMS_PROCESSED_PER_THREAD_Y > 3

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

1841

}

Gian Marco Iodice

2017-08-15 11:45:22 +0100

[diff] [blame]

1842

#endif // defined(FIXED_POINT_POSITION)

1843

#endif // defined(COLS_A) && defined(NUM_ELEMS_PROCESSED_PER_THREAD_X) && (NUM_ELEMS_PROCESSED_PER_THREAD_Y)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1844

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1845

#if defined(BETA)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1846

/** This OpenCL kernel performs the in-place matrix addition between 2 matrices taking into account that the second matrix might be weighted by a scalar value beta:

1847

*

1848

* @attention The beta's value need to be passed at compile time using -DBETA

1849

*

1850

* @param[in] src_ptr Pointer to the source matrix. Supported data types: F32

1851

* @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)

1852

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1853

* @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)

1854

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1855

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

1856

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1857

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

1858

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

1859

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

1860

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

1861

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

1862

*/

1863

__kernel void gemm_ma_f32(IMAGE_DECLARATION(src),

1864

IMAGE_DECLARATION(dst))

1865

{

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1866

// Compute source and destination addresses

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1867

Image src = CONVERT_TO_IMAGE_STRUCT(src);

1868

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

1869

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1870

// Load values from A x B

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1871

float4 alpha_ab = vload4(0, (__global float *)dst.ptr);

1872

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1873

// Load values from Matrix C

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1874

float4 c = vload4(0, (__global float *)src.ptr);

1875

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1876

// Computes alpha * axb + beta * c

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1877

float4 out = alpha_ab + (float4)BETA * c;

1878

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1879

// Store final result in axb matrix

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1880

vstore4(out, 0, (__global float *)dst.ptr);

1881

}

1882

1883

/** This OpenCL kernel performs the in-place matrix addition between 2 matrices taking into account that the second matrix might be weighted by a scalar value beta:

1884

*

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

1885

* @attention The beta's value need to be passed at compile time using -DBETA

1886

*

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1887

* @param[in] src_ptr Pointer to the source matrix. Supported data types: F16

1888

* @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)

1889

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1890

* @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)

1891

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1892

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

1893

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1894

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

1895

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

1896

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

1897

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

1898

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

1899

*/

1900

__kernel void gemm_ma_f16(IMAGE_DECLARATION(src),

1901

IMAGE_DECLARATION(dst))

1902

{

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1903

// Compute source and destination addresses

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1904

Image src = CONVERT_TO_IMAGE_STRUCT(src);

1905

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

1906

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1907

// Load values from A x B

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1908

half8 alpha_ab = vload8(0, (__global half *)dst.ptr);

1909

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1910

// Load values from Matrix C

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1911

half8 c = vload8(0, (__global half *)src.ptr);

1912

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1913

// Computes alpha * axb + beta * c

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1914

half8 out = alpha_ab + (half8)BETA * c;

1915

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1916

// Store final result in axb matrix

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1917

vstore8(out, 0, (__global half *)dst.ptr);

1918

}

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

1919

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1920

#if defined(FIXED_POINT_POSITION)

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

1921

/** This OpenCL kernel performs the in-place matrix addition between 2 matrices in 8 bit fixed point taking into account that the second matrix might be weighted by a scalar value beta:

1922

*

1923

* @attention The beta's value and the fixed point position need to be passed at compile time using -DBETA and -DFIXED_POINT_POSITION

1924

*

1925

* @note: BETA must be passed in 8 bit fixed point format

1926

*

1927

* @param[in] src_ptr Pointer to the source matrix. Supported data types: QS8

1928

* @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)

1929

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1930

* @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)

1931

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1932

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix

1933

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr

1934

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

1935

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

1936

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

1937

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

1938

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

1939

*/

1940

__kernel void gemm_ma_qs8(IMAGE_DECLARATION(src),

1941

IMAGE_DECLARATION(dst))

1942

{

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1943

// Compute source and destination addresses

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

1944

Image src = CONVERT_TO_IMAGE_STRUCT(src);

1945

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

1946

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1947

// Load values from A x B

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

1948

char16 alpha_ab = vload16(0, (__global char *)dst.ptr);

1949

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1950

// Load values from Matrix C

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

1951

char16 c = vload16(0, (__global char *)src.ptr);

1952

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1953

// Computes alpha * axb + beta * c

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

1954

char16 out = mla_sat_qs8x16(alpha_ab, (char16)BETA, c, FIXED_POINT_POSITION);

1955

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1956

// Store final result in axb matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

1957

vstore16(out, 0, (__global char *)dst.ptr);

1958

}

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

1959

1960

/** This OpenCL kernel performs the in-place matrix addition between 2 matrices in 16 bit fixed point taking into account that the second matrix might be weighted by a scalar value beta:

1961

*

1962

* @attention The beta's value and the fixed point position need to be passed at compile time using -DBETA and -DFIXED_POINT_POSITION

1963

*

1964

* @note: BETA must be passed in 16 bit fixed point format

1965

*

1966

* @param[in] src_ptr Pointer to the source matrix. Supported data types: QS16

1967

* @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)

1968

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1969

* @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)

1970

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1971

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix

1972

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr

1973

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

1974

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

1975

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

1976

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

1977

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

1978

*/

1979

__kernel void gemm_ma_qs16(IMAGE_DECLARATION(src),

1980

IMAGE_DECLARATION(dst))

1981

{

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1982

// Compute source and destination addresses

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

1983

Image src = CONVERT_TO_IMAGE_STRUCT(src);

1984

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

1985

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1986

// Load values from A x B

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

1987

short8 alpha_ab = vload8(0, (__global short *)dst.ptr);

1988

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1989

// Load values from Matrix C

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

1990

short8 c = vload8(0, (__global short *)src.ptr);

1991

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1992

// Computes alpha * axb + beta * c

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

1993

short8 out = mla_sat_qs16x8(alpha_ab, (short8)BETA, c, FIXED_POINT_POSITION);

1994

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1995

// Store final result in axb matrix

Gian Marco Iodice

2017-07-03 17:41:47 +0100

[diff] [blame]

1996

vstore8(out, 0, (__global short *)dst.ptr);

1997

}

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

1998

#endif // defined(FIXED_POINT_POSITION)

1999

#endif // defined(BETA)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2000

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2001

#if defined(WIDTH_VECTOR_A)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2002

/** This OpenCL kernel computes the vector by matrix multiplication between each row of A (src0) and matrix B (src1) used for locally connected layer

2003

*

2004

* @attention The width of A need to be passed at compile time using -DWIDTH_VECTOR_A

2005

*

2006

* @attention The input A and matrix B must not be reshaped

2007

*

2008

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32

2009

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

2010

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

2011

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

2012

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

2013

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

2014

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2015

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

2016

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

2017

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

2018

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

2019

* @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)

2020

* @param[in] src1_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

2021

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

2022

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2023

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

2024

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

2025

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

2026

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

2027

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

2028

*/

2029

__kernel void gemm_lc_vm_f32(IMAGE_DECLARATION(src0),

2030

TENSOR3D_DECLARATION(src1),

2031

IMAGE_DECLARATION(dst))

2032

{

2033

int idx = get_global_id(0) * 4;

2034

int idy = get_global_id(1);

2035

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2036

// Compute the address for the vector A and matrix B

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2037

int2 src_addr = ((int2)(src0_offset_first_element_in_bytes + src0_stride_y * idy, src1_offset_first_element_in_bytes + src1_stride_z * idy));

2038

src_addr.s1 += idx * sizeof(float);

2039

2040

int end_row_vec_a = src_addr.s0 + (WIDTH_VECTOR_A * sizeof(float));

float4 acc = 0.0f;

Georgios Pinitas

2017-10-20 18:52:20 +0100

[diff] [blame]

2044

for(; src_addr.s0 <= (end_row_vec_a - 2 * (int)sizeof(float)); src_addr += (int2)(2 * sizeof(float), 2 * src1_stride_y))

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2045

{

2046

float2 a0 = vload2(0, (__global float *)(src0_ptr + src_addr.s0));

2047

float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));

2048

float4 b1 = vload4(0, (__global float *)(src1_ptr + src_addr.s1 + src1_stride_y));

2049

2050

acc += b0 * (float4)a0.s0;

2051

acc += b1 * (float4)a0.s1;

2052

}

2053

2054

for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(sizeof(float), src1_stride_y))

2055

{

2056

float a0 = *((__global float *)(src0_ptr + src_addr.s0));

2057

float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));

2058

2059

acc += b0 * (float4)a0;

2060

}

2061

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

2062

// Compute destination address

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

2063

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

2064

2065

vstore4(acc, 0, (__global float *)(offset(&dst, 0, 0)));

2066

}

Anton Lokhmotov