Blame - src/core/CL/cl_kernels/gemm.cl - ml/ComputeLibrary

2017-09-04 18:44:23 +0100

[diff] [blame]

287

*

288

* @attention The width of matrix B and the alpha's value need to be passed at compile time using -DWIDTH_MATRIX_B

289

*

290

* @param[in] src0_ptr Pointer to the source matrix. Supported formats: U8

291

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

292

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

293

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

294

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

295

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

296

* @param[in] src1_ptr Pointer to the source matrix. Supported formats: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

297

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

298

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

299

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

300

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

301

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

302

* @param[out] dst_ptr Pointer to the destination matrix Supported formats: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

303

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

304

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

305

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

306

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

307

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

308

* @param[in] a_offset Offset to be added to each element of the matrix A

309

* @param[in] b_offset Offset to be added to each element of the matrix B.

310

* @param[in] c_offset Offset to be added to each element of the matrix C.

311

* @param[in] c_mult_int Multiplied with each element of the matrix C.

312

* @param[in] shift Number of bits to shift right the result.

313

*/

314

__kernel void gemm_mm_u8(IMAGE_DECLARATION(src0),

315

IMAGE_DECLARATION(src1),

316

IMAGE_DECLARATION(dst),

int a_offset,

int b_offset,

int c_offset,

int c_mult_int,

int shift)

{

/* src_addr.s0 = address of matrix A */

324

/* src_addr.s1 = address of matrix B */

325

326

/* Compute address for matrix A and B */

327

int2 src_addr = (int2)(get_global_id(1), get_global_id(0)) * (int2)((src0_stride_y),

328

(src1_stride_y));

329

330

/* Add offset_first_element_in_bytes */

331

src_addr = src_addr + ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));

332

333

/* Compute end row address for matrix B */

334

int end_row_mtx_b = src_addr.s1 + WIDTH_MATRIX_B;

335

336

/* Reset accumulators */

int16 c00 = 0.0f;

int16 c10 = 0.0f;

int16 c20 = 0.0f;

int16 c30 = 0.0f;

for(; src_addr.s1 <= (end_row_mtx_b - 8); src_addr += (int2)(8, 32))

343

{

344

/* Load values from matrix A (interleaved) and matrix B (transposed) */

345

int8 a0 = (int8)a_offset + convert_int8(vload8(0, ((__global uchar *)src0_ptr) + src_addr.s0));

346

int16 b0 = (int16)b_offset + convert_int16(vload16(0, ((__global uchar *)src1_ptr) + src_addr.s1));

347

348

c00 += (int16)a0.s0 * b0;

349

c10 += (int16)a0.s1 * b0;

350

c20 += (int16)a0.s2 * b0;

351

c30 += (int16)a0.s3 * b0;

352

353

int16 b1 = (int16)b_offset + convert_int16(vload16(0, ((__global uchar *)src1_ptr) + src_addr.s1 + 16));

354

355

c00 += (int16)a0.s4 * b1;

356

c10 += (int16)a0.s5 * b1;

357

c20 += (int16)a0.s6 * b1;

358

c30 += (int16)a0.s7 * b1;

359

}

360

361

for(; src_addr.s1 < end_row_mtx_b; src_addr += (int2)(4, 16))

362

{

363

/* Load values from matrix A (interleaved) and matrix B (transposed) */

364

int4 a0 = (int4)a_offset + convert_int4(vload4(0, ((__global uchar *)src0_ptr) + src_addr.s0));

365

int16 b0 = (int16)b_offset + convert_int16(vload16(0, ((__global uchar *)src1_ptr) + src_addr.s1));

366

367

c00 += (int16)a0.s0 * b0;

368

c10 += (int16)a0.s1 * b0;

369

c20 += (int16)a0.s2 * b0;

370

c30 += (int16)a0.s3 * b0;

371

}

372

373

/* Compute destination address */

374

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

375

376

/* Multiply by the weight of matrix product */

377

c00 = (((int16)c_offset + c00) * (int16)c_mult_int) >> shift;

378

c10 = (((int16)c_offset + c10) * (int16)c_mult_int) >> shift;

379

c20 = (((int16)c_offset + c20) * (int16)c_mult_int) >> shift;

380

c30 = (((int16)c_offset + c30) * (int16)c_mult_int) >> shift;

381

382

/* Store 4x16 block */

383

vstore16(convert_uchar16_sat(c00), 0, (__global uchar *)(offset(&dst, 0, 0)));

384

vstore16(convert_uchar16_sat(c10), 0, (__global uchar *)(offset(&dst, 0, 1)));

385

vstore16(convert_uchar16_sat(c20), 0, (__global uchar *)(offset(&dst, 0, 2)));

386

vstore16(convert_uchar16_sat(c30), 0, (__global uchar *)(offset(&dst, 0, 3)));

387

}

Anthony Barbier

2017-07-03 17:39:37 +0100

[diff] [blame]

388

#endif /* WIDTH_MATRIX_B */

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

389

Anthony Barbier

2017-07-03 17:39:37 +0100

[diff] [blame]

390

#if defined(WIDTH_MATRIX_B) && defined(ALPHA)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

391

/** This OpenCL kernel is optimised for Midgard. It computes the matrix multiplication between matrix A (src0) and matrix B (src1)

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

392

* Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_32bit and @ref gemm_transpose1x4 before running the matrix multiplication

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

393

*

394

* @attention The width of matrix B and the alpha's value need to be passed at compile time using -DWIDTH_MATRIX_B and -DALPHA

395

*

396

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32

397

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

398

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

399

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

400

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

401

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

402

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

403

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

404

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

405

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

406

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

407

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

408

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

409

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

410

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

411

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

412

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

413

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

414

*/

415

__kernel void gemm_mm_f32_midgard(IMAGE_DECLARATION(src0),

416

IMAGE_DECLARATION(src1),

417

IMAGE_DECLARATION(dst))

418

{

419

/* src_addr.s0 = address of matrix A */

420

/* src_addr.s1 = address of matrix B */

421

422

/* Compute address for matrix A and B */

423

int2 src_addr = (int2)(get_global_id(1), get_global_id(0)) * (int2)((src0_stride_y),

424

(src1_stride_y));

425

426

/* Add offset_first_element_in_bytes */

427

src_addr = src_addr + ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));

428

429

/* Divide by 4 in order to get the src_addr in unit of float */

430

src_addr = src_addr >> 2;

431

432

/* Compute end row address for matrix B */

433

int end_row_mtx_b = src_addr.s1 + WIDTH_MATRIX_B;

434

435

/* Reset accumulators */

float4 c00 = 0.0f;

float4 c10 = 0.0f;

float4 c20 = 0.0f;

float4 c30 = 0.0f;

for(; src_addr.s1 <= (end_row_mtx_b - 8); src_addr += (int2)(8, 8))

442

{

443

/* Load values from matrix A (interleaved) and matrix B (transposed) */

444

float4 a0 = vload4(0, ((__global float *)src0_ptr) + src_addr.s0);

445

float4 b0 = vload4(0, ((__global float *)src1_ptr) + src_addr.s1);

446

447

c00 += (float4)a0.s0 * b0;

448

c10 += (float4)a0.s1 * b0;

449

c20 += (float4)a0.s2 * b0;

450

c30 += (float4)a0.s3 * b0;

451

452

/* Load values from matrix A (interleaved) and matrix B (transposed) */

453

a0 = vload4(0, ((__global float *)src0_ptr) + src_addr.s0 + 4);

454

b0 = vload4(0, ((__global float *)src1_ptr) + src_addr.s1 + 4);

455

456

c00 += (float4)a0.s0 * b0;

457

c10 += (float4)a0.s1 * b0;

458

c20 += (float4)a0.s2 * b0;

459

c30 += (float4)a0.s3 * b0;

460

}

461

462

for(; src_addr.s1 < end_row_mtx_b; src_addr += (int2)(4, 4))

463

{

464

/* Load values from matrix A (interleaved) and matrix B (transposed) */

465

float4 a0 = vload4(0, ((__global float *)src0_ptr) + src_addr.s0);

466

float4 b0 = vload4(0, ((__global float *)src1_ptr) + src_addr.s1);

467

468

c00 += (float4)a0.s0 * b0;

469

c10 += (float4)a0.s1 * b0;

470

c20 += (float4)a0.s2 * b0;

471

c30 += (float4)a0.s3 * b0;

472

}

473

474

/* Compute destination address */

475

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

476

477

/* Multiply by the weight of matrix product */

478

c00 = c00 * (float4)ALPHA;

479

c10 = c10 * (float4)ALPHA;

480

c20 = c20 * (float4)ALPHA;

481

c30 = c30 * (float4)ALPHA;

482

483

/* Store 4x4 block */

484

vstore4(c00, 0, (__global float *)(offset(&dst, 0, 0)));

485

vstore4(c10, 0, (__global float *)(offset(&dst, 0, 1)));

486

vstore4(c20, 0, (__global float *)(offset(&dst, 0, 2)));

487

vstore4(c30, 0, (__global float *)(offset(&dst, 0, 3)));

488

}

489

490

/** This OpenCL kernel is optimised for Bifrost. It computes the matrix multiplication between matrix A (src0) and matrix B (src1)

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

491

* Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_32bit and @ref gemm_transpose1x4 before running the matrix multiplication

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

492

*

493

* @attention The width of matrix B and the alpha's value need to be passed at compile time using -DWIDTH_MATRIX_B and -DALPHA

494

*

495

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32

496

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

497

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

498

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

499

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

500

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

501

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

502

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

503

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

504

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

505

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

506

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

507

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

508

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

509

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

510

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

511

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

512

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

513

*/

514

__kernel void gemm_mm_f32_bifrost(IMAGE_DECLARATION(src0),

515

IMAGE_DECLARATION(src1),

516

IMAGE_DECLARATION(dst))

517

{

518

// src_addr_a = address of matrix A

519

// src_addr_b = address of matrix B

520

__global float *src_addr_a = (__global float *)(src0_ptr + get_global_id(1) * src0_stride_y + src0_offset_first_element_in_bytes);

521

__global float *src_addr_b = (__global float *)(src1_ptr + get_global_id(0) * src1_stride_y + src1_offset_first_element_in_bytes);

522

523

// Compute end row address for matrix B

524

__global float *src_end_addr_b = src_addr_b + WIDTH_MATRIX_B;

525

526

// Reset accumulators

float c00 = 0.0f;

float c01 = 0.0f;

float c02 = 0.0f;

float c03 = 0.0f;

float c10 = 0.0f;

float c11 = 0.0f;

float c12 = 0.0f;

float c13 = 0.0f;

float c20 = 0.0f;

float c21 = 0.0f;

float c22 = 0.0f;

float c23 = 0.0f;

float c30 = 0.0f;

float c31 = 0.0f;

float c32 = 0.0f;

float c33 = 0.0f;

for(; src_addr_b <= (src_end_addr_b - 16); src_addr_a += 16, src_addr_b += 16)

545

{

546

// Load values from matrix A (interleaved) and matrix B (transposed)

547

float4 a0 = vload4(0, src_addr_a);

548

float4 b0 = vload4(0, src_addr_b);

549

550

c00 = fma(a0.s0, b0.s0, c00);

551

c01 = fma(a0.s0, b0.s1, c01);

552

c02 = fma(a0.s0, b0.s2, c02);

553

c03 = fma(a0.s0, b0.s3, c03);

554

555

c10 = fma(a0.s1, b0.s0, c10);

556

c11 = fma(a0.s1, b0.s1, c11);

557

c12 = fma(a0.s1, b0.s2, c12);

558

c13 = fma(a0.s1, b0.s3, c13);

559

560

c20 = fma(a0.s2, b0.s0, c20);

561

c21 = fma(a0.s2, b0.s1, c21);

562

c22 = fma(a0.s2, b0.s2, c22);

563

c23 = fma(a0.s2, b0.s3, c23);

564

565

c30 = fma(a0.s3, b0.s0, c30);

566

c31 = fma(a0.s3, b0.s1, c31);

567

c32 = fma(a0.s3, b0.s2, c32);

568

c33 = fma(a0.s3, b0.s3, c33);

569

570

// Load values from matrix A (interleaved) and matrix B (transposed)

571

a0 = vload4(0, src_addr_a + 4);

572

b0 = vload4(0, src_addr_b + 4);

573

574

c00 = fma(a0.s0, b0.s0, c00);

575

c01 = fma(a0.s0, b0.s1, c01);

576

c02 = fma(a0.s0, b0.s2, c02);

577

c03 = fma(a0.s0, b0.s3, c03);

578

579

c10 = fma(a0.s1, b0.s0, c10);

580

c11 = fma(a0.s1, b0.s1, c11);

581

c12 = fma(a0.s1, b0.s2, c12);

582

c13 = fma(a0.s1, b0.s3, c13);

583

584

c20 = fma(a0.s2, b0.s0, c20);

585

c21 = fma(a0.s2, b0.s1, c21);

586

c22 = fma(a0.s2, b0.s2, c22);

587

c23 = fma(a0.s2, b0.s3, c23);

588

589

c30 = fma(a0.s3, b0.s0, c30);

590

c31 = fma(a0.s3, b0.s1, c31);

591

c32 = fma(a0.s3, b0.s2, c32);

592

c33 = fma(a0.s3, b0.s3, c33);

593

594

// Load values from matrix A (interleaved) and matrix B (transposed)

595

a0 = vload4(0, src_addr_a + 8);

596

b0 = vload4(0, src_addr_b + 8);

597

598

c00 = fma(a0.s0, b0.s0, c00);

599

c01 = fma(a0.s0, b0.s1, c01);

600

c02 = fma(a0.s0, b0.s2, c02);

601

c03 = fma(a0.s0, b0.s3, c03);

602

603

c10 = fma(a0.s1, b0.s0, c10);

604

c11 = fma(a0.s1, b0.s1, c11);

605

c12 = fma(a0.s1, b0.s2, c12);

606

c13 = fma(a0.s1, b0.s3, c13);

607

608

c20 = fma(a0.s2, b0.s0, c20);

609

c21 = fma(a0.s2, b0.s1, c21);

610

c22 = fma(a0.s2, b0.s2, c22);

611

c23 = fma(a0.s2, b0.s3, c23);

612

613

c30 = fma(a0.s3, b0.s0, c30);

614

c31 = fma(a0.s3, b0.s1, c31);

615

c32 = fma(a0.s3, b0.s2, c32);

616

c33 = fma(a0.s3, b0.s3, c33);

617

618

// Load values from matrix A (interleaved) and matrix B (transposed)

619

a0 = vload4(0, src_addr_a + 12);

620

b0 = vload4(0, src_addr_b + 12);

621

622

c00 = fma(a0.s0, b0.s0, c00);

623

c01 = fma(a0.s0, b0.s1, c01);

624

c02 = fma(a0.s0, b0.s2, c02);

625

c03 = fma(a0.s0, b0.s3, c03);

626

627

c10 = fma(a0.s1, b0.s0, c10);

628

c11 = fma(a0.s1, b0.s1, c11);

629

c12 = fma(a0.s1, b0.s2, c12);

630

c13 = fma(a0.s1, b0.s3, c13);

631

632

c20 = fma(a0.s2, b0.s0, c20);

633

c21 = fma(a0.s2, b0.s1, c21);

634

c22 = fma(a0.s2, b0.s2, c22);

635

c23 = fma(a0.s2, b0.s3, c23);

636

637

c30 = fma(a0.s3, b0.s0, c30);

638

c31 = fma(a0.s3, b0.s1, c31);

639

c32 = fma(a0.s3, b0.s2, c32);

640

c33 = fma(a0.s3, b0.s3, c33);

641

}

642

643

for(; src_addr_b < src_end_addr_b; src_addr_a += 4, src_addr_b += 4)

644

{

645

// Load values from matrix A (interleaved) and matrix B (transposed)

646

float4 a0 = vload4(0, src_addr_a);

647

float4 b0 = vload4(0, src_addr_b);

648

649

c00 = fma(a0.s0, b0.s0, c00);

650

c01 = fma(a0.s0, b0.s1, c01);

651

c02 = fma(a0.s0, b0.s2, c02);

652

c03 = fma(a0.s0, b0.s3, c03);

653

654

c10 = fma(a0.s1, b0.s0, c10);

655

c11 = fma(a0.s1, b0.s1, c11);

656

c12 = fma(a0.s1, b0.s2, c12);

657

c13 = fma(a0.s1, b0.s3, c13);

658

659

c20 = fma(a0.s2, b0.s0, c20);

660

c21 = fma(a0.s2, b0.s1, c21);

661

c22 = fma(a0.s2, b0.s2, c22);

662

c23 = fma(a0.s2, b0.s3, c23);

663

664

c30 = fma(a0.s3, b0.s0, c30);

665

c31 = fma(a0.s3, b0.s1, c31);

666

c32 = fma(a0.s3, b0.s2, c32);

667

c33 = fma(a0.s3, b0.s3, c33);

668

}

669

670

// Compute destination address

671

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

672

673

// Multiply by the weight of matrix product

c00 = c00 * ALPHA;

c01 = c01 * ALPHA;

c02 = c02 * ALPHA;

c03 = c03 * ALPHA;

c10 = c10 * ALPHA;

c11 = c11 * ALPHA;

c12 = c12 * ALPHA;

c13 = c13 * ALPHA;

c20 = c20 * ALPHA;

c21 = c21 * ALPHA;

c22 = c22 * ALPHA;

c23 = c23 * ALPHA;

c30 = c30 * ALPHA;

c31 = c31 * ALPHA;

c32 = c32 * ALPHA;

c33 = c33 * ALPHA;

barrier(CLK_GLOBAL_MEM_FENCE);

692

693

// Store 4x4 block

694

vstore4((float4)(c00, c01, c02, c03), 0, (__global float *)(offset(&dst, 0, 0)));

695

vstore4((float4)(c10, c11, c12, c13), 0, (__global float *)(offset(&dst, 0, 1)));

696

vstore4((float4)(c20, c21, c22, c23), 0, (__global float *)(offset(&dst, 0, 2)));

697

vstore4((float4)(c30, c31, c32, c33), 0, (__global float *)(offset(&dst, 0, 3)));

698

}

699

700

/** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1)

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

701

* Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_16bit and @ref gemm_transpose1x8 before running the matrix multiplication

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

702

*

703

* @attention The width of matrix B and the alpha's value need to be passed at compile time using -DWIDTH_MATRIX_B and -DALPHA

704

*

705

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16

706

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

707

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

708

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

709

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

710

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

711

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

712

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

713

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

714

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

715

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

716

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

717

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

718

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

719

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

720

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

721

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

722

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

723

*/

724

__kernel void gemm_mm_f16(IMAGE_DECLARATION(src0),

725

IMAGE_DECLARATION(src1),

726

IMAGE_DECLARATION(dst))

727

{

728

/* src_addr.s0 = address of matrix A */

729

/* src_addr.s1 = address of matrix B */

730

731

/* Compute address for matrix A and B */

732

int2 src_addr = (int2)(get_global_id(1), get_global_id(0)) * (int2)((src0_stride_y),

733

(src1_stride_y));

734

735

/* Add offset_first_element_in_bytes */

736

src_addr = src_addr + ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));

737

738

/* Divide by 2 in order to get the src_addr in unit of half */

739

src_addr = src_addr >> 1;

740

741

/* Compute end row address for matrix B */

742

int end_row_mtx_b = src_addr.s1 + WIDTH_MATRIX_B;

743

744

/* Reset accumulators */

half8 c00 = 0.0f;

half8 c10 = 0.0f;

half8 c20 = 0.0f;

half8 c30 = 0.0f;

for(; src_addr.s1 <= (end_row_mtx_b - 8); src_addr += (int2)(8, 16))

751

{

752

/* Load values from matrix A (interleaved) and matrix B (transposed) */

753

half4 a0 = vload4(0, ((__global half *)src0_ptr) + src_addr.s0);

754

half8 b0 = vload8(0, ((__global half *)src1_ptr) + src_addr.s1);

755

756

c00 += (half8)a0.s0 * b0;

757

c10 += (half8)a0.s1 * b0;

758

c20 += (half8)a0.s2 * b0;

759

c30 += (half8)a0.s3 * b0;

760

761

/* Load values from matrix A (interleaved) and matrix B (transposed) */

762

a0 = vload4(0, ((__global half *)src0_ptr) + src_addr.s0 + 4);

763

b0 = vload8(0, ((__global half *)src1_ptr) + src_addr.s1 + 8);

764

765

c00 += (half8)a0.s0 * b0;

766

c10 += (half8)a0.s1 * b0;

767

c20 += (half8)a0.s2 * b0;

768

c30 += (half8)a0.s3 * b0;

769

}

770

771

for(; src_addr.s1 < end_row_mtx_b; src_addr += (int2)(4, 8))

772

{

773

/* Load values from matrix A (interleaved) and matrix B (transposed) */

774

half4 a0 = vload4(0, ((__global half *)src0_ptr) + src_addr.s0);

775

half8 b0 = vload8(0, ((__global half *)src1_ptr) + src_addr.s1);

776

777

c00 += (half8)a0.s0 * b0;

778

c10 += (half8)a0.s1 * b0;

779

c20 += (half8)a0.s2 * b0;

780

c30 += (half8)a0.s3 * b0;

781

}

782

783

/* Compute destination address */

784

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

785

786

/* Multiply by the weight of matrix product */

787

c00 = c00 * (half8)ALPHA;

788

c10 = c10 * (half8)ALPHA;

789

c20 = c20 * (half8)ALPHA;

790

c30 = c30 * (half8)ALPHA;

791

792

/* Store 4x8 block */

793

vstore8(c00, 0, (__global half *)(offset(&dst, 0, 0)));

794

vstore8(c10, 0, (__global half *)(offset(&dst, 0, 1)));

795

vstore8(c20, 0, (__global half *)(offset(&dst, 0, 2)));

796

vstore8(c30, 0, (__global half *)(offset(&dst, 0, 3)));

797

}

798

Anthony Barbier

2017-07-03 17:39:37 +0100

[diff] [blame]

799

#ifdef FIXED_POINT_POSITION

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

800

/** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1) in 8 bit fixed point precision

801

* Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_8bit and @ref gemm_transpose1x16 before running the matrix multiplication

802

*

803

* @attention The width of matrix B, the alpha's value and fixed point position need to be passed at compile time using -DWIDTH_MATRIX_B -DALPHA and -DFIXED_POINT_POSITION

804

*

805

* @note: ALPHA must be passed in 8 bit fixed point format

806

*

807

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: QS8

808

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

809

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

810

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

811

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

812

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

813

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

814

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

815

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

816

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

817

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

818

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

819

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

820

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

821

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

822

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

823

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

824

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

825

*/

826

__kernel void gemm_mm_qs8(IMAGE_DECLARATION(src0),

827

IMAGE_DECLARATION(src1),

828

IMAGE_DECLARATION(dst))

829

{

830

/* src_addr.s0 = address of matrix A */

831

/* src_addr.s1 = address of matrix B */

832

833

/* Compute address for matrix A and B */

834

int2 src_addr = (int2)(get_global_id(1), get_global_id(0)) * (int2)((src0_stride_y),

835

(src1_stride_y));

836

837

/* Add offset_first_element_in_bytes */

838

src_addr = src_addr + ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));

839

840

/* Compute end row address for matrix B */

841

int end_row_mtx_b = src_addr.s1 + WIDTH_MATRIX_B;

842

843

/* Reset accumulators */

short8 c00 = 0.0f;

short8 c10 = 0.0f;

short8 c20 = 0.0f;

short8 c30 = 0.0f;

short8 c01 = 0.0f;

short8 c11 = 0.0f;

short8 c21 = 0.0f;

short8 c31 = 0.0f;

/* This for loop performs 1 accumulation for each iteration */

854

for(; src_addr.s1 <= (end_row_mtx_b - 16); src_addr += (int2)(4, 16))

855

{

856

/* Load values from matrix A (interleaved) and matrix B (transposed) */

857

char4 a0 = vload4(0, ((__global char *)src0_ptr) + src_addr.s0);

858

char16 b0 = vload16(0, ((__global char *)src1_ptr) + src_addr.s1);

859

860

c00 = mlal_sat_qs8x8(c00, (char8)a0.s0, b0.s01234567, FIXED_POINT_POSITION);

861

c10 = mlal_sat_qs8x8(c10, (char8)a0.s1, b0.s01234567, FIXED_POINT_POSITION);

862

c20 = mlal_sat_qs8x8(c20, (char8)a0.s2, b0.s01234567, FIXED_POINT_POSITION);

863

c30 = mlal_sat_qs8x8(c30, (char8)a0.s3, b0.s01234567, FIXED_POINT_POSITION);

864

865

c01 = mlal_sat_qs8x8(c01, (char8)a0.s0, b0.s89ABCDEF, FIXED_POINT_POSITION);

866

c11 = mlal_sat_qs8x8(c11, (char8)a0.s1, b0.s89ABCDEF, FIXED_POINT_POSITION);

867

c21 = mlal_sat_qs8x8(c21, (char8)a0.s2, b0.s89ABCDEF, FIXED_POINT_POSITION);

868

c31 = mlal_sat_qs8x8(c31, (char8)a0.s3, b0.s89ABCDEF, FIXED_POINT_POSITION);

869

}

870

871

/* Compute destination address */

872

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

873

874

/* Multiply by the weight of matrix product */

875

char16 c00_qs8 = convert_char16_sat((short16)(c00, c01));

876

char16 c10_qs8 = convert_char16_sat((short16)(c10, c11));

877

char16 c20_qs8 = convert_char16_sat((short16)(c20, c21));

878

char16 c30_qs8 = convert_char16_sat((short16)(c30, c31));

879

880

c00_qs8 = mul_sat_qs8x16(c00_qs8, (char16)ALPHA, FIXED_POINT_POSITION);

881

c10_qs8 = mul_sat_qs8x16(c10_qs8, (char16)ALPHA, FIXED_POINT_POSITION);

882

c20_qs8 = mul_sat_qs8x16(c20_qs8, (char16)ALPHA, FIXED_POINT_POSITION);

883

c30_qs8 = mul_sat_qs8x16(c30_qs8, (char16)ALPHA, FIXED_POINT_POSITION);

884

885

/* Store 16x4 block */

886

vstore16(c00_qs8, 0, (__global char *)(offset(&dst, 0, 0)));

887

vstore16(c10_qs8, 0, (__global char *)(offset(&dst, 0, 1)));

888

vstore16(c20_qs8, 0, (__global char *)(offset(&dst, 0, 2)));

889

vstore16(c30_qs8, 0, (__global char *)(offset(&dst, 0, 3)));

890

}

Gian Marco Iodice

8a38369

2017-07-03 17:41:47 +0100

[diff] [blame^]

891

892

/** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1) in 16 bit fixed point precision

893

* Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_16bit and @ref gemm_transpose1x8 before running the matrix multiplication

894

*

895

* @attention The width of matrix B, the alpha's value and fixed point position need to be passed at compile time using -DWIDTH_MATRIX_B -DALPHA and -DFIXED_POINT_POSITION

896

*

897

* @note: ALPHA must be passed in 16 bit fixed point format

898

*

899

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: QS16

900

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

901

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

902

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

903

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

904

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

905

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

906

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

907

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

908

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

909

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

910

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

911

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

912

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

913

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

914

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

915

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

916

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

917

*/

918

__kernel void gemm_mm_qs16(IMAGE_DECLARATION(src0),

919

IMAGE_DECLARATION(src1),

920

IMAGE_DECLARATION(dst))

921

{

922

/* src_addr.s0 = address of matrix A */

923

/* src_addr.s1 = address of matrix B */

924

925

/* Compute address for matrix A and B */

926

int2 src_addr = (int2)(get_global_id(1), get_global_id(0)) * (int2)((src0_stride_y),

927

(src1_stride_y));

928

929

/* Add offset_first_element_in_bytes */

930

src_addr = src_addr + ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));

931

932

/* Divide by 2 in order to get the src_addr in unit of short */

933

src_addr = src_addr >> 1;

934

935

/* Compute end row address for matrix B */

936

int end_row_mtx_b = src_addr.s1 + WIDTH_MATRIX_B;

937

938

/* Reset accumulators */

int8 c00 = 0.0f;

int8 c10 = 0.0f;

int8 c20 = 0.0f;

int8 c30 = 0.0f;

/* This for loop performs 1 accumulation for each iteration */

945

for(; src_addr.s1 <= (end_row_mtx_b - 8); src_addr += (int2)(4, 8))

946

{

947

/* Load values from matrix A (interleaved) and matrix B (transposed) */

948

short4 a0 = vload4(0, ((__global short *)src0_ptr) + src_addr.s0);

949

short8 b0 = vload8(0, ((__global short *)src1_ptr) + src_addr.s1);

950

951

c00 = mlal_sat_qs16x8(c00, (short8)a0.s0, b0, FIXED_POINT_POSITION);

952

c10 = mlal_sat_qs16x8(c10, (short8)a0.s1, b0, FIXED_POINT_POSITION);

953

c20 = mlal_sat_qs16x8(c20, (short8)a0.s2, b0, FIXED_POINT_POSITION);

954

c30 = mlal_sat_qs16x8(c30, (short8)a0.s3, b0, FIXED_POINT_POSITION);

955

}

956

957

/* Compute destination address */

958

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

959

960

/* Multiply by the weight of matrix product */

961

short8 c00_qs16 = convert_short8_sat(c00);

962

short8 c10_qs16 = convert_short8_sat(c10);

963

short8 c20_qs16 = convert_short8_sat(c20);

964

short8 c30_qs16 = convert_short8_sat(c30);

965

966

c00_qs16 = mul_sat_qs16x8(c00_qs16, (short8)ALPHA, FIXED_POINT_POSITION);

967

c10_qs16 = mul_sat_qs16x8(c10_qs16, (short8)ALPHA, FIXED_POINT_POSITION);

968

c20_qs16 = mul_sat_qs16x8(c20_qs16, (short8)ALPHA, FIXED_POINT_POSITION);

969

c30_qs16 = mul_sat_qs16x8(c30_qs16, (short8)ALPHA, FIXED_POINT_POSITION);

970

971

/* Store 8x4 block */

972

vstore8(c00_qs16, 0, (__global short *)(offset(&dst, 0, 0)));

973

vstore8(c10_qs16, 0, (__global short *)(offset(&dst, 0, 1)));

974

vstore8(c20_qs16, 0, (__global short *)(offset(&dst, 0, 2)));

975

vstore8(c30_qs16, 0, (__global short *)(offset(&dst, 0, 3)));

976

}

977

#endif // defined(FIXED_POINT_POSITION)

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

978

Anthony Barbier

2017-07-03 17:39:37 +0100

[diff] [blame]

979

#ifdef WIDTH_VECTOR_A

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

980

/** This OpenCL kernel computes the vector by matrix multiplication between the vector A (src0) and matrix B (src1)

981

*

982

* @attention The width of vector A, the width of matrix B and the alpha's value need to be passed at compile time using -DWIDTH_VECTOR_A -DWIDTH_MATRIX_B and -DALPHA

983

*

984

* @attention The input vector A and matrix B must not be reshaped

985

*

986

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32

987

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

988

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

989

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

990

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

991

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

992

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

993

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

994

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

995

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

996

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

997

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

998

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

999

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

1000

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

1001

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

1002

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

1003

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

1004

*/

1005

__kernel void gemm_vm_f32(IMAGE_DECLARATION(src0),

1006

IMAGE_DECLARATION(src1),

1007

IMAGE_DECLARATION(dst))

1008

{

1009

int idx = get_global_id(0) * 4;

1010

1011

/* Compute the address for the vector A and matrix B */

1012

int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));

1013

src_addr.s1 += idx * sizeof(float);

1014

1015

int end_row_vec_a = src_addr.s0 + (WIDTH_VECTOR_A * sizeof(float));

float4 acc = 0.0f;

for(; src_addr.s0 <= (end_row_vec_a - 2 * sizeof(float)); src_addr += (int2)(2 * sizeof(float), 2 * src1_stride_y))

1020

{

1021

float2 a0 = vload2(0, (__global float *)(src0_ptr + src_addr.s0));

1022

float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));

1023

float4 b1 = vload4(0, (__global float *)(src1_ptr + src_addr.s1 + src1_stride_y));

1024

1025

acc += b0 * (float4)a0.s0;

1026

acc += b1 * (float4)a0.s1;

1027

}

1028

1029

for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(sizeof(float), src1_stride_y))

1030

{

1031

float a0 = *((__global float *)(src0_ptr + src_addr.s0));

1032

float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));

1033

1034

acc += b0 * (float4)a0;

1035

}

1036

1037

/* Compute destination address */

1038

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

1039

1040

/* Multiply by the weight of vector-matrix product */

1041

acc = acc * (float4)ALPHA;

1042

1043

vstore4(acc, 0, (__global float *)(offset(&dst, 0, 0)));

1044

}

1045

1046

/** This OpenCL kernel computes the vector by matrix multiplication between the vector A (src0) and matrix B (src1)

1047

*

1048

* @attention The width of vector A, the width of matrix B and the alpha's value need to be passed at compile time using -DWIDTH_VECTOR_A -DWIDTH_MATRIX_B and -DALPHA

1049

*

1050

* @attention The input vector A and matrix B must not be reshaped

1051

*

1052

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16

1053

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

1054

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1055

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

1056

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1057

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

1058

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1059

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

1060

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1061

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

1062

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1063

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

1064

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1065

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

1066

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

1067

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

1068

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

1069

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

1070

*/

1071

__kernel void gemm_vm_f16(IMAGE_DECLARATION(src0),

1072

IMAGE_DECLARATION(src1),

1073

IMAGE_DECLARATION(dst))

1074

{

1075

int idx = get_global_id(0) * 8;

1076

1077

/* Compute the address for the vector A and matrix B */

1078

int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));

1079

src_addr.s1 += idx * sizeof(half);

1080

1081

int end_row_vec_a = src_addr.s0 + (WIDTH_VECTOR_A * sizeof(half));

half8 acc = 0.0f;

for(; src_addr.s0 <= (end_row_vec_a - 4 * sizeof(half)); src_addr += (int2)(4 * sizeof(half), 4 * src1_stride_y))

1086

{

1087

half4 a0 = vload4(0, (__global half *)(src0_ptr + src_addr.s0));

1088

half8 b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1 + 0 * src1_stride_y));

1089

half8 b1 = vload8(0, (__global half *)(src1_ptr + src_addr.s1 + 1 * src1_stride_y));

1090

half8 b2 = vload8(0, (__global half *)(src1_ptr + src_addr.s1 + 2 * src1_stride_y));

1091

half8 b3 = vload8(0, (__global half *)(src1_ptr + src_addr.s1 + 3 * src1_stride_y));

1092

1093

acc += b0 * (half8)a0.s0;

1094

acc += b1 * (half8)a0.s1;

1095

acc += b2 * (half8)a0.s2;

1096

acc += b3 * (half8)a0.s3;

1097

}

1098

1099

for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(sizeof(half), src1_stride_y))

1100

{

1101

half a0 = *((__global half *)(src0_ptr + src_addr.s0));

1102

half8 b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));

1103

1104

acc += b0 * (half8)a0;

1105

}

1106

1107

/* Compute destination address */

1108

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

1109

1110

/* Multiply by the weight of vector-matrix product */

1111

acc = acc * (half8)ALPHA;

1112

1113

vstore8(acc, 0, (__global half *)(offset(&dst, 0, 0)));

1114

}

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

1115

Anthony Barbier

2017-07-03 17:39:37 +0100

[diff] [blame]

1116

#ifdef FIXED_POINT_POSITION

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

1117

/** This OpenCL kernel computes the vector by matrix multiplication between the vector A (src0) and matrix B (src1) in 8 bit fixed point

1118

*

1119

* @attention The width of vector A, the width of matrix B, the alpha's value and the fixed point position need to be passed at compile time using -DWIDTH_VECTOR_A -DWIDTH_MATRIX_B, -DALPHA and -DFIXED_POINT_POSITION

1120

*

1121

* @attention The input vector A and matrix B must not be reshaped

1122

*

1123

* @note: ALPHA must be passed in 8 bit fixed point format

1124

*

1125

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: QS8

1126

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

1127

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1128

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

1129

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1130

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

1131

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

1132

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

1133

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1134

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

1135

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1136

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

1137

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

1138

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

1139

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

1140

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

1141

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

1142

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

1143

*/

1144

__kernel void gemm_vm_qs8(IMAGE_DECLARATION(src0),

1145

IMAGE_DECLARATION(src1),

1146

IMAGE_DECLARATION(dst))

1147

{

1148

int idx = get_global_id(0) * 16;

1149

1150

/* Compute the address for the vector A and matrix B */

1151

int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));

1152

src_addr.s1 += idx;

1153

1154

int end_row_vec_a = src_addr.s0 + WIDTH_VECTOR_A;

short8 acc0 = 0;

short8 acc1 = 0;

/* This for loop performs 4 accumulations per iteration */

1160

for(; src_addr.s0 <= (end_row_vec_a - 4); src_addr += (int2)(4, 4 * src1_stride_y))

1161

{

1162

char4 a0 = vload4(0, (__global char *)(src0_ptr + src_addr.s0));

1163

char16 b0 = vload16(0, (__global char *)(src1_ptr + src_addr.s1 + 0 * src1_stride_y));

1164

char16 b1 = vload16(0, (__global char *)(src1_ptr + src_addr.s1 + 1 * src1_stride_y));

1165

char16 b2 = vload16(0, (__global char *)(src1_ptr + src_addr.s1 + 2 * src1_stride_y));

1166

char16 b3 = vload16(0, (__global char *)(src1_ptr + src_addr.s1 + 3 * src1_stride_y));

1167

1168

acc0 = mlal_sat_qs8x8(acc0, (char8)a0.s0, b0.s01234567, FIXED_POINT_POSITION);

1169

acc0 = mlal_sat_qs8x8(acc0, (char8)a0.s1, b1.s01234567, FIXED_POINT_POSITION);

1170

acc0 = mlal_sat_qs8x8(acc0, (char8)a0.s2, b2.s01234567, FIXED_POINT_POSITION);

1171

acc0 = mlal_sat_qs8x8(acc0, (char8)a0.s3, b3.s01234567, FIXED_POINT_POSITION);

1172

1173

acc1 = mlal_sat_qs8x8(acc1, (char8)a0.s0, b0.s89ABCDEF, FIXED_POINT_POSITION);

1174

acc1 = mlal_sat_qs8x8(acc1, (char8)a0.s1, b1.s89ABCDEF, FIXED_POINT_POSITION);

1175

acc1 = mlal_sat_qs8x8(acc1, (char8)a0.s2, b2.s89ABCDEF, FIXED_POINT_POSITION);

1176

acc1 = mlal_sat_qs8x8(acc1, (char8)a0.s3, b3.s89ABCDEF, FIXED_POINT_POSITION);

1177

}

1178

1179

/* Left-over accumulations */

1180

for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(1, src1_stride_y))

1181

{

1182

char a0 = *((__global char *)(src0_ptr + src_addr.s0));

1183

char16 b0 = vload16(0, (__global char *)(src1_ptr + src_addr.s1));

1184

1185

acc0 = mlal_sat_qs8x8(acc0, (char8)a0, b0.s01234567, FIXED_POINT_POSITION);

1186

acc1 = mlal_sat_qs8x8(acc1, (char8)a0, b0.s89ABCDEF, FIXED_POINT_POSITION);

1187

}

1188

1189

/* Compute destination address */

1190

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

1191

1192

/* Multiply by the weight of matrix product */

1193

char16 acc_qs8 = convert_char16_sat((short16)(acc0, acc1));

1194

1195

acc_qs8 = mul_sat_qs8x16(acc_qs8, (char16)ALPHA, FIXED_POINT_POSITION);

1196

1197

/* Store 16 values */

1198

vstore16(acc_qs8, 0, (__global char *)(offset(&dst, 0, 0)));

1199

}

Gian Marco Iodice

8a38369

2017-07-03 17:41:47 +0100

[diff] [blame^]

1200

1201

/** This OpenCL kernel computes the vector by matrix multiplication between the vector A (src0) and matrix B (src1) in 16 bit fixed point

1202

*

1203

* @attention The width of vector A, the width of matrix B, the alpha's value and the fixed point position need to be passed at compile time using -DWIDTH_VECTOR_A -DWIDTH_MATRIX_B, -DALPHA and -DFIXED_POINT_POSITION

1204

*

1205

* @attention The input vector A and matrix B must not be reshaped

1206

*

1207

* @note: ALPHA must be passed in 16 bit fixed point format

1208

*

1209

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: QS16

1210

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

1211

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1212

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

1213

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1214

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

1215

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

1216

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

1217

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1218

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

1219

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1220

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

1221

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

1222

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

1223

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

1224

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

1225

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

1226

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

1227

*/

1228

__kernel void gemm_vm_qs16(IMAGE_DECLARATION(src0),

1229

IMAGE_DECLARATION(src1),

1230

IMAGE_DECLARATION(dst))

1231

{

1232

int idx = get_global_id(0) * 8;

1233

1234

/* Compute the address for the vector A and matrix B */

1235

int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));

1236

src_addr.s1 += idx * sizeof(short);

1237

1238

int end_row_vec_a = src_addr.s0 + (WIDTH_VECTOR_A * sizeof(short));

1239

1240

/* Reset accumulator */

1241

int8 acc0 = 0;

1242

1243

/* This for loop performs 4 accumulations per iteration */

1244

for(; src_addr.s0 <= (end_row_vec_a - 4 * sizeof(short)); src_addr += (int2)(4 * sizeof(short), 4 * src1_stride_y))

1245

{

1246

short4 a0 = vload4(0, (__global short *)(src0_ptr + src_addr.s0));

1247

short8 b0 = vload8(0, (__global short *)(src1_ptr + src_addr.s1 + 0 * src1_stride_y));

1248

short8 b1 = vload8(0, (__global short *)(src1_ptr + src_addr.s1 + 1 * src1_stride_y));

1249

short8 b2 = vload8(0, (__global short *)(src1_ptr + src_addr.s1 + 2 * src1_stride_y));

1250

short8 b3 = vload8(0, (__global short *)(src1_ptr + src_addr.s1 + 3 * src1_stride_y));

1251

1252

acc0 = mlal_sat_qs16x8(acc0, (short8)a0.s0, b0, FIXED_POINT_POSITION);

1253

acc0 = mlal_sat_qs16x8(acc0, (short8)a0.s1, b1, FIXED_POINT_POSITION);

1254

acc0 = mlal_sat_qs16x8(acc0, (short8)a0.s2, b2, FIXED_POINT_POSITION);

1255

acc0 = mlal_sat_qs16x8(acc0, (short8)a0.s3, b3, FIXED_POINT_POSITION);

1256

}

1257

1258

/* Left-over accumulations */

1259

for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(sizeof(short), src1_stride_y))

1260

{

1261

short a0 = *((__global short *)(src0_ptr + src_addr.s0));

1262

short8 b0 = vload8(0, (__global short *)(src1_ptr + src_addr.s1));

1263

1264

acc0 = mlal_sat_qs16x8(acc0, (short8)a0, b0, FIXED_POINT_POSITION);

1265

}

1266

1267

/* Compute destination address */

1268

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

1269

1270

/* Multiply by the weight of matrix product */

1271

short8 acc_qs16 = convert_short8_sat(acc0);

1272

1273

acc_qs16 = mul_sat_qs16x8(acc_qs16, (short8)ALPHA, FIXED_POINT_POSITION);

1274

1275

/* Store 8 values */

1276

vstore8(acc_qs16, 0, (__global short *)(offset(&dst, 0, 0)));

1277

}

1278

#endif /* defined(FIXED_POINT_POSITION) */

1279

#endif /* defined(WIDTH_VECTOR_A) */

1280

#endif /* defined(WIDTH_MATRIX_B) && defined(ALPHA) */

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1281

Anthony Barbier

2017-07-03 17:39:37 +0100

[diff] [blame]

1282

#ifdef BETA

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1283

/** This OpenCL kernel performs the in-place matrix addition between 2 matrices taking into account that the second matrix might be weighted by a scalar value beta:

1284

*

1285

* @attention The beta's value need to be passed at compile time using -DBETA

1286

*

1287

* @param[in] src_ptr Pointer to the source matrix. Supported data types: F32

1288

* @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)

1289

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1290

* @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)

1291

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1292

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

1293

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1294

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

1295

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

1296

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

1297

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

1298

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

1299

*/

1300

__kernel void gemm_ma_f32(IMAGE_DECLARATION(src),

1301

IMAGE_DECLARATION(dst))

1302

{

1303

/* Compute source and destination addresses */

1304

Image src = CONVERT_TO_IMAGE_STRUCT(src);

1305

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

1306

1307

/* Load values from A x B */

1308

float4 alpha_ab = vload4(0, (__global float *)dst.ptr);

1309

1310

/* Load values from Matrix C */

1311

float4 c = vload4(0, (__global float *)src.ptr);

1312

1313

/* Computes alpha * axb + beta * c */

1314

float4 out = alpha_ab + (float4)BETA * c;

1315

1316

/* Store final result in axb matrix */

1317

vstore4(out, 0, (__global float *)dst.ptr);

1318

}

1319

1320

/** This OpenCL kernel performs the in-place matrix addition between 2 matrices taking into account that the second matrix might be weighted by a scalar value beta:

1321

*

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

1322

* @attention The beta's value need to be passed at compile time using -DBETA

1323

*

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1324

* @param[in] src_ptr Pointer to the source matrix. Supported data types: F16

1325

* @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)

1326

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1327

* @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)

1328

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1329

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

1330

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1331

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

1332

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

1333

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

1334

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

1335

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

1336

*/

1337

__kernel void gemm_ma_f16(IMAGE_DECLARATION(src),

1338

IMAGE_DECLARATION(dst))

1339

{

1340

/* Compute source and destination addresses */

1341

Image src = CONVERT_TO_IMAGE_STRUCT(src);

1342

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

1343

1344

/* Load values from A x B */

1345

half8 alpha_ab = vload8(0, (__global half *)dst.ptr);

1346

1347

/* Load values from Matrix C */

1348

half8 c = vload8(0, (__global half *)src.ptr);

1349

1350

/* Computes alpha * axb + beta * c */

1351

half8 out = alpha_ab + (half8)BETA * c;

1352

1353

/* Store final result in axb matrix */

1354

vstore8(out, 0, (__global half *)dst.ptr);

1355

}

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

1356

Anthony Barbier

2017-07-03 17:39:37 +0100

[diff] [blame]

1357

#ifdef FIXED_POINT_POSITION

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

1358

/** This OpenCL kernel performs the in-place matrix addition between 2 matrices in 8 bit fixed point taking into account that the second matrix might be weighted by a scalar value beta:

1359

*

1360

* @attention The beta's value and the fixed point position need to be passed at compile time using -DBETA and -DFIXED_POINT_POSITION

1361

*

1362

* @note: BETA must be passed in 8 bit fixed point format

1363

*

1364

* @param[in] src_ptr Pointer to the source matrix. Supported data types: QS8

1365

* @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)

1366

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1367

* @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)

1368

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1369

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix

1370

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr

1371

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

1372

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

1373

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

1374

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

1375

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

1376

*/

1377

__kernel void gemm_ma_qs8(IMAGE_DECLARATION(src),

1378

IMAGE_DECLARATION(dst))

1379

{

1380

/* Compute source and destination addresses */

1381

Image src = CONVERT_TO_IMAGE_STRUCT(src);

1382

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

1383

1384

/* Load values from A x B */

1385

char16 alpha_ab = vload16(0, (__global char *)dst.ptr);

1386

1387

/* Load values from Matrix C */

1388

char16 c = vload16(0, (__global char *)src.ptr);

1389

1390

/* Computes alpha * axb + beta * c */

1391

char16 out = mla_sat_qs8x16(alpha_ab, (char16)BETA, c, FIXED_POINT_POSITION);

1392

1393

/* Store final result in axb matrix */

1394

vstore16(out, 0, (__global char *)dst.ptr);

1395

}

Gian Marco Iodice

8a38369

2017-07-03 17:41:47 +0100

[diff] [blame^]

1396

1397

/** This OpenCL kernel performs the in-place matrix addition between 2 matrices in 16 bit fixed point taking into account that the second matrix might be weighted by a scalar value beta:

1398

*

1399

* @attention The beta's value and the fixed point position need to be passed at compile time using -DBETA and -DFIXED_POINT_POSITION

1400

*

1401

* @note: BETA must be passed in 16 bit fixed point format

1402

*

1403

* @param[in] src_ptr Pointer to the source matrix. Supported data types: QS16

1404

* @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)

1405

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1406

* @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)

1407

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1408

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix

1409

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr

1410

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

1411

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

1412

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

1413

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

1414

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

1415

*/

1416

__kernel void gemm_ma_qs16(IMAGE_DECLARATION(src),

1417

IMAGE_DECLARATION(dst))

1418

{

1419

/* Compute source and destination addresses */

1420

Image src = CONVERT_TO_IMAGE_STRUCT(src);

1421

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

1422

1423

/* Load values from A x B */

1424

short8 alpha_ab = vload8(0, (__global short *)dst.ptr);

1425

1426

/* Load values from Matrix C */

1427

short8 c = vload8(0, (__global short *)src.ptr);

1428

1429

/* Computes alpha * axb + beta * c */

1430

short8 out = mla_sat_qs16x8(alpha_ab, (short8)BETA, c, FIXED_POINT_POSITION);

1431

1432

/* Store final result in axb matrix */

1433

vstore8(out, 0, (__global short *)dst.ptr);

1434

}

1435

#endif /* defined(FIXED_POINT_POSITION) */

1436

#endif /* defined(BETA) */

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1437

Anthony Barbier

2017-07-03 17:39:37 +0100

[diff] [blame]

1438

#ifdef WIDTH_VECTOR_A

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1439

/** This OpenCL kernel computes the vector by matrix multiplication between each row of A (src0) and matrix B (src1) used for locally connected layer

1440

*

1441

* @attention The width of A need to be passed at compile time using -DWIDTH_VECTOR_A

1442

*

1443

* @attention The input A and matrix B must not be reshaped

1444

*

1445

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32

1446

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

1447

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1448

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

1449

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1450

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

1451

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1452

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

1453

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1454

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

1455

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1456

* @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)

1457

* @param[in] src1_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

1458

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

2017-06-23 13:38:14 +0100

[diff] [blame]

1459

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src0_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1460

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

1461

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

1462

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

1463

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

1464

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

1465

*/

1466

__kernel void gemm_lc_vm_f32(IMAGE_DECLARATION(src0),

1467

TENSOR3D_DECLARATION(src1),

1468

IMAGE_DECLARATION(dst))

1469

{

1470

int idx = get_global_id(0) * 4;

1471

int idy = get_global_id(1);

1472

1473

/* Compute the address for the vector A and matrix B */

1474

int2 src_addr = ((int2)(src0_offset_first_element_in_bytes + src0_stride_y * idy, src1_offset_first_element_in_bytes + src1_stride_z * idy));

1475

src_addr.s1 += idx * sizeof(float);

1476

1477

int end_row_vec_a = src_addr.s0 + (WIDTH_VECTOR_A * sizeof(float));

float4 acc = 0.0f;

for(; src_addr.s0 <= (end_row_vec_a - 2 * sizeof(float)); src_addr += (int2)(2 * sizeof(float), 2 * src1_stride_y))

1482

{

1483

float2 a0 = vload2(0, (__global float *)(src0_ptr + src_addr.s0));

1484

float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));

1485

float4 b1 = vload4(0, (__global float *)(src1_ptr + src_addr.s1 + src1_stride_y));

1486

1487

acc += b0 * (float4)a0.s0;

1488

acc += b1 * (float4)a0.s1;

1489

}

1490

1491

for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(sizeof(float), src1_stride_y))

1492

{

1493

float a0 = *((__global float *)(src0_ptr + src_addr.s0));

1494

float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));

1495

1496

acc += b0 * (float4)a0;

1497

}

1498

1499

/* Compute destination address */

1500

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

1501

1502

vstore4(acc, 0, (__global float *)(offset(&dst, 0, 0)));

1503

}

Anthony Barbier