Blame - src/core/CL/cl_kernels/gemm.cl - ml/ComputeLibrary

* Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_u8 and @ref gemm_transpose1x16_u8 before running the matrix multiplication

286

*

287

* @attention The width of matrix B and the alpha's value need to be passed at compile time using -DWIDTH_MATRIX_B

288

*

289

* @param[in] src0_ptr Pointer to the source matrix. Supported formats: U8

290

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

291

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

292

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

293

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

294

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

295

* @param[in] src1_ptr Pointer to the source matrix. Supported formats: U8

296

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

297

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

298

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

299

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

300

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

301

* @param[out] dst_ptr Pointer to the destination matrix Supported formats: U8

302

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

303

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

304

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

305

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

306

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

307

* @param[in] a_offset Offset to be added to each element of the matrix A

308

* @param[in] b_offset Offset to be added to each element of the matrix B.

309

* @param[in] c_offset Offset to be added to each element of the matrix C.

310

* @param[in] c_mult_int Multiplied with each element of the matrix C.

311

* @param[in] shift Number of bits to shift right the result.

312

*/

313

__kernel void gemm_mm_u8(IMAGE_DECLARATION(src0),

314

IMAGE_DECLARATION(src1),

315

IMAGE_DECLARATION(dst),

int a_offset,

int b_offset,

int c_offset,

int c_mult_int,

int shift)

{

/* src_addr.s0 = address of matrix A */

323

/* src_addr.s1 = address of matrix B */

324

325

/* Compute address for matrix A and B */

326

int2 src_addr = (int2)(get_global_id(1), get_global_id(0)) * (int2)((src0_stride_y),

327

(src1_stride_y));

328

329

/* Add offset_first_element_in_bytes */

330

src_addr = src_addr + ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));

331

332

/* Compute end row address for matrix B */

333

int end_row_mtx_b = src_addr.s1 + WIDTH_MATRIX_B;

334

335

/* Reset accumulators */

int16 c00 = 0.0f;

int16 c10 = 0.0f;

int16 c20 = 0.0f;

int16 c30 = 0.0f;

for(; src_addr.s1 <= (end_row_mtx_b - 8); src_addr += (int2)(8, 32))

342

{

343

/* Load values from matrix A (interleaved) and matrix B (transposed) */

344

int8 a0 = (int8)a_offset + convert_int8(vload8(0, ((__global uchar *)src0_ptr) + src_addr.s0));

345

int16 b0 = (int16)b_offset + convert_int16(vload16(0, ((__global uchar *)src1_ptr) + src_addr.s1));

346

347

c00 += (int16)a0.s0 * b0;

348

c10 += (int16)a0.s1 * b0;

349

c20 += (int16)a0.s2 * b0;

350

c30 += (int16)a0.s3 * b0;

351

352

int16 b1 = (int16)b_offset + convert_int16(vload16(0, ((__global uchar *)src1_ptr) + src_addr.s1 + 16));

353

354

c00 += (int16)a0.s4 * b1;

355

c10 += (int16)a0.s5 * b1;

356

c20 += (int16)a0.s6 * b1;

357

c30 += (int16)a0.s7 * b1;

358

}

359

360

for(; src_addr.s1 < end_row_mtx_b; src_addr += (int2)(4, 16))

361

{

362

/* Load values from matrix A (interleaved) and matrix B (transposed) */

363

int4 a0 = (int4)a_offset + convert_int4(vload4(0, ((__global uchar *)src0_ptr) + src_addr.s0));

364

int16 b0 = (int16)b_offset + convert_int16(vload16(0, ((__global uchar *)src1_ptr) + src_addr.s1));

365

366

c00 += (int16)a0.s0 * b0;

367

c10 += (int16)a0.s1 * b0;

368

c20 += (int16)a0.s2 * b0;

369

c30 += (int16)a0.s3 * b0;

370

}

371

372

/* Compute destination address */

373

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

374

375

/* Multiply by the weight of matrix product */

376

c00 = (((int16)c_offset + c00) * (int16)c_mult_int) >> shift;

377

c10 = (((int16)c_offset + c10) * (int16)c_mult_int) >> shift;

378

c20 = (((int16)c_offset + c20) * (int16)c_mult_int) >> shift;

379

c30 = (((int16)c_offset + c30) * (int16)c_mult_int) >> shift;

380

381

/* Store 4x16 block */

382

vstore16(convert_uchar16_sat(c00), 0, (__global uchar *)(offset(&dst, 0, 0)));

383

vstore16(convert_uchar16_sat(c10), 0, (__global uchar *)(offset(&dst, 0, 1)));

384

vstore16(convert_uchar16_sat(c20), 0, (__global uchar *)(offset(&dst, 0, 2)));

385

vstore16(convert_uchar16_sat(c30), 0, (__global uchar *)(offset(&dst, 0, 3)));

}

#endif

#if(defined WIDTH_MATRIX_B && defined ALPHA)

390

/** This OpenCL kernel is optimised for Midgard. It computes the matrix multiplication between matrix A (src0) and matrix B (src1)

391

* Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_f32 and @ref gemm_transpose1x4_f32 before running the matrix multiplication

392

*

393

* @attention The width of matrix B and the alpha's value need to be passed at compile time using -DWIDTH_MATRIX_B and -DALPHA

394

*

395

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32

396

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

397

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

398

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

399

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

400

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

401

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: F32

402

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

403

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

404

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

405

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

406

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

407

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: F32

408

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

409

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

410

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

411

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

412

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

413

*/

414

__kernel void gemm_mm_f32_midgard(IMAGE_DECLARATION(src0),

415

IMAGE_DECLARATION(src1),

416

IMAGE_DECLARATION(dst))

417

{

418

/* src_addr.s0 = address of matrix A */

419

/* src_addr.s1 = address of matrix B */

420

421

/* Compute address for matrix A and B */

422

int2 src_addr = (int2)(get_global_id(1), get_global_id(0)) * (int2)((src0_stride_y),

423

(src1_stride_y));

424

425

/* Add offset_first_element_in_bytes */

426

src_addr = src_addr + ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));

427

428

/* Divide by 4 in order to get the src_addr in unit of float */

429

src_addr = src_addr >> 2;

430

431

/* Compute end row address for matrix B */

432

int end_row_mtx_b = src_addr.s1 + WIDTH_MATRIX_B;

433

434

/* Reset accumulators */

float4 c00 = 0.0f;

float4 c10 = 0.0f;

float4 c20 = 0.0f;

float4 c30 = 0.0f;

for(; src_addr.s1 <= (end_row_mtx_b - 8); src_addr += (int2)(8, 8))

441

{

442

/* Load values from matrix A (interleaved) and matrix B (transposed) */

443

float4 a0 = vload4(0, ((__global float *)src0_ptr) + src_addr.s0);

444

float4 b0 = vload4(0, ((__global float *)src1_ptr) + src_addr.s1);

445

446

c00 += (float4)a0.s0 * b0;

447

c10 += (float4)a0.s1 * b0;

448

c20 += (float4)a0.s2 * b0;

449

c30 += (float4)a0.s3 * b0;

450

451

/* Load values from matrix A (interleaved) and matrix B (transposed) */

452

a0 = vload4(0, ((__global float *)src0_ptr) + src_addr.s0 + 4);

453

b0 = vload4(0, ((__global float *)src1_ptr) + src_addr.s1 + 4);

454

455

c00 += (float4)a0.s0 * b0;

456

c10 += (float4)a0.s1 * b0;

457

c20 += (float4)a0.s2 * b0;

458

c30 += (float4)a0.s3 * b0;

459

}

460

461

for(; src_addr.s1 < end_row_mtx_b; src_addr += (int2)(4, 4))

462

{

463

/* Load values from matrix A (interleaved) and matrix B (transposed) */

464

float4 a0 = vload4(0, ((__global float *)src0_ptr) + src_addr.s0);

465

float4 b0 = vload4(0, ((__global float *)src1_ptr) + src_addr.s1);

466

467

c00 += (float4)a0.s0 * b0;

468

c10 += (float4)a0.s1 * b0;

469

c20 += (float4)a0.s2 * b0;

470

c30 += (float4)a0.s3 * b0;

471

}

472

473

/* Compute destination address */

474

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

475

476

/* Multiply by the weight of matrix product */

477

c00 = c00 * (float4)ALPHA;

478

c10 = c10 * (float4)ALPHA;

479

c20 = c20 * (float4)ALPHA;

480

c30 = c30 * (float4)ALPHA;

481

482

/* Store 4x4 block */

483

vstore4(c00, 0, (__global float *)(offset(&dst, 0, 0)));

484

vstore4(c10, 0, (__global float *)(offset(&dst, 0, 1)));

485

vstore4(c20, 0, (__global float *)(offset(&dst, 0, 2)));

486

vstore4(c30, 0, (__global float *)(offset(&dst, 0, 3)));

487

}

488

489

/** This OpenCL kernel is optimised for Bifrost. It computes the matrix multiplication between matrix A (src0) and matrix B (src1)

490

* Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_f32 and @ref gemm_transpose1x4_f32 before running the matrix multiplication

491

*

492

* @attention The width of matrix B and the alpha's value need to be passed at compile time using -DWIDTH_MATRIX_B and -DALPHA

493

*

494

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32

495

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

496

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

497

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

498

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

499

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

500

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: F32

501

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

502

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

503

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

504

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

505

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

506

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: F32

507

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

508

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

509

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

510

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

511

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

512

*/

513

__kernel void gemm_mm_f32_bifrost(IMAGE_DECLARATION(src0),

514

IMAGE_DECLARATION(src1),

515

IMAGE_DECLARATION(dst))

516

{

517

// src_addr_a = address of matrix A

518

// src_addr_b = address of matrix B

519

__global float *src_addr_a = (__global float *)(src0_ptr + get_global_id(1) * src0_stride_y + src0_offset_first_element_in_bytes);

520

__global float *src_addr_b = (__global float *)(src1_ptr + get_global_id(0) * src1_stride_y + src1_offset_first_element_in_bytes);

521

522

// Compute end row address for matrix B

523

__global float *src_end_addr_b = src_addr_b + WIDTH_MATRIX_B;

524

525

// Reset accumulators

float c00 = 0.0f;

float c01 = 0.0f;

float c02 = 0.0f;

float c03 = 0.0f;

float c10 = 0.0f;

float c11 = 0.0f;

float c12 = 0.0f;

float c13 = 0.0f;

float c20 = 0.0f;

float c21 = 0.0f;

float c22 = 0.0f;

float c23 = 0.0f;

float c30 = 0.0f;

float c31 = 0.0f;

float c32 = 0.0f;

float c33 = 0.0f;

for(; src_addr_b <= (src_end_addr_b - 16); src_addr_a += 16, src_addr_b += 16)

544

{

545

// Load values from matrix A (interleaved) and matrix B (transposed)

546

float4 a0 = vload4(0, src_addr_a);

547

float4 b0 = vload4(0, src_addr_b);

548

549

c00 = fma(a0.s0, b0.s0, c00);

550

c01 = fma(a0.s0, b0.s1, c01);

551

c02 = fma(a0.s0, b0.s2, c02);

552

c03 = fma(a0.s0, b0.s3, c03);

553

554

c10 = fma(a0.s1, b0.s0, c10);

555

c11 = fma(a0.s1, b0.s1, c11);

556

c12 = fma(a0.s1, b0.s2, c12);

557

c13 = fma(a0.s1, b0.s3, c13);

558

559

c20 = fma(a0.s2, b0.s0, c20);

560

c21 = fma(a0.s2, b0.s1, c21);

561

c22 = fma(a0.s2, b0.s2, c22);

562

c23 = fma(a0.s2, b0.s3, c23);

563

564

c30 = fma(a0.s3, b0.s0, c30);

565

c31 = fma(a0.s3, b0.s1, c31);

566

c32 = fma(a0.s3, b0.s2, c32);

567

c33 = fma(a0.s3, b0.s3, c33);

568

569

// Load values from matrix A (interleaved) and matrix B (transposed)

570

a0 = vload4(0, src_addr_a + 4);

571

b0 = vload4(0, src_addr_b + 4);

572

573

c00 = fma(a0.s0, b0.s0, c00);

574

c01 = fma(a0.s0, b0.s1, c01);

575

c02 = fma(a0.s0, b0.s2, c02);

576

c03 = fma(a0.s0, b0.s3, c03);

577

578

c10 = fma(a0.s1, b0.s0, c10);

579

c11 = fma(a0.s1, b0.s1, c11);

580

c12 = fma(a0.s1, b0.s2, c12);

581

c13 = fma(a0.s1, b0.s3, c13);

582

583

c20 = fma(a0.s2, b0.s0, c20);

584

c21 = fma(a0.s2, b0.s1, c21);

585

c22 = fma(a0.s2, b0.s2, c22);

586

c23 = fma(a0.s2, b0.s3, c23);

587

588

c30 = fma(a0.s3, b0.s0, c30);

589

c31 = fma(a0.s3, b0.s1, c31);

590

c32 = fma(a0.s3, b0.s2, c32);

591

c33 = fma(a0.s3, b0.s3, c33);

592

593

// Load values from matrix A (interleaved) and matrix B (transposed)

594

a0 = vload4(0, src_addr_a + 8);

595

b0 = vload4(0, src_addr_b + 8);

596

597

c00 = fma(a0.s0, b0.s0, c00);

598

c01 = fma(a0.s0, b0.s1, c01);

599

c02 = fma(a0.s0, b0.s2, c02);

600

c03 = fma(a0.s0, b0.s3, c03);

601

602

c10 = fma(a0.s1, b0.s0, c10);

603

c11 = fma(a0.s1, b0.s1, c11);

604

c12 = fma(a0.s1, b0.s2, c12);

605

c13 = fma(a0.s1, b0.s3, c13);

606

607

c20 = fma(a0.s2, b0.s0, c20);

608

c21 = fma(a0.s2, b0.s1, c21);

609

c22 = fma(a0.s2, b0.s2, c22);

610

c23 = fma(a0.s2, b0.s3, c23);

611

612

c30 = fma(a0.s3, b0.s0, c30);

613

c31 = fma(a0.s3, b0.s1, c31);

614

c32 = fma(a0.s3, b0.s2, c32);

615

c33 = fma(a0.s3, b0.s3, c33);

616

617

// Load values from matrix A (interleaved) and matrix B (transposed)

618

a0 = vload4(0, src_addr_a + 12);

619

b0 = vload4(0, src_addr_b + 12);

620

621

c00 = fma(a0.s0, b0.s0, c00);

622

c01 = fma(a0.s0, b0.s1, c01);

623

c02 = fma(a0.s0, b0.s2, c02);

624

c03 = fma(a0.s0, b0.s3, c03);

625

626

c10 = fma(a0.s1, b0.s0, c10);

627

c11 = fma(a0.s1, b0.s1, c11);

628

c12 = fma(a0.s1, b0.s2, c12);

629

c13 = fma(a0.s1, b0.s3, c13);

630

631

c20 = fma(a0.s2, b0.s0, c20);

632

c21 = fma(a0.s2, b0.s1, c21);

633

c22 = fma(a0.s2, b0.s2, c22);

634

c23 = fma(a0.s2, b0.s3, c23);

635

636

c30 = fma(a0.s3, b0.s0, c30);

637

c31 = fma(a0.s3, b0.s1, c31);

638

c32 = fma(a0.s3, b0.s2, c32);

639

c33 = fma(a0.s3, b0.s3, c33);

640

}

641

642

for(; src_addr_b < src_end_addr_b; src_addr_a += 4, src_addr_b += 4)

643

{

644

// Load values from matrix A (interleaved) and matrix B (transposed)

645

float4 a0 = vload4(0, src_addr_a);

646

float4 b0 = vload4(0, src_addr_b);

647

648

c00 = fma(a0.s0, b0.s0, c00);

649

c01 = fma(a0.s0, b0.s1, c01);

650

c02 = fma(a0.s0, b0.s2, c02);

651

c03 = fma(a0.s0, b0.s3, c03);

652

653

c10 = fma(a0.s1, b0.s0, c10);

654

c11 = fma(a0.s1, b0.s1, c11);

655

c12 = fma(a0.s1, b0.s2, c12);

656

c13 = fma(a0.s1, b0.s3, c13);

657

658

c20 = fma(a0.s2, b0.s0, c20);

659

c21 = fma(a0.s2, b0.s1, c21);

660

c22 = fma(a0.s2, b0.s2, c22);

661

c23 = fma(a0.s2, b0.s3, c23);

662

663

c30 = fma(a0.s3, b0.s0, c30);

664

c31 = fma(a0.s3, b0.s1, c31);

665

c32 = fma(a0.s3, b0.s2, c32);

666

c33 = fma(a0.s3, b0.s3, c33);

667

}

668

669

// Compute destination address

670

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

671

672

// Multiply by the weight of matrix product

c00 = c00 * ALPHA;

c01 = c01 * ALPHA;

c02 = c02 * ALPHA;

c03 = c03 * ALPHA;

c10 = c10 * ALPHA;

c11 = c11 * ALPHA;

c12 = c12 * ALPHA;

c13 = c13 * ALPHA;

c20 = c20 * ALPHA;

c21 = c21 * ALPHA;

c22 = c22 * ALPHA;

c23 = c23 * ALPHA;

c30 = c30 * ALPHA;

c31 = c31 * ALPHA;

c32 = c32 * ALPHA;

c33 = c33 * ALPHA;

barrier(CLK_GLOBAL_MEM_FENCE);

691

692

// Store 4x4 block

693

vstore4((float4)(c00, c01, c02, c03), 0, (__global float *)(offset(&dst, 0, 0)));

694

vstore4((float4)(c10, c11, c12, c13), 0, (__global float *)(offset(&dst, 0, 1)));

695

vstore4((float4)(c20, c21, c22, c23), 0, (__global float *)(offset(&dst, 0, 2)));

696

vstore4((float4)(c30, c31, c32, c33), 0, (__global float *)(offset(&dst, 0, 3)));

697

}

698

699

/** This OpenCL kernel computes the matrix multiplication between matrix A (src0) and matrix B (src1)

700

* Matrix A and matrix B must be reshaped respectively with @ref gemm_interleave4x4_f16 and @ref gemm_transpose1x8_f16 before running the matrix multiplication

701

*

702

* @attention The width of matrix B and the alpha's value need to be passed at compile time using -DWIDTH_MATRIX_B and -DALPHA

703

*

704

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16

705

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

706

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

707

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

708

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

709

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

710

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: F16

711

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

712

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

713

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

714

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

715

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

716

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: F16

717

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

718

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

719

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

720

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

721

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

722

*/

723

__kernel void gemm_mm_f16(IMAGE_DECLARATION(src0),

724

IMAGE_DECLARATION(src1),

725

IMAGE_DECLARATION(dst))

726

{

727

/* src_addr.s0 = address of matrix A */

728

/* src_addr.s1 = address of matrix B */

729

730

/* Compute address for matrix A and B */

731

int2 src_addr = (int2)(get_global_id(1), get_global_id(0)) * (int2)((src0_stride_y),

732

(src1_stride_y));

733

734

/* Add offset_first_element_in_bytes */

735

src_addr = src_addr + ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));

736

737

/* Divide by 2 in order to get the src_addr in unit of half */

738

src_addr = src_addr >> 1;

739

740

/* Compute end row address for matrix B */

741

int end_row_mtx_b = src_addr.s1 + WIDTH_MATRIX_B;

742

743

/* Reset accumulators */

half8 c00 = 0.0f;

half8 c10 = 0.0f;

half8 c20 = 0.0f;

half8 c30 = 0.0f;

for(; src_addr.s1 <= (end_row_mtx_b - 8); src_addr += (int2)(8, 16))

750

{

751

/* Load values from matrix A (interleaved) and matrix B (transposed) */

752

half4 a0 = vload4(0, ((__global half *)src0_ptr) + src_addr.s0);

753

half8 b0 = vload8(0, ((__global half *)src1_ptr) + src_addr.s1);

754

755

c00 += (half8)a0.s0 * b0;

756

c10 += (half8)a0.s1 * b0;

757

c20 += (half8)a0.s2 * b0;

758

c30 += (half8)a0.s3 * b0;

759

760

/* Load values from matrix A (interleaved) and matrix B (transposed) */

761

a0 = vload4(0, ((__global half *)src0_ptr) + src_addr.s0 + 4);

762

b0 = vload8(0, ((__global half *)src1_ptr) + src_addr.s1 + 8);

763

764

c00 += (half8)a0.s0 * b0;

765

c10 += (half8)a0.s1 * b0;

766

c20 += (half8)a0.s2 * b0;

767

c30 += (half8)a0.s3 * b0;

768

}

769

770

for(; src_addr.s1 < end_row_mtx_b; src_addr += (int2)(4, 8))

771

{

772

/* Load values from matrix A (interleaved) and matrix B (transposed) */

773

half4 a0 = vload4(0, ((__global half *)src0_ptr) + src_addr.s0);

774

half8 b0 = vload8(0, ((__global half *)src1_ptr) + src_addr.s1);

775

776

c00 += (half8)a0.s0 * b0;

777

c10 += (half8)a0.s1 * b0;

778

c20 += (half8)a0.s2 * b0;

779

c30 += (half8)a0.s3 * b0;

780

}

781

782

/* Compute destination address */

783

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

784

785

/* Multiply by the weight of matrix product */

786

c00 = c00 * (half8)ALPHA;

787

c10 = c10 * (half8)ALPHA;

788

c20 = c20 * (half8)ALPHA;

789

c30 = c30 * (half8)ALPHA;

790

791

/* Store 4x8 block */

792

vstore8(c00, 0, (__global half *)(offset(&dst, 0, 0)));

793

vstore8(c10, 0, (__global half *)(offset(&dst, 0, 1)));

794

vstore8(c20, 0, (__global half *)(offset(&dst, 0, 2)));

795

vstore8(c30, 0, (__global half *)(offset(&dst, 0, 3)));

796

}

797

798

#if(defined WIDTH_VECTOR_A)

799

/** This OpenCL kernel computes the vector by matrix multiplication between the vector A (src0) and matrix B (src1)

800

*

801

* @attention The width of vector A, the width of matrix B and the alpha's value need to be passed at compile time using -DWIDTH_VECTOR_A -DWIDTH_MATRIX_B and -DALPHA

802

*

803

* @attention The input vector A and matrix B must not be reshaped

804

*

805

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32

806

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

807

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

808

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

809

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

810

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

811

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: F32

812

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

813

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

814

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

815

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

816

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

817

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: F32

818

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

819

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

820

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

821

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

822

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

823

*/

824

__kernel void gemm_vm_f32(IMAGE_DECLARATION(src0),

825

IMAGE_DECLARATION(src1),

826

IMAGE_DECLARATION(dst))

827

{

828

int idx = get_global_id(0) * 4;

829

830

/* Compute the address for the vector A and matrix B */

831

int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));

832

src_addr.s1 += idx * sizeof(float);

833

834

int end_row_vec_a = src_addr.s0 + (WIDTH_VECTOR_A * sizeof(float));

float4 acc = 0.0f;

for(; src_addr.s0 <= (end_row_vec_a - 2 * sizeof(float)); src_addr += (int2)(2 * sizeof(float), 2 * src1_stride_y))

839

{

840

float2 a0 = vload2(0, (__global float *)(src0_ptr + src_addr.s0));

841

float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));

842

float4 b1 = vload4(0, (__global float *)(src1_ptr + src_addr.s1 + src1_stride_y));

843

844

acc += b0 * (float4)a0.s0;

845

acc += b1 * (float4)a0.s1;

846

}

847

848

for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(sizeof(float), src1_stride_y))

849

{

850

float a0 = *((__global float *)(src0_ptr + src_addr.s0));

851

float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));

852

853

acc += b0 * (float4)a0;

854

}

855

856

/* Compute destination address */

857

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

858

859

/* Multiply by the weight of vector-matrix product */

860

acc = acc * (float4)ALPHA;

861

862

vstore4(acc, 0, (__global float *)(offset(&dst, 0, 0)));

863

}

864

865

/** This OpenCL kernel computes the vector by matrix multiplication between the vector A (src0) and matrix B (src1)

866

*

867

* @attention The width of vector A, the width of matrix B and the alpha's value need to be passed at compile time using -DWIDTH_VECTOR_A -DWIDTH_MATRIX_B and -DALPHA

868

*

869

* @attention The input vector A and matrix B must not be reshaped

870

*

871

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F16

872

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

873

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

874

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

875

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

876

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

877

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: F16

878

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

879

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

880

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

881

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

882

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

883

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: F16

884

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

885

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

886

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

887

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

888

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

889

*/

890

__kernel void gemm_vm_f16(IMAGE_DECLARATION(src0),

891

IMAGE_DECLARATION(src1),

892

IMAGE_DECLARATION(dst))

893

{

894

int idx = get_global_id(0) * 8;

895

896

/* Compute the address for the vector A and matrix B */

897

int2 src_addr = ((int2)(src0_offset_first_element_in_bytes, src1_offset_first_element_in_bytes));

898

src_addr.s1 += idx * sizeof(half);

899

900

int end_row_vec_a = src_addr.s0 + (WIDTH_VECTOR_A * sizeof(half));

half8 acc = 0.0f;

for(; src_addr.s0 <= (end_row_vec_a - 4 * sizeof(half)); src_addr += (int2)(4 * sizeof(half), 4 * src1_stride_y))

905

{

906

half4 a0 = vload4(0, (__global half *)(src0_ptr + src_addr.s0));

907

half8 b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1 + 0 * src1_stride_y));

908

half8 b1 = vload8(0, (__global half *)(src1_ptr + src_addr.s1 + 1 * src1_stride_y));

909

half8 b2 = vload8(0, (__global half *)(src1_ptr + src_addr.s1 + 2 * src1_stride_y));

910

half8 b3 = vload8(0, (__global half *)(src1_ptr + src_addr.s1 + 3 * src1_stride_y));

911

912

acc += b0 * (half8)a0.s0;

913

acc += b1 * (half8)a0.s1;

914

acc += b2 * (half8)a0.s2;

915

acc += b3 * (half8)a0.s3;

916

}

917

918

for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(sizeof(half), src1_stride_y))

919

{

920

half a0 = *((__global half *)(src0_ptr + src_addr.s0));

921

half8 b0 = vload8(0, (__global half *)(src1_ptr + src_addr.s1));

922

923

acc += b0 * (half8)a0;

924

}

925

926

/* Compute destination address */

927

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

928

929

/* Multiply by the weight of vector-matrix product */

930

acc = acc * (half8)ALPHA;

931

932

vstore8(acc, 0, (__global half *)(offset(&dst, 0, 0)));

933

}

934

#endif /* (defined WIDTH_VECTOR_A) */

935

#endif /* (defined WIDTH_MATRIX_B && defined ALPHA) */

936

937

#if(defined BETA)

938

/** This OpenCL kernel performs the in-place matrix addition between 2 matrices taking into account that the second matrix might be weighted by a scalar value beta:

939

*

940

* @attention The beta's value need to be passed at compile time using -DBETA

941

*

942

* @param[in] src_ptr Pointer to the source matrix. Supported data types: F32

943

* @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)

944

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

945

* @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)

946

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

947

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix

948

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: F32

949

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

950

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

951

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

952

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

953

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

954

*/

955

__kernel void gemm_ma_f32(IMAGE_DECLARATION(src),

956

IMAGE_DECLARATION(dst))

957

{

958

/* Compute source and destination addresses */

959

Image src = CONVERT_TO_IMAGE_STRUCT(src);

960

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

961

962

/* Load values from A x B */

963

float4 alpha_ab = vload4(0, (__global float *)dst.ptr);

964

965

/* Load values from Matrix C */

966

float4 c = vload4(0, (__global float *)src.ptr);

967

968

/* Computes alpha * axb + beta * c */

969

float4 out = alpha_ab + (float4)BETA * c;

970

971

/* Store final result in axb matrix */

972

vstore4(out, 0, (__global float *)dst.ptr);

973

}

974

975

/** This OpenCL kernel performs the in-place matrix addition between 2 matrices taking into account that the second matrix might be weighted by a scalar value beta:

976

*

977

* @param[in] src_ptr Pointer to the source matrix. Supported data types: F16

978

* @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)

979

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

980

* @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)

981

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

982

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix

983

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: F16

984

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

985

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

986

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

987

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

988

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

989

*/

990

__kernel void gemm_ma_f16(IMAGE_DECLARATION(src),

991

IMAGE_DECLARATION(dst))

992

{

993

/* Compute source and destination addresses */

994

Image src = CONVERT_TO_IMAGE_STRUCT(src);

995

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

996

997

/* Load values from A x B */

998

half8 alpha_ab = vload8(0, (__global half *)dst.ptr);

999

1000

/* Load values from Matrix C */

1001

half8 c = vload8(0, (__global half *)src.ptr);

1002

1003

/* Computes alpha * axb + beta * c */

1004

half8 out = alpha_ab + (half8)BETA * c;

1005

1006

/* Store final result in axb matrix */

1007

vstore8(out, 0, (__global half *)dst.ptr);

1008

}

1009

#endif /* (defined BETA) */

1010

1011

#if(defined WIDTH_VECTOR_A)

1012

/** This OpenCL kernel computes the vector by matrix multiplication between each row of A (src0) and matrix B (src1) used for locally connected layer

1013

*

1014

* @attention The width of A need to be passed at compile time using -DWIDTH_VECTOR_A

1015

*

1016

* @attention The input A and matrix B must not be reshaped

1017

*

1018

* @param[in] src0_ptr Pointer to the source matrix. Supported data types: F32

1019

* @param[in] src0_stride_x Stride of the source matrix in X dimension (in bytes)

1020

* @param[in] src0_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1021

* @param[in] src0_stride_y Stride of the source matrix in Y dimension (in bytes)

1022

* @param[in] src0_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1023

* @param[in] src0_offset_first_element_in_bytes The offset of the first element in the source matrix

1024

* @param[in] src1_ptr Pointer to the source matrix. Supported data types: F32

1025

* @param[in] src1_stride_x Stride of the source matrix in X dimension (in bytes)

1026

* @param[in] src1_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1027

* @param[in] src1_stride_y Stride of the source matrix in Y dimension (in bytes)

1028

* @param[in] src1_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1029

* @param[in] src1_stride_z Stride of the source matrix in Z dimension (in bytes)

1030

* @param[in] src1_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

1031

* @param[in] src1_offset_first_element_in_bytes The offset of the first element in the source matrix

1032

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: F32

1033

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

1034

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

1035

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

1036

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

1037

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

1038

*/

1039

__kernel void gemm_lc_vm_f32(IMAGE_DECLARATION(src0),

1040

TENSOR3D_DECLARATION(src1),

1041

IMAGE_DECLARATION(dst))

1042

{

1043

int idx = get_global_id(0) * 4;

1044

int idy = get_global_id(1);

1045

1046

/* Compute the address for the vector A and matrix B */

1047

int2 src_addr = ((int2)(src0_offset_first_element_in_bytes + src0_stride_y * idy, src1_offset_first_element_in_bytes + src1_stride_z * idy));

1048

src_addr.s1 += idx * sizeof(float);

1049

1050

int end_row_vec_a = src_addr.s0 + (WIDTH_VECTOR_A * sizeof(float));

float4 acc = 0.0f;

for(; src_addr.s0 <= (end_row_vec_a - 2 * sizeof(float)); src_addr += (int2)(2 * sizeof(float), 2 * src1_stride_y))

1055

{

1056

float2 a0 = vload2(0, (__global float *)(src0_ptr + src_addr.s0));

1057

float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));

1058

float4 b1 = vload4(0, (__global float *)(src1_ptr + src_addr.s1 + src1_stride_y));

1059

1060

acc += b0 * (float4)a0.s0;

1061

acc += b1 * (float4)a0.s1;

1062

}

1063

1064

for(; src_addr.s0 < end_row_vec_a; src_addr += (int2)(sizeof(float), src1_stride_y))

1065

{

1066

float a0 = *((__global float *)(src0_ptr + src_addr.s0));

1067

float4 b0 = vload4(0, (__global float *)(src1_ptr + src_addr.s1));

1068

1069

acc += b0 * (float4)a0;

1070

}

1071

1072

/* Compute destination address */

1073

Image dst = CONVERT_TO_IMAGE_STRUCT(dst);

1074

1075

vstore4(acc, 0, (__global float *)(offset(&dst, 0, 0)));

1076

}

1077

#endif /* (defined WIDTH_VECTOR_A) */