Blame - src/core/CL/cl_kernels/common/gemm.cl - ml/ComputeLibrary

2022-11-03 09:30:56 +0000

[diff] [blame]

438

ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, N0, c, A_VAL, B_VAL);

Ramy Elgammal

2022-02-01 23:01:27 +0000

[diff] [blame]

439

#endif // defined(ACTIVATION_TYPE)

440

441

// Store output block

442

STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);

443

444

#undef RHS_BLOCK_SIZE

#undef RHS_OFFSET_X

#undef RHS_STEP_X

#undef RHS_STEP_LOOP

}

#endif // defined(GEMM_MM_RESHAPED_ONLY_RHS_T)

450

451

#if defined(OPENCL_IMAGE_SUPPORT) && defined(GEMM_MM_RESHAPED_ONLY_RHS_T_TEXTURE)

452

/** This OpenCL kernel computes the matrix multiplication between 2 matrices. The RHS matrix is stored in OpenCL image

453

* The LHS matrix is NOT reshaped

454

* The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is transposed

Ramy Elgammal

2022-02-01 23:01:27 +0000

[diff] [blame]

455

*

456

* @note -DOPENCL_IMAGE_SUPPORT must be passed at compile time in order to compile this OpenCL kernel

457

* @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.

458

* @note The GEMM's dimensions (M,N and K) must be passed at runtime as kernel parameters.

459

* @note The height of the RHS matrix, defined before creating the OpenCL image object from the OpenCL buffer, should be passed at compile time using -DRHS_HEIGHT=<value> (e.g. -DRHS_HEIGHT=32)

460

* Since we cannot create a 3d image from a buffer, the third dimension could be collapsed with the second dimension so RHS_HEIGHT

461

* could be different from the value returned by get_image_height(rhs_img).

462

* @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (e.g. -DN0=8, -DK0=4).

463

* @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)

464

* @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)

465

* @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.

466

* @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)

467

* @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)

468

* @note Only the following configurations of M0, N0 and K0 are currently supported:

469

* - M0 = 1, 2, 3, 4, 5, 6, 7, 8

* - N0 = 4, 8, 16

* - K0 = 4, 8, 16

* - H0 >= 1

*

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

475

* The activation function is performed after the bias addition

476

* @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:

477

* -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D

478

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

479

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

480

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

481

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix

482

*

483

* @param[in] lhs_ptr Pointer to the LHS matrix. Supported data type: F32

484

* @param[in] lhs_stride_x Stride of the LHS matrix in X dimension (in bytes)

485

* @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

486

* @param[in] lhs_stride_y Stride of the LHS matrix in Y dimension (in bytes)

487

* @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

488

* @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS matrix

489

* @param[in] rhs_img The RHS reshaped matrix as OpenCL image object. Supported data type: same as @p lhs_ptr

490

* @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

491

* @param[in] bias_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

492

* @param[in] bias_step_x (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)

493

* @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

494

* @param[in] bias_step_y (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)

495

* @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

496

* @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr

497

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

498

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

499

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

500

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

501

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

502

* @param[in] lhs_stride_z Stride of the LHS matrix in Z dimension (in bytes)

503

* @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes)

504

* @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

505

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

506

* @param[in] lhs_cross_plane_pad (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)

507

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

508

* @param[in] M Number of rows in LHS matrix not reshaped.

509

* @param[in] N Number of columns in RHS matrix not reshaped.

510

* @param[in] K Number of columns in LHS matrix and rows in RHS matrix not reshaped.

511

*/

512

__kernel void gemm_mm_reshaped_only_rhs_t_texture(IMAGE_DECLARATION(lhs),

513

__read_only image2d_t rhs_img,

514

#if defined(BETA)

515

IMAGE_DECLARATION(bias),

516

#endif // defined(BETA)

517

IMAGE_DECLARATION(dst),

uint lhs_stride_z,

uint rhs_stride_z,

#if defined(BETA)

uint bias_stride_z,

#endif //defined(BETA)

523

uint dst_stride_z

524

#if defined(REINTERPRET_INPUT_AS_3D)

525

,

526

uint lhs_cross_plane_pad

527

#endif // REINTERPRET_INPUT_AS_3D

528

#if defined(REINTERPRET_OUTPUT_AS_3D)

529

,

530

uint dst_cross_plane_pad

531

#endif // REINTERPRET_OUTPUT_AS_3D

,

const int M,

const int N,

const int K)

{

// Pixel unit

#define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(K0)

539

540

const uint LEFTOVER_K = K % K0;

541

542

// Block size

543

#define RHS_BLOCK_SIZE (PIXEL_UNIT * (N0))

544

545

// RHS offset and step X

546

#if defined(RHS_INTERLEAVE)

547

#define RHS_OFFSET_X (PIXEL_UNIT)

548

#define RHS_STEP_X (PIXEL_UNIT * (H0))

549

#define RHS_STEP_LOOP (1)

550

#else // defined(RHS_INTERLEAVE)

551

#define RHS_OFFSET_X (RHS_BLOCK_SIZE)

552

#define RHS_STEP_X PIXEL_UNIT

553

#define RHS_STEP_LOOP (H0)

554

#endif // defined(RHS_INTERLEAVE)

555

556

uint x = get_global_id(0);

557

uint y = get_global_id(1);

558

uint z = get_global_id(2);

559

560

const bool cond_y = y == 0;

561

const bool cond_x = ((x + 1) * N0 >= N);

562

563

#if defined(DUMMY_WORK_ITEMS)

564

if((x * N0 >= N) || (y * M0 >= M))

{

return;

}

#endif // defined(DUMMY_WORK_ITEMS)

569

570

// Compute LHS matrix address

571

uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;

572

573

#if defined(MATRIX_B_DEPTH)

574

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

575

const uint z_rhs = (get_global_id(2) % MATRIX_B_DEPTH);

576

#else // defined(MATRIX_B_DEPTH)

577

const uint z_rhs = get_global_id(2);

578

#endif // defined(MATRIX_B_DEPTH)

579

580

// Compute RHS matrix coordinates

581

uint x_rhs = (get_global_id(0) % H0) * (uint)RHS_OFFSET_X;

582

const uint y_rhs = (get_global_id(0) / (uint)H0) + z_rhs * RHS_HEIGHT;

583

584

REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0);

585

REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);

586

587

#if defined(REINTERPRET_INPUT_AS_3D)

588

// The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

589

CALCULATE_Z_OFFSET(M0, uint, zlhs, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);

590

591

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

592

// multiply lhs_stride_z by DEPTH_GEMM3D

593

lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;

594

595

#else // defined(REINTERPRET_INPUT_AS_3D)

596

597

// Add offset for batched GEMM

598

lhs_offset += z * lhs_stride_z;

599

600

#endif // defined(REINTERPRET_INPUT_AS_3D)

601

602

// Initialize the accumulators

603

REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0);

604

605

int i = 0;

606

for(; i <= (K - K0); i += K0)

607

{

608

// Load values from LHS matrix

609

LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);

610

611

// Load values from RHS matrix stored in a cl_image

612

REPEAT_VAR_INIT_TO_CONST(N0, VEC_DATA_TYPE(DATA_TYPE, K0), b, 0);

613

LOAD_TEXTURE2D(N0, PIXEL_UNIT, DATA_TYPE, b, rhs_img, x_rhs, y_rhs, RHS_STEP_X, 0);

614

615

// Accumulate

616

ARM_DOT_K0XN0(K0, a0, b, c0);

617

#if M0 > 1

618

ARM_DOT_K0XN0(K0, a1, b, c1);

619

#endif // M0 > 1

620

#if M0 > 2

621

ARM_DOT_K0XN0(K0, a2, b, c2);

622

#endif // M0 > 2

623

#if M0 > 3

624

ARM_DOT_K0XN0(K0, a3, b, c3);

625

#endif // M0 > 3

626

#if M0 > 4

627

ARM_DOT_K0XN0(K0, a4, b, c4);

628

#endif // M0 > 4

629

#if M0 > 5

630

ARM_DOT_K0XN0(K0, a5, b, c5);

631

#endif // M0 > 5

632

#if M0 > 6

633

ARM_DOT_K0XN0(K0, a6, b, c6);

634

#endif // M0 > 6

635

#if M0 > 7

636

ARM_DOT_K0XN0(K0, a7, b, c7);

637

#endif // M0 > 7

638

639

lhs_offset += K0 * sizeof(DATA_TYPE);

640

x_rhs += N0 * RHS_STEP_X * RHS_STEP_LOOP;

}

if(LEFTOVER_K != 0)

{

// Note: We cannot read out-of-bound elements from the RHS matrix because

646

// the RHS width is always multiple of K0. This is not be true for the LHS matrix

647

// Left-over accumulations for LHS matrix

union UNION_VEC_TYPE

{

DATA_TYPE s[K0];

VEC_DATA_TYPE(DATA_TYPE, K0)

v;

};

union UNION_VEC_TYPE a0 = {.v = 0 };

657

#if M0 > 1

658

union UNION_VEC_TYPE a1 = {.v = 0 };

659

#endif // M0 > 1

660

#if M0 > 2

661

union UNION_VEC_TYPE a2 = {.v = 0 };

662

#endif // M0 > 2

663

#if M0 > 3

664

union UNION_VEC_TYPE a3 = {.v = 0 };

665

#endif // M0 > 3

666

#if M0 > 4

667

union UNION_VEC_TYPE a4 = {.v = 0 };

668

#endif // M0 > 4

669

#if M0 > 5

670

union UNION_VEC_TYPE a5 = {.v = 0 };

671

#endif // M0 > 5

672

#if M0 > 6

673

union UNION_VEC_TYPE a6 = {.v = 0 };

674

#endif // M0 > 6

675

#if M0 > 7

676

union UNION_VEC_TYPE a7 = {.v = 0 };

677

#endif // M0 > 7

678

679

REPEAT_VAR_INIT_TO_CONST(N0, VEC_DATA_TYPE(DATA_TYPE, K0), b, 0);

680

681

// Load from RHS matrix

682

LOAD_TEXTURE2D(N0, PIXEL_UNIT, DATA_TYPE, b, rhs_img, x_rhs, y_rhs, RHS_STEP_X, 0);

683

684

// Load from LHS matrix

685

for(int k = 0; k < LEFTOVER_K; ++k)

686

{

687

a0.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zlhs0);

688

#if M0 > 1

689

a1.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zlhs1);

690

#endif // M0 > 1

691

#if M0 > 2

692

a2.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zlhs2);

693

#endif // M0 > 2

694

#if M0 > 3

695

a3.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zlhs3);

696

#endif // M0 > 3

697

#if M0 > 4

698

a4.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zlhs4);

699

#endif // M0 > 4

700

#if M0 > 5

701

a5.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zlhs5);

702

#endif // M0 > 5

703

#if M0 > 6

704

a6.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zlhs6);

705

#endif // M0 > 6

706

#if M0 > 7

707

a7.s[k] = *(__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zlhs7);

708

#endif // M0 > 7

709

710

lhs_offset += sizeof(DATA_TYPE);

}

// Accumulate

ARM_DOT_K0XN0(K0, a0.v, b, c0);

715

#if M0 > 1

716

ARM_DOT_K0XN0(K0, a1.v, b, c1);

717

#endif // M0 > 1

718

#if M0 > 2

719

ARM_DOT_K0XN0(K0, a2.v, b, c2);

720

#endif // M0 > 2

721

#if M0 > 3

722

ARM_DOT_K0XN0(K0, a3.v, b, c3);

723

#endif // M0 > 3

724

#if M0 > 4

725

ARM_DOT_K0XN0(K0, a4.v, b, c4);

726

#endif // M0 > 4

727

#if M0 > 5

728

ARM_DOT_K0XN0(K0, a5.v, b, c5);

729

#endif // M0 > 5

730

#if M0 > 6

731

ARM_DOT_K0XN0(K0, a6.v, b, c6);

732

#endif // M0 > 6

733

#if M0 > 7

734

ARM_DOT_K0XN0(K0, a7.v, b, c7);

#endif // M0 > 7

}

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);

739

740

REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;

741

742

#if defined(REINTERPRET_OUTPUT_AS_3D)

743

744

// The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

745

CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);

746

747

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

748

// multiply dst_stride_z by DEPTH_GEMM3D

749

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

750

751

#else // defined(REINTERPRET_OUTPUT_AS_3D)

752

753

// Add offset for batched GEMM

754

dst_addr += z * dst_stride_z;

755

756

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

757

758

// Multiply by the weight of matrix-matrix product and store the result

759

#if defined(ALPHA)

760

SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);

761

#endif // defined(ALPHA)

// Add beta*bias

#if defined(BETA)

#if defined(BROADCAST_BIAS)

766

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));

767

768

LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x);

769

770

#ifndef UNIT_BETA

771

SCALE_BLOCK(1, DATA_TYPE, bias, BETA);

772

#endif // UNIT_BIAS

773

774

// c = c + bias[broadcasted]

775

ADD_BLOCK_BROADCAST(M0, c, bias0);

776

777

#else // defined(BROADCAST_BIAS)

778

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z;

779

780

LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);

781

782

#ifndef UNIT_BETA

783

SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);

#endif // UNIT_BIAS

// c = c + bias

ADD_BLOCK(M0, c, bias);

788

789

#endif // defined(BROADCAST_BIAS)

790

#endif // defined(BETA)

791

792

#if defined(ACTIVATION_TYPE)

Gian Marco Iodice

2022-11-03 09:30:56 +0000

[diff] [blame]

793

ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, N0, c, A_VAL, B_VAL);

Ramy Elgammal

2022-02-01 23:01:27 +0000

[diff] [blame]

794

#endif // defined(ACTIVATION_TYPE)

795

796

// Store output block

797

STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);

798

799

#undef RHS_BLOCK_SIZE

#undef RHS_OFFSET_X

#undef RHS_STEP_X

#undef RHS_STEP_LOOP

#undef PIXEL_UNIT

}

#endif // defined(OPENCL_IMAGE_SUPPORT) && defined(GEMM_MM_RESHAPED_ONLY_RHS_T_TEXTURE)

806

807

#define VFMA(a, b, c) \

({ \

c = fma(a, b, c); \

})

#if M0 == 1

#define VFMA_M0xN0(i, a, b, c) \

814

({ \

815

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

816

})

817

#elif M0 == 2 // M0 == 2

818

#define VFMA_M0xN0(i, a, b, c) \

819

({ \

820

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

821

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

822

})

823

#elif M0 == 3 // M0 == 3

824

#define VFMA_M0xN0(i, a, b, c) \

825

({ \

826

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

827

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

828

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \

829

})

830

#elif M0 == 4 // M0 == 4

831

#define VFMA_M0xN0(i, a, b, c) \

832

({ \

833

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

834

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

835

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \

836

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \

837

})

838

#elif M0 == 5 // M0 == 5

839

#define VFMA_M0xN0(i, a, b, c) \

840

({ \

841

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

842

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

843

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \

844

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \

845

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \

846

})

847

#elif M0 == 6 // M0 == 6

848

#define VFMA_M0xN0(i, a, b, c) \

849

({ \

850

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

851

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

852

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \

853

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \

854

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \

855

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \

856

})

857

#elif M0 == 7 // M0 == 7

858

#define VFMA_M0xN0(i, a, b, c) \

859

({ \

860

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

861

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

862

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \

863

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \

864

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \

865

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \

866

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \

867

})

868

#elif M0 == 8 // M0 == 8

869

#define VFMA_M0xN0(i, a, b, c) \

870

({ \

871

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

872

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

873

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \

874

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \

875

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \

876

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \

877

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \

878

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##7).s##i), b, (c##7)); \

879

})

880

#else // M0 not supported

881

#error "M0 not supported"

882

#endif // M0 not supported

883

884

#if defined(GEMM_MM_RESHAPED_ONLY_RHS_NT)

885

/** This OpenCL kernel computes the matrix multiplication between 2 matrices.

886

* The LHS matrix is NOT reshaped

887

* The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is NOT transposed

Ramy Elgammal

2022-02-01 23:01:27 +0000

[diff] [blame]

888

*

889

* @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.

890

* @note The GEMM's dimensions (M,N and K) must be passed at runtime as kernel parameters.

891

* @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (e.g. -DN0=8, -DK0=4).

892

* @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)

893

* @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)

894

* @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.

895

* @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)

896

* @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)

897

* @note Only the following configurations of M0, N0 and K0 are currently supported:

898

* - M0 = 1, 2, 3, 4, 5, 6, 7, 8

899

* - N0 = 2, 3, 4, 8, 16

900

* - K0 = 2, 3, 4, 8, 16

901

* - H0 >= 1

902

*

903

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

904

* The activation function is performed after the bias addition

905

* @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:

906

* -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D

907

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

908

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

909

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

910

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix

911

*

912

* @param[in] lhs_ptr Pointer to the LHS matrix. Supported data type: F16/F32

913

* @param[in] lhs_stride_x Stride of the LHS matrix in X dimension (in bytes)

914

* @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

915

* @param[in] lhs_stride_y Stride of the LHS matrix in Y dimension (in bytes)

916

* @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

917

* @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS matrix

918

* @param[in] rhs_ptr Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr

919

* @param[in] rhs_stride_x Stride of the RHS reshaped matrix in X dimension (in bytes)

920

* @param[in] rhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

921

* @param[in] rhs_stride_y Stride of the RHS reshaped matrix in Y dimension (in bytes)

922

* @param[in] rhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

923

* @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS reshaped matrix

924

* @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

925

* @param[in] bias_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

926

* @param[in] bias_step_x (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)

927

* @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

928

* @param[in] bias_step_y (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)

929

* @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

930

* @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr

931

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

932

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

933

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

934

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

935

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

936

* @param[in] lhs_stride_z Stride of the LHS matrix in Z dimension (in bytes)

937

* @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes)

938

* @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

939

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

940

* @param[in] lhs_cross_plane_pad (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)

941

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

942

* @param[in] M Number of rows in LHS matrix not reshaped.

943

* @param[in] N Number of columns in RHS matrix not reshaped.

944

* @param[in] K Number of columns in LHS matrix and rows in RHS matrix not reshaped.

945

*/

946

__kernel void gemm_mm_reshaped_only_rhs_nt(IMAGE_DECLARATION(lhs),

947

IMAGE_DECLARATION(rhs),

948

#if defined(BETA)

949

IMAGE_DECLARATION(bias),

950

#endif // defined(BETA)

951

IMAGE_DECLARATION(dst),

uint lhs_stride_z,

uint rhs_stride_z,

#if defined(BETA)

uint bias_stride_z,

#endif //defined(BETA)

957

uint dst_stride_z

958

#if defined(REINTERPRET_INPUT_AS_3D)

959

,

960

uint lhs_cross_plane_pad

961

#endif // REINTERPRET_INPUT_AS_3D

962

#if defined(REINTERPRET_OUTPUT_AS_3D)

963

,

964

uint dst_cross_plane_pad

965

#endif // REINTERPRET_OUTPUT_AS_3D

,

const int M,

const int N,

const int K)

{

// Block size

#define RHS_BLOCK_SIZE ((K0) * (N0))

973

974

// RHS offset and step X

975

#if defined(RHS_INTERLEAVE)

976

#define RHS_OFFSET_X (N0)

977

#define RHS_STEP_X ((N0) * (H0))

978

#define RHS_STEP_LOOP (1)

979

#else // defined(RHS_INTERLEAVE)

980

#define RHS_OFFSET_X (RHS_BLOCK_SIZE)

981

#define RHS_STEP_X (N0)

982

#define RHS_STEP_LOOP (H0)

983

#endif // defined(RHS_INTERLEAVE)

984

985

uint x = get_global_id(0);

986

uint y = get_global_id(1);

987

uint z = get_global_id(2);

988

989

const bool cond_y = y == 0;

990

const bool cond_x = ((x + 1) * N0 >= N);

991

992

#if defined(DUMMY_WORK_ITEMS)

993

if((x * N0 >= N) || (y * M0 >= M))

{

return;

}

#endif // defined(DUMMY_WORK_ITEMS)

998

999

// Compute LHS matrix address

1000

uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;

1001

1002

// Compute RHS reshaped matrix address

1003

uint rhs_offset = rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (x / (uint)H0) * rhs_stride_y;

1004

1005

#if defined(MATRIX_B_DEPTH)

1006

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

1007

rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;

1008

#else // defined(MATRIX_B_DEPTH)

1009

rhs_offset += z * rhs_stride_z;

1010

#endif // defined(MATRIX_B_DEPTH)

1011

1012

REPEAT_VAR_INIT_TO_CONST(8, uint, zin, 0); //uint zin0=0,zin1=0,zin2=0,... zin7=0;

1013

REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0); //uint zero0=0,zero1=0,zero2=0,... zero7=0;

1014

1015

#if defined(REINTERPRET_INPUT_AS_3D)

1016

1017

// The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

1018

CALCULATE_Z_OFFSET(M0, uint, zin, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);

1019

1020

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

1021

// multiply lhs_stride_z by DEPTH_GEMM3D

1022

lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;

1023

1024

#else // defined(REINTERPRET_INPUT_AS_3D)

1025

1026

// Add offset for batched GEMM

1027

lhs_offset += z * lhs_stride_z;

1028

1029

#endif // defined(REINTERPRET_INPUT_AS_3D)

1030

1031

// Initialize the accumulators

1032

REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0); //VEC_DATA_TYPE(DATA_TYPE, N0) c0=0,c1=0,c2=0,... c(N0-1)=0;

1033

1034

int i = 0;

1035

for(; i <= (K - K0); i += K0)

1036

{

1037

// Supported cases (M0, K0):

1038

// 1,2 - 1,3 - 1,4 - 1,8 - 1,16

1039

// 2,2 - 2,3 - 2,4 - 2,8 - 2,16

1040

// 3,2 - 3,3 - 3,4 - 3,8 - 3,16

1041

// 4,2 - 4,3 - 4,4 - 4,8 - 4,16

1042

// 5,2 - 5,3 - 5,4 - 5,8 - 5,16

1043

// 6,2 - 6,3 - 6,4 - 6,8 - 6,16

1044

// 7,2 - 7,3 - 7,4 - 7,8 - 7,16

1045

// 8,2 - 8,3 - 8,4 - 8,8 - 8,16

1046

// Load values from LHS matrix

1047

LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zin);

1048

1049

VEC_DATA_TYPE(DATA_TYPE, N0)

1050

b0;

1051

1052

b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0 * RHS_STEP_X * sizeof(DATA_TYPE)));

1053

VFMA_M0xN0(0, a, b0, c);

1054

b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 1 * RHS_STEP_X * sizeof(DATA_TYPE)));

1055

VFMA_M0xN0(1, a, b0, c);

1056

#if K0 > 2

1057

b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 2 * RHS_STEP_X * sizeof(DATA_TYPE)));

1058

VFMA_M0xN0(2, a, b0, c);

1059

#endif // K0 > 2

1060

#if K0 > 3

1061

b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 3 * RHS_STEP_X * sizeof(DATA_TYPE)));

1062

VFMA_M0xN0(3, a, b0, c);

1063

#endif // K0 > 3

1064

#if K0 > 4

1065

b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 4 * RHS_STEP_X * sizeof(DATA_TYPE)));

1066

VFMA_M0xN0(4, a, b0, c);

1067

b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 5 * RHS_STEP_X * sizeof(DATA_TYPE)));

1068

VFMA_M0xN0(5, a, b0, c);

1069

b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 6 * RHS_STEP_X * sizeof(DATA_TYPE)));

1070

VFMA_M0xN0(6, a, b0, c);

1071

b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 7 * RHS_STEP_X * sizeof(DATA_TYPE)));

1072

VFMA_M0xN0(7, a, b0, c);

1073

#endif // K0 > 4

1074

#if K0 > 8

1075

b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 8 * RHS_STEP_X * sizeof(DATA_TYPE)));

1076

VFMA_M0xN0(8, a, b0, c);

1077

b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 9 * RHS_STEP_X * sizeof(DATA_TYPE)));

1078

VFMA_M0xN0(9, a, b0, c);

1079

b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 10 * RHS_STEP_X * sizeof(DATA_TYPE)));

1080

VFMA_M0xN0(A, a, b0, c);

1081

b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 11 * RHS_STEP_X * sizeof(DATA_TYPE)));

1082

VFMA_M0xN0(B, a, b0, c);

1083

b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 12 * RHS_STEP_X * sizeof(DATA_TYPE)));

1084

VFMA_M0xN0(C, a, b0, c);

1085

b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 13 * RHS_STEP_X * sizeof(DATA_TYPE)));

1086

VFMA_M0xN0(D, a, b0, c);

1087

b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 14 * RHS_STEP_X * sizeof(DATA_TYPE)));

1088

VFMA_M0xN0(E, a, b0, c);

1089

b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 15 * RHS_STEP_X * sizeof(DATA_TYPE)));

1090

VFMA_M0xN0(F, a, b0, c);

1091

#endif // K0 > 8

1092

1093

lhs_offset += K0 * sizeof(DATA_TYPE);

1094

rhs_offset += K0 * RHS_STEP_X * RHS_STEP_LOOP * sizeof(DATA_TYPE);

1095

}

1096

1097

// Left-over accumulations

1098

for(; i < K; ++i)

1099

{

1100

// Load values from LHS matrix

1101

VEC_DATA_TYPE(DATA_TYPE, 2)

1102

a0 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zin0));

1103

#if M0 > 1

1104

VEC_DATA_TYPE(DATA_TYPE, 2)

1105

a1 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zin1));

1106

#endif // M0 > 1

1107

#if M0 > 2

1108

VEC_DATA_TYPE(DATA_TYPE, 2)

1109

a2 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zin2));

1110

#endif // M0 > 2

1111

#if M0 > 3

1112

VEC_DATA_TYPE(DATA_TYPE, 2)

1113

a3 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zin3));

1114

#endif // M0 > 3

1115

#if M0 > 4

1116

VEC_DATA_TYPE(DATA_TYPE, 2)

1117

a4 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zin4));

1118

#endif // M0 > 4

1119

#if M0 > 5

1120

VEC_DATA_TYPE(DATA_TYPE, 2)

1121

a5 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zin5));

1122

#endif // M0 > 5

1123

#if M0 > 6

1124

VEC_DATA_TYPE(DATA_TYPE, 2)

1125

a6 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zin6));

1126

#endif // M0 > 6

1127

#if M0 > 7

1128

VEC_DATA_TYPE(DATA_TYPE, 2)

1129

a7 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zin7));

1130

#endif // M0 > 7

1131

1132

VEC_DATA_TYPE(DATA_TYPE, N0)

1133

b0;

1134

1135

b0 = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0 * RHS_STEP_X * sizeof(DATA_TYPE)));

1136

VFMA_M0xN0(0, a, b0, c);

1137

1138

lhs_offset += sizeof(DATA_TYPE);

1139

rhs_offset += RHS_STEP_X * sizeof(DATA_TYPE);

1140

}

1141

1142

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);

1143

1144

REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;

1145

1146

#if defined(REINTERPRET_OUTPUT_AS_3D)

1147

// The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

1148

CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);

1149

1150

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

1151

// multiply dst_stride_z by DEPTH_GEMM3D

1152

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

1153

1154

#else // defined(REINTERPRET_OUTPUT_AS_3D)

1155

1156

// Add offset for batched GEMM

1157

dst_addr += z * dst_stride_z;

1158

1159

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

1160

1161

// Multiply by the weight of matrix-matrix product and store the result

1162

#if defined(ALPHA)

1163

SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);

1164

#endif // defined(ALPHA)

// Add beta*bias

#if defined(BETA)

#if defined(BROADCAST_BIAS)

1169

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));

1170

1171

LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x);

1172

1173

#ifndef UNIT_BETA

1174

SCALE_BLOCK(1, DATA_TYPE, bias, BETA);

1175

#endif // UNIT_BIAS

1176

1177

// c = c + bias[broadcasted]

1178

ADD_BLOCK_BROADCAST(M0, c, bias0);

1179

1180

#else // defined(BROADCAST_BIAS)

1181

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z;

1182

1183

LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);

1184

1185

#ifndef UNIT_BETA

1186

SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);

#endif // UNIT_BIAS

// c = c + bias

ADD_BLOCK(M0, c, bias);

1191

1192

#endif // defined(BROADCAST_BIAS)

1193

#endif // defined(BETA)

1194

1195

#if defined(ACTIVATION_TYPE)

Gian Marco Iodice

2022-11-03 09:30:56 +0000

[diff] [blame]

1196

ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, N0, c, A_VAL, B_VAL);

Ramy Elgammal

2022-02-01 23:01:27 +0000

[diff] [blame]

1197

#endif // defined(ACTIVATION_TYPE)

1198

1199

// Store output block

1200

STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);

1201

1202

#undef RHS_BLOCK_SIZE

#undef RHS_OFFSET_X

#undef RHS_STEP_X

#undef RHS_STEP_LOOP

}

#endif // defined(GEMM_MM_RESHAPED_ONLY_RHS_NT)

1208

1209

#if defined(OPENCL_IMAGE_SUPPORT) && defined(GEMM_MM_RESHAPED_ONLY_RHS_NT_TEXTURE)

1210

/** This OpenCL kernel computes the matrix multiplication between 2 matrices.

1211

* The LHS matrix is NOT reshaped

1212

* The RHS is reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the block K0xN0 is NOT transposed

Ramy Elgammal

2022-02-01 23:01:27 +0000

[diff] [blame]

1213

*

1214

* @note -DOPENCL_IMAGE_SUPPORT must be passed at compile time in order to compile this OpenCL kernel

1215

* @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.

1216

* @note The GEMM's dimensions (M,N and K) must be passed at runtime as kernel parameters.

1217

* @note The height of the RHS matrix, defined before creating the OpenCL image object from the OpenCL buffer, should be passed at compile time using -DRHS_HEIGHT=<value> (e.g. -DRHS_HEIGHT=32)

1218

* Since we cannot create a 3d image from a buffer, the third dimension could be collapsed with the second dimension so RHS_HEIGHT

1219

* could be different from the value returned by get_image_height(rhs_img).

1220

* @note The block's dimensions used for reshaping the RHS matrix (N0 and K0) must be passed at compile time using -DN0 and -DK0 (e.g. -DN0=8, -DK0=4).

1221

* @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)

1222

* @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)

1223

* @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.

1224

* @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)

1225

* @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)

1226

* @note Only the following configurations of M0, N0 and K0 are currently supported:

1227

* - M0 = 1, 2, 3, 4, 5, 6, 7, 8

* - N0 = 4, 8, 16

* - K0 = 4, 8, 16

* - H0 >= 1

*

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

1233

* The activation function is performed after the bias addition

1234

* @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:

1235

* -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D

1236

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

1237

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

1238

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

1239

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix

1240

*

1241

* @param[in] lhs_ptr Pointer to the LHS matrix. Supported data type: F32

1242

* @param[in] lhs_stride_x Stride of the LHS matrix in X dimension (in bytes)

1243

* @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1244

* @param[in] lhs_stride_y Stride of the LHS matrix in Y dimension (in bytes)

1245

* @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1246

* @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS matrix

1247

* @param[in] rhs_img The RHS reshaped matrix as OpenCL image object. Supported data type: same as @p lhs_ptr

1248

* @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

1249

* @param[in] bias_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

1250

* @param[in] bias_step_x (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)

1251

* @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

1252

* @param[in] bias_step_y (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)

1253

* @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

1254

* @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr

1255

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

1256

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1257

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

1258

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

1259

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

1260

* @param[in] lhs_stride_z Stride of the LHS matrix in Z dimension (in bytes)

1261

* @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes)

1262

* @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

1263

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

1264

* @param[in] lhs_cross_plane_pad (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)

1265

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

1266

* @param[in] M Number of rows in LHS matrix not reshaped.

1267

* @param[in] N Number of columns in RHS matrix not reshaped.

1268

* @param[in] K Number of columns in LHS matrix and rows in RHS matrix not reshaped.

1269

*/

1270

__kernel void gemm_mm_reshaped_only_rhs_nt_texture(IMAGE_DECLARATION(lhs),

1271

__read_only image2d_t rhs_img,

1272

#if defined(BETA)

1273

IMAGE_DECLARATION(bias),

1274

#endif // defined(BETA)

1275

IMAGE_DECLARATION(dst),

uint lhs_stride_z,

uint rhs_stride_z,

#if defined(BETA)

uint bias_stride_z,

#endif //defined(BETA)

1281

uint dst_stride_z

1282

#if defined(REINTERPRET_INPUT_AS_3D)

1283

,

1284

uint lhs_cross_plane_pad

1285

#endif // REINTERPRET_INPUT_AS_3D

1286

#if defined(REINTERPRET_OUTPUT_AS_3D)

1287

,

1288

uint dst_cross_plane_pad

1289

#endif // REINTERPRET_OUTPUT_AS_3D

,

const int M,

const int N,

const int K)

{

// Pixel unit

#define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(N0)

1297

1298

// Block size

1299

#define RHS_BLOCK_SIZE ((K0) * (PIXEL_UNIT))

1300

1301

// RHS offset and step X

1302

#if defined(RHS_INTERLEAVE)

1303

#define RHS_OFFSET_X (PIXEL_UNIT)

1304

#define RHS_STEP_X ((PIXEL_UNIT) * (H0))

1305

#define RHS_STEP_LOOP 1

1306

#else // defined(RHS_INTERLEAVE)

1307

#define RHS_OFFSET_X (RHS_BLOCK_SIZE)

1308

#define RHS_STEP_X (PIXEL_UNIT)

1309

#define RHS_STEP_LOOP (H0)

1310

#endif // defined(RHS_INTERLEAVE)

1311

1312

uint x = get_global_id(0);

1313

uint y = get_global_id(1);

1314

uint z = get_global_id(2);

1315

1316

const bool cond_y = y == 0;

1317

const bool cond_x = ((x + 1) * N0 >= N);

1318

1319

#if defined(DUMMY_WORK_ITEMS)

1320

if((x * N0 >= N) || (y * M0 >= M))

{

return;

}

#endif // defined(DUMMY_WORK_ITEMS)

1325

1326

// Compute LHS matrix address

1327

uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;

1328

1329

#if defined(MATRIX_B_DEPTH)

1330

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

1331

const uint z_rhs = (z % MATRIX_B_DEPTH);

1332

#else // defined(MATRIX_B_DEPTH)

1333

const uint z_rhs = z;

1334

#endif // defined(MATRIX_B_DEPTH)

1335

1336

// Compute RHS matrix coordinates

1337

uint x_rhs = (x % H0) * (uint)RHS_OFFSET_X;

1338

const uint y_rhs = (x / (uint)H0) + z_rhs * RHS_HEIGHT;

1339

1340

REPEAT_VAR_INIT_TO_CONST(8, uint, zin, 0);

1341

REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);

1342

1343

#if defined(REINTERPRET_INPUT_AS_3D)

1344

1345

// The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

1346

CALCULATE_Z_OFFSET(M0, uint, zin, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);

1347

1348

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

1349

// multiply lhs_stride_z by DEPTH_GEMM3D

1350

lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;

1351

1352

#else // defined(REINTERPRET_INPUT_AS_3D)

1353

1354

// Add offset for batched GEMM

1355

lhs_offset += z * lhs_stride_z;

1356

1357

#endif // defined(REINTERPRET_INPUT_AS_3D)

1358

1359

// Initialize the accumulators

1360

REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0);

1361

1362

int i = 0;

1363

for(; i <= (K - K0); i += K0)

1364

{

1365

// Load values from LHS matrix

1366

LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zin);

1367

1368

VEC_DATA_TYPE(DATA_TYPE, N0)

1369

b0;

1370

1371

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 0 * RHS_STEP_X), (y_rhs));

1372

VFMA_M0xN0(0, a, b0, c);

1373

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 1 * RHS_STEP_X), (y_rhs));

1374

VFMA_M0xN0(1, a, b0, c);

1375

#if K0 > 2

1376

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 2 * RHS_STEP_X), (y_rhs));

1377

VFMA_M0xN0(2, a, b0, c);

1378

#endif // K0 > 2

1379

#if K0 > 3

1380

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 3 * RHS_STEP_X), (y_rhs));

1381

VFMA_M0xN0(3, a, b0, c);

1382

#endif // K0 > 3

1383

#if K0 > 4

1384

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 4 * RHS_STEP_X), (y_rhs));

1385

VFMA_M0xN0(4, a, b0, c);

1386

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 5 * RHS_STEP_X), (y_rhs));

1387

VFMA_M0xN0(5, a, b0, c);

1388

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 6 * RHS_STEP_X), (y_rhs));

1389

VFMA_M0xN0(6, a, b0, c);

1390

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 7 * RHS_STEP_X), (y_rhs));

1391

VFMA_M0xN0(7, a, b0, c);

1392

#endif // K0 > 4

1393

#if K0 > 8

1394

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 8 * RHS_STEP_X), (y_rhs));

1395

VFMA_M0xN0(8, a, b0, c);

1396

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 9 * RHS_STEP_X), (y_rhs));

1397

VFMA_M0xN0(9, a, b0, c);

1398

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 10 * RHS_STEP_X), (y_rhs));

1399

VFMA_M0xN0(A, a, b0, c);

1400

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 11 * RHS_STEP_X), (y_rhs));

1401

VFMA_M0xN0(B, a, b0, c);

1402

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 12 * RHS_STEP_X), (y_rhs));

1403

VFMA_M0xN0(C, a, b0, c);

1404

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 13 * RHS_STEP_X), (y_rhs));

1405

VFMA_M0xN0(D, a, b0, c);

1406

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 14 * RHS_STEP_X), (y_rhs));

1407

VFMA_M0xN0(E, a, b0, c);

1408

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 15 * RHS_STEP_X), (y_rhs));

1409

VFMA_M0xN0(F, a, b0, c);

1410

#endif // K0 > 8

1411

1412

lhs_offset += K0 * sizeof(DATA_TYPE);

1413

x_rhs += K0 * RHS_STEP_X * RHS_STEP_LOOP;

1414

}

1415

1416

// Left-over accumulations

1417

for(; i < K; ++i)

1418

{

1419

// Load values from LHS matrix

1420

VEC_DATA_TYPE(DATA_TYPE, 2)

1421

a0 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zin0));

1422

#if M0 > 1

1423

VEC_DATA_TYPE(DATA_TYPE, 2)

1424

a1 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zin1));

1425

#endif // M0 > 1

1426

#if M0 > 2

1427

VEC_DATA_TYPE(DATA_TYPE, 2)

1428

a2 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zin2));

1429

#endif // M0 > 2

1430

#if M0 > 3

1431

VEC_DATA_TYPE(DATA_TYPE, 2)

1432

a3 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zin3));

1433

#endif // M0 > 3

1434

#if M0 > 4

1435

VEC_DATA_TYPE(DATA_TYPE, 2)

1436

a4 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zin4));

1437

#endif // M0 > 4

1438

#if M0 > 5

1439

VEC_DATA_TYPE(DATA_TYPE, 2)

1440

a5 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zin5));

1441

#endif // M0 > 5

1442

#if M0 > 6

1443

VEC_DATA_TYPE(DATA_TYPE, 2)

1444

a6 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zin6));

1445

#endif // M0 > 6

1446

#if M0 > 7

1447

VEC_DATA_TYPE(DATA_TYPE, 2)

1448

a7 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zin7));

1449

#endif // M0 > 7

1450

1451

VEC_DATA_TYPE(DATA_TYPE, N0)

1452

b0;

1453

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 0 * RHS_STEP_X), (y_rhs));

1454

1455

VFMA_M0xN0(0, a, b0, c);

1456

1457

lhs_offset += sizeof(DATA_TYPE);

x_rhs += RHS_STEP_X;

}

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);

1462

1463

REPEAT_VAR_INIT_TO_CONST(8, uint, zout, 0); //uint zout0=0,zout1=0,zout2=0,... zout7=0;

1464

1465

#if defined(REINTERPRET_OUTPUT_AS_3D)

1466

// The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

1467

CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);

1468

1469

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

1470

// multiply dst_stride_z by DEPTH_GEMM3D

1471

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

1472

1473

#else // defined(REINTERPRET_OUTPUT_AS_3D)

1474

1475

// Add offset for batched GEMM

1476

dst_addr += z * dst_stride_z;

1477

1478

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

1479

1480

// Multiply by the weight of matrix-matrix product and store the result

1481

#if defined(ALPHA)

1482

SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);

1483

#endif // defined(ALPHA)

// Add beta*bias

#if defined(BETA)

#if defined(BROADCAST_BIAS)

1488

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));

1489

1490

LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x);

1491

1492

#ifndef UNIT_BETA

1493

SCALE_BLOCK(1, DATA_TYPE, bias, BETA);

1494

#endif // UNIT_BIAS

1495

1496

// c = c + bias[broadcasted]

1497

ADD_BLOCK_BROADCAST(M0, c, bias0);

1498

1499

#else // defined(BROADCAST_BIAS)

1500

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z;

1501

1502

LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);

1503

1504

#ifndef UNIT_BETA

1505

SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);

#endif // UNIT_BIAS

// c = c + bias

ADD_BLOCK(M0, c, bias);

1510

1511

#endif // defined(BROADCAST_BIAS)

1512

#endif // defined(BETA)

1513

1514

#if defined(ACTIVATION_TYPE)

Gian Marco Iodice

2022-11-03 09:30:56 +0000

[diff] [blame]

1515

ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, N0, c, A_VAL, B_VAL);

Ramy Elgammal

2022-02-01 23:01:27 +0000

[diff] [blame]

1516

#endif // defined(ACTIVATION_TYPE)

1517

1518

// Store output block

1519

STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);

1520

1521

#undef RHS_BLOCK_SIZE

#undef RHS_OFFSET_X

#undef RHS_STEP_X

#undef RHS_STEP_LOOP

}

#endif // defined(OPENCL_IMAGE_SUPPORT) && defined(GEMM_MM_RESHAPED_ONLY_RHS_NT_TEXTURE)

1527

#endif // defined(M0) && defined(N0) && defined(K0) && defined(H0) && defined(DATA_TYPE)

1528

ramelg01

2021-11-11 10:05:00 +0000

[diff] [blame]

1529

#if defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(DATA_TYPE) && defined(DATA_TYPE_ACCUMULATOR)

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

1530

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

1531

#if defined(MIXED_PRECISION)

1532

#if K0 == 2

1533

#define ARM_DOT_K0(a, b, c) \

({ \

c += a.s0 * b.s0; \

c += a.s1 * b.s1; \

})

#elif K0 == 3 // K0 == 3

1539

#define ARM_DOT_K0(a, b, c) \

({ \

c += a.s0 * b.s0; \

c += a.s1 * b.s1; \

c += a.s2 * b.s2; \

})

#elif K0 == 4 // K0 == 4

1546

#define ARM_DOT_K0(a, b, c) \

({ \

c += a.s0 * b.s0; \

c += a.s1 * b.s1; \

c += a.s2 * b.s2; \

c += a.s3 * b.s3; \

})

#elif K0 == 8 // K0 == 8

1554

#define ARM_DOT_K0(a, b, c) \

({ \

c += a.s0 * b.s0; \

c += a.s1 * b.s1; \

c += a.s2 * b.s2; \

c += a.s3 * b.s3; \

c += a.s4 * b.s4; \

c += a.s5 * b.s5; \

c += a.s6 * b.s6; \

c += a.s7 * b.s7; \

})

#elif K0 == 16 // K0 == 16

1566

#define ARM_DOT_K0(a, b, c) \

({ \

c += a.s0 * b.s0; \

c += a.s1 * b.s1; \

c += a.s2 * b.s2; \

c += a.s3 * b.s3; \

c += a.s4 * b.s4; \

c += a.s5 * b.s5; \

c += a.s6 * b.s6; \

c += a.s7 * b.s7; \

c += a.s8 * b.s8; \

c += a.s9 * b.s9; \

c += a.sA * b.sA; \

c += a.sB * b.sB; \

c += a.sC * b.sC; \

c += a.sD * b.sD; \

c += a.sE * b.sE; \

c += a.sF * b.sF; \

})

#else // K0 not supported

1586

#error "K0 value not supported"

1587

#endif // K0 conditions

1588

#else // defined(MIXED_PRECISION)

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

1589

#if K0 == 2

1590

#define ARM_DOT_K0(a, b, c) \

1591

({ \

1592

c = fma(a.s0, b.s0, c); \

1593

c = fma(a.s1, b.s1, c); \

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

1594

})

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

1595

#elif K0 == 3 // K0 == 3

1596

#define ARM_DOT_K0(a, b, c) \

1597

({ \

1598

c = fma(a.s0, b.s0, c); \

1599

c = fma(a.s1, b.s1, c); \

1600

c = fma(a.s2, b.s2, c); \

1601

})

1602

#elif K0 == 4 // K0 == 4

1603

#define ARM_DOT_K0(a, b, c) \

1604

({ \

1605

c = fma(a.s0, b.s0, c); \

1606

c = fma(a.s1, b.s1, c); \

1607

c = fma(a.s2, b.s2, c); \

1608

c = fma(a.s3, b.s3, c); \

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

1609

})

1610

#elif K0 == 8 // K0 == 8

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

1611

#define ARM_DOT_K0(a, b, c) \

1612

({ \

1613

c = fma(a.s0, b.s0, c); \

1614

c = fma(a.s1, b.s1, c); \

1615

c = fma(a.s2, b.s2, c); \

1616

c = fma(a.s3, b.s3, c); \

1617

c = fma(a.s4, b.s4, c); \

1618

c = fma(a.s5, b.s5, c); \

1619

c = fma(a.s6, b.s6, c); \

1620

c = fma(a.s7, b.s7, c); \

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

1621

})

1622

#elif K0 == 16 // K0 == 16

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

1623

#define ARM_DOT_K0(a, b, c) \

1624

({ \

1625

c = fma(a.s0, b.s0, c); \

1626

c = fma(a.s1, b.s1, c); \

1627

c = fma(a.s2, b.s2, c); \

1628

c = fma(a.s3, b.s3, c); \

1629

c = fma(a.s4, b.s4, c); \

1630

c = fma(a.s5, b.s5, c); \

1631

c = fma(a.s6, b.s6, c); \

1632

c = fma(a.s7, b.s7, c); \

1633

c = fma(a.s8, b.s8, c); \

1634

c = fma(a.s9, b.s9, c); \

1635

c = fma(a.sA, b.sA, c); \

1636

c = fma(a.sB, b.sB, c); \

1637

c = fma(a.sC, b.sC, c); \

1638

c = fma(a.sD, b.sD, c); \

1639

c = fma(a.sE, b.sE, c); \

1640

c = fma(a.sF, b.sF, c); \

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

1641

})

1642

#else // K0 not supported

1643

#error "K0 value not supported"

1644

#endif // K0 conditions

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

1645

#endif // defined(MIXED_PRECISION)

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

1646

Giorgio Arena

7d2f69f

2021-05-11 16:39:33 +0100

[diff] [blame]

1647

#if defined(ARM_DOT_K0XN0)

1648

#undef ARM_DOT_K0XN0

1649

#endif // defined(ARM_DOT_K0XN0)

1650

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

1651

#if N0 == 2

1652

#define ARM_DOT_K0XN0(a, b, c) \

1653

({ \

1654

ARM_DOT_K0((a), (b##0), (c.s0)); \

1655

ARM_DOT_K0((a), (b##1), (c.s1)); \

1656

})

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

1657

#elif N0 == 3 // N0 == 3

1658

#define ARM_DOT_K0XN0(a, b, c) \

1659

({ \

1660

ARM_DOT_K0((a), (b##0), (c.s0)); \

1661

ARM_DOT_K0((a), (b##1), (c.s1)); \

1662

ARM_DOT_K0((a), (b##2), (c.s2)); \

1663

})

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

1664

#elif N0 == 4 // N0 == 4

1665

#define ARM_DOT_K0XN0(a, b, c) \

1666

({ \

1667

ARM_DOT_K0((a), (b##0), (c.s0)); \

1668

ARM_DOT_K0((a), (b##1), (c.s1)); \

1669

ARM_DOT_K0((a), (b##2), (c.s2)); \

1670

ARM_DOT_K0((a), (b##3), (c.s3)); \

1671

})

1672

#elif N0 == 8 // N0 == 8

1673

#define ARM_DOT_K0XN0(a, b, c) \

1674

({ \

1675

ARM_DOT_K0((a), (b##0), (c.s0)); \

1676

ARM_DOT_K0((a), (b##1), (c.s1)); \

1677

ARM_DOT_K0((a), (b##2), (c.s2)); \

1678

ARM_DOT_K0((a), (b##3), (c.s3)); \

1679

ARM_DOT_K0((a), (b##4), (c.s4)); \

1680

ARM_DOT_K0((a), (b##5), (c.s5)); \

1681

ARM_DOT_K0((a), (b##6), (c.s6)); \

1682

ARM_DOT_K0((a), (b##7), (c.s7)); \

1683

})

1684

#elif N0 == 16 // N0 == 16

1685

#define ARM_DOT_K0XN0(a, b, c) \

1686

({ \

1687

ARM_DOT_K0((a), (b##0), (c.s0)); \

1688

ARM_DOT_K0((a), (b##1), (c.s1)); \

1689

ARM_DOT_K0((a), (b##2), (c.s2)); \

1690

ARM_DOT_K0((a), (b##3), (c.s3)); \

1691

ARM_DOT_K0((a), (b##4), (c.s4)); \

1692

ARM_DOT_K0((a), (b##5), (c.s5)); \

1693

ARM_DOT_K0((a), (b##6), (c.s6)); \

1694

ARM_DOT_K0((a), (b##7), (c.s7)); \

1695

ARM_DOT_K0((a), (b##8), (c.s8)); \

1696

ARM_DOT_K0((a), (b##9), (c.s9)); \

1697

ARM_DOT_K0((a), (b##A), (c.sA)); \

1698

ARM_DOT_K0((a), (b##B), (c.sB)); \

1699

ARM_DOT_K0((a), (b##C), (c.sC)); \

1700

ARM_DOT_K0((a), (b##D), (c.sD)); \

1701

ARM_DOT_K0((a), (b##E), (c.sE)); \

1702

ARM_DOT_K0((a), (b##F), (c.sF)); \

1703

})

1704

#else // N0 not supported

1705

#error "N0 value not supported"

1706

#endif // N0 conditions

1707

ramelg01

2021-11-11 10:05:00 +0000

[diff] [blame]

1708

#if defined(GEMM_MM_RESHAPED_LHS_NT_RHS_T)

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

1709

/** This OpenCL kernel computes the matrix multiplication between 2 matrices.

1710

* The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be NOT transposed

1711

* The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be transposed

1712

*

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

1713

* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)

1714

* @note The data type used for the accumulators must be passed at compile time using -DDATA_TYPE_ACCUMULATOR (e.g. -DDATA_TYPE_ACCUMULATOR=float)

1715

* @note The F16 computation also supports mixed precision through the option -DMIXED_PRECISION passed at compile time. If enabled, DATA_TYPE_ACCUMULATOR should be set to float

Gian Marco Iodice

b0c5037

2019-03-15 10:13:05 +0000

[diff] [blame]

1716

* @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

1717

* @note The GEMM's dimensions M, N and K must be passed at compile time using -DM, -DN and -DK (e.g. -DM=52, -DN=90 and -DK=24).

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

1718

* @note The block's dimensions used for reshaping the LHS matrix and the RHS matrix (M0, N0 and K0) must be passed at compile time using -DM0, -DN0 and -DK0 (e.g. -DM0=4, -DN0=8, -DK0=4).

1719

* @note The number of M0xK0 vertical blocks stored on the same output row of the reshaped LHS matrix must be passed at compile time using -DV0 (e.g. -DV0=2)

1720

* @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

1721

* @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option -DLHS_INTERLEAVE must passed at compile time.

1722

* @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.

Gian Marco Iodice

2020-08-11 14:14:06 +0100

[diff] [blame]

1723

* @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)

1724

* @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

1725

* @note Only the following configurations of M0, N0 and K0 are currently supported:

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

1726

* - M0 = 2, 3, 4, 5, 6, 7, 8

Gian Marco Iodice

2019-01-11 11:30:55 +0000

[diff] [blame]

1727

* - N0 = 2, 3, 4, 8, 16

1728

* - K0 = 2, 3, 4, 8, 16

Gian Marco Iodice

62251f7

2019-03-11 16:07:12 +0000

[diff] [blame]

1729

* - V0 >= 1

1730

* - H0 >= 1

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

1731

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

1732

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

Gian Marco Iodice

2019-07-16 15:46:48 +0100

[diff] [blame]

1733

* The activation function is performed after the bias addition

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

1734

* @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

1735

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

1736

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

1737

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

1738

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix NOT reshaped

1739

*

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

1740

* @param[in] lhs_ptr Pointer to the LHS reshaped matrix. Supported data type: F16/F32

1741

* @param[in] lhs_stride_x Stride of the LHS reshaped matrix in X dimension (in bytes)

1742

* @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1743

* @param[in] lhs_stride_y Stride of the LHS reshaped matrix in Y dimension (in bytes)

1744

* @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1745

* @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS reshaped matrix

1746

* @param[in] rhs_ptr Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr

1747

* @param[in] rhs_stride_x Stride of the RHS reshaped matrix in X dimension (in bytes)

1748

* @param[in] rhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1749

* @param[in] rhs_stride_y Stride of the RHS reshaped matrix in Y dimension (in bytes)

1750

* @param[in] rhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1751

* @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS reshaped matrix

1752

* @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

1753

* @param[in] bias_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

1754

* @param[in] bias_step_x (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)

1755

* @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

1756

* @param[in] bias_step_y (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)

1757

* @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

1758

* @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr

1759

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

1760

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1761

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

1762

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

1763

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

1764

* @param[in] lhs_stride_z Stride of the LHS reshaped matrix in Z dimension (in bytes)

1765

* @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes)

1766

* @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

1767

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

1768

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

ramelg01

2021-11-11 10:05:00 +0000

[diff] [blame]

1769

* @param[in] M Number of rows in LHS matrix not reshaped.

1770

* @param[in] N Number of columns in RHS matrix not reshaped.

1771

* @param[in] K Number of columns in LHS matrix and rows in RHS matrix not reshaped.

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

1772

*/

1773

__kernel void gemm_mm_reshaped_lhs_nt_rhs_t(IMAGE_DECLARATION(lhs),

1774

IMAGE_DECLARATION(rhs),

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

1775

#if defined(BETA)

1776

IMAGE_DECLARATION(bias),

1777

#endif // defined(BETA)

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

1778

IMAGE_DECLARATION(dst),

1779

uint lhs_stride_z,

1780

uint rhs_stride_z,

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

1781

#if defined(BETA)

1782

uint bias_stride_z,

1783

#endif //defined(BETA)

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

1784

uint dst_stride_z

1785

#if defined(REINTERPRET_OUTPUT_AS_3D)

1786

,

1787

uint dst_cross_plane_pad

1788

#endif // REINTERPRET_OUTPUT_AS_3D

ramelg01

2021-11-11 10:05:00 +0000

[diff] [blame]

,

const int M,

const int N,

const int K)

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

1793

{

1794

// Block size

1795

#define LHS_BLOCK_SIZE ((K0) * (M0))

1796

1797

#if defined(LHS_INTERLEAVE)

1798

#define LHS_OFFSET_X (K0)

1799

#define LHS_STEP_X ((K0) * (V0))

1800

#define LHS_STEP_LOOP (1)

1801

#else // defined(INTERLEAVE)

1802

#define LHS_OFFSET_X (LHS_BLOCK_SIZE)

1803

#define LHS_STEP_X (K0)

1804

#define LHS_STEP_LOOP (V0)

1805

#endif // defined(INTERLEAVE)

1806

1807

// Block size

1808

#define RHS_BLOCK_SIZE ((K0) * (N0))

1809

1810

// RHS offset and step X

1811

#if defined(RHS_INTERLEAVE)

1812

#define RHS_OFFSET_X (K0)

1813

#define RHS_STEP_X ((K0) * (H0))

1814

#define RHS_STEP_LOOP (1)

1815

#else // defined(RHS_INTERLEAVE)

1816

#define RHS_OFFSET_X (RHS_BLOCK_SIZE)

1817

#define RHS_STEP_X (K0)

1818

#define RHS_STEP_LOOP (H0)

1819

#endif // defined(RHS_INTERLEAVE)

1820

Gian Marco Iodice

b0c5037

2019-03-15 10:13:05 +0000

[diff] [blame]

1821

#if defined(DUMMY_WORK_ITEMS)

1822

if((get_global_id(0) * N0 >= N) || (get_global_id(1) * M0 >= M))

{

return;

}

#endif // defined(DUMMY_WORK_ITEMS)

1827

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

1828

// Compute LHS matrix address

1829

__global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (get_global_id(1) % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (get_global_id(1) / V0) * (uint)lhs_stride_y +

1830

(get_global_id(2) * lhs_stride_z);

1831

1832

// Compute RHS matrix address

1833

__global uchar *rhs_addr = rhs_ptr + rhs_offset_first_element_in_bytes + (get_global_id(0) % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (get_global_id(0) / (uint)H0) * rhs_stride_y;

1834

1835

#if defined(MATRIX_B_DEPTH)

1836

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

1837

rhs_addr += (get_global_id(2) % MATRIX_B_DEPTH) * rhs_stride_z;

1838

#else // defined(MATRIX_B_DEPTH)

1839

rhs_addr += get_global_id(2) * rhs_stride_z;

1840

#endif // defined(MATRIX_B_DEPTH)

1841

1842

// Initialize the accumulators

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

1843

REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0);

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

1844

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

1845

REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0); //uint zlhs0=0,zlhs1=0,zlhs2=0,... zlhs7=0;

1846

REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);

Usama Arif

0681e3b

2019-04-25 14:28:07 +0100

[diff] [blame]

1847

ramelg01

2021-11-11 10:05:00 +0000

[diff] [blame]

1848

for(int i = 0; i < K; i += K0)

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

1849

{

1850

// Supported cases (M0, K0):

Gian Marco Iodice

adc5395

2019-02-15 11:10:31 +0000

[diff] [blame]

1851

// 1,2 - 1,3 - 1,4 - 1,8 - 1,16

1852

// 2,2 - 2,3 - 2,4 - 2,8 - 2,16

1853

// 3,2 - 3,3 - 3,4 - 3,8 - 3,16

1854

// 4,2 - 4,3 - 4,4 - 4,8 - 4,16

1855

// 5,2 - 5,3 - 5,4 - 5,8 - 5,16

1856

// 6,2 - 6,3 - 6,4 - 6,8 - 6,16

1857

// 7,2 - 7,3 - 7,4 - 7,8 - 7,16

1858

// 8,2 - 8,3 - 8,4 - 8,8 - 8,16

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

1859

// Load values from LHS matrix

Usama Arif

0681e3b

2019-04-25 14:28:07 +0100

[diff] [blame]

1860

LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_addr, 0, LHS_STEP_X * sizeof(DATA_TYPE), zlhs);

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

1861

1862

// Load values from RHS matrix

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

1863

LOAD_BLOCK(N0, K0, DATA_TYPE, b, rhs_addr, 0, RHS_STEP_X * sizeof(DATA_TYPE), zero);

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

1864

1865

// Accumulate

1866

ARM_DOT_K0XN0(a0, b, c0);

1867

#if M0 > 1

1868

ARM_DOT_K0XN0(a1, b, c1);

1869

#endif // M0 > 1

1870

#if M0 > 2

1871

ARM_DOT_K0XN0(a2, b, c2);

1872

#endif // M0 > 2

1873

#if M0 > 3

1874

ARM_DOT_K0XN0(a3, b, c3);

1875

#endif // M0 > 3

1876

#if M0 > 4

1877

ARM_DOT_K0XN0(a4, b, c4);

1878

#endif // M0 > 4

1879

#if M0 > 5

1880

ARM_DOT_K0XN0(a5, b, c5);

1881

#endif // M0 > 5

1882

#if M0 > 6

1883

ARM_DOT_K0XN0(a6, b, c6);

1884

#endif // M0 > 6

1885

#if M0 > 7

1886

ARM_DOT_K0XN0(a7, b, c7);

1887

#endif // M0 > 7

1888

1889

lhs_addr += (M0 * LHS_STEP_X * LHS_STEP_LOOP) * sizeof(DATA_TYPE);

1890

rhs_addr += (N0 * RHS_STEP_X * RHS_STEP_LOOP) * sizeof(DATA_TYPE);

1891

}

1892

1893

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * dst_stride_y);

1894

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

1895

REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

1896

Giorgio Arena

2021-09-07 14:15:28 +0100

[diff] [blame]

1897

const bool cond_y = ((get_global_id(1) + 1) * M0 >= M);

1898

const bool cond_x = ((get_global_id(0) + 1) * N0 >= N);

1899

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

1900

#if defined(REINTERPRET_OUTPUT_AS_3D)

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

1901

1902

// The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

Michele Di Giorgio

2020-11-23 15:05:12 +0000

[diff] [blame]

1903

CALCULATE_Z_OFFSET(M0, uint, zout, get_global_id(1) * (uint)M0, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

1904

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

1905

// multiply dst_stride_z by DEPTH_GEMM3D

1906

dst_addr += get_global_id(2) * dst_stride_z * DEPTH_GEMM3D;

1907

1908

#else // defined(REINTERPRET_OUTPUT_AS_3D)

1909

1910

// Add offset for batched GEMM

1911

dst_addr += get_global_id(2) * dst_stride_z;

1912

1913

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

1914

1915

// Multiply by the weight of matrix-matrix product and store the result

1916

#if defined(ALPHA)

Usama Arif

0681e3b

2019-04-25 14:28:07 +0100

[diff] [blame]

1917

SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

1918

#endif // defined(ALPHA)

1919

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

1920

// Add beta*bias

1921

#if defined(BETA)

1922

#if defined(BROADCAST_BIAS)

1923

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));

1924

Giorgio Arena

2021-09-07 14:15:28 +0100

[diff] [blame]

1925

LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x);

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

1926

1927

#ifndef UNIT_BETA

1928

SCALE_BLOCK(1, DATA_TYPE, bias, BETA);

1929

#endif // UNIT_BIAS

1930

1931

// c = c + bias[broadcasted]

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

1932

#if defined(MIXED_PRECISION)

1933

CONVERT_BLOCK(1, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);

1934

ADD_BLOCK_BROADCAST(M0, c, bias_hp0);

1935

#else // defined(MIXED_PRECISION)

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

1936

ADD_BLOCK_BROADCAST(M0, c, bias0);

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

1937

#endif // defined(MIXED_PRECISION)

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

1938

1939

#else // defined(BROADCAST_BIAS)

1940

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * bias_stride_y) + get_global_id(

1941

2) * bias_stride_z;

1942

Giorgio Arena

2021-09-07 14:15:28 +0100

[diff] [blame]

1943

LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

1944

1945

#ifndef UNIT_BETA

1946

SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);

1947

#endif // UNIT_BIAS

1948

1949

// c = c + bias

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

1950

#if defined(MIXED_PRECISION)

1951

CONVERT_BLOCK(M0, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);

1952

ADD_BLOCK(M0, c, bias_hp);

1953

#else // defined(MIXED_PRECISION)

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

1954

ADD_BLOCK(M0, c, bias);

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

1955

#endif // defined(MIXED_PRECISION)

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

1956

1957

#endif // defined(BROADCAST_BIAS)

1958

#endif // defined(BETA)

1959

Gian Marco Iodice

2019-07-16 15:46:48 +0100

[diff] [blame]

1960

#if defined(ACTIVATION_TYPE)

Georgios Pinitas

2019-10-11 17:38:50 +0100

[diff] [blame]

1961

#if defined(MIXED_PRECISION)

Gian Marco Iodice

2022-11-03 09:30:56 +0000

[diff] [blame]

1962

ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, N0, c, A_VAL, B_VAL);

Georgios Pinitas

2019-10-11 17:38:50 +0100

[diff] [blame]

1963

#else // defined(MIXED_PRECISION)

Gian Marco Iodice

2022-11-03 09:30:56 +0000

[diff] [blame]

1964

ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, N0, c, A_VAL, B_VAL);

Georgios Pinitas

2019-10-11 17:38:50 +0100

[diff] [blame]

1965

#endif // defined(MIXED_PRECISION)

Gian Marco Iodice

2019-07-16 15:46:48 +0100

[diff] [blame]

1966

#endif // defined(ACTIVATION_TYPE)

1967

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

1968

// Store output block

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

1969

#if defined(MIXED_PRECISION)

Gian Marco Iodice

2020-08-11 14:14:06 +0100

[diff] [blame]

1970

CONVERT_BLOCK(M0, N0, DATA_TYPE, c, c_lp);

Giorgio Arena

2020-10-15 17:39:41 +0100

[diff] [blame]

1971

STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c_lp, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

1972

#else // defined(MIXED_PRECISION)

Giorgio Arena

2020-10-15 17:39:41 +0100

[diff] [blame]

1973

STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

1974

#endif // defined(MIXED_PRECISION)

Gian Marco Iodice

2019-06-14 16:11:10 +0100

[diff] [blame]

1975

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

1976

#undef LHS_BLOCK_SIZE

1977

#undef LHS_OFFSET_X

1978

#undef LHS_STEP_X

1979

#undef RHS_BLOCK_SIZE

1980

#undef RHS_OFFSET_X

1981

#undef RHS_STEP_X

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

1982

#undef LHS_STEP_LOOP

1983

#undef RHS_STEP_LOOP

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

1984

}

ramelg01

2021-11-11 10:05:00 +0000

[diff] [blame]

1985

#endif // defined(GEMM_MM_RESHAPED_LHS_NT_RHS_T)

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

1986

ramelg01

2021-11-11 10:05:00 +0000

[diff] [blame]

1987

#if defined(OPENCL_IMAGE_SUPPORT) && defined(GEMM_MM_RESHAPED_LHS_NT_RHS_T_TEXTURE)

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

1988

/** This OpenCL kernel computes the matrix multiplication between 2 matrices. The RHS matrix is stored in OpenCL image object.

1989

* The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be NOT transposed

1990

* The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be transposed

1991

*

1992

* @note -DOPENCL_IMAGE_SUPPORT must be passed at compile time in order to compile this OpenCL kernel

1993

* @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=float)

1994

* @note The data type used for the accumulators must be passed at compile time using -DDATA_TYPE_ACCUMULATOR (e.g. -DDATA_TYPE_ACCUMULATOR=float)

1995

* @note The F16 computation also supports mixed precision through the option -DMIXED_PRECISION passed at compile time. If enabled, DATA_TYPE_ACCUMULATOR should be set to float

1996

* @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.

1997

* @note The GEMM's dimensions M, N and K must be passed at compile time using -DM, -DN and -DK (e.g. -DM=52, -DN=90 and -DK=24).

Gian Marco Iodice

781cba7

2020-06-19 16:56:57 +0100

[diff] [blame]

1998

* @note The height of the RHS matrix, defined before creating the OpenCL image object from the OpenCL buffer, should be passed at compile time using -DRHS_HEIGHT=<value> (e.g. -DRHS_HEIGHT=32)

1999

* Since we cannot create a 3d image from a buffer, the third dimension could be collapsed with the second dimension so RHS_HEIGHT

2000

* could be different from the value returned by get_image_height(rhs_img).

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

2001

* @note The block's dimensions used for reshaping the LHS matrix and the RHS matrix (M0, N0 and K0) must be passed at compile time using -DM0, -DN0 and -DK0 (e.g. -DM0=4, -DN0=8, -DK0=4).

2002

* @note The number of M0xK0 vertical blocks stored on the same output row of the reshaped LHS matrix must be passed at compile time using -DV0 (e.g. -DV0=2)

2003

* @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)

2004

* @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option -DLHS_INTERLEAVE must passed at compile time.

2005

* @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.

Gian Marco Iodice

2020-08-11 14:14:06 +0100

[diff] [blame]

2006

* @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)

2007

* @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

2008

* @note Only the following configurations of M0, N0 and K0 are currently supported:

2009

* - M0 = 2, 3, 4, 5, 6, 7, 8

* - N0 = 4, 8, 16

* - K0 = 4, 8, 16

* - V0 >= 1

* - H0 >= 1

*

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

2016

* The activation function is performed after the bias addition

2017

* @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:

2018

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

2019

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

2020

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

2021

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix NOT reshaped

2022

*

2023

* @param[in] lhs_ptr Pointer to the LHS reshaped matrix. Supported data type: F32

2024

* @param[in] lhs_stride_x Stride of the LHS reshaped matrix in X dimension (in bytes)

2025

* @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

2026

* @param[in] lhs_stride_y Stride of the LHS reshaped matrix in Y dimension (in bytes)

2027

* @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

2028

* @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS reshaped matrix

2029

* @param[in] rhs_img The RHS reshaped matrix as OpenCL image object. Supported data type: same as @p lhs_ptr

2030

* @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

2031

* @param[in] bias_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

2032

* @param[in] bias_step_x (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)

2033

* @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

2034

* @param[in] bias_step_y (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)

2035

* @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

2036

* @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr

2037

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

2038

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

2039

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

2040

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

2041

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

2042

* @param[in] lhs_stride_z Stride of the LHS reshaped matrix in Z dimension (in bytes)

2043

* @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes)

2044

* @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

2045

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

2046

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

ramelg01

2021-11-11 10:05:00 +0000

[diff] [blame]

2047

* @param[in] M Number of rows in LHS matrix not reshaped.

2048

* @param[in] N Number of columns in RHS matrix not reshaped.

2049

* @param[in] K Number of columns in LHS matrix and rows in RHS matrix not reshaped.

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

2050

*/

2051

__kernel void gemm_mm_reshaped_lhs_nt_rhs_t_texture(IMAGE_DECLARATION(lhs),

2052

__read_only image2d_t rhs_img,

2053

#if defined(BETA)

2054

IMAGE_DECLARATION(bias),

2055

#endif // defined(BETA)

2056

IMAGE_DECLARATION(dst),

uint lhs_stride_z,

uint rhs_stride_z,

#if defined(BETA)

uint bias_stride_z,

#endif //defined(BETA)

2062

uint dst_stride_z

2063

#if defined(REINTERPRET_OUTPUT_AS_3D)

2064

,

2065

uint dst_cross_plane_pad

2066

#endif // REINTERPRET_OUTPUT_AS_3D

ramelg01

2021-11-11 10:05:00 +0000

[diff] [blame]

,

const int M,

const int N,

const int K)

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

2071

{

2072

// Pixel unit

2073

#define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(K0)

2074

2075

// Block size

2076

#define LHS_BLOCK_SIZE ((K0) * (M0))

2077

2078

#if defined(LHS_INTERLEAVE)

2079

#define LHS_OFFSET_X (K0)

2080

#define LHS_STEP_X ((K0) * (V0))

2081

#define LHS_STEP_LOOP (1)

2082

#else // defined(INTERLEAVE)

2083

#define LHS_OFFSET_X (LHS_BLOCK_SIZE)

2084

#define LHS_STEP_X (K0)

2085

#define LHS_STEP_LOOP (V0)

2086

#endif // defined(INTERLEAVE)

2087

2088

// Block size

2089

#define RHS_BLOCK_SIZE (PIXEL_UNIT * (N0))

2090

2091

// RHS offset and step X

2092

#if defined(RHS_INTERLEAVE)

2093

#define RHS_OFFSET_X (PIXEL_UNIT)

2094

#define RHS_STEP_X (PIXEL_UNIT * (H0))

2095

#define RHS_STEP_LOOP (1)

2096

#else // defined(RHS_INTERLEAVE)

2097

#define RHS_OFFSET_X (RHS_BLOCK_SIZE)

2098

#define RHS_STEP_X PIXEL_UNIT

2099

#define RHS_STEP_LOOP (H0)

2100

#endif // defined(RHS_INTERLEAVE)

2101

2102

#if defined(DUMMY_WORK_ITEMS)

2103

if((get_global_id(0) * N0 >= N) || (get_global_id(1) * M0 >= M))

{

return;

}

#endif // defined(DUMMY_WORK_ITEMS)

2108

2109

// Compute LHS matrix address

2110

__global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (get_global_id(1) % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (get_global_id(1) / V0) * (uint)lhs_stride_y +

2111

(get_global_id(2) * lhs_stride_z);

2112

2113

#if defined(MATRIX_B_DEPTH)

2114

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

2115

const uint z_rhs = (get_global_id(2) % MATRIX_B_DEPTH);

2116

#else // defined(MATRIX_B_DEPTH)

2117

const uint z_rhs = get_global_id(2);

2118

#endif // defined(MATRIX_B_DEPTH)

2119

2120

// Compute RHS matrix coordinates

2121

uint x_rhs = (get_global_id(0) % H0) * (uint)RHS_OFFSET_X;

2122

const uint y_rhs = (get_global_id(0) / (uint)H0) + z_rhs * RHS_HEIGHT;

2123

2124

// Initialize the accumulators

2125

REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0);

2126

2127

REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0); //uint zlhs0=0,zlhs1=0,zlhs2=0,... zlhs7=0;

2128

REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);

2129

2130

for(int i = 0; i < K; i += K0)

2131

{

2132

// Load values from LHS matrix

2133

LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_addr, 0, LHS_STEP_X * sizeof(DATA_TYPE), zlhs);

2134

2135

// Load values from RHS matrix stored in a cl_image

2136

REPEAT_VAR_INIT_TO_CONST(N0, VEC_DATA_TYPE(DATA_TYPE, K0), b, 0);

2137

LOAD_TEXTURE2D(N0, PIXEL_UNIT, DATA_TYPE, b, rhs_img, x_rhs, y_rhs, RHS_STEP_X, 0);

2138

2139

// Accumulate

2140

ARM_DOT_K0XN0(a0, b, c0);

2141

#if M0 > 1

2142

ARM_DOT_K0XN0(a1, b, c1);

2143

#endif // M0 > 1

2144

#if M0 > 2

2145

ARM_DOT_K0XN0(a2, b, c2);

2146

#endif // M0 > 2

2147

#if M0 > 3

2148

ARM_DOT_K0XN0(a3, b, c3);

2149

#endif // M0 > 3

2150

#if M0 > 4

2151

ARM_DOT_K0XN0(a4, b, c4);

2152

#endif // M0 > 4

2153

#if M0 > 5

2154

ARM_DOT_K0XN0(a5, b, c5);

2155

#endif // M0 > 5

2156

#if M0 > 6

2157

ARM_DOT_K0XN0(a6, b, c6);

2158

#endif // M0 > 6

2159

#if M0 > 7

2160

ARM_DOT_K0XN0(a7, b, c7);

2161

#endif // M0 > 7

2162

2163

lhs_addr += (M0 * LHS_STEP_X * LHS_STEP_LOOP) * sizeof(DATA_TYPE);

2164

2165

x_rhs += N0 * RHS_STEP_X * RHS_STEP_LOOP;

2166

}

2167

2168

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * dst_stride_y);

2169

2170

REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);

2171

Giorgio Arena

2021-09-07 14:15:28 +0100

[diff] [blame]

2172

const bool cond_y = ((get_global_id(1) + 1) * M0 >= M);

2173

const bool cond_x = ((get_global_id(0) + 1) * N0 >= N);

2174

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

2175

#if defined(REINTERPRET_OUTPUT_AS_3D)

2176

2177

// The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

Michele Di Giorgio

2020-11-23 15:05:12 +0000

[diff] [blame]

2178

CALCULATE_Z_OFFSET(M0, uint, zout, get_global_id(1) * (uint)M0, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

2179

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

2180

// multiply dst_stride_z by DEPTH_GEMM3D

2181

dst_addr += get_global_id(2) * dst_stride_z * DEPTH_GEMM3D;

2182

2183

#else // defined(REINTERPRET_OUTPUT_AS_3D)

2184

2185

// Add offset for batched GEMM

2186

dst_addr += get_global_id(2) * dst_stride_z;

2187

2188

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

2189

2190

// Multiply by the weight of matrix-matrix product and store the result

2191

#if defined(ALPHA)

2192

SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);

2193

#endif // defined(ALPHA)

// Add beta*bias

#if defined(BETA)

#if defined(BROADCAST_BIAS)

2198

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));

2199

Giorgio Arena

2021-09-07 14:15:28 +0100

[diff] [blame]

2200

LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x);

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

2201

2202

#ifndef UNIT_BETA

2203

SCALE_BLOCK(1, DATA_TYPE, bias, BETA);

2204

#endif // UNIT_BIAS

2205

2206

// c = c + bias[broadcasted]

2207

#if defined(MIXED_PRECISION)

2208

CONVERT_BLOCK(1, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);

2209

ADD_BLOCK_BROADCAST(M0, c, bias_hp0);

2210

#else // defined(MIXED_PRECISION)

2211

ADD_BLOCK_BROADCAST(M0, c, bias0);

2212

#endif // defined(MIXED_PRECISION)

2213

2214

#else // defined(BROADCAST_BIAS)

2215

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * bias_stride_y) + get_global_id(

2216

2) * bias_stride_z;

2217

Giorgio Arena

2021-09-07 14:15:28 +0100

[diff] [blame]

2218

LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

2219

2220

#ifndef UNIT_BETA

2221

SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);

#endif // UNIT_BIAS

// c = c + bias

#if defined(MIXED_PRECISION)

2226

CONVERT_BLOCK(M0, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);

2227

ADD_BLOCK(M0, c, bias_hp);

2228

#else // defined(MIXED_PRECISION)

2229

ADD_BLOCK(M0, c, bias);

2230

#endif // defined(MIXED_PRECISION)

2231

2232

#endif // defined(BROADCAST_BIAS)

2233

#endif // defined(BETA)

2234

2235

#if defined(ACTIVATION_TYPE)

2236

#if defined(MIXED_PRECISION)

Gian Marco Iodice

2022-11-03 09:30:56 +0000

[diff] [blame]

2237

ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, N0, c, A_VAL, B_VAL);

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

2238

#else // defined(MIXED_PRECISION)

Gian Marco Iodice

2022-11-03 09:30:56 +0000

[diff] [blame]

2239

ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, N0, c, A_VAL, B_VAL);

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

2240

#endif // defined(MIXED_PRECISION)

2241

#endif // defined(ACTIVATION_TYPE)

2242

2243

// Store output block

2244

#if defined(MIXED_PRECISION)

Gian Marco Iodice

2020-08-11 14:14:06 +0100

[diff] [blame]

2245

CONVERT_BLOCK(M0, N0, DATA_TYPE, c, c_lp);

Giorgio Arena

2020-10-15 17:39:41 +0100

[diff] [blame]

2246

STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c_lp, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

2247

#else // defined(MIXED_PRECISION)

Giorgio Arena

2020-10-15 17:39:41 +0100

[diff] [blame]

2248

STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

2249

#endif // defined(MIXED_PRECISION)

2250

2251

#undef LHS_BLOCK_SIZE

2252

#undef LHS_OFFSET_X

2253

#undef LHS_STEP_X

2254

#undef RHS_BLOCK_SIZE

#undef RHS_OFFSET_X

#undef RHS_STEP_X

#undef PIXEL_UNIT

#undef LHS_STEP_LOOP

#undef RHS_STEP_LOOP

}

ramelg01

2021-11-11 10:05:00 +0000

[diff] [blame]

2261

#endif // defined(OPENCL_IMAGE_SUPPORT) && defined(GEMM_MM_RESHAPED_LHS_NT_RHS_T_TEXTURE)

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

2262

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

2263

#if defined(LHS_TRANSPOSE)

2264

2265

#define VTYPE(TYPE, SIZE) VEC_DATA_TYPE(TYPE, SIZE)

2266

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

2267

#if defined(MIXED_PRECISION)

2268

2269

#if(GPU_ARCH == GPU_ARCH_MIDGARD)

2270

#define ARM_VFMA(N0, a, b, c) c += (CONVERT(a, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0))) * (CONVERT(b, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0)));

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

2271

#else // GPU_ARCH == GPU_ARCH_MIDGARD

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

2272

#define ARM_VFMA(N0, a, b, c) c = fma((CONVERT(a, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0))), (CONVERT(b, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0))), (c));

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

2273

#endif // GPU_ARCH == GPU_ARCH_MIDGARD

2274

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

2275

#else // defined(MIXED_PRECISION

2276

2277

#if(GPU_ARCH == GPU_ARCH_MIDGARD)

2278

#define ARM_VFMA(N0, a, b, c) c += (a) * (b);

2279

#else // GPU_ARCH == GPU_ARCH_MIDGARD

2280

#define ARM_VFMA(N0, a, b, c) c = fma((a), (b), (c));

2281

#endif // GPU_ARCH == GPU_ARCH_MIDGARD

2282

2283

#endif // defined(MIXED_PRECISION)

2284

2285

#define ARM_VVM_T_NT_1xN0x1(N0, TYPE, a, b, C) \

2286

({ \

2287

ARM_VFMA(N0, (VTYPE(TYPE, N0))(a), b, (C##0)); \

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

2288

})

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

2289

#define ARM_VVM_T_NT_2xN0x1(N0, TYPE, a, b, C) \

2290

({ \

2291

ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s0), b, (C##0)); \

2292

ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s1), b, (C##1)); \

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

2293

})

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

2294

#define ARM_VVM_T_NT_3xN0x1(N0, TYPE, a, b, C) \

2295

({ \

2296

ARM_VVM_T_NT_2xN0x1(N0, TYPE, a, b, C); \

2297

ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s2), b, (C##2)); \

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

2298

})

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

2299

#define ARM_VVM_T_NT_4xN0x1(N0, TYPE, a, b, C) \

2300

({ \

2301

ARM_VVM_T_NT_3xN0x1(N0, TYPE, a, b, C); \

2302

ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s3), b, (C##3)); \

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

2303

})

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

2304

#define ARM_VVM_T_NT_8xN0x1(N0, TYPE, a, b, C) \

2305

({ \

2306

ARM_VVM_T_NT_4xN0x1(N0, TYPE, a, b, C); \

2307

ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s4), b, (C##4)); \

2308

ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s5), b, (C##5)); \

2309

ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s6), b, (C##6)); \

2310

ARM_VFMA(N0, (VTYPE(TYPE, N0))(a.s7), b, (C##7)); \

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

2311

})

2312

2313

// Factory macro for the column-vector (transposed) by row-vector (not transposed) multiplication. K0 = 1

2314

// a is the column-vector (transposed)

2315

// b is the row-vector (not transposed)

2316

// C is the output matrix

2317

// Lower case is a vector (a, b)

2318

// Upper case is a matrix (C)

2319

#define ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, a, b, C) ARM_VVM_T_NT_##M0##xN0x1(N0, TYPE, a, b, C)

2320

2321

#define ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, A, B, C) \

2322

({ \

2323

ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##0), (B##0), C); \

2324

})

2325

#define ARM_MM_T_NT_M0xN0x2(M0, N0, TYPE, A, B, C) \

2326

({ \

2327

ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, A, B, C); \

2328

ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##1), (B##1), C); \

2329

})

2330

#define ARM_MM_T_NT_M0xN0x3(M0, N0, TYPE, A, B, C) \

2331

({ \

2332

ARM_MM_T_NT_M0xN0x2(M0, N0, TYPE, A, B, C); \

2333

ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##2), (B##2), C); \

2334

})

2335

#define ARM_MM_T_NT_M0xN0x4(M0, N0, TYPE, A, B, C) \

2336

({ \

2337

ARM_MM_T_NT_M0xN0x3(M0, N0, TYPE, A, B, C); \

2338

ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##3), (B##3), C); \

2339

})

2340

#define ARM_MM_T_NT_M0xN0x8(M0, N0, TYPE, A, B, C) \

2341

({ \

2342

ARM_MM_T_NT_M0xN0x4(M0, N0, TYPE, A, B, C); \

2343

ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##4), (B##4), C); \

2344

ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##5), (B##5), C); \

2345

ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##6), (B##6), C); \

2346

ARM_VVM_T_NT_M0xN0x1(M0, N0, TYPE, (A##7), (B##7), C); \

2347

})

2348

#define ARM_MM_T_NT_M0xN0x16(M0, N0, TYPE, A, B, C) \

2349

({ \

2350

ARM_MM_T_NT_M0xN0x8(M0, N0, TYPE, A, B, C); \

2351

ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##8), (B##8), C); \

2352

ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##9), (B##9), C); \

2353

ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##A), (B##A), C); \

2354

ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##B), (B##B), C); \

2355

ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##C), (B##C), C); \

2356

ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##D), (B##D), C); \

2357

ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##E), (B##E), C); \

2358

ARM_MM_T_NT_M0xN0x1(M0, N0, TYPE, (A##F), (B##F), C); \

2359

})

2360

2361

// Factory macro for the matrix (transposed) by matrix (not transposed) multiplication.

2362

// The dimensions for this matrix multiplications are defined through M0, N0 and K0

2363

// The dimensions supported are:

2364

// M0: 1, 2, 3, 4, 8

2365

// N0: 1, 2, 3, 4, 8, 16

2366

// K0: 1, 2, 3, 4, 8, 16

2367

// This macro calls the vector-by-matrix macro K0 times

2368

// A, B and C are matrices

Gian Marco Iodice

2019-09-24 12:05:06 +0100

[diff] [blame]

2369

#define ARM_MM_T_NT(M0, N0, K0, TYPE, A, B, C) \

2370

CONCAT(ARM_MM_T_NT_M0xN0x, K0) \

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

2371

(M0, N0, TYPE, A, B, C)

2372

ramelg01

2021-11-11 10:05:00 +0000

[diff] [blame]

2373

#if defined(GEMM_MM_RESHAPED_LHS_T_RHS_NT)

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

2374

/** This OpenCL kernel computes the matrix multiplication between 2 matrices.

2375

* The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be transposed

2376

* The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be NOT transposed

2377

*

2378

* @note LHS_TRANSPOSE should be passed at compile time in order to compile this OpenCL kernel (e.g. -DLHS_TRANSPOSE).

2379

* @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

2380

* @note The GEMM's dimensions M, N and K must be passed at compile time using -DM, -DN and -DK (e.g. -DM=52, -DN=90 and -DK=24).

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

2381

* @note The block's dimensions used for reshaping the LHS matrix and the RHS matrix (M0, N0 and K0) must be passed at compile time using -DM0, -DN0 and -DK0 (e.g. -DM0=4, -DN0=8, -DK0=4).

2382

* @note The number of M0xK0 vertical blocks stored on the same output row of the reshaped LHS matrix must be passed at compile time using -DV0 (e.g. -DV0=2)

2383

* @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)

2384

* @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option -DLHS_INTERLEAVE must passed at compile time.

2385

* @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.

Gian Marco Iodice

2020-08-11 14:14:06 +0100

[diff] [blame]

2386

* @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)

2387

* @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

2388

* @note Only the following configurations of M0, N0 and K0 are currently supported:

2389

* - M0 = 2, 3, 4, 8

2390

* - N0 = 2, 3, 4, 8, 16

2391

* - K0 = 2, 3, 4, 8, 16

* - V0 >= 1

* - H0 >= 1

*

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

2396

* The activation function is performed after the bias addition

2397

* @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:

2398

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

2399

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

2400

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

2401

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix NOT reshaped

2402

*

2403

* @param[in] lhs_ptr Pointer to the LHS reshaped matrix. Supported data type: F16/F32

2404

* @param[in] lhs_stride_x Stride of the LHS reshaped matrix in X dimension (in bytes)

2405

* @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

2406

* @param[in] lhs_stride_y Stride of the LHS reshaped matrix in Y dimension (in bytes)

2407

* @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

2408

* @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS reshaped matrix

2409

* @param[in] rhs_ptr Pointer to the RHS reshaped matrix. Supported data type: same as @p lhs_ptr

2410

* @param[in] rhs_stride_x Stride of the RHS reshaped matrix in X dimension (in bytes)

2411

* @param[in] rhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

2412

* @param[in] rhs_stride_y Stride of the RHS reshaped matrix in Y dimension (in bytes)

2413

* @param[in] rhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

2414

* @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS reshaped matrix

2415

* @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

2416

* @param[in] bias_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

2417

* @param[in] bias_step_x (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)

2418

* @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

2419

* @param[in] bias_step_y (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)

2420

* @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

2421

* @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr

2422

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

2423

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

2424

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

2425

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

2426

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

2427

* @param[in] lhs_stride_z Stride of the LHS reshaped matrix in Z dimension (in bytes)

2428

* @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes)

2429

* @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

2430

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

2431

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

ramelg01

2021-11-11 10:05:00 +0000

[diff] [blame]

2432

* @param[in] M Number of rows in LHS matrix not reshaped.

2433

* @param[in] N Number of columns in RHS matrix not reshaped.

2434

* @param[in] K Number of columns in LHS matrix and rows in RHS matrix not reshaped.

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

2435

*/

2436

__kernel void gemm_mm_reshaped_lhs_t_rhs_nt(IMAGE_DECLARATION(lhs),

2437

IMAGE_DECLARATION(rhs),

2438

#if defined(BETA)

2439

IMAGE_DECLARATION(bias),

2440

#endif // defined(BETA)

2441

IMAGE_DECLARATION(dst),

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

uint lhs_stride_z,

uint rhs_stride_z,

#if defined(BETA)

uint bias_stride_z,

#endif //defined(BETA)

2447

uint dst_stride_z

2448

#if defined(REINTERPRET_OUTPUT_AS_3D)

2449

,

2450

uint dst_cross_plane_pad

2451

#endif // REINTERPRET_OUTPUT_AS_3D

ramelg01

2021-11-11 10:05:00 +0000

[diff] [blame]

,

const int M,

const int N,

const int K)

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

2456

{

2457

// Block size

2458

#define LHS_BLOCK_SIZE ((K0) * (M0))

2459

2460

#if defined(LHS_INTERLEAVE)

2461

#define LHS_OFFSET_X (M0)

2462

#define LHS_STEP_X ((M0) * (V0))

2463

#define LHS_STEP_LOOP (1)

2464

#else // defined(INTERLEAVE)

2465

#define LHS_OFFSET_X (LHS_BLOCK_SIZE)

2466

#define LHS_STEP_X (M0)

2467

#define LHS_STEP_LOOP (V0)

2468

#endif // defined(INTERLEAVE)

2469

2470

// Block size

2471

#define RHS_BLOCK_SIZE ((K0) * (N0))

2472

2473

// RHS offset and step X

2474

#if defined(RHS_INTERLEAVE)

2475

#define RHS_OFFSET_X (N0)

2476

#define RHS_STEP_X ((N0) * (H0))

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

2477

#else // defined(RHS_INTERLEAVE)

2478

#define RHS_OFFSET_X (RHS_BLOCK_SIZE)

2479

#define RHS_STEP_X (N0)

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

2480

#endif // defined(RHS_INTERLEAVE)

2481

2482

const uint x = get_global_id(0);

2483

const uint y = get_global_id(1);

2484

const uint z = get_global_id(2);

2485

Giorgio Arena

2021-09-07 14:15:28 +0100

[diff] [blame]

2486

const bool cond_y = ((get_global_id(1) + 1) * M0 >= M);

2487

const bool cond_x = ((get_global_id(0) + 1) * N0 >= N);

2488

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

2489

#if defined(DUMMY_WORK_ITEMS)

2490

if((x * N0 >= N) || (y * M0 >= M))

{

return;

}

#endif // defined(DUMMY_WORK_ITEMS)

2495

2496

// Compute LHS matrix address

2497

__global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (y % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (y / V0) * (uint)lhs_stride_y + (z * lhs_stride_z);

2498

2499

// Compute RHS matrix address

2500

__global uchar *rhs_addr = rhs_ptr + rhs_offset_first_element_in_bytes + (x % H0) * (uint)RHS_OFFSET_X * sizeof(DATA_TYPE) + (x / (uint)H0) * rhs_stride_y;

2501

2502

#if defined(MATRIX_B_DEPTH)

2503

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

2504

rhs_addr += (z % MATRIX_B_DEPTH) * rhs_stride_z;

2505

#else // defined(MATRIX_B_DEPTH)

2506

rhs_addr += z * rhs_stride_z;

2507

#endif // defined(MATRIX_B_DEPTH)

2508

2509

// Initialize the accumulators

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

2510

REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0);

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

2511

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

2512

REPEAT_VAR_INIT_TO_CONST(M0, uint, zero, 0);

2513

Gian Marco Iodice

2019-09-24 12:05:06 +0100

[diff] [blame]

2514

__global DATA_TYPE *lhs = (__global DATA_TYPE *)(lhs_addr);

2515

__global DATA_TYPE *rhs = (__global DATA_TYPE *)(rhs_addr);

2516

ramelg01

2021-11-11 10:05:00 +0000

[diff] [blame]

2517

for(int i = 0; i < K; i += K0)

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

2518

{

Gian Marco Iodice

2019-09-24 12:05:06 +0100

[diff] [blame]

2519

VEC_DATA_TYPE(DATA_TYPE, M0)

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

2520

a0;

Gian Marco Iodice

2019-09-24 12:05:06 +0100

[diff] [blame]

2521

VEC_DATA_TYPE(DATA_TYPE, N0)

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

2522

b0;

2523

2524

a0 = VLOAD(M0)(0, lhs);

Gian Marco Iodice

2019-09-24 12:05:06 +0100

[diff] [blame]

2525

b0 = VLOAD(N0)(0, rhs);

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

2526

Gian Marco Iodice

2019-09-24 12:05:06 +0100

[diff] [blame]

2527

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

2528

Gian Marco Iodice

2019-09-24 12:05:06 +0100

[diff] [blame]

2529

lhs += LHS_STEP_X;

2530

rhs += RHS_STEP_X;

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

2531

Gian Marco Iodice

2019-09-24 12:05:06 +0100

[diff] [blame]

2532

#if K0 > 1

2533

a0 = VLOAD(M0)(0, lhs);

2534

b0 = VLOAD(N0)(0, rhs);

2535

2536

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

rhs += RHS_STEP_X;

#endif // K0 > 1

#if K0 > 2

a0 = VLOAD(M0)(0, lhs);

2544

b0 = VLOAD(N0)(0, rhs);

2545

2546

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

rhs += RHS_STEP_X;

#endif // K0 > 2

#if K0 > 3

a0 = VLOAD(M0)(0, lhs);

2554

b0 = VLOAD(N0)(0, rhs);

2555

2556

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

rhs += RHS_STEP_X;

#endif // K0 > 3

#if K0 > 4

a0 = VLOAD(M0)(0, lhs);

2564

b0 = VLOAD(N0)(0, rhs);

2565

2566

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

rhs += RHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

2572

b0 = VLOAD(N0)(0, rhs);

2573

2574

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

rhs += RHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

2580

b0 = VLOAD(N0)(0, rhs);

2581

2582

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

rhs += RHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

2588

b0 = VLOAD(N0)(0, rhs);

2589

2590

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

rhs += RHS_STEP_X;

#endif // K0 > 4

#if K0 > 8

a0 = VLOAD(M0)(0, lhs);

2598

b0 = VLOAD(N0)(0, rhs);

2599

2600

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

rhs += RHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

2606

b0 = VLOAD(N0)(0, rhs);

2607

2608

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

rhs += RHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

2614

b0 = VLOAD(N0)(0, rhs);

2615

2616

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

rhs += RHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

2622

b0 = VLOAD(N0)(0, rhs);

2623

2624

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

rhs += RHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

2630

b0 = VLOAD(N0)(0, rhs);

2631

2632

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

rhs += RHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

2638

b0 = VLOAD(N0)(0, rhs);

2639

2640

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

rhs += RHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

2646

b0 = VLOAD(N0)(0, rhs);

2647

2648

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

rhs += RHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

2654

b0 = VLOAD(N0)(0, rhs);

2655

2656

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

rhs += RHS_STEP_X;

#endif // K0 > 8

#ifndef LHS_INTERLEAVE

2663

lhs += (M0 * K0 * (V0 - 1));

2664

#endif // LHS_INTERLEAVE

2665

2666

#ifndef RHS_INTERLEAVE

2667

rhs += (N0 * K0 * (H0 - 1));

2668

#endif // RHS_INTERLEAVE

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

2669

}

2670

2671

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * dst_stride_y);

2672

2673

REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);

2674

2675

#if defined(REINTERPRET_OUTPUT_AS_3D)

2676

2677

// The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

Michele Di Giorgio

2020-11-23 15:05:12 +0000

[diff] [blame]

2678

CALCULATE_Z_OFFSET(M0, uint, zout, y * (uint)M0, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

2679

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

2680

// multiply dst_stride_z by DEPTH_GEMM3D

2681

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

2682

2683

#else // defined(REINTERPRET_OUTPUT_AS_3D)

2684

2685

// Add offset for batched GEMM

2686

dst_addr += z * dst_stride_z;

2687

2688

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

2689

2690

// Multiply by the weight of matrix-matrix product and store the result

2691

#if defined(ALPHA)

2692

SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);

2693

#endif // defined(ALPHA)

// Add beta*bias

#if defined(BETA)

#if defined(BROADCAST_BIAS)

2698

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE));

2699

Giorgio Arena

2021-09-07 14:15:28 +0100

[diff] [blame]

2700

LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x);

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

2701

2702

#ifndef UNIT_BETA

2703

SCALE_BLOCK(1, DATA_TYPE, bias, BETA);

2704

#endif // UNIT_BIAS

2705

2706

// c = c + bias[broadcasted]

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

2707

#if defined(MIXED_PRECISION)

2708

CONVERT_BLOCK(1, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);

2709

ADD_BLOCK_BROADCAST(M0, c, bias_hp0);

2710

#else // defined(MIXED_PRECISION)

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

2711

ADD_BLOCK_BROADCAST(M0, c, bias0);

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

2712

#endif // defined(MIXED_PRECISION)

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

2713

2714

#else // defined(BROADCAST_BIAS)

Gian Marco Iodice

2020-08-11 14:14:06 +0100

[diff] [blame]

2715

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE)) + (get_global_id(1) * (uint)M0 * bias_stride_y) + get_global_id(

2716

2) * bias_stride_z;

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

2717

Giorgio Arena

2021-09-07 14:15:28 +0100

[diff] [blame]

2718

LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

2719

2720

#ifndef UNIT_BETA

2721

SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);

2722

#endif // UNIT_BIAS

2723

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

2724

#if defined(MIXED_PRECISION)

2725

CONVERT_BLOCK(M0, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);

2726

ADD_BLOCK(M0, c, bias_hp);

2727

#else // defined(MIXED_PRECISION)

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

2728

ADD_BLOCK(M0, c, bias);

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

2729

#endif // defined(MIXED_PRECISION)

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

2730

2731

#endif // defined(BROADCAST_BIAS)

2732

#endif // defined(BETA)

2733

2734

#if defined(ACTIVATION_TYPE)

Georgios Pinitas

2019-10-11 17:38:50 +0100

[diff] [blame]

2735

#if defined(MIXED_PRECISION)

Gian Marco Iodice

2022-11-03 09:30:56 +0000

[diff] [blame]

2736

ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, N0, c, A_VAL, B_VAL);

Georgios Pinitas

2019-10-11 17:38:50 +0100

[diff] [blame]

2737

#else // defined(MIXED_PRECISION)

Gian Marco Iodice

2022-11-03 09:30:56 +0000

[diff] [blame]

2738

ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, N0, c, A_VAL, B_VAL);

Georgios Pinitas

2019-10-11 17:38:50 +0100

[diff] [blame]

2739

#endif // defined(MIXED_PRECISION)

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

2740

#endif // defined(ACTIVATION_TYPE)

2741

2742

// Store output block

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

2743

#if defined(MIXED_PRECISION)

Gian Marco Iodice

2020-08-11 14:14:06 +0100

[diff] [blame]

2744

CONVERT_BLOCK(M0, N0, DATA_TYPE, c, c_lp);

Giorgio Arena

2020-10-15 17:39:41 +0100

[diff] [blame]

2745

STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c_lp, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

2746

#else // defined(MIXED_PRECISION)

Giorgio Arena

2020-10-15 17:39:41 +0100

[diff] [blame]

2747

STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);

Gian Marco Iodice

2019-09-27 09:23:15 +0100

[diff] [blame]

2748

#endif // defined(MIXED_PRECISION)

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

2749

2750

#undef LHS_BLOCK_SIZE

2751

#undef LHS_OFFSET_X

2752

#undef LHS_STEP_X

2753

#undef RHS_BLOCK_SIZE

2754

#undef RHS_OFFSET_X

2755

#undef RHS_STEP_X

2756

}

ramelg01

2021-11-11 10:05:00 +0000

[diff] [blame]

2757

#endif // defined(GEMM_MM_RESHAPED_LHS_T_RHS_NT)

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

2758

ramelg01

2021-11-11 10:05:00 +0000

[diff] [blame]

2759

#if defined(OPENCL_IMAGE_SUPPORT) && defined(GEMM_MM_RESHAPED_LHS_T_RHS_NT_TEXTURE)

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

2760

/** This OpenCL kernel computes the matrix multiplication between 2 matrices. The RHS matrix is stored in OpenCL image object.

2761

* The LHS matrix must be reshaped with @ref CLGEMMReshapeLHSMatrixKernel and the M0xK0 must be transposed

2762

* The RHS matrix must be reshaped with @ref CLGEMMReshapeRHSMatrixKernel and the K0xN0 must be NOT transposed

2763

*

2764

* @note -DOPENCL_IMAGE_SUPPORT must be passed at compile time in order to compile this OpenCL kernel

2765

* @note LHS_TRANSPOSE should be passed at compile time in order to compile this OpenCL kernel (e.g. -DLHS_TRANSPOSE).

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

2766

* @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.

ramelg01

2021-11-11 10:05:00 +0000

[diff] [blame]

2767

* @note The GEMM's dimensions M, N and K must be passed at runtime.

Gian Marco Iodice

781cba7

2020-06-19 16:56:57 +0100

[diff] [blame]

2768

* @note The height of the RHS matrix, defined before creating the OpenCL image object from the OpenCL buffer, should be passed at compile time using -DRHS_HEIGHT=<value> (e.g. -DRHS_HEIGHT=32)

2769

* Since we cannot create a 3d image from a buffer, the third dimension could be collapsed with the second dimension so RHS_HEIGHT

2770

* could be different from the value returned by get_image_height(rhs_img).

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

2771

* @note The block's dimensions used for reshaping the LHS matrix and the RHS matrix (M0, N0 and K0) must be passed at compile time using -DM0, -DN0 and -DK0 (e.g. -DM0=4, -DN0=8, -DK0=4).

2772

* @note The number of M0xK0 vertical blocks stored on the same output row of the reshaped LHS matrix must be passed at compile time using -DV0 (e.g. -DV0=2)

2773

* @note The number of K0xN0 horizontal blocks stored on the same output row of the reshaped RHS matrix must be passed at compile time using -DH0 (e.g. -DH0=2)

2774

* @note If the M0xK0 blocks in the reshaped LHS matrix have been interleaved, the option -DLHS_INTERLEAVE must passed at compile time.

2775

* @note If the K0xN0 blocks in the reshaped RHS matrix have been interleaved, the option -DRHS_INTERLEAVE must passed at compile time.

Gian Marco Iodice

2020-08-11 14:14:06 +0100

[diff] [blame]

2776

* @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)

2777

* @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

2778

* @note Only the following configurations of M0, N0 and K0 are currently supported:

* - M0 = 2, 3, 4, 8

* - N0 = 4, 8, 16

* - K0 = 4, 8, 16

* - V0 >= 1

* - H0 >= 1

*

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

2786

* The activation function is performed after the bias addition

2787

* @note In case the output has to be reinterpreted as a 3D tensor (e.g. output of convolution layer), the following information must be passed at compile time:

2788

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

2789

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

2790

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

2791

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix NOT reshaped

2792

*

2793

* @param[in] lhs_ptr Pointer to the LHS reshaped matrix. Supported data type: F32

2794

* @param[in] lhs_stride_x Stride of the LHS reshaped matrix in X dimension (in bytes)

2795

* @param[in] lhs_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

2796

* @param[in] lhs_stride_y Stride of the LHS reshaped matrix in Y dimension (in bytes)

2797

* @param[in] lhs_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

2798

* @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS reshaped matrix

2799

* @param[in] rhs_img The RHS reshaped matrix as cl_image 2d. Supported data type: same as @p lhs_ptr

2800

* @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

2801

* @param[in] bias_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

2802

* @param[in] bias_step_x (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)

2803

* @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

2804

* @param[in] bias_step_y (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)

2805

* @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

2806

* @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr

2807

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

2808

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

2809

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

2810

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

2811

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

2812

* @param[in] lhs_stride_z Stride of the LHS reshaped matrix in Z dimension (in bytes)

2813

* @param[in] rhs_stride_z Stride of the RHS reshaped matrix in Z dimension (in bytes)

2814

* @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

2815

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

2816

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

ramelg01

2021-11-11 10:05:00 +0000

[diff] [blame]

2817

* @param[in] M Number of rows in LHS matrix not reshaped.

2818

* @param[in] N Number of columns in RHS matrix not reshaped.

2819

* @param[in] K Number of columns in LHS matrix and rows in RHS matrix not reshaped.

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

2820

*/

2821

__kernel void gemm_mm_reshaped_lhs_t_rhs_nt_texture(IMAGE_DECLARATION(lhs),

2822

__read_only image2d_t rhs_img,

2823

#if defined(BETA)

2824

IMAGE_DECLARATION(bias),

2825

#endif // defined(BETA)

2826

IMAGE_DECLARATION(dst),

uint lhs_stride_z,

uint rhs_stride_z,

#if defined(BETA)

uint bias_stride_z,

#endif //defined(BETA)

2832

uint dst_stride_z

2833

#if defined(REINTERPRET_OUTPUT_AS_3D)

2834

,

2835

uint dst_cross_plane_pad

2836

#endif // REINTERPRET_OUTPUT_AS_3D

ramelg01

2021-11-11 10:05:00 +0000

[diff] [blame]

,

const int M,

const int N,

const int K)

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

2841

{

2842

// Pixel unit

2843

#define PIXEL_UNIT CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(N0)

2844

2845

// Block size

2846

#define LHS_BLOCK_SIZE ((K0) * (M0))

2847

2848

#if defined(LHS_INTERLEAVE)

2849

#define LHS_OFFSET_X (M0)

2850

#define LHS_STEP_X ((M0) * (V0))

2851

#define LHS_STEP_LOOP (1)

2852

#else // defined(INTERLEAVE)

2853

#define LHS_OFFSET_X (LHS_BLOCK_SIZE)

2854

#define LHS_STEP_X (M0)

2855

#define LHS_STEP_LOOP (V0)

2856

#endif // defined(INTERLEAVE)

2857

2858

// Block size

2859

#define RHS_BLOCK_SIZE ((K0) * (PIXEL_UNIT))

2860

2861

// RHS offset and step X

2862

#if defined(RHS_INTERLEAVE)

2863

#define RHS_OFFSET_X (PIXEL_UNIT)

2864

#define RHS_STEP_X ((PIXEL_UNIT) * (H0))

2865

#else // defined(RHS_INTERLEAVE)

2866

#define RHS_OFFSET_X (RHS_BLOCK_SIZE)

2867

#define RHS_STEP_X (PIXEL_UNIT)

2868

#endif // defined(RHS_INTERLEAVE)

2869

2870

const uint x = get_global_id(0);

2871

const uint y = get_global_id(1);

2872

const uint z = get_global_id(2);

2873

2874

#if defined(DUMMY_WORK_ITEMS)

2875

if((x * N0 >= N) || (y * M0 >= M))

{

return;

}

#endif // defined(DUMMY_WORK_ITEMS)

2880

2881

// Compute LHS matrix address

2882

__global uchar *lhs_addr = lhs_ptr + lhs_offset_first_element_in_bytes + (y % V0) * (uint)LHS_OFFSET_X * sizeof(DATA_TYPE) + (y / V0) * (uint)lhs_stride_y + (z * lhs_stride_z);

2883

2884

#if defined(MATRIX_B_DEPTH)

2885

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

2886

const uint z_rhs = (z % MATRIX_B_DEPTH);

2887

#else // defined(MATRIX_B_DEPTH)

2888

const uint z_rhs = z;

2889

#endif // defined(MATRIX_B_DEPTH)

2890

2891

// Compute RHS matrix coordinates

2892

uint x_rhs = (x % H0) * (uint)RHS_OFFSET_X;

2893

const uint y_rhs = (x / (uint)H0) + z_rhs * RHS_HEIGHT;

2894

2895

// Initialize the accumulators

2896

REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE_ACCUMULATOR, N0), c, 0);

2897

2898

REPEAT_VAR_INIT_TO_CONST(M0, uint, zero, 0);

2899

2900

__global DATA_TYPE *lhs = (__global DATA_TYPE *)(lhs_addr);

2901

2902

for(int i = 0; i < K; i += K0)

2903

{

2904

VEC_DATA_TYPE(DATA_TYPE, M0)

2905

a0;

2906

VEC_DATA_TYPE(DATA_TYPE, N0)

2907

b0;

2908

2909

a0 = VLOAD(M0)(0, lhs);

2910

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 0 * RHS_STEP_X), (y_rhs));

2911

2912

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

#if K0 > 1

a0 = VLOAD(M0)(0, lhs);

2918

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 1 * RHS_STEP_X), (y_rhs));

2919

2920

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

#endif // K0 > 1

#if K0 > 2

a0 = VLOAD(M0)(0, lhs);

2927

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 2 * RHS_STEP_X), (y_rhs));

2928

2929

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

#endif // K0 > 2

#if K0 > 3

a0 = VLOAD(M0)(0, lhs);

2936

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 3 * RHS_STEP_X), (y_rhs));

2937

2938

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

#endif // K0 > 3

#if K0 > 4

a0 = VLOAD(M0)(0, lhs);

2945

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 4 * RHS_STEP_X), (y_rhs));

2946

2947

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

2952

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 5 * RHS_STEP_X), (y_rhs));

2953

2954

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

2959

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 6 * RHS_STEP_X), (y_rhs));

2960

2961

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

2966

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 7 * RHS_STEP_X), (y_rhs));

2967

2968

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

#endif // K0 > 4

#if K0 > 8

a0 = VLOAD(M0)(0, lhs);

2975

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 8 * RHS_STEP_X), (y_rhs));

2976

2977

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

2982

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 9 * RHS_STEP_X), (y_rhs));

2983

2984

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

2989

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 10 * RHS_STEP_X), (y_rhs));

2990

2991

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

2996

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 11 * RHS_STEP_X), (y_rhs));

2997

2998

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3003

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 12 * RHS_STEP_X), (y_rhs));

3004

3005

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3010

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 13 * RHS_STEP_X), (y_rhs));

3011

3012

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3017

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 14 * RHS_STEP_X), (y_rhs));

3018

3019

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

a0 = VLOAD(M0)(0, lhs);

3024

b0 = READ_IMAGE2D(DATA_TYPE, PIXEL_UNIT, rhs_img, (x_rhs + 15 * RHS_STEP_X), (y_rhs));

3025

3026

ARM_MM_T_NT(M0, N0, 1, DATA_TYPE, a, b, c);

lhs += LHS_STEP_X;

#endif // K0 > 8

#ifndef LHS_INTERLEAVE

3032

lhs += (M0 * K0 * (V0 - 1));

3033

#endif // LHS_INTERLEAVE

3034

3035

x_rhs += K0 * RHS_STEP_X;

3036

#ifndef RHS_INTERLEAVE

3037

x_rhs += (PIXEL_UNIT * K0 * (H0 - 1));

3038

#endif // RHS_INTERLEAVE

3039

}

3040

3041

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * dst_stride_y);

3042

3043

REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);

3044

Giorgio Arena

2021-09-07 14:15:28 +0100

[diff] [blame]

3045

const bool cond_y = ((get_global_id(1) + 1) * M0 >= M);

3046

const bool cond_x = ((get_global_id(0) + 1) * N0 >= N);

3047

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

3048

#if defined(REINTERPRET_OUTPUT_AS_3D)

3049

3050

// The plane (zin) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

Michele Di Giorgio

2020-11-23 15:05:12 +0000

[diff] [blame]

3051

CALCULATE_Z_OFFSET(M0, uint, zout, y * (uint)M0, HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

3052

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

3053

// multiply dst_stride_z by DEPTH_GEMM3D

3054

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

3055

3056

#else // defined(REINTERPRET_OUTPUT_AS_3D)

3057

3058

// Add offset for batched GEMM

3059

dst_addr += z * dst_stride_z;

3060

3061

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

3062

3063

// Multiply by the weight of matrix-matrix product and store the result

3064

#if defined(ALPHA)

3065

SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);

3066

#endif // defined(ALPHA)

// Add beta*bias

#if defined(BETA)

#if defined(BROADCAST_BIAS)

3071

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE));

3072

Giorgio Arena

2021-09-07 14:15:28 +0100

[diff] [blame]

3073

LOAD_BLOCK_BOUNDARY_AWARE(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, 1, PARTIAL_STORE_N0, false, cond_x);

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

3074

3075

#ifndef UNIT_BETA

3076

SCALE_BLOCK(1, DATA_TYPE, bias, BETA);

3077

#endif // UNIT_BIAS

3078

3079

// c = c + bias[broadcasted]

3080

#if defined(MIXED_PRECISION)

3081

CONVERT_BLOCK(1, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);

3082

ADD_BLOCK_BROADCAST(M0, c, bias_hp0);

3083

#else // defined(MIXED_PRECISION)

3084

ADD_BLOCK_BROADCAST(M0, c, bias0);

3085

#endif // defined(MIXED_PRECISION)

3086

3087

#else // defined(BROADCAST_BIAS)

3088

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (y * (uint)M0 * bias_stride_y) + z * bias_stride_z;

3089

Giorgio Arena

2021-09-07 14:15:28 +0100

[diff] [blame]

3090

LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

3091

3092

#ifndef UNIT_BETA

3093

SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);

3094

#endif // UNIT_BIAS

3095

3096

#if defined(MIXED_PRECISION)

3097

CONVERT_BLOCK(M0, N0, DATA_TYPE_ACCUMULATOR, bias, bias_hp);

3098

ADD_BLOCK(M0, c, bias_hp);

3099

#else // defined(MIXED_PRECISION)

3100

ADD_BLOCK(M0, c, bias);

3101

#endif // defined(MIXED_PRECISION)

3102

3103

#endif // defined(BROADCAST_BIAS)

3104

#endif // defined(BETA)

3105

3106

#if defined(ACTIVATION_TYPE)

3107

#if defined(MIXED_PRECISION)

Gian Marco Iodice

2022-11-03 09:30:56 +0000

[diff] [blame]

3108

ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE_ACCUMULATOR, N0, c, A_VAL, B_VAL);

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

3109

#else // defined(MIXED_PRECISION)

Gian Marco Iodice

2022-11-03 09:30:56 +0000

[diff] [blame]

3110

ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, N0, c, A_VAL, B_VAL);

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

3111

#endif // defined(MIXED_PRECISION)

3112

#endif // defined(ACTIVATION_TYPE)

3113

3114

// Store output block

3115

#if defined(MIXED_PRECISION)

Gian Marco Iodice

2020-08-11 14:14:06 +0100

[diff] [blame]

3116

CONVERT_BLOCK(M0, N0, DATA_TYPE, c, c_lp);

Giorgio Arena

2020-10-15 17:39:41 +0100

[diff] [blame]

3117

STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c_lp, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

3118

#else // defined(MIXED_PRECISION)

Giorgio Arena

2020-10-15 17:39:41 +0100

[diff] [blame]

3119

STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

3120

#endif // defined(MIXED_PRECISION)

3121

3122

#undef LHS_BLOCK_SIZE

3123

#undef LHS_OFFSET_X

3124

#undef LHS_STEP_X

3125

#undef RHS_BLOCK_SIZE

#undef RHS_OFFSET_X

#undef RHS_STEP_X

#undef PIXEL_UNIT

#undef LHS_STEP_LOOP

#undef RHS_STEP_LOOP

}

ramelg01

2021-11-11 10:05:00 +0000

[diff] [blame]

3132

#endif // defined(OPENCL_IMAGE_SUPPORT) && defined(GEMM_MM_RESHAPED_LHS_T_RHS_NT_TEXTURE)

Gian Marco Iodice

2020-06-10 17:59:30 +0100

[diff] [blame]

3133

Giorgio Arena

2019-08-01 14:22:12 +0100

[diff] [blame]

3134

#endif // defined(LHS_TRANSPOSE)

3135

ramelg01

2021-11-11 10:05:00 +0000

[diff] [blame]

3136

#endif // defined(M0) && defined(N0) && defined(K0) && defined(V0) && defined(H0) && defined(DATA_TYPE) && defined(DATA_TYPE_ACCUMULATOR)

Gian Marco Iodice

2018-12-12 10:18:04 +0000

[diff] [blame]

3137

ramelg01

2021-11-11 10:05:00 +0000

[diff] [blame]

3138

#if defined(M0) && defined(N0) && defined(K0) && defined(DATA_TYPE)

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

3139

3140

#define VFMA(a, b, c) \

({ \

c = fma(a, b, c); \

})

#if M0 == 1

#define RHS_VFMA_M0xN0(i, a, b, c) \

3147

({ \

3148

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

3149

})

3150

#elif M0 == 2 // M0 == 2

3151

#define RHS_VFMA_M0xN0(i, a, b, c) \

3152

({ \

3153

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

3154

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

3155

})

3156

#elif M0 == 3 // M0 == 3

3157

#define RHS_VFMA_M0xN0(i, a, b, c) \

3158

({ \

3159

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

3160

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

3161

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \

3162

})

3163

#elif M0 == 4 // M0 == 4

3164

#define RHS_VFMA_M0xN0(i, a, b, c) \

3165

({ \

3166

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

3167

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

3168

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \

3169

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \

3170

})

3171

#elif M0 == 5 // M0 == 5

3172

#define RHS_VFMA_M0xN0(i, a, b, c) \

3173

({ \

3174

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

3175

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

3176

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \

3177

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \

3178

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \

3179

})

3180

#elif M0 == 6 // M0 == 6

3181

#define RHS_VFMA_M0xN0(i, a, b, c) \

3182

({ \

3183

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

3184

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

3185

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \

3186

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \

3187

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \

3188

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \

3189

})

3190

#elif M0 == 7 // M0 == 7

3191

#define RHS_VFMA_M0xN0(i, a, b, c) \

3192

({ \

3193

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

3194

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

3195

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \

3196

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \

3197

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \

3198

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \

3199

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \

3200

})

3201

#elif M0 == 8 // M0 == 8

3202

#define RHS_VFMA_M0xN0(i, a, b, c) \

3203

({ \

3204

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##0).s##i), b, (c##0)); \

3205

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##1).s##i), b, (c##1)); \

3206

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##2).s##i), b, (c##2)); \

3207

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##3).s##i), b, (c##3)); \

3208

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##4).s##i), b, (c##4)); \

3209

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##5).s##i), b, (c##5)); \

3210

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##6).s##i), b, (c##6)); \

3211

VFMA((VEC_DATA_TYPE(DATA_TYPE, N0))((a##7).s##i), b, (c##7)); \

3212

})

3213

#else // M0 not supported

3214

#error "M0 not supported"

3215

#endif // M0 not supported

3216

ramelg01

2021-11-11 10:05:00 +0000

[diff] [blame]

3217

#if defined(GEMM_MM_NATIVE)

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

3218

/** This OpenCL kernel computes the matrix multiplication between 2 matrices.

3219

* The LHS matrix is NOT reshaped

3220

* The RHS matrix is NOT reshaped

3221

*

3222

* @note If the first two dimensions of NDRange have been dispatched with "dummy_work_items" support, the option -DDUMMY_WORK_ITEMS must be passed at compile time.

ramelg01

2021-11-11 10:05:00 +0000

[diff] [blame]

3223

* @note The GEMM's dimensions (M,N and K) must be passed at runtime as kernel parameters.

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

3224

* @note The number of M0 rows to process must be passed at compile time using -DM0 (e.g. -DM0=2)

3225

* @note The number of K0 partial accumulations must be passed at compile time using -DK0 (e.g., -DK0=2)

3226

* @note The number of N0 columns to process must be passed at compile time using -DN0 (e.g. -DN0=2)

SiCong Li

3a50166

2020-06-26 10:02:06 +0100

[diff] [blame]

3227

* @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)

3228

* @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

3229

* @note Only the following configurations of M0, N0 and K0 are currently supported:

3230

* - M0 = 1, 2, 3, 4, 5, 6, 7, 8

3231

* - N0 = 2, 3, 4, 8, 16

3232

* - K0 = 2, 3, 4, 8, 16

3233

*

Gian Marco Iodice

2019-07-19 09:54:47 +0100

[diff] [blame]

3234

* @note If the activation type were passed at compile time through -DACTIVATION_TYPE (e.g. -DACTIVATION_TYPE=RELU), A, B variables, required by some activation functions, should be passed at compile time as well using -DA_VAL= and -DB_VAL= respectively.

Gian Marco Iodice

2019-07-16 15:46:48 +0100

[diff] [blame]

3235

* The activation function is performed after the bias addition

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

3236

* @note In case the input or output have to be reinterpreted as a 3D tensor, the following information must be passed at compile time:

3237

* -# REINTERPRET_INPUT_AS_3D: To reinterpret the input as 3D

3238

* -# REINTERPRET_OUTPUT_AS_3D: To reinterpret the output as 3D

3239

* -# HEIGHT_GEMM3D: The height of the output in case it has to be reinterpreted as a 3D tensor.

3240

* -# DEPTH_GEMM3D: The depth of the output in case it has to be reinterpreted as a 3D tensor

3241

* (HEIGHT_GEMM3D * DEPTH_GEMM3D) = columns LHS matrix

3242

*

Gian Marco Iodice

2019-06-24 14:40:30 +0100

[diff] [blame]

3243

* @param[in] lhs_ptr Pointer to the LHS matrix. Supported data type: F16/F32

3244

* @param[in] lhs_stride_x Stride of the LHS matrix in X dimension (in bytes)

3245

* @param[in] lhs_step_x lhs_stride_x * number of elements along X processed per workitem(in bytes)

3246

* @param[in] lhs_stride_y Stride of the LHS matrix in Y dimension (in bytes)

3247

* @param[in] lhs_step_y lhs_stride_y * number of elements along Y processed per workitem(in bytes)

3248

* @param[in] lhs_offset_first_element_in_bytes The offset of the first element in the LHS matrix

3249

* @param[in] rhs_ptr Pointer to the RHS matrix. Supported data type: same as @p lhs_ptr

3250

* @param[in] rhs_stride_x Stride of the RHS matrix in X dimension (in bytes)

3251

* @param[in] rhs_step_x rhs_stride_x * number of elements along X processed per workitem(in bytes)

3252

* @param[in] rhs_stride_y Stride of the RHS matrix in Y dimension (in bytes)

3253

* @param[in] rhs_step_y rhs_stride_y * number of elements along Y processed per workitem(in bytes)

3254

* @param[in] rhs_offset_first_element_in_bytes The offset of the first element in the RHS matrix

Gian Marco Iodice

2019-06-24 14:40:30 +0100

[diff] [blame]

3255

* @param[in] bias_ptr (Optional) Pointer to the bias matrix. Supported data type: same as @p lhs_ptr

3256

* @param[in] bias_stride_x (Optional) Stride of the bias matrix in X dimension (in bytes)

3257

* @param[in] bias_step_x (Optional) bias_stride_x * number of elements along X processed per workitem(in bytes)

3258

* @param[in] bias_stride_y (Optional) Stride of the bias matrix in Y dimension (in bytes)

3259

* @param[in] bias_step_y (Optional) bias_stride_y * number of elements along Y processed per workitem(in bytes)

3260

* @param[in] bias_offset_first_element_in_bytes (Optional) The offset of the first element in the bias matrix

3261

* @param[out] dst_ptr Pointer to the destination matrix Supported data type: same as @p lhs_ptr

3262

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

3263

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

3264

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

3265

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

3266

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

3267

* @param[in] lhs_stride_z Stride of the LHS matrix in Z dimension (in bytes)

3268

* @param[in] rhs_stride_z Stride of the RHS matrix in Z dimension (in bytes)

3269

* @param[in] bias_stride_z (Optional) Stride of the bias matrix in Z dimension (in bytes)

3270

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

ramelg01

2021-11-11 10:05:00 +0000

[diff] [blame]

3271

* @param[in] M Number of rows in LHS matrix not reshaped.

3272

* @param[in] N Number of columns in RHS matrix not reshaped.

3273

* @param[in] K Number of columns in LHS matrix and rows in RHS matrix not reshaped.

Gian Marco Iodice

2019-06-24 14:40:30 +0100

[diff] [blame]

3274

* @param[in] lhs_cross_plane_pad (Optional) Bottom paddings for LHS matrix in unit of elements (only if defined REINTERPRET_INPUT_AS_3D)

3275

* @param[in] dst_cross_plane_pad (Optional) Bottom paddings for the output matrix in unit of elements (only if defined REINTERPRET_OUTPUT_AS_3D)

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

3276

*/

3277

__kernel void gemm_mm_native(IMAGE_DECLARATION(lhs),

3278

IMAGE_DECLARATION(rhs),

Gian Marco Iodice

2019-06-24 14:40:30 +0100

[diff] [blame]

3279

#if defined(BETA)

3280

IMAGE_DECLARATION(bias),

3281

#endif // defined(BETA)

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

3282

IMAGE_DECLARATION(dst),

3283

uint lhs_stride_z,

3284

uint rhs_stride_z,

Gian Marco Iodice

2019-06-24 14:40:30 +0100

[diff] [blame]

3285

#if defined(BETA)

3286

uint bias_stride_z,

3287

#endif //defined(BETA)

ramelg01

2021-11-11 10:05:00 +0000

[diff] [blame]

uint dst_stride_z,

const int M,

const int N,

const int K

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

3292

#if defined(REINTERPRET_INPUT_AS_3D)

3293

,

3294

uint lhs_cross_plane_pad

3295

#endif // REINTERPRET_INPUT_AS_3D

3296

#if defined(REINTERPRET_OUTPUT_AS_3D)

3297

,

3298

uint dst_cross_plane_pad

3299

#endif // REINTERPRET_OUTPUT_AS_3D

)

{

// Block size

#define RHS_BLOCK_SIZE ((K0) * (N0))

3304

3305

// RHS offset and step X

3306

#define RHS_OFFSET_X (RHS_BLOCK_SIZE)

3307

3308

uint x = get_global_id(0);

3309

uint y = get_global_id(1);

3310

uint z = get_global_id(2);

3311

3312

#if defined(DUMMY_WORK_ITEMS)

3313

if((x * N0 >= N) || (y * M0 >= M))

{

return;

}

#endif // defined(DUMMY_WORK_ITEMS)

3318

3319

// Compute LHS matrix address

SiCongLi

2021-11-03 12:17:06 +0000

[diff] [blame]

3320

uint lhs_offset = lhs_offset_first_element_in_bytes + COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * (uint)lhs_stride_y;

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

3321

3322

// Compute RHS matrix address

3323

uint rhs_offset = rhs_offset_first_element_in_bytes + x * N0 * sizeof(DATA_TYPE);

3324

3325

#if defined(MATRIX_B_DEPTH)

3326

// Do not slide matrix B if the matrix B has 3 dimensions and matrix A more than 3

3327

rhs_offset += (z % MATRIX_B_DEPTH) * rhs_stride_z;

3328

#else // defined(MATRIX_B_DEPTH)

3329

rhs_offset += z * rhs_stride_z;

3330

#endif // defined(MATRIX_B_DEPTH)

3331

Gian Marco Iodice

2019-06-24 14:40:30 +0100

[diff] [blame]

3332

REPEAT_VAR_INIT_TO_CONST(M0, uint, zlhs, 0);

3333

REPEAT_VAR_INIT_TO_CONST(16, uint, zero, 0);

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

3334

3335

#if defined(REINTERPRET_INPUT_AS_3D)

3336

// The plane (zlhs) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

SiCongLi

2021-11-03 12:17:06 +0000

[diff] [blame]

3337

CALCULATE_Z_OFFSET(M0, uint, zlhs, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, lhs_cross_plane_pad, lhs_stride_y);

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

3338

3339

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

3340

// multiply lhs_stride_z by DEPTH_GEMM3D

3341

lhs_offset += z * lhs_stride_z * DEPTH_GEMM3D;

3342

3343

#else // defined(REINTERPRET_INPUT_AS_3D)

3344

3345

// Add offset for batched GEMM

3346

lhs_offset += z * lhs_stride_z;

3347

3348

#endif // defined(REINTERPRET_INPUT_AS_3D)

3349

3350

// Initialize the accumulators

Gian Marco Iodice

2019-06-24 14:40:30 +0100

[diff] [blame]

3351

REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), c, 0); //VEC_DATA_TYPE(DATA_TYPE, N0) c0=0,c1=0,c2=0,... c(M0-1)=0;

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

3352

3353

int i = 0;

Gian Marco Iodice

c9cecc0

2021-10-15 10:23:24 +0100

[diff] [blame]

3354

#if K0 > 1

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

3355

for(; i <= (K - K0); i += K0)

3356

{

3357

// Supported cases (M0, K0):

3358

// 1,2 - 1,3 - 1,4 - 1,8 - 1,16

3359

// 2,2 - 2,3 - 2,4 - 2,8 - 2,16

3360

// 3,2 - 3,3 - 3,4 - 3,8 - 3,16

3361

// 4,2 - 4,3 - 4,4 - 4,8 - 4,16

3362

// 5,2 - 5,3 - 5,4 - 5,8 - 5,16

3363

// 6,2 - 6,3 - 6,4 - 6,8 - 6,16

3364

// 7,2 - 7,3 - 7,4 - 7,8 - 7,16

3365

// 8,2 - 8,3 - 8,4 - 8,8 - 8,16

3366

// Load values from LHS matrix

3367

LOAD_BLOCK(M0, K0, DATA_TYPE, a, lhs_ptr, lhs_offset, lhs_stride_y, zlhs);

3368

3369

// Load values from RHS matrix

Gian Marco Iodice

2019-06-24 14:40:30 +0100

[diff] [blame]

3370

LOAD_BLOCK(K0, N0, DATA_TYPE, b, rhs_ptr, rhs_offset, rhs_stride_y, zero);

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

3371

3372

RHS_VFMA_M0xN0(0, a, b0, c);

3373

RHS_VFMA_M0xN0(1, a, b1, c);

3374

#if K0 > 2

3375

RHS_VFMA_M0xN0(2, a, b2, c);

3376

#endif // K0 > 2

3377

#if K0 > 3

3378

RHS_VFMA_M0xN0(3, a, b3, c);

3379

#endif // K0 > 3

3380

#if K0 > 4

3381

RHS_VFMA_M0xN0(4, a, b4, c);

3382

RHS_VFMA_M0xN0(5, a, b5, c);

3383

RHS_VFMA_M0xN0(6, a, b6, c);

3384

RHS_VFMA_M0xN0(7, a, b7, c);

3385

#endif // K0 > 4

3386

#if K0 > 8

3387

RHS_VFMA_M0xN0(8, a, b8, c);

3388

RHS_VFMA_M0xN0(9, a, b9, c);

Gian Marco Iodice

7b9d7ca

2019-09-19 16:37:39 +0100

[diff] [blame]

3389

RHS_VFMA_M0xN0(A, a, bA, c);

3390

RHS_VFMA_M0xN0(B, a, bB, c);

3391

RHS_VFMA_M0xN0(C, a, bC, c);

3392

RHS_VFMA_M0xN0(D, a, bD, c);

3393

RHS_VFMA_M0xN0(E, a, bE, c);

3394

RHS_VFMA_M0xN0(F, a, bF, c);

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

3395

#endif // K0 > 8

3396

3397

lhs_offset += K0 * sizeof(DATA_TYPE);

3398

rhs_offset += K0 * rhs_stride_y;

3399

}

Gian Marco Iodice

c9cecc0

2021-10-15 10:23:24 +0100

[diff] [blame]

3400

#endif // K0 > 1

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

3401

// Left-over accumulations

3402

for(; i < K; ++i)

3403

{

3404

// Load values from LHS matrix

3405

VEC_DATA_TYPE(DATA_TYPE, 2)

3406

a0 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 0 * lhs_stride_y + zlhs0));

3407

#if M0 > 1

3408

VEC_DATA_TYPE(DATA_TYPE, 2)

3409

a1 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 1 * lhs_stride_y + zlhs1));

3410

#endif // M0 > 1

3411

#if M0 > 2

3412

VEC_DATA_TYPE(DATA_TYPE, 2)

3413

a2 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 2 * lhs_stride_y + zlhs2));

3414

#endif // M0 > 2

3415

#if M0 > 3

3416

VEC_DATA_TYPE(DATA_TYPE, 2)

3417

a3 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 3 * lhs_stride_y + zlhs3));

3418

#endif // M0 > 3

3419

#if M0 > 4

3420

VEC_DATA_TYPE(DATA_TYPE, 2)

3421

a4 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 4 * lhs_stride_y + zlhs4));

3422

#endif // M0 > 4

3423

#if M0 > 5

3424

VEC_DATA_TYPE(DATA_TYPE, 2)

3425

a5 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 5 * lhs_stride_y + zlhs5));

3426

#endif // M0 > 5

3427

#if M0 > 6

3428

VEC_DATA_TYPE(DATA_TYPE, 2)

3429

a6 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 6 * lhs_stride_y + zlhs6));

3430

#endif // M0 > 6

3431

#if M0 > 7

3432

VEC_DATA_TYPE(DATA_TYPE, 2)

3433

a7 = *((__global DATA_TYPE *)(lhs_ptr + lhs_offset + 7 * lhs_stride_y + zlhs7));

3434

#endif // M0 > 7

3435

3436

VEC_DATA_TYPE(DATA_TYPE, N0)

3437

b = VLOAD(N0)(0, (__global DATA_TYPE *)(rhs_ptr + rhs_offset + 0 * rhs_stride_y));

3438

RHS_VFMA_M0xN0(0, a, b, c);

3439

3440

lhs_offset += sizeof(DATA_TYPE);

3441

rhs_offset += rhs_stride_y;

3442

}

3443

SiCongLi

2021-11-03 12:17:06 +0000

[diff] [blame]

3444

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * dst_stride_y);

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

3445

Gian Marco Iodice

2019-06-24 14:40:30 +0100

[diff] [blame]

3446

REPEAT_VAR_INIT_TO_CONST(M0, uint, zout, 0);

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

3447

3448

#if defined(REINTERPRET_OUTPUT_AS_3D)

3449

// The plane (zout) is calculated dividing M (y * M0) by HEIGHT_GEMM3D

SiCongLi

2021-11-03 12:17:06 +0000

[diff] [blame]

3450

CALCULATE_Z_OFFSET(M0, uint, zout, COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0), HEIGHT_GEMM3D, DEPTH_GEMM3D, dst_cross_plane_pad, dst_stride_y);

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

3451

3452

// Add offset for batched GEMM. The batches will be in the fourth dimension and for this reason we

3453

// multiply dst_stride_z by DEPTH_GEMM3D

3454

dst_addr += z * dst_stride_z * DEPTH_GEMM3D;

3455

3456

#else // defined(REINTERPRET_OUTPUT_AS_3D)

3457

3458

// Add offset for batched GEMM

3459

dst_addr += z * dst_stride_z;

3460

3461

#endif // defined(REINTERPRET_OUTPUT_AS_3D)

3462

3463

// Multiply by the weight of matrix-matrix product and store the result

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

3464

#if defined(ALPHA)

3465

SCALE_BLOCK(M0, DATA_TYPE, c, ALPHA);

3466

#endif // defined(ALPHA)

3467

Gian Marco Iodice

2019-06-24 14:40:30 +0100

[diff] [blame]

3468

// Add beta*bias

3469

#if defined(BETA)

3470

#if defined(BROADCAST_BIAS)

3471

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (get_global_id(0) * (uint)N0 * sizeof(DATA_TYPE));

3472

SiCongLi

2021-11-03 12:17:06 +0000

[diff] [blame]

3473

LOAD_BLOCK(1, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

Gian Marco Iodice

2019-06-24 14:40:30 +0100

[diff] [blame]

3474

3475

#ifndef UNIT_BETA

3476

SCALE_BLOCK(1, DATA_TYPE, bias, BETA);

3477

#endif // UNIT_BIAS

3478

3479

// c = c + bias[broadcasted]

3480

ADD_BLOCK_BROADCAST(M0, c, bias0);

3481

3482

#else // defined(BROADCAST_BIAS)

SiCongLi

2021-11-03 12:17:06 +0000

[diff] [blame]

3483

__global uchar *bias_addr = bias_ptr + bias_offset_first_element_in_bytes + (x * (uint)N0 * sizeof(DATA_TYPE)) + (COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) * bias_stride_y) + z * bias_stride_z;

Gian Marco Iodice

2019-06-24 14:40:30 +0100

[diff] [blame]

3484

SiCongLi

2021-11-03 12:17:06 +0000

[diff] [blame]

3485

LOAD_BLOCK(M0, N0, DATA_TYPE, bias, bias_addr, 0, bias_stride_y, zero);

Gian Marco Iodice

2019-06-24 14:40:30 +0100

[diff] [blame]

3486

3487

#ifndef UNIT_BETA

3488

SCALE_BLOCK(M0, DATA_TYPE, bias, BETA);

#endif // UNIT_BIAS

// c = c + bias

ADD_BLOCK(M0, c, bias);

3493

3494

#endif // defined(BROADCAST_BIAS)

3495

#endif // defined(BETA)

3496

Gian Marco Iodice

2019-07-16 15:46:48 +0100

[diff] [blame]

3497

#if defined(ACTIVATION_TYPE)

Gian Marco Iodice

2022-11-03 09:30:56 +0000

[diff] [blame]

3498

ACTIVATION_BLOCK(M0, ACTIVATION_TYPE, DATA_TYPE, N0, c, A_VAL, B_VAL);

Gian Marco Iodice

2019-07-16 15:46:48 +0100

[diff] [blame]

3499

#endif // defined(ACTIVATION_TYPE)

3500

SiCongLi

2021-11-03 12:17:06 +0000

[diff] [blame]

3501

const bool cond_y = y == 0;

3502

const bool cond_x = ((x + 1) * N0 >= N);

3503

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

3504

// Store output block

Giorgio Arena

2020-10-15 17:39:41 +0100

[diff] [blame]

3505

STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

3506

}

ramelg01

2021-11-11 10:05:00 +0000

[diff] [blame]

3507

#endif // defined(GEMM_MM_NATIVE)

3508

#endif // defined(M0) && defined(N0) && defined(K0) && defined(DATA_TYPE)

giuros01

2019-04-01 13:50:22 +0100

[diff] [blame]

3509

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

3510

#if defined(BETA)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3511

/** This OpenCL kernel performs the in-place matrix addition between 2 matrices taking into account that the second matrix might be weighted by a scalar value beta:

3512

*

Gian Marco

19835e5

2018-01-30 13:35:54 +0000

[diff] [blame]

3513

* @note The beta's value need to be passed at compile time using -DBETA

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3514

*

3515

* @param[in] src_ptr Pointer to the source matrix. Supported data types: F32

3516

* @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)

3517

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

3518

* @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)

3519

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

3520

* @param[in] src_stride_z Stride of the destination tensor in Z dimension (in bytes)

3521

* @param[in] src_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3522

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

3a3066b

2017-06-23 13:38:14 +0100

[diff] [blame]

3523

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3524

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

3525

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

3526

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

3527

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

3528

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

3529

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3530

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

3531

*/

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

3532

__kernel void gemm_ma_f32(TENSOR3D_DECLARATION(src),

3533

TENSOR3D_DECLARATION(dst))

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3534

{

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

3535

// Compute source and destination addresses

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

3536

Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);

3537

Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3538

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

3539

// Load values from A x B

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3540

float4 alpha_ab = vload4(0, (__global float *)dst.ptr);

3541

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

3542

// Load values from Matrix C

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3543

float4 c = vload4(0, (__global float *)src.ptr);

3544

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

3545

// Computes alpha * axb + beta * c

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3546

float4 out = alpha_ab + (float4)BETA * c;

3547

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

3548

// Store final result in axb matrix

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3549

vstore4(out, 0, (__global float *)dst.ptr);

3550

}

3551

Vidhya Sudhan Loganathan

76c8564

2018-05-25 13:53:02 +0100

[diff] [blame]

3552

#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3553

/** This OpenCL kernel performs the in-place matrix addition between 2 matrices taking into account that the second matrix might be weighted by a scalar value beta:

3554

*

Gian Marco

19835e5

2018-01-30 13:35:54 +0000

[diff] [blame]

3555

* @note The beta's value need to be passed at compile time using -DBETA

Gian Marco Iodice

3a3066b

2017-06-23 13:38:14 +0100

[diff] [blame]

3556

*

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3557

* @param[in] src_ptr Pointer to the source matrix. Supported data types: F16

3558

* @param[in] src_stride_x Stride of the source matrix in X dimension (in bytes)

3559

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

3560

* @param[in] src_stride_y Stride of the source matrix in Y dimension (in bytes)

3561

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

3562

* @param[in] src_stride_z Stride of the destination tensor in Z dimension (in bytes)

3563

* @param[in] src_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3564

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source matrix

Gian Marco Iodice

3a3066b

2017-06-23 13:38:14 +0100

[diff] [blame]

3565

* @param[out] dst_ptr Pointer to the destination matrix Supported data types: same as @p src_ptr

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3566

* @param[in] dst_stride_x Stride of the destination matrix in X dimension (in bytes)

3567

* @param[in] dst_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)

3568

* @param[in] dst_stride_y Stride of the destination matrix in Y dimension (in bytes)

3569

* @param[in] dst_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

3570

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

3571

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3572

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination matrix

3573

*/

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

3574

__kernel void gemm_ma_f16(TENSOR3D_DECLARATION(src),

3575

TENSOR3D_DECLARATION(dst))

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3576

{

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

3577

// Compute source and destination addresses

Isabella Gottardi

2018-03-01 16:42:00 +0000

[diff] [blame]

3578

Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);

3579

Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3580

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

3581

// Load values from A x B

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3582

half8 alpha_ab = vload8(0, (__global half *)dst.ptr);

3583

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

3584

// Load values from Matrix C

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3585

half8 c = vload8(0, (__global half *)src.ptr);

3586

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

3587

// Computes alpha * axb + beta * c

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3588

half8 out = alpha_ab + (half8)BETA * c;

3589

Anton Lokhmotov

2017-11-20 11:02:10 +0000

[diff] [blame]

3590

// Store final result in axb matrix

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

3591

vstore8(out, 0, (__global half *)dst.ptr);

3592

}

Vidhya Sudhan Loganathan

76c8564

2018-05-25 13:53:02 +0100

[diff] [blame]

3593

#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)

ramelg01