Blame - src/core/cpu/kernels/CpuGemmMatrixMultiplyKernel.cpp - ml/ComputeLibrary

2017-09-04 18:44:23 +0100

[diff] [blame]

486

Window win_a(window);

487

win_a.set(Window::DimX, Window::Dimension(0, 0, 0));

488

win_a.set(Window::DimY, Window::Dimension(window.y().start() / 4, std::max(window.y().end() / 4, 1), 1));

489

490

Window win_b;

491

// Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2

492

// This scenario can happen when the the matrix multiplication is used to perform a convolution operation

Michele Di Giorgio

2021-06-21 14:45:44 +0100

[diff] [blame^]

493

if(rhs->info()->num_dimensions() >= 3)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

494

{

495

win_b = window;

496

}

Michele Di Giorgio

2021-06-21 14:45:44 +0100

[diff] [blame^]

497

// Set step_x and step_y for matrix B. Scale by a factor of 4 the X range as the input transposed matrix A has 4 times less the cols of the dst matrix

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

498

// The step along the x direction is 2 times the in_b_stride because for each iteration we compute 2 blocks of size 4x4

499

win_b.set(Window::DimX, Window::Dimension(window.x().start() / 4, window.x().end() / 4, 2 * in_b_stride));

500

win_b.set(Window::DimY, Window::Dimension(0, 0, 0));

501

Michele Di Giorgio

2021-06-21 14:45:44 +0100

[diff] [blame^]

502

Iterator ina(lhs, win_a);

503

Iterator inb(rhs, win_b);

504

Iterator out(dst, window);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

505

Michele Di Giorgio

2020-10-08 11:54:42 +0100

[diff] [blame]

506

const bool multiply_alpha = !(helpers::float_ops::is_one(alpha));

507

508

const float32x4_t alpha_f32 = vdupq_n_f32(alpha);

509

Michele Di Giorgio

2021-06-21 14:45:44 +0100

[diff] [blame^]

510

// The implementation assumes that the matrix A and Matrix B have been reshaped respectively with CpuGemmInterleave4x4 and CpuGemmTranspose1xW

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

511

// The reshaping of the matrices helps to have a cache friendly implementation and helps to avoid the data re-arrangements needed for computing 16x4 elements per iteration

512

// All the values needed for computing a single 4x4 block will be read from consecutive memory positions

Michele Di Giorgio

2020-10-08 11:54:42 +0100

[diff] [blame]

513

execute_window_loop(window, [&](const Coordinates & id)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

514

{

515

auto mtx_a0 = reinterpret_cast<const float *>(ina.ptr());

516

auto mtx_b0 = reinterpret_cast<const float *>(inb.ptr());

517

auto mtx_b1 = mtx_b0 + in_b_stride;

518

519

float32x4_t acc00 = vdupq_n_f32(0.f);

520

float32x4_t acc10 = vdupq_n_f32(0.f);

521

float32x4_t acc20 = vdupq_n_f32(0.f);

522

float32x4_t acc30 = vdupq_n_f32(0.f);

523

524

float32x4_t acc01 = vdupq_n_f32(0.f);

525

float32x4_t acc11 = vdupq_n_f32(0.f);

526

float32x4_t acc21 = vdupq_n_f32(0.f);

527

float32x4_t acc31 = vdupq_n_f32(0.f);

528

529

#if __arm__

530

asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_a0)));

531

asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b0)));

532

asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b1)));

Anthony Barbier

2017-07-03 17:39:37 +0100

[diff] [blame]

533

#endif /* __arm__ */

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

534

535

auto mtx_b0_end_addr = mtx_b0 + num_elems_matrix_b_x;

536

for(; mtx_b0 <= (mtx_b0_end_addr - 32);)

537

{

538

float32x4_t a0 = vld1q_dup_f32(mtx_a0 + 0);

539

float32x4_t a1 = vld1q_dup_f32(mtx_a0 + 1);

540

float32x4_t a2 = vld1q_dup_f32(mtx_a0 + 2);

541

float32x4_t a3 = vld1q_dup_f32(mtx_a0 + 3);

542

543

float32x4_t b00 = vld1q_f32(mtx_b0);

544

float32x4_t b10 = vld1q_f32(mtx_b1);

545

float32x4_t b01 = vld1q_f32(mtx_b0 + 4);

546

float32x4_t b11 = vld1q_f32(mtx_b1 + 4);

547

548

#if __arm__

549

asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_a0)));

550

asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b0)));

551

asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b1)));

Anthony Barbier

2017-07-03 17:39:37 +0100

[diff] [blame]

552

#endif /* __arm__ */

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

553

554

// 4x4 block 0

555

acc00 = vmlaq_f32(acc00, b00, a0);

556

acc10 = vmlaq_f32(acc10, b00, a1);

557

acc20 = vmlaq_f32(acc20, b00, a2);

558

acc30 = vmlaq_f32(acc30, b00, a3);

559

560

float32x4_t a4 = vld1q_dup_f32(mtx_a0 + 4);

561

float32x4_t a5 = vld1q_dup_f32(mtx_a0 + 5);

562

float32x4_t a6 = vld1q_dup_f32(mtx_a0 + 6);

563

float32x4_t a7 = vld1q_dup_f32(mtx_a0 + 7);

564

565

// 4x4 block 1

566

acc01 = vmlaq_f32(acc01, b10, a0);

567

acc11 = vmlaq_f32(acc11, b10, a1);

568

acc21 = vmlaq_f32(acc21, b10, a2);

569

acc31 = vmlaq_f32(acc31, b10, a3);

570

571

// 4x4 block 0

572

acc00 = vmlaq_f32(acc00, b01, a4);

573

acc10 = vmlaq_f32(acc10, b01, a5);

574

acc20 = vmlaq_f32(acc20, b01, a6);

575

acc30 = vmlaq_f32(acc30, b01, a7);

576

577

// 4x4 block 1

578

acc01 = vmlaq_f32(acc01, b11, a4);

579

acc11 = vmlaq_f32(acc11, b11, a5);

580

acc21 = vmlaq_f32(acc21, b11, a6);

581

acc31 = vmlaq_f32(acc31, b11, a7);

mtx_a0 += 8;

mtx_b0 += 8;

mtx_b1 += 8;

a0 = vld1q_dup_f32(mtx_a0 + 0);

588

a1 = vld1q_dup_f32(mtx_a0 + 1);

589

a2 = vld1q_dup_f32(mtx_a0 + 2);

590

a3 = vld1q_dup_f32(mtx_a0 + 3);

591

592

b00 = vld1q_f32(mtx_b0);

593

b10 = vld1q_f32(mtx_b1);

594

b01 = vld1q_f32(mtx_b0 + 4);

595

b11 = vld1q_f32(mtx_b1 + 4);

596

597

// 4x4 block 0

598

acc00 = vmlaq_f32(acc00, b00, a0);

599

acc10 = vmlaq_f32(acc10, b00, a1);

600

acc20 = vmlaq_f32(acc20, b00, a2);

601

acc30 = vmlaq_f32(acc30, b00, a3);

602

603

a4 = vld1q_dup_f32(mtx_a0 + 4);

604

a5 = vld1q_dup_f32(mtx_a0 + 5);

605

a6 = vld1q_dup_f32(mtx_a0 + 6);

606

a7 = vld1q_dup_f32(mtx_a0 + 7);

607

608

// 4x4 block 1

609

acc01 = vmlaq_f32(acc01, b10, a0);

610

acc11 = vmlaq_f32(acc11, b10, a1);

611

acc21 = vmlaq_f32(acc21, b10, a2);

612

acc31 = vmlaq_f32(acc31, b10, a3);

613

614

// 4x4 block 0

615

acc00 = vmlaq_f32(acc00, b01, a4);

616

acc10 = vmlaq_f32(acc10, b01, a5);

617

acc20 = vmlaq_f32(acc20, b01, a6);

618

acc30 = vmlaq_f32(acc30, b01, a7);

619

620

// 4x4 block 1

621

acc01 = vmlaq_f32(acc01, b11, a4);

622

acc11 = vmlaq_f32(acc11, b11, a5);

623

acc21 = vmlaq_f32(acc21, b11, a6);

624

acc31 = vmlaq_f32(acc31, b11, a7);

mtx_a0 += 8;

mtx_b0 += 8;

mtx_b1 += 8;

a0 = vld1q_dup_f32(mtx_a0 + 0);

631

a1 = vld1q_dup_f32(mtx_a0 + 1);

632

a2 = vld1q_dup_f32(mtx_a0 + 2);

633

a3 = vld1q_dup_f32(mtx_a0 + 3);

634

b00 = vld1q_f32(mtx_b0);

635

b10 = vld1q_f32(mtx_b1);

636

b01 = vld1q_f32(mtx_b0 + 4);

637

b11 = vld1q_f32(mtx_b1 + 4);

638

639

#if __arm__

640

asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_a0)));

641

asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b0)));

642

asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b1)));

Anthony Barbier

2017-07-03 17:39:37 +0100

[diff] [blame]

643

#endif /* __arm__ */

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

644

645

// 4x4 block 0

646

acc00 = vmlaq_f32(acc00, b00, a0);

647

acc10 = vmlaq_f32(acc10, b00, a1);

648

acc20 = vmlaq_f32(acc20, b00, a2);

649

acc30 = vmlaq_f32(acc30, b00, a3);

650

651

a4 = vld1q_dup_f32(mtx_a0 + 4);

652

a5 = vld1q_dup_f32(mtx_a0 + 5);

653

a6 = vld1q_dup_f32(mtx_a0 + 6);

654

a7 = vld1q_dup_f32(mtx_a0 + 7);

655

656

// 4x4 block 1

657

acc01 = vmlaq_f32(acc01, b10, a0);

658

acc11 = vmlaq_f32(acc11, b10, a1);

659

acc21 = vmlaq_f32(acc21, b10, a2);

660

acc31 = vmlaq_f32(acc31, b10, a3);

661

662

// 4x4 block 0

663

acc00 = vmlaq_f32(acc00, b01, a4);

664

acc10 = vmlaq_f32(acc10, b01, a5);

665

acc20 = vmlaq_f32(acc20, b01, a6);

666

acc30 = vmlaq_f32(acc30, b01, a7);

667

668

// 4x4 block 1

669

acc01 = vmlaq_f32(acc01, b11, a4);

670

acc11 = vmlaq_f32(acc11, b11, a5);

671

acc21 = vmlaq_f32(acc21, b11, a6);

672

acc31 = vmlaq_f32(acc31, b11, a7);

mtx_a0 += 8;

mtx_b0 += 8;

mtx_b1 += 8;

a0 = vld1q_dup_f32(mtx_a0 + 0);

679

a1 = vld1q_dup_f32(mtx_a0 + 1);

680

a2 = vld1q_dup_f32(mtx_a0 + 2);

681

a3 = vld1q_dup_f32(mtx_a0 + 3);

682

b00 = vld1q_f32(mtx_b0);

683

b10 = vld1q_f32(mtx_b1);

684

b01 = vld1q_f32(mtx_b0 + 4);

685

b11 = vld1q_f32(mtx_b1 + 4);

686

687

// 4x4 block 0

688

acc00 = vmlaq_f32(acc00, b00, a0);

689

acc10 = vmlaq_f32(acc10, b00, a1);

690

acc20 = vmlaq_f32(acc20, b00, a2);

691

acc30 = vmlaq_f32(acc30, b00, a3);

692

693

a4 = vld1q_dup_f32(mtx_a0 + 4);

694

a5 = vld1q_dup_f32(mtx_a0 + 5);

695

a6 = vld1q_dup_f32(mtx_a0 + 6);

696

a7 = vld1q_dup_f32(mtx_a0 + 7);

697

698

// 4x4 block 1

699

acc01 = vmlaq_f32(acc01, b10, a0);

700

acc11 = vmlaq_f32(acc11, b10, a1);

701

acc21 = vmlaq_f32(acc21, b10, a2);

702

acc31 = vmlaq_f32(acc31, b10, a3);

703

704

// 4x4 block 0

705

acc00 = vmlaq_f32(acc00, b01, a4);

706

acc10 = vmlaq_f32(acc10, b01, a5);

707

acc20 = vmlaq_f32(acc20, b01, a6);

708

acc30 = vmlaq_f32(acc30, b01, a7);

709

710

// 4x4 block 1

711

acc01 = vmlaq_f32(acc01, b11, a4);

712

acc11 = vmlaq_f32(acc11, b11, a5);

713

acc21 = vmlaq_f32(acc21, b11, a6);

714

acc31 = vmlaq_f32(acc31, b11, a7);

mtx_a0 += 8;

mtx_b0 += 8;

mtx_b1 += 8;

}

for(; mtx_b0 < mtx_b0_end_addr;)

722

{

723

float32x4_t a0 = vld1q_dup_f32(mtx_a0 + 0);

724

float32x4_t a1 = vld1q_dup_f32(mtx_a0 + 1);

725

float32x4_t a2 = vld1q_dup_f32(mtx_a0 + 2);

726

float32x4_t a3 = vld1q_dup_f32(mtx_a0 + 3);

727

float32x4_t b00 = vld1q_f32(mtx_b0);

728

float32x4_t b10 = vld1q_f32(mtx_b1);

729

730

#if __arm__

731

asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_a0)));

732

asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b0)));

733

asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b1)));

Anthony Barbier

2017-07-03 17:39:37 +0100

[diff] [blame]

734

#endif /* __arm__ */

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

735

// 4x4 block 0

736

acc00 = vmlaq_f32(acc00, b00, a0);

737

acc10 = vmlaq_f32(acc10, b00, a1);

738

acc20 = vmlaq_f32(acc20, b00, a2);

739

acc30 = vmlaq_f32(acc30, b00, a3);

740

741

// 4x4 block 1

742

acc01 = vmlaq_f32(acc01, b10, a0);

743

acc11 = vmlaq_f32(acc11, b10, a1);

744

acc21 = vmlaq_f32(acc21, b10, a2);

745

acc31 = vmlaq_f32(acc31, b10, a3);

mtx_a0 += 4;

mtx_b0 += 4;

mtx_b1 += 4;

}

// Multiply by the weight of matrix product (alpha)

753

if(multiply_alpha)

754

{

Michele Di Giorgio

2020-10-08 11:54:42 +0100

[diff] [blame]

755

acc00 = vmulq_f32(acc00, alpha_f32);

756

acc10 = vmulq_f32(acc10, alpha_f32);

757

acc20 = vmulq_f32(acc20, alpha_f32);

758

acc30 = vmulq_f32(acc30, alpha_f32);

759

acc01 = vmulq_f32(acc01, alpha_f32);

760

acc11 = vmulq_f32(acc11, alpha_f32);

761

acc21 = vmulq_f32(acc21, alpha_f32);

762

acc31 = vmulq_f32(acc31, alpha_f32);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

763

}

764

765

const auto mtx_out0 = reinterpret_cast<float *>(out.ptr());

766

const auto mtx_out1 = mtx_out0 + 4;

767

Michele Di Giorgio

2020-10-08 11:54:42 +0100

[diff] [blame]

768

if(id.x() < (out_width - 8))

769

{

770

vst1q_f32(mtx_out0, acc00);

771

vst1q_f32(mtx_out1, acc01);

772

if(id.y() + 1 < out_height)

773

{

774

vst1q_f32(mtx_out0 + out_stride1, acc10);

775

vst1q_f32(mtx_out1 + out_stride1, acc11);

776

if(id.y() + 2 < out_height)

777

{

778

vst1q_f32(mtx_out0 + out_stride2, acc20);

779

vst1q_f32(mtx_out1 + out_stride2, acc21);

780

if(id.y() + 3 < out_height)

781

{

782

vst1q_f32(mtx_out0 + out_stride3, acc30);

783

vst1q_f32(mtx_out1 + out_stride3, acc31);

}

}

}

}

else if(id.x() < (out_width - 4))

789

{

790

vst1q_f32(mtx_out0, acc00);

791

if(id.y() + 1 < out_height)

792

{

793

vst1q_f32(mtx_out0 + out_stride1, acc10);

794

if(id.y() + 2 < out_height)

795

{

796

vst1q_f32(mtx_out0 + out_stride2, acc20);

797

if(id.y() + 3 < out_height)

798

{

799

vst1q_f32(mtx_out0 + out_stride3, acc30);

}

}

}

// Left-over columns

const int columns_left = out_width - id.x() - 4;

805

for(auto x = 0; x < columns_left; ++x)

806

{

807

*(mtx_out1 + x) = acc01[x];

808

if(id.y() + 1 < out_height)

809

{

810

*(mtx_out1 + x + out_stride1) = acc11[x];

811

if(id.y() + 2 < out_height)

812

{

813

*(mtx_out1 + x + out_stride2) = acc21[x];

814

if(id.y() + 3 < out_height)

815

{

816

*(mtx_out1 + x + out_stride3) = acc31[x];

}

}

}

}

}

else

{

// Left-over columns

const int columns_left = out_width - id.x();

826

for(int x = 0; x < columns_left; ++x)

827

{

828

*(mtx_out0 + x) = acc00[x];

829

if(id.y() + 1 < out_height)

830

{

831

*(mtx_out0 + x + out_stride1) = acc10[x];

832

if(id.y() + 2 < out_height)

833

{

834

*(mtx_out0 + x + out_stride2) = acc20[x];

835

if(id.y() + 3 < out_height)

836

{

837

*(mtx_out0 + x + out_stride3) = acc30[x];

}

}

}

}

}

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

},

ina, inb, out);

}

Michele Di Giorgio

2020-10-08 11:54:42 +0100

[diff] [blame]

847

#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

Michele Di Giorgio

2021-06-21 14:45:44 +0100

[diff] [blame^]

848

void matrix_matrix_multiply_f16(const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

849

{

Michele Di Giorgio

2021-06-21 14:45:44 +0100

[diff] [blame^]

850

ARM_COMPUTE_UNUSED(info);

851

const int out_width = static_cast<int>(dst->info()->dimension(0));

852

const int out_height = static_cast<int>(dst->info()->dimension(1));

853

const size_t in_b_stride = rhs->info()->strides_in_bytes()[1] / data_size_from_type(rhs->info()->data_type());

854

const size_t out_stride = dst->info()->strides_in_bytes()[1] / data_size_from_type(dst->info()->data_type());

855

const int num_elems_matrix_b_x = rhs->info()->dimension(0);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

856

Michele Di Giorgio

2021-06-21 14:45:44 +0100

[diff] [blame^]

857

// Set step_x and step_y for matrix A. Scale by a factor of 4 the Y range as the input interleaved matrix A has 4 times less the rows of the dst matrix

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

858

Window win_a(window);

859

win_a.set(Window::DimX, Window::Dimension(0, 0, 0));

860

win_a.set(Window::DimY, Window::Dimension(window.y().start() / 4, std::max(window.y().end() / 4, 1), 1));

861

862

Window win_b;

863

// Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2

864

// This scenario can happen when the the matrix multiplication is used to perform a convolution operation

Michele Di Giorgio

2021-06-21 14:45:44 +0100

[diff] [blame^]

865

if(rhs->info()->num_dimensions() >= 3)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

866

{

867

win_b = window;

868

}

Michele Di Giorgio

2021-06-21 14:45:44 +0100

[diff] [blame^]

869

// Set step_x and step_y for matrix B. Scale by a factor of 8 the X range as the input transposed matrix A has 8 times less the cols of the dst matrix

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

870

win_b.set(Window::DimX, Window::Dimension(window.x().start() / 8, window.x().end() / 8, in_b_stride));

871

win_b.set(Window::DimY, Window::Dimension(0, 1, 0));

872

Michele Di Giorgio

2021-06-21 14:45:44 +0100

[diff] [blame^]

873

Iterator ina(lhs, win_a);

874

Iterator inb(rhs, win_b);

875

Iterator out(dst, window);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

876

Michele Di Giorgio

2020-10-08 11:54:42 +0100

[diff] [blame]

877

const bool multiply_alpha = !(helpers::float_ops::is_one(alpha));

878

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

879

const float16x8_t alpha_f16 = vdupq_n_f16(alpha);

880

Michele Di Giorgio

2020-10-08 11:54:42 +0100

[diff] [blame]

881

execute_window_loop(window, [&](const Coordinates & id)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

882

{

883

const auto *mtx_a0 = reinterpret_cast<const float16_t *>(ina.ptr());

884

const auto *mtx_b0 = reinterpret_cast<const float16_t *>(inb.ptr());

885

auto *mtx_out = reinterpret_cast<float16_t *>(out.ptr());

float16x8x4_t c =

{

{

vdupq_n_f16(0.f),

vdupq_n_f16(0.f),

vdupq_n_f16(0.f),

vdupq_n_f16(0.f)

}

};

/*

This kernel puts the values in a 4x4 block of Matrix A on the same row (Interleaved values)

898

|a00 a01 a02 a03 | a04 a05 a06 a07|

899

|a10 a11 a12 a13 | a14 a15 a16 a17|

900

|a20 a21 a22 a23 | a24 a25 a26 a27| = | a00 a10 a20 a30 || a01 a11 a21 a31 || a02 a12 a22 a32 || a03 a13 a23 a33 | a40 a50 a60 a70 | ...

901

|a30 a31 a32 a33 | a34 a35 a36 a37| | a04 a14 a24 a34 || a05 a15 a25 a35 || a06 a15 a26 a36 || a07 a17 a27 a37 | a44 a54 a64 a74 | ...

902

|a40 a41 a42 a43 | a44 a45 a46 a47|

903

|a50 a51 a52 a53 | a54 a55 a56 a57|

904

|a60 a61 a62 a63 | a64 a65 a66 a67|

905

|a70 a71 a72 a73 | a74 a75 a76 a77|

906

Michele Di Giorgio

2021-06-21 14:45:44 +0100

[diff] [blame^]

907

After this operation, the dst matrix will have the following shape: [ height * 4, width / 4 ]

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

908

909

B Matrix has been transposed as shown below

910

911

|b00 b01 b02 b03 b04 b05 b06 b07|

912

|b10 b11 b12 b13 b14 b15 b16 b17|

913

|b20 b21 b22 b23 b24 b25 b26 b27|

914

|b30 b31 b32 b33 b34 b35 b36 b37|

915

------------------->

916

917

|b00 b01 b02 b03 b04 b05 b06 b07||b10 b11 b12 b13 b14 b15 b16 b17||b20 b21 b22 b23 b24 b25 b26 b27||b30 b31 b32 b33 b34 b35 b36 b37|

918

919

c.val[0][0] = a00*b00 + a01*b10 + a02*b20 + a03*b30

920

c.val[0][1] = a00*b01 + a01*b11 + a02*b21 + a03*b31

921

Michele Di Giorgio

2021-06-21 14:45:44 +0100

[diff] [blame^]

922

The size of the dst tensor's XY-plane must be the following shape [ width * 8, height / 8 ]. All other dimensions must have the same size.

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

923

*/

Pablo Tello

221f381

2017-06-28 17:27:56 +0100

[diff] [blame]

924

const float16_t *mtx_b0_end_addr = mtx_b0 + num_elems_matrix_b_x;

925

926

for(; mtx_b0 <= (mtx_b0_end_addr - 32);)

927

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

928

{

929

const float16x8_t p00 = vld1q_f16(mtx_a0);

930

const float16x8_t p02 = vld1q_f16(mtx_a0 + 8);

Pablo Tello

221f381

2017-06-28 17:27:56 +0100

[diff] [blame]

931

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

932

const float16x8_t q00 = vld1q_f16(mtx_b0);

933

const float16x8_t q02 = vld1q_f16(mtx_b0 + 8);

934

const float16x8_t q04 = vld1q_f16(mtx_b0 + 16);

935

const float16x8_t q06 = vld1q_f16(mtx_b0 + 24);

936

937

c.val[0] = vaddq_f16(c.val[0], vmulq_n_f16(q00, vgetq_lane_f16(p00, 0)));

938

c.val[1] = vaddq_f16(c.val[1], vmulq_n_f16(q00, vgetq_lane_f16(p00, 1)));

939

c.val[2] = vaddq_f16(c.val[2], vmulq_n_f16(q00, vgetq_lane_f16(p00, 2)));

940

c.val[3] = vaddq_f16(c.val[3], vmulq_n_f16(q00, vgetq_lane_f16(p00, 3)));

941

942

c.val[0] = vaddq_f16(c.val[0], vmulq_n_f16(q02, vgetq_lane_f16(p00, 4)));

943

c.val[1] = vaddq_f16(c.val[1], vmulq_n_f16(q02, vgetq_lane_f16(p00, 5)));

944

c.val[2] = vaddq_f16(c.val[2], vmulq_n_f16(q02, vgetq_lane_f16(p00, 6)));

945

c.val[3] = vaddq_f16(c.val[3], vmulq_n_f16(q02, vgetq_lane_f16(p00, 7)));

946

947

c.val[0] = vaddq_f16(c.val[0], vmulq_n_f16(q04, vgetq_lane_f16(p02, 0)));

948

c.val[1] = vaddq_f16(c.val[1], vmulq_n_f16(q04, vgetq_lane_f16(p02, 1)));

949

c.val[2] = vaddq_f16(c.val[2], vmulq_n_f16(q04, vgetq_lane_f16(p02, 2)));

950

c.val[3] = vaddq_f16(c.val[3], vmulq_n_f16(q04, vgetq_lane_f16(p02, 3)));

951

952

c.val[0] = vaddq_f16(c.val[0], vmulq_n_f16(q06, vgetq_lane_f16(p02, 4)));

953

c.val[1] = vaddq_f16(c.val[1], vmulq_n_f16(q06, vgetq_lane_f16(p02, 5)));

954

c.val[2] = vaddq_f16(c.val[2], vmulq_n_f16(q06, vgetq_lane_f16(p02, 6)));

955

c.val[3] = vaddq_f16(c.val[3], vmulq_n_f16(q06, vgetq_lane_f16(p02, 7)));

Pablo Tello

221f381

2017-06-28 17:27:56 +0100

[diff] [blame]

mtx_a0 += 16;

mtx_b0 += 32;

}

for(; mtx_b0 < mtx_b0_end_addr;)

962

963

{

964

const float16x4_t p00 = vld1_f16(mtx_a0);

965

const float16x8_t q00 = vld1q_f16(mtx_b0);

966

967

c.val[0] = vaddq_f16(c.val[0], vmulq_n_f16(q00, vget_lane_f16(p00, 0)));

968

c.val[1] = vaddq_f16(c.val[1], vmulq_n_f16(q00, vget_lane_f16(p00, 1)));

969

c.val[2] = vaddq_f16(c.val[2], vmulq_n_f16(q00, vget_lane_f16(p00, 2)));

970

c.val[3] = vaddq_f16(c.val[3], vmulq_n_f16(q00, vget_lane_f16(p00, 3)));

971

972

mtx_a0 += 4;

973

mtx_b0 += 8;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

}

if(multiply_alpha)

{

c.val[0] = vmulq_f16(c.val[0], alpha_f16);

979

c.val[1] = vmulq_f16(c.val[1], alpha_f16);

980

c.val[2] = vmulq_f16(c.val[2], alpha_f16);

981

c.val[3] = vmulq_f16(c.val[3], alpha_f16);

982

}

983

Michele Di Giorgio

2020-10-08 11:54:42 +0100

[diff] [blame]

984

if(id.x() < (out_width - 8))

985

{

986

vst1q_f16(mtx_out, c.val[0]);

987

if(id.y() + 1 < out_height)

988

{

989

vst1q_f16(mtx_out + 1 * out_stride, c.val[1]);

990

if(id.y() + 2 < out_height)

991

{

992

vst1q_f16(mtx_out + 2 * out_stride, c.val[2]);

993

if(id.y() + 3 < out_height)

994

{

995

vst1q_f16(mtx_out + 3 * out_stride, c.val[3]);

}

}

}

}

else

{

// Left-over columns

const int columns_left = out_width - id.x();

1004

for(int x = 0; x < columns_left; ++x)

1005

{

1006

*(mtx_out + x) = c.val[0][x];

1007

if(id.y() + 1 < out_height)

1008

{

1009

*(mtx_out + x + 1 * out_stride) = c.val[1][x];

1010

if(id.y() + 2 < out_height)

1011

{

1012

*(mtx_out + x + 2 * out_stride) = c.val[2][x];

1013

if(id.y() + 3 < out_height)

1014

{

1015

*(mtx_out + x + 3 * out_stride) = c.val[3][x];

}

}

}

}

}

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1021

},

1022

ina, inb, out);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1023

}

Michele Di Giorgio

2020-10-08 11:54:42 +0100

[diff] [blame]

1024

#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1025

Michele Di Giorgio

2021-06-21 14:45:44 +0100

[diff] [blame^]

1026

inline Status validate_arguments(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, float alpha, bool is_interleaved, const GEMMReshapeInfo &reshape_info)

Giorgio Arena

2017-11-30 15:08:38 +0000

[diff] [blame]

1027

{

Ioan-Cristian Szabo

2017-11-30 17:17:17 +0000

[diff] [blame]

1028

ARM_COMPUTE_UNUSED(alpha);

1029

Michele Di Giorgio

2021-06-21 14:45:44 +0100

[diff] [blame^]

1030

ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(lhs);

1031

ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F16, DataType::F32);

1032

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, rhs, dst);

Giorgio Arena

2017-11-30 15:08:38 +0000

[diff] [blame]

1033

Ioan-Cristian Szabo

2017-11-30 17:17:17 +0000

[diff] [blame]

1034

if(!is_interleaved)

Giorgio Arena

2017-11-30 15:08:38 +0000

[diff] [blame]

1035

{

Michele Di Giorgio

2021-06-21 14:45:44 +0100

[diff] [blame^]

1036

ARM_COMPUTE_RETURN_ERROR_ON(lhs->dimension(0) != rhs->dimension(1));

Ioan-Cristian Szabo

2017-11-30 17:17:17 +0000

[diff] [blame]

1037

Michele Di Giorgio

2021-06-21 14:45:44 +0100

[diff] [blame^]

1038

if(dst->total_size() != 0)

Ioan-Cristian Szabo

2017-11-30 17:17:17 +0000

[diff] [blame]

1039

{

Michele Di Giorgio

2021-06-21 14:45:44 +0100

[diff] [blame^]

1040

ARM_COMPUTE_RETURN_ERROR_ON(rhs->dimension(0) != dst->dimension(0));

1041

ARM_COMPUTE_RETURN_ERROR_ON(lhs->dimension(1) != dst->dimension(1));

1042

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, dst);

Ioan-Cristian Szabo

2017-11-30 17:17:17 +0000

[diff] [blame]

}

}

else

{

const int m = reshape_info.m();

1048

const int n = reshape_info.n();

1049

const int k = reshape_info.k();

1050

const int mult_transpose1xW_width = reshape_info.mult_transpose1xW_width();

1051

const int mult_interleave4x4_height = reshape_info.mult_interleave4x4_height();

1052

1053

/* Interleave */

Michele Di Giorgio

2021-06-21 14:45:44 +0100

[diff] [blame^]

1054

TensorShape tensor_shape0{ lhs->tensor_shape() };

Ioan-Cristian Szabo

2017-11-30 17:17:17 +0000

[diff] [blame]

1055

tensor_shape0.set(0, k);

1056

tensor_shape0.set(1, m);

1057

Michele Di Giorgio

2021-06-21 14:45:44 +0100

[diff] [blame^]

1058

const TensorInfo tensor_info0 = lhs->clone()->set_tensor_shape(tensor_shape0);

1059

const TensorInfo tensor_info_reshaped0 = lhs->clone()->set_tensor_shape(misc::shape_calculator::compute_interleaved_shape(tensor_info0, mult_interleave4x4_height));

1060

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(lhs, &tensor_info_reshaped0);

Ioan-Cristian Szabo

2017-11-30 17:17:17 +0000

[diff] [blame]

1061

1062

if(n != 0) /* Transpose */

1063

{

Michele Di Giorgio

2021-06-21 14:45:44 +0100

[diff] [blame^]

1064

TensorShape tensor_shape1{ rhs->tensor_shape() };

Ioan-Cristian Szabo

2017-11-30 17:17:17 +0000

[diff] [blame]

1065

tensor_shape1.set(0, n);

1066

tensor_shape1.set(1, k);

1067

Michele Di Giorgio

2021-06-21 14:45:44 +0100

[diff] [blame^]

1068

const TensorInfo tensor_info1 = rhs->clone()->set_tensor_shape(tensor_shape1);

1069

const TensorInfo tensor_info_reshaped1 = rhs->clone()->set_tensor_shape(misc::shape_calculator::compute_transpose1xW_with_element_size_shape(tensor_info1, mult_transpose1xW_width));

1070

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(rhs, &tensor_info_reshaped1);

Ioan-Cristian Szabo

2017-11-30 17:17:17 +0000

[diff] [blame]

1071

}

1072

Michele Di Giorgio

2021-06-21 14:45:44 +0100

[diff] [blame^]

1073

if(dst->total_size() != 0)

Ioan-Cristian Szabo

2017-11-30 17:17:17 +0000

[diff] [blame]

1074

{

1075

if(n != 0)

1076

{

Michele Di Giorgio

2021-06-21 14:45:44 +0100

[diff] [blame^]

1077

ARM_COMPUTE_RETURN_ERROR_ON(dst->dimension(0) != static_cast<size_t>(n));

Ioan-Cristian Szabo

2017-11-30 17:17:17 +0000

[diff] [blame]

1078

}

Michele Di Giorgio

2021-06-21 14:45:44 +0100

[diff] [blame^]

1079

ARM_COMPUTE_RETURN_ERROR_ON(dst->dimension(1) != static_cast<size_t>(m));

1080

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, dst);

Ioan-Cristian Szabo

2017-11-30 17:17:17 +0000

[diff] [blame]

1081

}

Giorgio Arena

2017-11-30 15:08:38 +0000

[diff] [blame]

}

return Status{};

}

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1086

} // namespace

1087

Michele Di Giorgio

2021-06-21 14:45:44 +0100

[diff] [blame^]

1088

void CpuGemmMatrixMultiplyKernel::configure(const ITensorInfo *lhs, const ITensorInfo *rhs, ITensorInfo *dst, float alpha, bool is_interleaved, const GEMMReshapeInfo &reshape_info)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1089

{

Michele Di Giorgio

2021-06-21 14:45:44 +0100

[diff] [blame^]

1090

ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1091

Michele Di Giorgio

2021-06-21 14:45:44 +0100

[diff] [blame^]

1092

// dst tensor auto inizialitation if not yet initialized

1093

TensorShape tensor_shape{ lhs->tensor_shape() };

1094

tensor_shape.set(0, is_interleaved ? reshape_info.n() : rhs->dimension(0));

1095

tensor_shape.set(1, is_interleaved ? reshape_info.m() : lhs->dimension(1));

Ioan-Cristian Szabo

2017-11-30 17:17:17 +0000

[diff] [blame]

1096

Michele Di Giorgio

2021-06-21 14:45:44 +0100

[diff] [blame^]

1097

auto_init_if_empty(*dst, lhs->clone()->set_tensor_shape(tensor_shape));

Ioan-Cristian Szabo

2017-11-30 17:17:17 +0000

[diff] [blame]

1098

1099

// Perform validate step

Michele Di Giorgio

2021-06-21 14:45:44 +0100

[diff] [blame^]

1100

ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(lhs, rhs, dst, alpha, is_interleaved, reshape_info));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1101

Michele Di Giorgio

2021-06-21 14:45:44 +0100

[diff] [blame^]

1102

_alpha = alpha;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1103

Giorgio Arena

2017-11-30 15:08:38 +0000

[diff] [blame]

1104

// Configure kernel window

Michele Di Giorgio

2020-10-08 11:54:42 +0100

[diff] [blame]

1105

Window win{};

1106

Michele Di Giorgio

2021-06-21 14:45:44 +0100

[diff] [blame^]

1107

// Check if the dst tensor is a vector. If so,the kernel runs the vector-matrix multiplication

1108

const bool is_dst_vector = (dst->dimension(1) == 1);

1109

if(is_dst_vector)

Michele Di Giorgio

2020-10-08 11:54:42 +0100

[diff] [blame]

1110

{

Michele Di Giorgio

2021-06-21 14:45:44 +0100

[diff] [blame^]

1111

const unsigned int num_elems_processed_per_iteration_x = (lhs->data_type() == DataType::F32) ? 16 : 32;

Michele Di Giorgio

2020-10-08 11:54:42 +0100

[diff] [blame]

1112

Michele Di Giorgio

2021-06-21 14:45:44 +0100

[diff] [blame^]

1113

win = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration_x));

Michele Di Giorgio

2020-10-08 11:54:42 +0100

[diff] [blame]

}

else

{

constexpr unsigned int num_elems_processed_per_iteration_x = 8;

1118

constexpr unsigned int num_elems_processed_per_iteration_y = 4;

1119

Michele Di Giorgio

2021-06-21 14:45:44 +0100

[diff] [blame^]

1120

win = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));

Michele Di Giorgio

2020-10-08 11:54:42 +0100

[diff] [blame]

1121

}

1122

Michele Di Giorgio

2021-06-21 14:45:44 +0100

[diff] [blame^]

1123

switch(lhs->data_type())

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1124

{

Michele Di Giorgio

2020-10-08 11:54:42 +0100

[diff] [blame]

1125

case DataType::F32:

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1126

{

Michele Di Giorgio

2021-06-21 14:45:44 +0100

[diff] [blame^]

1127

_func = (is_dst_vector) ? vector_matrix_multiply_f32 : matrix_matrix_multiply_f32;

Michele Di Giorgio

2020-10-08 11:54:42 +0100

[diff] [blame]

1128

break;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1129

}

Ioan-Cristian Szabo

5edbd1c

2017-11-13 13:34:08 +0000

[diff] [blame]

1130

#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

Michele Di Giorgio

2020-10-08 11:54:42 +0100

[diff] [blame]

1131

case DataType::F16:

1132

{

Michele Di Giorgio

2021-06-21 14:45:44 +0100

[diff] [blame^]

1133

_func = (is_dst_vector) ? vector_matrix_multiply_f16 : matrix_matrix_multiply_f16;

Michele Di Giorgio

2020-10-08 11:54:42 +0100

[diff] [blame]

1134

break;

1135

}

Ioan-Cristian Szabo

5edbd1c

2017-11-13 13:34:08 +0000

[diff] [blame]

1136

#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */

Michele Di Giorgio

2020-10-08 11:54:42 +0100

[diff] [blame]

1137

default:

1138

{

1139

ARM_COMPUTE_ERROR("Data type not supported");

1140

break;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1141

}

1142

}

Michele Di Giorgio

2021-06-21 14:45:44 +0100

[diff] [blame^]

1143

ICPPKernel::configure(win);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1144

}

Michele Di Giorgio