Blame - src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp - ml/ComputeLibrary

2017-11-15 13:28:27 +0000

[diff] [blame]

603

{

604

auto *mtx_a0 = reinterpret_cast<const int8_t *>(ina.ptr());

605

auto *mtx_b0 = reinterpret_cast<const int8_t *>(inb.ptr());

606

607

// Note: Since the input are all positives, we can use uint32_t

608

// Accumulators for the block 0

int32x4x4_t c0 =

{

{

vdupq_n_s32(0),

vdupq_n_s32(0),

vdupq_n_s32(0),

vdupq_n_s32(0)

}

};

// Accumulators for the block 1

int32x4x4_t c1 =

{

{

vdupq_n_s32(0),

vdupq_n_s32(0),

vdupq_n_s32(0),

vdupq_n_s32(0)

}

};

// Accumulators for the block 2

int32x4x4_t c2 =

{

{

vdupq_n_s32(0),

vdupq_n_s32(0),

vdupq_n_s32(0),

vdupq_n_s32(0)

}

};

// Accumulators for the block 3

int32x4x4_t c3 =

{

{

vdupq_n_s32(0),

vdupq_n_s32(0),

vdupq_n_s32(0),

vdupq_n_s32(0)

}

};

for(int k = 0; k < width_b; k += 16, mtx_a0 += 4, mtx_b0 += 16)

653

{

654

const int8x8_t a00_s8 = vld1_s8(mtx_a0);

655

const int8x16_t b00_s8 = vld1q_s8(mtx_b0);

656

657

// Convert a00_s8 to uint16_t and get the lower part

658

const int16x4_t a00_s16 = vget_low_s16(vmovl_s8(a00_s8));

659

660

// Convert b00_s8 to int16_t

661

const int16x4x4_t b00_s16 =

662

{

663

{

664

vget_low_s16(vmovl_s8(vget_low_s8(b00_s8))),

665

vget_high_s16(vmovl_s8(vget_low_s8(b00_s8))),

666

vget_low_s16(vmovl_s8(vget_high_s8(b00_s8))),

667

vget_high_s16(vmovl_s8(vget_high_s8(b00_s8)))

}

};

// 4x4 block 0

c0.val[0] = vmlal_lane_s16(c0.val[0], b00_s16.val[0], a00_s16, 0);

673

c0.val[1] = vmlal_lane_s16(c0.val[1], b00_s16.val[1], a00_s16, 0);

674

c0.val[2] = vmlal_lane_s16(c0.val[2], b00_s16.val[2], a00_s16, 0);

675

c0.val[3] = vmlal_lane_s16(c0.val[3], b00_s16.val[3], a00_s16, 0);

676

677

// 4x4 block 1

678

c1.val[0] = vmlal_lane_s16(c1.val[0], b00_s16.val[0], a00_s16, 1);

679

c1.val[1] = vmlal_lane_s16(c1.val[1], b00_s16.val[1], a00_s16, 1);

680

c1.val[2] = vmlal_lane_s16(c1.val[2], b00_s16.val[2], a00_s16, 1);

681

c1.val[3] = vmlal_lane_s16(c1.val[3], b00_s16.val[3], a00_s16, 1);

682

683

// 4x4 block 2

684

c2.val[0] = vmlal_lane_s16(c2.val[0], b00_s16.val[0], a00_s16, 2);

685

c2.val[1] = vmlal_lane_s16(c2.val[1], b00_s16.val[1], a00_s16, 2);

686

c2.val[2] = vmlal_lane_s16(c2.val[2], b00_s16.val[2], a00_s16, 2);

687

c2.val[3] = vmlal_lane_s16(c2.val[3], b00_s16.val[3], a00_s16, 2);

688

689

// 4x4 block 3

690

c3.val[0] = vmlal_lane_s16(c3.val[0], b00_s16.val[0], a00_s16, 3);

691

c3.val[1] = vmlal_lane_s16(c3.val[1], b00_s16.val[1], a00_s16, 3);

692

c3.val[2] = vmlal_lane_s16(c3.val[2], b00_s16.val[2], a00_s16, 3);

693

c3.val[3] = vmlal_lane_s16(c3.val[3], b00_s16.val[3], a00_s16, 3);

694

}

695

696

auto mtx_out = reinterpret_cast<int32_t *>(out.ptr());

697

vst1q_s32(mtx_out + 0 * out_stride + 0, c0.val[0]);

698

vst1q_s32(mtx_out + 0 * out_stride + 4, c0.val[1]);

699

vst1q_s32(mtx_out + 0 * out_stride + 8, c0.val[2]);

700

vst1q_s32(mtx_out + 0 * out_stride + 12, c0.val[3]);

701

vst1q_s32(mtx_out + 1 * out_stride + 0, c1.val[0]);

702

vst1q_s32(mtx_out + 1 * out_stride + 4, c1.val[1]);

703

vst1q_s32(mtx_out + 1 * out_stride + 8, c1.val[2]);

704

vst1q_s32(mtx_out + 1 * out_stride + 12, c1.val[3]);

705

vst1q_s32(mtx_out + 2 * out_stride + 0, c2.val[0]);

706

vst1q_s32(mtx_out + 2 * out_stride + 4, c2.val[1]);

707

vst1q_s32(mtx_out + 2 * out_stride + 8, c2.val[2]);

708

vst1q_s32(mtx_out + 2 * out_stride + 12, c2.val[3]);

709

vst1q_s32(mtx_out + 3 * out_stride + 0, c3.val[0]);

710

vst1q_s32(mtx_out + 3 * out_stride + 4, c3.val[1]);

711

vst1q_s32(mtx_out + 3 * out_stride + 8, c3.val[2]);

712

vst1q_s32(mtx_out + 3 * out_stride + 12, c3.val[3]);

713

},

714

ina, inb, out);

715

}

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

} // namespace

class Coordinates;

} // namespace arm_compute

720

721

namespace

722

{

Georgios Pinitas

2017-12-06 11:53:03 +0000

[diff] [blame]

723

Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output)

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

724

{

Georgios Pinitas

dbdea0d

2019-10-16 19:21:40 +0100

[diff] [blame]

725

ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S8, DataType::U8);

726

ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::S8, DataType::U8);

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

727

ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32);

728

729

TensorShape in0_shape = input0->tensor_shape();

730

TensorShape in1_shape = input1->tensor_shape();

731

TensorShape out_shape = output->tensor_shape();

732

733

// Check vector-by-matrix case

734

if(out_shape[1] == 1)

735

{

736

ARM_COMPUTE_RETURN_ERROR_ON_MSG(in0_shape[0] != in1_shape[1], "The number of input0's columns must be equal to input1's rows");

}

else

{

in0_shape.collapse(2);

741

in1_shape.collapse(2);

742

out_shape.collapse(2);

743

744

ARM_COMPUTE_RETURN_ERROR_ON_MSG(in0_shape[2] != out_shape[2], "Output tensor must have the same number of batches of input0 tensor");

745

ARM_COMPUTE_RETURN_ERROR_ON_MSG(in1_shape[2] != 1 && in0_shape[2] != in1_shape[2], "Input1 tensor must have the same number of batches of input0 or the number of batches must be set to 1");

Anthony Barbier

93b9bdb

2017-12-12 11:27:55 +0000

[diff] [blame]

746

ARM_COMPUTE_RETURN_ERROR_ON_MSG(in1_shape[0] % 16, "Input1's width must be a multiple of 16");

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

747

}

748

Georgios Pinitas

2017-12-06 11:53:03 +0000

[diff] [blame]

749

return Status{};

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

750

}

751

Georgios Pinitas

2017-12-06 11:53:03 +0000

[diff] [blame]

752

std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1, ITensorInfo *output)

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

753

{

754

constexpr unsigned int num_elems_processed_per_iteration_x = 16;

755

constexpr unsigned int num_elems_processed_per_iteration_y = 4;

756

757

Window win;

758

bool window_changed = false;

759

760

// Check if the output tensor is a vector. If so,the kernel runs the vector-matrix multiplication

761

if((output->dimension(1) == 1))

762

{

763

// Configure kernel window

764

win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x));

765

766

// We cannot read out-of-bound elements from matrix A as we use the left-over for loop

767

AccessWindowStatic in0_access(input0, 0, 0, input0->tensor_shape().x(), 1);

768

AccessWindowHorizontal in1_access(input1, 0, num_elems_processed_per_iteration_x);

769

AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration_x);

770

771

window_changed = update_window_and_padding(win, in0_access, in1_access, output_access);

772

773

Coordinates coord;

774

coord.set_num_dimensions(output->num_dimensions());

775

output_access.set_valid_region(win, ValidRegion(coord, output->tensor_shape()));

}

else

{

win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));

780

Anthony Barbier

93b9bdb

2017-12-12 11:27:55 +0000

[diff] [blame]

781

unsigned int num_k_iterations = ceil_to_multiple(input1->dimension(0), num_elems_processed_per_iteration_x) / 16;

782

// For each iteration of "k" we increment the input pointer by 4, and we load 8 elements a the time:

Michele Di Giorgio

9d3e7f9

2019-08-13 14:23:21 +0100

[diff] [blame]

783

AccessWindowStatic in0_access(input0, 0, 0, (num_k_iterations - 1) * 4 + 8, input0->dimension(1));

784

AccessWindowStatic in1_access(input1, 0, 0, ceil_to_multiple(input1->dimension(0), num_elems_processed_per_iteration_x), input1->dimension(1));

785

AccessWindowRectangle output_access(output, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

786

787

window_changed = update_window_and_padding(win, in0_access, in1_access, output_access);

788

Diego Lopez Recas

bcbc970

2017-12-18 11:28:27 +0000

[diff] [blame]

789

output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

790

}

791

Georgios Pinitas

2017-12-06 11:53:03 +0000

[diff] [blame]

792

Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

793

return std::make_pair(err, win);

}

} // namespace

NEGEMMLowpMatrixMultiplyKernel::NEGEMMLowpMatrixMultiplyKernel()

798

: _input0(nullptr), _input1(nullptr), _output(nullptr), _slide_matrix_b(true)

{

}

void NEGEMMLowpMatrixMultiplyKernel::configure(const ITensor *input0, const ITensor *input1, ITensor *output)

803

{

804

ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output);

805

ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info()));

806

807

TensorShape in1_shape = input1->info()->tensor_shape();

808

in1_shape.collapse(2);

_input0 = input0;

_input1 = input1;

_output = output;

_slide_matrix_b = in1_shape[2] != 1;

814

815

// Configure kernel window

816

auto win_config = validate_and_configure_window(input0->info(), input1->info(), output->info());

817

ARM_COMPUTE_ERROR_THROW_ON(win_config.first);

818

INEKernel::configure(win_config.second);

819

}

820

Georgios Pinitas

2017-12-06 11:53:03 +0000

[diff] [blame]

821

Status NEGEMMLowpMatrixMultiplyKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output)

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

822

{

823

ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output));

824

ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input0->clone().get(), input1->clone().get(), output->clone().get()).first);

825

Georgios Pinitas

2017-12-06 11:53:03 +0000

[diff] [blame]

826

return Status{};

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

827

}

Pablo Tello

2017-11-15 13:28:27 +0000

[diff] [blame]

828

829

void NEGEMMLowpMatrixMultiplyKernel::run(const Window &window, const ThreadInfo &info)

830

{

831

ARM_COMPUTE_UNUSED(info);

832

ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);

833

ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);

834

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

835

// Check if the output tensor is a vector. If so,the kernel runs the vector-matrix multiplication path

836

if((_output->info()->dimension(1) == 1))

Pablo Tello

2017-11-15 13:28:27 +0000

[diff] [blame]

837

{

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

838

const auto width_matrix_a = static_cast<int>(_input0->info()->dimension(0));

839

const auto width_matrix_b = static_cast<int>(_input1->info()->dimension(0));

840

const auto in_b_stride = static_cast<int>(_input1->info()->strides_in_bytes()[1] / data_size_from_type(_input1->info()->data_type()));

841

842

// The implementation computes 16 elements per iteration

843

const int window_start_x = 16 * info.thread_id;

844

const int window_step_x = 16 * info.num_threads;

845

// Make sure (window_end_x - window_start_x) is a multiple of window_step_x

846

const int window_end_x = ceil_to_multiple(width_matrix_b - window_start_x, window_step_x) + window_start_x;

847

848

Window win_out(window);

849

win_out.set(Window::DimX, Window::Dimension(window_start_x, window_end_x, window_step_x));

850

win_out.set(Window::DimY, Window::Dimension(0, 1, 1));

851

852

Window win_a(window);

853

win_a.set(Window::DimX, Window::Dimension(0, 0, 0));

854

win_a.set(Window::DimY, Window::Dimension(0, 0, 0));

855

856

Window win_b;

857

// Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2

858

// This scenario can happen when the the matrix multiplication is used to perform a convolution operation

859

if(_input1->info()->num_dimensions() >= 3)

{

win_b = window;

}

win_b.set(Window::DimX, Window::Dimension(window_start_x, window_end_x, window_step_x));

864

win_b.set(Window::DimY, Window::Dimension(0, 1, 1));

865

866

Iterator ina(_input0, win_a);

867

Iterator inb(_input1, win_b);

868

Iterator out(_output, win_out);

869

870

switch(_input0->info()->data_type())

871

{

872

case DataType::S8:

Georgios Pinitas

63d4dbd

2019-11-08 11:51:56 +0000

[diff] [blame]

873

case DataType::QASYMM8_SIGNED:

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

874

{

875

vector_matrix_multiply_s8(ina, inb, out, width_matrix_a, width_matrix_b, in_b_stride, window);

break;

}

case DataType::U8:

case DataType::QASYMM8:

880

{

881

vector_matrix_multiply_u8(ina, inb, out, width_matrix_a, width_matrix_b, in_b_stride, window);

break;

}

default:

{

ARM_COMPUTE_ERROR("Not supported");

887

break;

888

}

889

}

Pablo Tello

2017-11-15 13:28:27 +0000

[diff] [blame]

890

}

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

891

else

Pablo Tello

2017-11-15 13:28:27 +0000

[diff] [blame]

892

{

Gian Marco

2017-11-30 14:31:13 +0000

[diff] [blame]

893

const size_t in_b_stride = _input1->info()->strides_in_bytes()[1];

894

const size_t out_stride = _output->info()->strides_in_bytes()[1] / _output->info()->element_size();

895

896

// Set step_x and step_y for matrix A. Scale by a factor of 4 the Y range as the input interleaved matrix A has 4 times less the rows of the output matrix

897

Window win_a(window);

898

win_a.set(Window::DimX, Window::Dimension(0, 0, 0));

899

win_a.set(Window::DimY, Window::Dimension(window.y().start() / 4, window.y().end() / 4, 1));

900

901

// Set step_x and step_y for matrix B. Scale by a factor of 16 the X range as the input transposed matrix A has 16 times less the columns of the output matrix

902

Window win_b;

903

// Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2

904

// This scenario can happen when the the matrix multiplication is used to perform a convolution operation

905

if(_slide_matrix_b)

Pablo Tello