Blame - src/core/NEON/kernels/arm_gemm/quantized.cpp - ml/ComputeLibrary

void compute_some_rows(unsigned int blocks, const T *input, unsigned int in_stride, int32_t *row_bias, unsigned int mask_mode, uint64x2_t mask, int32x4_t offset_mul) {

632

int16x8_t sums[rows];

633

int32x4_t finalsums[rows];

634

635

for (unsigned int i=0; i<rows; i++) {

Michalis Spyrou

400abc8

2019-08-20 17:25:25 +0100

[diff] [blame]

636

sums[i] = vdupq_n_s16(0);

637

finalsums[i] = vdupq_n_s32(0);

Georgios Pinitas

cfa2bba

2019-06-27 17:00:52 +0100

[diff] [blame]

638

}

639

640

for (unsigned int i=0; i<blocks; i++) {

641

for (unsigned int r=0; r<rows; r++) {

Michalis Spyrou

400abc8

2019-08-20 17:25:25 +0100

[diff] [blame]

642

/* If we add too many blocks together, we run the risk

643

* of overflowing the intermediate 16-bit accumulators,

644

* especially in the unsigned case where we later treat

645

* the accumulator as signed.

646

*

647

* In that case, the maximum (signed) value is 16383,

648

* which is safe for 64 (unsigned) accumulations (255*64

649

* = 16,320).

650

*

651

* Each invocation of pairwise add adds 2 values to the

652

* accumulator - so in the unsigned case we can do 32

653

* adds before we need to reset the 16-bit accumulator

654

* by adding into the 32-bit 'finalsums'.

655

*

656

* We could do 64 adds in the signed case, but that

657

* optimization is not worth the complexity.

658

*/

Georgios Pinitas

48b3ef8

2019-10-14 19:03:09 +0100

[diff] [blame]

659

if (i > 0 && ((i & 31) == 0)) {

660

finalsums[r] = vpadalq_s16(finalsums[r], sums[r]);

661

sums[r] = vdupq_n_s16(0);

662

}

Georgios Pinitas

cfa2bba

2019-06-27 17:00:52 +0100

[diff] [blame]

663

sums[r] = accumulate_16(input + (r * in_stride) + (i * 16), sums[r]);

}

}

/* Handle the final masked read if needed. */

668

if (mask_mode > 0) {

669

for (unsigned int r=0; r<rows; r++) {

670

if (mask_mode == 1) {

671

sums[r] = accumulate_masked_8(input + (r * in_stride) + (blocks * 16), sums[r], mask);

672

} else {

673

sums[r] = accumulate_masked_16(input + (r * in_stride) + (blocks * 16), sums[r], mask);

}

}

}

for (unsigned int i=0; i<rows; i++) {

Michalis Spyrou

400abc8

2019-08-20 17:25:25 +0100

[diff] [blame]

679

finalsums[i] = vpadalq_s16(finalsums[i], sums[i]);

Georgios Pinitas

cfa2bba

2019-06-27 17:00:52 +0100

[diff] [blame]

}

int32x4_t t0, t1;

int32x2_t t2;

/* Result writeback - need to write back one value per row

686

* processed. Multiply all the final totals by -b_offset so

687

* that the terms can simply be added in the requantize code.

* */

switch (rows) {

case 1:

/* If we only have one output, just use ADDV. Multiply

692

* the offset into all four components separately so it

693

* can stay in the SIMD register file. */

694

t0 = vmulq_s32(finalsums[0], offset_mul);

695

*row_bias = vaddvq_s32(t0);

break;

case 2:

/* For two outputs, two rounds of pairwise adds will

700

* generate the result in a 2-vector we can store in one

701

* go. */

702

t0 = vpaddq_s32(finalsums[0], finalsums[1]);

703

t0 = vpaddq_s32(t0, t0);

704

t2 = vmul_s32(vget_low_s32(t0), vget_low_s32(offset_mul));

705

vst1_s32(row_bias, t2);

break;

case 3:

/* Three rows - need to store the low two words plus the odd value from lane 2 */

710

t0 = vpaddq_s32(finalsums[0], finalsums[1]);

711

t1 = vpaddq_s32(finalsums[2], finalsums[2]);

712

713

t0 = vpaddq_s32(t0, t1);

714

t0 = vmulq_s32(t0, offset_mul);

715

716

vst1_s32(row_bias, vget_low_s32(t0));

717

row_bias[2] = vgetq_lane_s32(t0, 2);

break;

case 4:

/* Four rows (most common case) - reduce to a single

722

* vector with pairwise adds. */

723

t0 = vpaddq_s32(finalsums[0], finalsums[1]);

724

t1 = vpaddq_s32(finalsums[2], finalsums[3]);

725

726

t0 = vpaddq_s32(t0, t1);

727

t0 = vmulq_s32(t0, offset_mul);

728

729

vst1q_s32(row_bias, t0);

730

break;

Georgios Pinitas

0cc50ed

2020-07-06 19:10:38 +0100

[diff] [blame]

731

Georgios Pinitas

5aa1a0b

2020-07-02 20:02:20 +0100

[diff] [blame]

732

default:

733

UNREACHABLE("Impossible.");

Georgios Pinitas

cfa2bba

2019-06-27 17:00:52 +0100

[diff] [blame]

}

}

Michalis Spyrou

2019-11-14 14:31:44 +0000

[diff] [blame]

737

row_sum_helpers(const Requantize32 &qp) : qp(qp) { }

Georgios Pinitas

cfa2bba

2019-06-27 17:00:52 +0100

[diff] [blame]

};

template<>

int16x8_t row_sum_helpers::accumulate_16(const uint8_t *ptr, int16x8_t sum) {

742

return vreinterpretq_s16_u16(vpadalq_u8(vreinterpretq_u16_s16(sum), vld1q_u8(ptr)));

}

template<>

int16x8_t row_sum_helpers::accumulate_16(const int8_t *ptr, int16x8_t sum) {

747

return vpadalq_s8(sum, vld1q_s8(ptr));

}

template<>

int16x8_t row_sum_helpers::accumulate_masked_16(const int8_t *ptr, int16x8_t sum, uint64x2_t mask) {

752

int8x16_t v = vandq_s8(vld1q_s8(ptr), vreinterpretq_s8_u64(mask));

753

return vpadalq_s8(sum, v);

}

template<>

int16x8_t row_sum_helpers::accumulate_masked_16(const uint8_t *ptr, int16x8_t sum, uint64x2_t mask) {

758

uint8x16_t v = vandq_u8(vld1q_u8(ptr), vreinterpretq_u8_u64(mask));

759

return vreinterpretq_s16_u16(vpadalq_u8(vreinterpretq_u16_s16(sum), v));

}

template<>

int16x8_t row_sum_helpers::accumulate_masked_8(const int8_t *ptr, int16x8_t sum, uint64x2_t mask) {

764

int8x16_t v = vcombine_s8(vld1_s8(ptr), vdup_n_s8(0));

765

v = vreinterpretq_s8_u64(vandq_u64(mask, vreinterpretq_u64_s8(v)));

766

return vpadalq_s8(sum, v);

}

template<>

int16x8_t row_sum_helpers::accumulate_masked_8(const uint8_t *ptr, int16x8_t sum, uint64x2_t mask) {

771

uint8x16_t v = vcombine_u8(vld1_u8(ptr), vdup_n_u8(0));

772

v = vreinterpretq_u8_u64(vandq_u64(mask, vreinterpretq_u64_u8(v)));

773

return vreinterpretq_s16_u16(vpadalq_u8(vreinterpretq_u16_s16(sum), v));

}

}

template<typename T>

Michalis Spyrou

71ac903

2019-11-14 14:31:44 +0000

[diff] [blame]

778

void compute_row_sums(const Requantize32 &qp, unsigned int width, unsigned int height,

Georgios Pinitas

cfa2bba

2019-06-27 17:00:52 +0100

[diff] [blame]

779

const T *input, unsigned int in_stride, int32_t *row_bias) {

Michalis Spyrou

71ac903

2019-11-14 14:31:44 +0000

[diff] [blame]

780

/* If the 'b' offset is zero, just skip this entirely. */

781

if (qp.b_offset == 0) {

782

memset(row_bias, 0, height * sizeof(int32_t));

return;

}

Georgios Pinitas

2019-06-27 17:00:52 +0100

[diff] [blame]

786

row_sum_helpers thehelpers(qp);

787

788

const int32x4_t offset_mul = vdupq_n_s32(-qp.b_offset);

789

790

/* Work out how many full vectors of 16 bytes we will read, and how many

791

* odd bytes at the end */

792

unsigned int blocks = (width / 16);

793

const unsigned int odds = width % 16;

794

795

/* Generate a mask to use on the last iteration, if necessary. */

796

uint64x2_t mask;

797

unsigned int mask_mode = 0;

798

799

if (odds > 0 && odds <= 8) {

800

/* 1-8 odds: mask in the low lane, 0 in the top */

801

uint64_t maskval = (~0ULL) >> (8 * (8-odds));

802

803

mask = vsetq_lane_u64(maskval, vdupq_n_u64(0), 0);

804

805

mask_mode = 1;

806

} else if (odds > 8) {

807

/* 9-15 odds: mask in the top lane, all 1s in the bottom. */

808

uint64_t maskval = (~0ULL) >> (8 * (16-odds));

809

810

mask = vsetq_lane_u64(maskval, vdupq_n_u64(~0ULL), 1);

mask_mode = 2;

}

for (unsigned int row=0; row<height; row+=4) {

switch(height-row) {

default:

case 4:

thehelpers.compute_some_rows<4>(blocks, input + (row * in_stride), in_stride, row_bias + row, mask_mode, mask, offset_mul);

820

break;

821

case 3:

822

thehelpers.compute_some_rows<3>(blocks, input + (row * in_stride), in_stride, row_bias + row, mask_mode, mask, offset_mul);

823

break;

824

case 2:

825

thehelpers.compute_some_rows<2>(blocks, input + (row * in_stride), in_stride, row_bias + row, mask_mode, mask, offset_mul);

826

break;

827

case 1:

828

thehelpers.compute_some_rows<1>(blocks, input + (row * in_stride), in_stride, row_bias + row, mask_mode, mask, offset_mul);

break;

}

}

}

/* Instantiate the two versions for uint8_t and int8_t. */

Michalis Spyrou

71ac903

2019-11-14 14:31:44 +0000

[diff] [blame]

835

template void compute_row_sums(const Requantize32 &, unsigned int, unsigned int, const int8_t *, unsigned int, int32_t *);

836

template void compute_row_sums(const Requantize32 &, unsigned int, unsigned int, const uint8_t *, unsigned int, int32_t *);

Georgios Pinitas

cfa2bba

2019-06-27 17:00:52 +0100

[diff] [blame]

837

838

template<unsigned int active_rows, typename T>

839

inline void add_block(const T *input, unsigned int in_stride, int32_t *output);

840

841

template<unsigned int active_rows>

842

inline void add_block(const uint8_t *input, unsigned int in_stride, int32_t *output) {

843

uint8x16_t inputs[4];

844

845

for (unsigned int i=0; i<4; i++) {

846

if (i < active_rows) {

847

inputs[i] = vld1q_u8(input + i * in_stride);

848

} else {

849

inputs[i] = vdupq_n_u8(0);

}

}

int16x8_t sums_16b[4];

854

855

// Two adds for the low pairs

856

sums_16b[0]=vreinterpretq_s16_u16(vaddl_u8(vget_low_u8(inputs[0]), vget_low_u8(inputs[1])));

857

sums_16b[1]=vreinterpretq_s16_u16(vaddl_u8(vget_low_u8(inputs[2]), vget_low_u8(inputs[3])));

858

// Two adds for the high pairs

859

sums_16b[2]=vreinterpretq_s16_u16(vaddl_high_u8(inputs[0], inputs[1]));

860

sums_16b[3]=vreinterpretq_s16_u16(vaddl_high_u8(inputs[2], inputs[3]));

861

862

int32x4_t sums_32b[4];

863

864

sums_32b[0]=vaddl_s16(vget_low_s16(sums_16b[0]), vget_low_s16(sums_16b[1]));

865

sums_32b[1]=vaddl_high_s16(sums_16b[0], sums_16b[1]);

866

sums_32b[2]=vaddl_s16(vget_low_s16(sums_16b[2]), vget_low_s16(sums_16b[3]));

867

sums_32b[3]=vaddl_high_s16(sums_16b[2], sums_16b[3]);

868

869

for (unsigned int i=0; i<4; i++) {

870

vst1q_s32(output + 4*i, vaddq_s32(sums_32b[i], vld1q_s32(output + 4*i)));

}

}

template<unsigned int active_rows>

875

inline void add_block(const int8_t *input, unsigned int in_stride, int32_t *output) {

876

int8x16_t inputs[4];

877

878

for (unsigned int i=0; i<4; i++) {

879

if (i < active_rows) {

880

inputs[i] = vld1q_s8(input + i * in_stride);

881

} else {

882

inputs[i] = vdupq_n_s8(0);

}

}

int16x8_t sums_16b[4];

887

888

// Two adds for the low pairs

889

sums_16b[0]=vaddl_s8(vget_low_s8(inputs[0]), vget_low_s8(inputs[1]));

890

sums_16b[1]=vaddl_s8(vget_low_s8(inputs[2]), vget_low_s8(inputs[3]));

891

// Two adds for the high pairs

892

sums_16b[2]=vaddl_high_s8(inputs[0], inputs[1]);

893

sums_16b[3]=vaddl_high_s8(inputs[2], inputs[3]);

894

895

int32x4_t sums_32b[4];

896

897

sums_32b[0]=vaddl_s16(vget_low_s16(sums_16b[0]), vget_low_s16(sums_16b[1]));

898

sums_32b[1]=vaddl_high_s16(sums_16b[0], sums_16b[1]);

899

sums_32b[2]=vaddl_s16(vget_low_s16(sums_16b[2]), vget_low_s16(sums_16b[3]));

900

sums_32b[3]=vaddl_high_s16(sums_16b[2], sums_16b[3]);

901

902

for (unsigned int i=0; i<4; i++) {

903

vst1q_s32(output + 4*i, vaddq_s32(sums_32b[i], vld1q_s32(output + 4*i)));

}

}

Georgios Pinitas

2019-06-27 17:00:52 +0100

[diff] [blame]

907

/* "first_col" parameter is used to offset the read into the qp.bias array,

908

* in cases where we are not computing the first columns of the output (i.e.

909

* in multithreaded cases where we divide columns across threads) */

910

template<typename T>

Michalis Spyrou

71ac903

2019-11-14 14:31:44 +0000

[diff] [blame]

911

void compute_col_sums(const Requantize32 &qp, unsigned int width, unsigned int height, const T *input, unsigned int in_stride, int32_t *col_bias, unsigned int depth, unsigned int multi, unsigned int first_col) {

912

/* Only actually add up the columns if a_offset is non-zero. */

913

if (qp.a_offset != 0) {

914

memset(reinterpret_cast<void *>(col_bias), 0, width * sizeof(int32_t));

Georgios Pinitas

cfa2bba

2019-06-27 17:00:52 +0100

[diff] [blame]

915

Michalis Spyrou

71ac903

2019-11-14 14:31:44 +0000

[diff] [blame]

916

for (unsigned int row=0; row<height; row+=4) {

917

unsigned int numrows=std::min(height-row, 4u);

Georgios Pinitas

cfa2bba

2019-06-27 17:00:52 +0100

[diff] [blame]

918

Michalis Spyrou

71ac903

2019-11-14 14:31:44 +0000

[diff] [blame]

919

for (unsigned int col=0; col<width; col+=16) {

920

unsigned int numcols=std::min(width-col, 16u);

Georgios Pinitas

cfa2bba

2019-06-27 17:00:52 +0100

[diff] [blame]

921

Michalis Spyrou

71ac903

2019-11-14 14:31:44 +0000

[diff] [blame]

922

if (numcols==16) {

923

switch(numrows) {

Michalis Spyrou

71ac903

2019-11-14 14:31:44 +0000

[diff] [blame]

924

case 1:

925

add_block<1>(input + row * in_stride + col, in_stride, col_bias + col);

926

break;

Georgios Pinitas

cfa2bba

2019-06-27 17:00:52 +0100

[diff] [blame]

927

Michalis Spyrou

71ac903

2019-11-14 14:31:44 +0000

[diff] [blame]

928

case 2:

929

add_block<2>(input + row * in_stride + col, in_stride, col_bias + col);

930

break;

Georgios Pinitas

cfa2bba

2019-06-27 17:00:52 +0100

[diff] [blame]

931

Michalis Spyrou

71ac903

2019-11-14 14:31:44 +0000

[diff] [blame]

932

case 3:

933

add_block<3>(input + row * in_stride + col, in_stride, col_bias + col);

934

break;

Georgios Pinitas

cfa2bba

2019-06-27 17:00:52 +0100

[diff] [blame]

935

Michalis Spyrou

71ac903

2019-11-14 14:31:44 +0000

[diff] [blame]

936

case 4:

937

add_block<4>(input + row * in_stride + col, in_stride, col_bias + col);

938

break;

Georgios Pinitas

5aa1a0b

2020-07-02 20:02:20 +0100

[diff] [blame]

939

940

default:

941

UNREACHABLE("Impossible.");

Georgios Pinitas

cfa2bba

2019-06-27 17:00:52 +0100

[diff] [blame]

942

}

Michalis Spyrou

71ac903

2019-11-14 14:31:44 +0000

[diff] [blame]

943

} else {

944

for (; col<width; col++) {

945

int32_t sum=0;

946

for (unsigned int r=0; r<numrows; r++) {

947

sum += input[(row + r)*in_stride + col];

948

}

949

col_bias[col] += sum;

950

}

Georgios Pinitas

cfa2bba

2019-06-27 17:00:52 +0100

[diff] [blame]

}

}

}

}

for (unsigned int col=0; col<width; col++) {

957

int32_t result = col_bias[col];

958

959

result = (qp.a_offset * qp.b_offset * depth) - (result * qp.a_offset);

960

961

if (qp.bias != nullptr) {

Georgios Pinitas

48b3ef8

2019-10-14 19:03:09 +0100

[diff] [blame]

962

result += qp.bias[multi * qp.bias_multi_stride + col + first_col];

Georgios Pinitas

cfa2bba

2019-06-27 17:00:52 +0100

[diff] [blame]

963

}

964

965

col_bias[col] = result;

}

}

Michalis Spyrou

2019-11-14 14:31:44 +0000

[diff] [blame]

969

template void compute_col_sums(const Requantize32 &qp, unsigned int width, unsigned int height, const int8_t *input, unsigned int in_stride, int32_t *col_bias, unsigned int depth, unsigned int multi, unsigned int first_col);

970

template void compute_col_sums(const Requantize32 &qp, unsigned int width, unsigned int height, const uint8_t *input, unsigned int in_stride, int32_t *col_bias, unsigned int depth, unsigned int multi, unsigned int first_col);

Georgios Pinitas

cfa2bba

2019-06-27 17:00:52 +0100

[diff] [blame]

971

972

} // namespace arm_gemm

Georgios Pinitas

f33484f

2019-07-29 12:40:59 +0100

[diff] [blame]

973

974

#endif // __aarch64__