Blame - src/core/NEON/kernels/arm_gemm/quantized.cpp - ml/ComputeLibrary

void compute_some_rows(unsigned int blocks, const T *input, unsigned int in_stride, int32_t *row_bias, unsigned int mask_mode, uint64x2_t mask, int32x4_t offset_mul) {

631

int16x8_t sums[rows];

632

int32x4_t finalsums[rows];

633

634

for (unsigned int i=0; i<rows; i++) {

Michalis Spyrou

400abc8

2019-08-20 17:25:25 +0100

[diff] [blame]

635

sums[i] = vdupq_n_s16(0);

636

finalsums[i] = vdupq_n_s32(0);

Georgios Pinitas

cfa2bba

2019-06-27 17:00:52 +0100

[diff] [blame]

637

}

638

639

for (unsigned int i=0; i<blocks; i++) {

640

for (unsigned int r=0; r<rows; r++) {

Michalis Spyrou

400abc8

2019-08-20 17:25:25 +0100

[diff] [blame]

641

/* If we add too many blocks together, we run the risk

642

* of overflowing the intermediate 16-bit accumulators,

643

* especially in the unsigned case where we later treat

644

* the accumulator as signed.

645

*

646

* In that case, the maximum (signed) value is 16383,

647

* which is safe for 64 (unsigned) accumulations (255*64

648

* = 16,320).

649

*

650

* Each invocation of pairwise add adds 2 values to the

651

* accumulator - so in the unsigned case we can do 32

652

* adds before we need to reset the 16-bit accumulator

653

* by adding into the 32-bit 'finalsums'.

654

*

655

* We could do 64 adds in the signed case, but that

656

* optimization is not worth the complexity.

657

*/

Georgios Pinitas

48b3ef8

2019-10-14 19:03:09 +0100

[diff] [blame]

658

if (i > 0 && ((i & 31) == 0)) {

659

finalsums[r] = vpadalq_s16(finalsums[r], sums[r]);

660

sums[r] = vdupq_n_s16(0);

661

}

Georgios Pinitas

cfa2bba

2019-06-27 17:00:52 +0100

[diff] [blame]

662

sums[r] = accumulate_16(input + (r * in_stride) + (i * 16), sums[r]);

}

}

/* Handle the final masked read if needed. */

667

if (mask_mode > 0) {

668

for (unsigned int r=0; r<rows; r++) {

669

if (mask_mode == 1) {

670

sums[r] = accumulate_masked_8(input + (r * in_stride) + (blocks * 16), sums[r], mask);

671

} else {

672

sums[r] = accumulate_masked_16(input + (r * in_stride) + (blocks * 16), sums[r], mask);

}

}

}

for (unsigned int i=0; i<rows; i++) {

Michalis Spyrou

400abc8

2019-08-20 17:25:25 +0100

[diff] [blame]

678

finalsums[i] = vpadalq_s16(finalsums[i], sums[i]);

Georgios Pinitas

cfa2bba

2019-06-27 17:00:52 +0100

[diff] [blame]

}

int32x4_t t0, t1;

int32x2_t t2;

/* Result writeback - need to write back one value per row

685

* processed. Multiply all the final totals by -b_offset so

686

* that the terms can simply be added in the requantize code.

* */

switch (rows) {

case 1:

/* If we only have one output, just use ADDV. Multiply

691

* the offset into all four components separately so it

692

* can stay in the SIMD register file. */

693

t0 = vmulq_s32(finalsums[0], offset_mul);

694

*row_bias = vaddvq_s32(t0);

break;

case 2:

/* For two outputs, two rounds of pairwise adds will

699

* generate the result in a 2-vector we can store in one

700

* go. */

701

t0 = vpaddq_s32(finalsums[0], finalsums[1]);

702

t0 = vpaddq_s32(t0, t0);

703

t2 = vmul_s32(vget_low_s32(t0), vget_low_s32(offset_mul));

704

vst1_s32(row_bias, t2);

break;

case 3:

/* Three rows - need to store the low two words plus the odd value from lane 2 */

709

t0 = vpaddq_s32(finalsums[0], finalsums[1]);

710

t1 = vpaddq_s32(finalsums[2], finalsums[2]);

711

712

t0 = vpaddq_s32(t0, t1);

713

t0 = vmulq_s32(t0, offset_mul);

714

715

vst1_s32(row_bias, vget_low_s32(t0));

716

row_bias[2] = vgetq_lane_s32(t0, 2);

break;

case 4:

/* Four rows (most common case) - reduce to a single

721

* vector with pairwise adds. */

722

t0 = vpaddq_s32(finalsums[0], finalsums[1]);

723

t1 = vpaddq_s32(finalsums[2], finalsums[3]);

724

725

t0 = vpaddq_s32(t0, t1);

726

t0 = vmulq_s32(t0, offset_mul);

727

728

vst1q_s32(row_bias, t0);

729

break;

Georgios Pinitas

0cc50ed

2020-07-06 19:10:38 +0100

[diff] [blame]

730

Georgios Pinitas

5aa1a0b

2020-07-02 20:02:20 +0100

[diff] [blame]

731

default:

732

UNREACHABLE("Impossible.");

Georgios Pinitas

cfa2bba

2019-06-27 17:00:52 +0100

[diff] [blame]

}

}

Michalis Spyrou

2019-11-14 14:31:44 +0000

[diff] [blame]

736

row_sum_helpers(const Requantize32 &qp) : qp(qp) { }

Georgios Pinitas

cfa2bba

2019-06-27 17:00:52 +0100

[diff] [blame]

};

template<>

int16x8_t row_sum_helpers::accumulate_16(const uint8_t *ptr, int16x8_t sum) {

741

return vreinterpretq_s16_u16(vpadalq_u8(vreinterpretq_u16_s16(sum), vld1q_u8(ptr)));

}

template<>

int16x8_t row_sum_helpers::accumulate_16(const int8_t *ptr, int16x8_t sum) {

746

return vpadalq_s8(sum, vld1q_s8(ptr));

}

template<>

int16x8_t row_sum_helpers::accumulate_masked_16(const int8_t *ptr, int16x8_t sum, uint64x2_t mask) {

751

int8x16_t v = vandq_s8(vld1q_s8(ptr), vreinterpretq_s8_u64(mask));

752

return vpadalq_s8(sum, v);

}

template<>

int16x8_t row_sum_helpers::accumulate_masked_16(const uint8_t *ptr, int16x8_t sum, uint64x2_t mask) {

757

uint8x16_t v = vandq_u8(vld1q_u8(ptr), vreinterpretq_u8_u64(mask));

758

return vreinterpretq_s16_u16(vpadalq_u8(vreinterpretq_u16_s16(sum), v));

}

template<>

int16x8_t row_sum_helpers::accumulate_masked_8(const int8_t *ptr, int16x8_t sum, uint64x2_t mask) {

763

int8x16_t v = vcombine_s8(vld1_s8(ptr), vdup_n_s8(0));

764

v = vreinterpretq_s8_u64(vandq_u64(mask, vreinterpretq_u64_s8(v)));

765

return vpadalq_s8(sum, v);

}

template<>

int16x8_t row_sum_helpers::accumulate_masked_8(const uint8_t *ptr, int16x8_t sum, uint64x2_t mask) {

770

uint8x16_t v = vcombine_u8(vld1_u8(ptr), vdup_n_u8(0));

771

v = vreinterpretq_u8_u64(vandq_u64(mask, vreinterpretq_u64_u8(v)));

772

return vreinterpretq_s16_u16(vpadalq_u8(vreinterpretq_u16_s16(sum), v));

}

}

template<typename T>

Michalis Spyrou

71ac903

2019-11-14 14:31:44 +0000

[diff] [blame]

777

void compute_row_sums(const Requantize32 &qp, unsigned int width, unsigned int height,

Georgios Pinitas

cfa2bba

2019-06-27 17:00:52 +0100

[diff] [blame]

778

const T *input, unsigned int in_stride, int32_t *row_bias) {

Michalis Spyrou

71ac903

2019-11-14 14:31:44 +0000

[diff] [blame]

779

/* If the 'b' offset is zero, just skip this entirely. */

780

if (qp.b_offset == 0) {

781

memset(row_bias, 0, height * sizeof(int32_t));

return;

}

Georgios Pinitas

2019-06-27 17:00:52 +0100

[diff] [blame]

785

row_sum_helpers thehelpers(qp);

786

787

const int32x4_t offset_mul = vdupq_n_s32(-qp.b_offset);

788

789

/* Work out how many full vectors of 16 bytes we will read, and how many

790

* odd bytes at the end */

791

unsigned int blocks = (width / 16);

792

const unsigned int odds = width % 16;

793

794

/* Generate a mask to use on the last iteration, if necessary. */

795

uint64x2_t mask;

796

unsigned int mask_mode = 0;

797

798

if (odds > 0 && odds <= 8) {

799

/* 1-8 odds: mask in the low lane, 0 in the top */

800

uint64_t maskval = (~0ULL) >> (8 * (8-odds));

801

802

mask = vsetq_lane_u64(maskval, vdupq_n_u64(0), 0);

803

804

mask_mode = 1;

805

} else if (odds > 8) {

806

/* 9-15 odds: mask in the top lane, all 1s in the bottom. */

807

uint64_t maskval = (~0ULL) >> (8 * (16-odds));

808

809

mask = vsetq_lane_u64(maskval, vdupq_n_u64(~0ULL), 1);

mask_mode = 2;

}

for (unsigned int row=0; row<height; row+=4) {

switch(height-row) {

default:

case 4:

thehelpers.compute_some_rows<4>(blocks, input + (row * in_stride), in_stride, row_bias + row, mask_mode, mask, offset_mul);

819

break;

820

case 3:

821

thehelpers.compute_some_rows<3>(blocks, input + (row * in_stride), in_stride, row_bias + row, mask_mode, mask, offset_mul);

822

break;

823

case 2:

824

thehelpers.compute_some_rows<2>(blocks, input + (row * in_stride), in_stride, row_bias + row, mask_mode, mask, offset_mul);

825

break;

826

case 1:

827

thehelpers.compute_some_rows<1>(blocks, input + (row * in_stride), in_stride, row_bias + row, mask_mode, mask, offset_mul);

break;

}

}

}

/* Instantiate the two versions for uint8_t and int8_t. */

Michalis Spyrou

71ac903

2019-11-14 14:31:44 +0000

[diff] [blame]

834

template void compute_row_sums(const Requantize32 &, unsigned int, unsigned int, const int8_t *, unsigned int, int32_t *);

835

template void compute_row_sums(const Requantize32 &, unsigned int, unsigned int, const uint8_t *, unsigned int, int32_t *);

Georgios Pinitas

cfa2bba

2019-06-27 17:00:52 +0100

[diff] [blame]

836

837

template<unsigned int active_rows, typename T>

838

inline void add_block(const T *input, unsigned int in_stride, int32_t *output);

839

840

template<unsigned int active_rows>

841

inline void add_block(const uint8_t *input, unsigned int in_stride, int32_t *output) {

842

uint8x16_t inputs[4];

843

844

for (unsigned int i=0; i<4; i++) {

845

if (i < active_rows) {

846

inputs[i] = vld1q_u8(input + i * in_stride);

847

} else {

848

inputs[i] = vdupq_n_u8(0);

}

}

int16x8_t sums_16b[4];

853

854

// Two adds for the low pairs

855

sums_16b[0]=vreinterpretq_s16_u16(vaddl_u8(vget_low_u8(inputs[0]), vget_low_u8(inputs[1])));

856

sums_16b[1]=vreinterpretq_s16_u16(vaddl_u8(vget_low_u8(inputs[2]), vget_low_u8(inputs[3])));

857

// Two adds for the high pairs

858

sums_16b[2]=vreinterpretq_s16_u16(vaddl_high_u8(inputs[0], inputs[1]));

859

sums_16b[3]=vreinterpretq_s16_u16(vaddl_high_u8(inputs[2], inputs[3]));

860

861

int32x4_t sums_32b[4];

862

863

sums_32b[0]=vaddl_s16(vget_low_s16(sums_16b[0]), vget_low_s16(sums_16b[1]));

864

sums_32b[1]=vaddl_high_s16(sums_16b[0], sums_16b[1]);

865

sums_32b[2]=vaddl_s16(vget_low_s16(sums_16b[2]), vget_low_s16(sums_16b[3]));

866

sums_32b[3]=vaddl_high_s16(sums_16b[2], sums_16b[3]);

867

868

for (unsigned int i=0; i<4; i++) {

869

vst1q_s32(output + 4*i, vaddq_s32(sums_32b[i], vld1q_s32(output + 4*i)));

}

}

template<unsigned int active_rows>

874

inline void add_block(const int8_t *input, unsigned int in_stride, int32_t *output) {

875

int8x16_t inputs[4];

876

877

for (unsigned int i=0; i<4; i++) {

878

if (i < active_rows) {

879

inputs[i] = vld1q_s8(input + i * in_stride);

880

} else {

881

inputs[i] = vdupq_n_s8(0);

}

}

int16x8_t sums_16b[4];

886

887

// Two adds for the low pairs

888

sums_16b[0]=vaddl_s8(vget_low_s8(inputs[0]), vget_low_s8(inputs[1]));

889

sums_16b[1]=vaddl_s8(vget_low_s8(inputs[2]), vget_low_s8(inputs[3]));

890

// Two adds for the high pairs

891

sums_16b[2]=vaddl_high_s8(inputs[0], inputs[1]);

892

sums_16b[3]=vaddl_high_s8(inputs[2], inputs[3]);

893

894

int32x4_t sums_32b[4];

895

896

sums_32b[0]=vaddl_s16(vget_low_s16(sums_16b[0]), vget_low_s16(sums_16b[1]));

897

sums_32b[1]=vaddl_high_s16(sums_16b[0], sums_16b[1]);

898

sums_32b[2]=vaddl_s16(vget_low_s16(sums_16b[2]), vget_low_s16(sums_16b[3]));

899

sums_32b[3]=vaddl_high_s16(sums_16b[2], sums_16b[3]);

900

901

for (unsigned int i=0; i<4; i++) {

902

vst1q_s32(output + 4*i, vaddq_s32(sums_32b[i], vld1q_s32(output + 4*i)));

}

}

Georgios Pinitas

2019-06-27 17:00:52 +0100

[diff] [blame]

906

/* "first_col" parameter is used to offset the read into the qp.bias array,

907

* in cases where we are not computing the first columns of the output (i.e.

908

* in multithreaded cases where we divide columns across threads) */

909

template<typename T>

Michalis Spyrou

71ac903

2019-11-14 14:31:44 +0000

[diff] [blame]

910

void compute_col_sums(const Requantize32 &qp, unsigned int width, unsigned int height, const T *input, unsigned int in_stride, int32_t *col_bias, unsigned int depth, unsigned int multi, unsigned int first_col) {

911

/* Only actually add up the columns if a_offset is non-zero. */

912

if (qp.a_offset != 0) {

913

memset(reinterpret_cast<void *>(col_bias), 0, width * sizeof(int32_t));

Georgios Pinitas

cfa2bba

2019-06-27 17:00:52 +0100

[diff] [blame]

914

Michalis Spyrou

71ac903

2019-11-14 14:31:44 +0000

[diff] [blame]

915

for (unsigned int row=0; row<height; row+=4) {

916

unsigned int numrows=std::min(height-row, 4u);

Georgios Pinitas

cfa2bba

2019-06-27 17:00:52 +0100

[diff] [blame]

917

Michalis Spyrou

71ac903

2019-11-14 14:31:44 +0000

[diff] [blame]

918

for (unsigned int col=0; col<width; col+=16) {

919

unsigned int numcols=std::min(width-col, 16u);

Georgios Pinitas

cfa2bba

2019-06-27 17:00:52 +0100

[diff] [blame]

920

Michalis Spyrou

71ac903

2019-11-14 14:31:44 +0000

[diff] [blame]

921

if (numcols==16) {

922

switch(numrows) {

Michalis Spyrou

71ac903

2019-11-14 14:31:44 +0000

[diff] [blame]

923

case 1:

924

add_block<1>(input + row * in_stride + col, in_stride, col_bias + col);

925

break;

Georgios Pinitas

cfa2bba

2019-06-27 17:00:52 +0100

[diff] [blame]

926

Michalis Spyrou

71ac903

2019-11-14 14:31:44 +0000

[diff] [blame]

927

case 2:

928

add_block<2>(input + row * in_stride + col, in_stride, col_bias + col);

929

break;

Georgios Pinitas

cfa2bba

2019-06-27 17:00:52 +0100

[diff] [blame]

930

Michalis Spyrou

71ac903

2019-11-14 14:31:44 +0000

[diff] [blame]

931

case 3:

932

add_block<3>(input + row * in_stride + col, in_stride, col_bias + col);

933

break;

Georgios Pinitas

cfa2bba

2019-06-27 17:00:52 +0100

[diff] [blame]

934

Michalis Spyrou

71ac903

2019-11-14 14:31:44 +0000

[diff] [blame]

935

case 4:

936

add_block<4>(input + row * in_stride + col, in_stride, col_bias + col);

937

break;

Georgios Pinitas

5aa1a0b

2020-07-02 20:02:20 +0100

[diff] [blame]

938

939

default:

940

UNREACHABLE("Impossible.");

Georgios Pinitas

cfa2bba

2019-06-27 17:00:52 +0100

[diff] [blame]

941

}

Michalis Spyrou

71ac903

2019-11-14 14:31:44 +0000

[diff] [blame]

942

} else {

943

for (; col<width; col++) {

944

int32_t sum=0;

945

for (unsigned int r=0; r<numrows; r++) {

946

sum += input[(row + r)*in_stride + col];

947

}

948

col_bias[col] += sum;

949

}

Georgios Pinitas

cfa2bba

2019-06-27 17:00:52 +0100

[diff] [blame]

}

}

}

}

for (unsigned int col=0; col<width; col++) {

956

int32_t result = col_bias[col];

957

958

result = (qp.a_offset * qp.b_offset * depth) - (result * qp.a_offset);

959

960

if (qp.bias != nullptr) {

Georgios Pinitas

48b3ef8

2019-10-14 19:03:09 +0100

[diff] [blame]

961

result += qp.bias[multi * qp.bias_multi_stride + col + first_col];

Georgios Pinitas

cfa2bba

2019-06-27 17:00:52 +0100

[diff] [blame]

962

}

963

964

col_bias[col] = result;

}

}

Michalis Spyrou

2019-11-14 14:31:44 +0000

[diff] [blame]

968

template void compute_col_sums(const Requantize32 &qp, unsigned int width, unsigned int height, const int8_t *input, unsigned int in_stride, int32_t *col_bias, unsigned int depth, unsigned int multi, unsigned int first_col);

969

template void compute_col_sums(const Requantize32 &qp, unsigned int width, unsigned int height, const uint8_t *input, unsigned int in_stride, int32_t *col_bias, unsigned int depth, unsigned int multi, unsigned int first_col);

Georgios Pinitas

cfa2bba

2019-06-27 17:00:52 +0100

[diff] [blame]

970

971

} // namespace arm_gemm

Georgios Pinitas

f33484f

2019-07-29 12:40:59 +0100

[diff] [blame]

972

973

#endif // __aarch64__