Blame - src/core/NEON/kernels/arm_gemm/quantized.cpp - ml/ComputeLibrary

void compute_some_rows(unsigned int blocks, const T *input, unsigned int in_stride, int32_t *row_bias, unsigned int mask_mode, uint64x2_t mask, int32x4_t offset_mul) {

805

int16x8_t sums[rows];

806

int32x4_t finalsums[rows];

807

808

for (unsigned int i=0; i<rows; i++) {

Michalis Spyrou

400abc8

2019-08-20 17:25:25 +0100

[diff] [blame]

809

sums[i] = vdupq_n_s16(0);

810

finalsums[i] = vdupq_n_s32(0);

Georgios Pinitas

cfa2bba

2019-06-27 17:00:52 +0100

[diff] [blame]

811

}

812

813

for (unsigned int i=0; i<blocks; i++) {

814

for (unsigned int r=0; r<rows; r++) {

Michalis Spyrou

400abc8

2019-08-20 17:25:25 +0100

[diff] [blame]

815

/* If we add too many blocks together, we run the risk

816

* of overflowing the intermediate 16-bit accumulators,

817

* especially in the unsigned case where we later treat

818

* the accumulator as signed.

819

*

820

* In that case, the maximum (signed) value is 16383,

821

* which is safe for 64 (unsigned) accumulations (255*64

822

* = 16,320).

823

*

824

* Each invocation of pairwise add adds 2 values to the

825

* accumulator - so in the unsigned case we can do 32

826

* adds before we need to reset the 16-bit accumulator

827

* by adding into the 32-bit 'finalsums'.

828

*

829

* We could do 64 adds in the signed case, but that

830

* optimization is not worth the complexity.

831

*/

Georgios Pinitas

48b3ef8

2019-10-14 19:03:09 +0100

[diff] [blame]

832

if (i > 0 && ((i & 31) == 0)) {

833

finalsums[r] = vpadalq_s16(finalsums[r], sums[r]);

834

sums[r] = vdupq_n_s16(0);

835

}

Georgios Pinitas

cfa2bba

2019-06-27 17:00:52 +0100

[diff] [blame]

836

sums[r] = accumulate_16(input + (r * in_stride) + (i * 16), sums[r]);

}

}

/* Handle the final masked read if needed. */

841

if (mask_mode > 0) {

842

for (unsigned int r=0; r<rows; r++) {

843

if (mask_mode == 1) {

844

sums[r] = accumulate_masked_8(input + (r * in_stride) + (blocks * 16), sums[r], mask);

845

} else {

846

sums[r] = accumulate_masked_16(input + (r * in_stride) + (blocks * 16), sums[r], mask);

}

}

}

for (unsigned int i=0; i<rows; i++) {

Michalis Spyrou

400abc8

2019-08-20 17:25:25 +0100

[diff] [blame]

852

finalsums[i] = vpadalq_s16(finalsums[i], sums[i]);

Georgios Pinitas

cfa2bba

2019-06-27 17:00:52 +0100

[diff] [blame]

}

int32x4_t t0, t1;

int32x2_t t2;

/* Result writeback - need to write back one value per row

859

* processed. Multiply all the final totals by -b_offset so

860

* that the terms can simply be added in the requantize code.

* */

switch (rows) {

case 1:

/* If we only have one output, just use ADDV. Multiply

865

* the offset into all four components separately so it

866

* can stay in the SIMD register file. */

867

t0 = vmulq_s32(finalsums[0], offset_mul);

868

*row_bias = vaddvq_s32(t0);

break;

case 2:

/* For two outputs, two rounds of pairwise adds will

873

* generate the result in a 2-vector we can store in one

874

* go. */

875

t0 = vpaddq_s32(finalsums[0], finalsums[1]);

876

t0 = vpaddq_s32(t0, t0);

877

t2 = vmul_s32(vget_low_s32(t0), vget_low_s32(offset_mul));

878

vst1_s32(row_bias, t2);

break;

case 3:

/* Three rows - need to store the low two words plus the odd value from lane 2 */

883

t0 = vpaddq_s32(finalsums[0], finalsums[1]);

884

t1 = vpaddq_s32(finalsums[2], finalsums[2]);

885

886

t0 = vpaddq_s32(t0, t1);

887

t0 = vmulq_s32(t0, offset_mul);

888

889

vst1_s32(row_bias, vget_low_s32(t0));

890

row_bias[2] = vgetq_lane_s32(t0, 2);

break;

case 4:

/* Four rows (most common case) - reduce to a single

895

* vector with pairwise adds. */

896

t0 = vpaddq_s32(finalsums[0], finalsums[1]);

897

t1 = vpaddq_s32(finalsums[2], finalsums[3]);

898

899

t0 = vpaddq_s32(t0, t1);

900

t0 = vmulq_s32(t0, offset_mul);

901

902

vst1q_s32(row_bias, t0);

903

break;

Georgios Pinitas

0cc50ed

2020-07-06 19:10:38 +0100

[diff] [blame]

904

Georgios Pinitas

5aa1a0b

2020-07-02 20:02:20 +0100

[diff] [blame]

905

default:

906

UNREACHABLE("Impossible.");

Georgios Pinitas

cfa2bba

2019-06-27 17:00:52 +0100

[diff] [blame]

}

}

Michalis Spyrou

2019-11-14 14:31:44 +0000

[diff] [blame]

910

row_sum_helpers(const Requantize32 &qp) : qp(qp) { }

Georgios Pinitas

cfa2bba

2019-06-27 17:00:52 +0100

[diff] [blame]

};

template<>

int16x8_t row_sum_helpers::accumulate_16(const uint8_t *ptr, int16x8_t sum) {

915

return vreinterpretq_s16_u16(vpadalq_u8(vreinterpretq_u16_s16(sum), vld1q_u8(ptr)));

}

template<>

int16x8_t row_sum_helpers::accumulate_16(const int8_t *ptr, int16x8_t sum) {

920

return vpadalq_s8(sum, vld1q_s8(ptr));

}

template<>

int16x8_t row_sum_helpers::accumulate_masked_16(const int8_t *ptr, int16x8_t sum, uint64x2_t mask) {

925

int8x16_t v = vandq_s8(vld1q_s8(ptr), vreinterpretq_s8_u64(mask));

926

return vpadalq_s8(sum, v);

}

template<>

int16x8_t row_sum_helpers::accumulate_masked_16(const uint8_t *ptr, int16x8_t sum, uint64x2_t mask) {

931

uint8x16_t v = vandq_u8(vld1q_u8(ptr), vreinterpretq_u8_u64(mask));

932

return vreinterpretq_s16_u16(vpadalq_u8(vreinterpretq_u16_s16(sum), v));

}

template<>

int16x8_t row_sum_helpers::accumulate_masked_8(const int8_t *ptr, int16x8_t sum, uint64x2_t mask) {

937

int8x16_t v = vcombine_s8(vld1_s8(ptr), vdup_n_s8(0));

938

v = vreinterpretq_s8_u64(vandq_u64(mask, vreinterpretq_u64_s8(v)));

939

return vpadalq_s8(sum, v);

}

template<>

int16x8_t row_sum_helpers::accumulate_masked_8(const uint8_t *ptr, int16x8_t sum, uint64x2_t mask) {

944

uint8x16_t v = vcombine_u8(vld1_u8(ptr), vdup_n_u8(0));

945

v = vreinterpretq_u8_u64(vandq_u64(mask, vreinterpretq_u64_u8(v)));

946

return vreinterpretq_s16_u16(vpadalq_u8(vreinterpretq_u16_s16(sum), v));

}

}

template<typename T>

Michalis Spyrou

71ac903

2019-11-14 14:31:44 +0000

[diff] [blame]

951

void compute_row_sums(const Requantize32 &qp, unsigned int width, unsigned int height,

Georgios Pinitas

cfa2bba

2019-06-27 17:00:52 +0100

[diff] [blame]

952

const T *input, unsigned int in_stride, int32_t *row_bias) {

Michalis Spyrou

71ac903

2019-11-14 14:31:44 +0000

[diff] [blame]

953

/* If the 'b' offset is zero, just skip this entirely. */

954

if (qp.b_offset == 0) {

955

memset(row_bias, 0, height * sizeof(int32_t));

return;

}

Georgios Pinitas

2019-06-27 17:00:52 +0100

[diff] [blame]

959

row_sum_helpers thehelpers(qp);

960

961

const int32x4_t offset_mul = vdupq_n_s32(-qp.b_offset);

962

963

/* Work out how many full vectors of 16 bytes we will read, and how many

964

* odd bytes at the end */

965

unsigned int blocks = (width / 16);

966

const unsigned int odds = width % 16;

967

968

/* Generate a mask to use on the last iteration, if necessary. */

969

uint64x2_t mask;

970

unsigned int mask_mode = 0;

971

972

if (odds > 0 && odds <= 8) {

973

/* 1-8 odds: mask in the low lane, 0 in the top */

974

uint64_t maskval = (~0ULL) >> (8 * (8-odds));

975

976

mask = vsetq_lane_u64(maskval, vdupq_n_u64(0), 0);

977

978

mask_mode = 1;

979

} else if (odds > 8) {

980

/* 9-15 odds: mask in the top lane, all 1s in the bottom. */

981

uint64_t maskval = (~0ULL) >> (8 * (16-odds));

982

983

mask = vsetq_lane_u64(maskval, vdupq_n_u64(~0ULL), 1);

mask_mode = 2;

}

for (unsigned int row=0; row<height; row+=4) {

switch(height-row) {

default:

case 4:

thehelpers.compute_some_rows<4>(blocks, input + (row * in_stride), in_stride, row_bias + row, mask_mode, mask, offset_mul);

993

break;

994

case 3:

995

thehelpers.compute_some_rows<3>(blocks, input + (row * in_stride), in_stride, row_bias + row, mask_mode, mask, offset_mul);

996

break;

997

case 2:

998

thehelpers.compute_some_rows<2>(blocks, input + (row * in_stride), in_stride, row_bias + row, mask_mode, mask, offset_mul);

999

break;

1000

case 1:

1001

thehelpers.compute_some_rows<1>(blocks, input + (row * in_stride), in_stride, row_bias + row, mask_mode, mask, offset_mul);

break;

}

}

}

/* Instantiate the two versions for uint8_t and int8_t. */

Michalis Spyrou

71ac903

2019-11-14 14:31:44 +0000

[diff] [blame]

1008

template void compute_row_sums(const Requantize32 &, unsigned int, unsigned int, const int8_t *, unsigned int, int32_t *);

1009

template void compute_row_sums(const Requantize32 &, unsigned int, unsigned int, const uint8_t *, unsigned int, int32_t *);

Georgios Pinitas

cfa2bba

2019-06-27 17:00:52 +0100

[diff] [blame]

1010

1011

template<unsigned int active_rows, typename T>

1012

inline void add_block(const T *input, unsigned int in_stride, int32_t *output);

1013

1014

template<unsigned int active_rows>

1015

inline void add_block(const uint8_t *input, unsigned int in_stride, int32_t *output) {

1016

uint8x16_t inputs[4];

1017

1018

for (unsigned int i=0; i<4; i++) {

1019

if (i < active_rows) {

1020

inputs[i] = vld1q_u8(input + i * in_stride);

1021

} else {

1022

inputs[i] = vdupq_n_u8(0);

}

}

int16x8_t sums_16b[4];

1027

1028

// Two adds for the low pairs

1029

sums_16b[0]=vreinterpretq_s16_u16(vaddl_u8(vget_low_u8(inputs[0]), vget_low_u8(inputs[1])));

1030

sums_16b[1]=vreinterpretq_s16_u16(vaddl_u8(vget_low_u8(inputs[2]), vget_low_u8(inputs[3])));

1031

// Two adds for the high pairs

1032

sums_16b[2]=vreinterpretq_s16_u16(vaddl_high_u8(inputs[0], inputs[1]));

1033

sums_16b[3]=vreinterpretq_s16_u16(vaddl_high_u8(inputs[2], inputs[3]));

1034

1035

int32x4_t sums_32b[4];

1036

1037

sums_32b[0]=vaddl_s16(vget_low_s16(sums_16b[0]), vget_low_s16(sums_16b[1]));

1038

sums_32b[1]=vaddl_high_s16(sums_16b[0], sums_16b[1]);

1039

sums_32b[2]=vaddl_s16(vget_low_s16(sums_16b[2]), vget_low_s16(sums_16b[3]));

1040

sums_32b[3]=vaddl_high_s16(sums_16b[2], sums_16b[3]);

1041

1042

for (unsigned int i=0; i<4; i++) {

1043

vst1q_s32(output + 4*i, vaddq_s32(sums_32b[i], vld1q_s32(output + 4*i)));

}

}

template<unsigned int active_rows>

1048

inline void add_block(const int8_t *input, unsigned int in_stride, int32_t *output) {

1049

int8x16_t inputs[4];

1050

1051

for (unsigned int i=0; i<4; i++) {

1052

if (i < active_rows) {

1053

inputs[i] = vld1q_s8(input + i * in_stride);

1054

} else {

1055

inputs[i] = vdupq_n_s8(0);

}

}

int16x8_t sums_16b[4];

1060

1061

// Two adds for the low pairs

1062

sums_16b[0]=vaddl_s8(vget_low_s8(inputs[0]), vget_low_s8(inputs[1]));

1063

sums_16b[1]=vaddl_s8(vget_low_s8(inputs[2]), vget_low_s8(inputs[3]));

1064

// Two adds for the high pairs

1065

sums_16b[2]=vaddl_high_s8(inputs[0], inputs[1]);

1066

sums_16b[3]=vaddl_high_s8(inputs[2], inputs[3]);

1067

1068

int32x4_t sums_32b[4];

1069

1070

sums_32b[0]=vaddl_s16(vget_low_s16(sums_16b[0]), vget_low_s16(sums_16b[1]));

1071

sums_32b[1]=vaddl_high_s16(sums_16b[0], sums_16b[1]);

1072

sums_32b[2]=vaddl_s16(vget_low_s16(sums_16b[2]), vget_low_s16(sums_16b[3]));

1073

sums_32b[3]=vaddl_high_s16(sums_16b[2], sums_16b[3]);

1074

1075

for (unsigned int i=0; i<4; i++) {

1076

vst1q_s32(output + 4*i, vaddq_s32(sums_32b[i], vld1q_s32(output + 4*i)));

}

}

Georgios Pinitas

2019-06-27 17:00:52 +0100

[diff] [blame]

1080

/* "first_col" parameter is used to offset the read into the qp.bias array,

1081

* in cases where we are not computing the first columns of the output (i.e.

1082

* in multithreaded cases where we divide columns across threads) */

1083

template<typename T>

Michalis Spyrou

71ac903

2019-11-14 14:31:44 +0000

[diff] [blame]

1084

void compute_col_sums(const Requantize32 &qp, unsigned int width, unsigned int height, const T *input, unsigned int in_stride, int32_t *col_bias, unsigned int depth, unsigned int multi, unsigned int first_col) {

1085

/* Only actually add up the columns if a_offset is non-zero. */

1086

if (qp.a_offset != 0) {

1087

memset(reinterpret_cast<void *>(col_bias), 0, width * sizeof(int32_t));

Georgios Pinitas

cfa2bba

2019-06-27 17:00:52 +0100

[diff] [blame]

1088

Michalis Spyrou

71ac903

2019-11-14 14:31:44 +0000

[diff] [blame]

1089

for (unsigned int row=0; row<height; row+=4) {

1090

unsigned int numrows=std::min(height-row, 4u);

Georgios Pinitas

cfa2bba

2019-06-27 17:00:52 +0100

[diff] [blame]

1091

Michalis Spyrou

71ac903

2019-11-14 14:31:44 +0000

[diff] [blame]

1092

for (unsigned int col=0; col<width; col+=16) {

1093

unsigned int numcols=std::min(width-col, 16u);

Georgios Pinitas

cfa2bba

2019-06-27 17:00:52 +0100

[diff] [blame]

1094

Michalis Spyrou

71ac903

2019-11-14 14:31:44 +0000

[diff] [blame]

1095

if (numcols==16) {

1096

switch(numrows) {

Michalis Spyrou

71ac903

2019-11-14 14:31:44 +0000

[diff] [blame]

1097

case 1:

1098

add_block<1>(input + row * in_stride + col, in_stride, col_bias + col);

1099

break;

Georgios Pinitas

cfa2bba

2019-06-27 17:00:52 +0100

[diff] [blame]

1100

Michalis Spyrou

71ac903

2019-11-14 14:31:44 +0000

[diff] [blame]

1101

case 2:

1102

add_block<2>(input + row * in_stride + col, in_stride, col_bias + col);

1103

break;

Georgios Pinitas

cfa2bba

2019-06-27 17:00:52 +0100

[diff] [blame]

1104

Michalis Spyrou

71ac903

2019-11-14 14:31:44 +0000

[diff] [blame]

1105

case 3:

1106

add_block<3>(input + row * in_stride + col, in_stride, col_bias + col);

1107

break;

Georgios Pinitas

cfa2bba

2019-06-27 17:00:52 +0100

[diff] [blame]

1108

Michalis Spyrou

71ac903

2019-11-14 14:31:44 +0000

[diff] [blame]

1109

case 4:

1110

add_block<4>(input + row * in_stride + col, in_stride, col_bias + col);

1111

break;

Georgios Pinitas

5aa1a0b

2020-07-02 20:02:20 +0100

[diff] [blame]

1112

1113

default:

1114

UNREACHABLE("Impossible.");

Georgios Pinitas

cfa2bba

2019-06-27 17:00:52 +0100

[diff] [blame]

1115

}

Michalis Spyrou

71ac903

2019-11-14 14:31:44 +0000

[diff] [blame]

1116

} else {

1117

for (; col<width; col++) {

1118

int32_t sum=0;

1119

for (unsigned int r=0; r<numrows; r++) {

1120

sum += input[(row + r)*in_stride + col];

1121

}

1122

col_bias[col] += sum;

1123

}

Georgios Pinitas

cfa2bba

2019-06-27 17:00:52 +0100

[diff] [blame]

}

}

}

}

for (unsigned int col=0; col<width; col++) {

1130

int32_t result = col_bias[col];

1131

1132

result = (qp.a_offset * qp.b_offset * depth) - (result * qp.a_offset);

1133

1134

if (qp.bias != nullptr) {

Georgios Pinitas

48b3ef8

2019-10-14 19:03:09 +0100

[diff] [blame]

1135

result += qp.bias[multi * qp.bias_multi_stride + col + first_col];

Georgios Pinitas

cfa2bba

2019-06-27 17:00:52 +0100

[diff] [blame]

1136

}

1137

1138

col_bias[col] = result;

}

}

Michalis Spyrou

2019-11-14 14:31:44 +0000

[diff] [blame]

1142

template void compute_col_sums(const Requantize32 &qp, unsigned int width, unsigned int height, const int8_t *input, unsigned int in_stride, int32_t *col_bias, unsigned int depth, unsigned int multi, unsigned int first_col);

1143

template void compute_col_sums(const Requantize32 &qp, unsigned int width, unsigned int height, const uint8_t *input, unsigned int in_stride, int32_t *col_bias, unsigned int depth, unsigned int multi, unsigned int first_col);

Georgios Pinitas

cfa2bba

2019-06-27 17:00:52 +0100

[diff] [blame]

1144

1145

} // namespace arm_gemm

Georgios Pinitas

f33484f

2019-07-29 12:40:59 +0100

[diff] [blame]

1146

1147

#endif // __aarch64__