Blame - src/core/NEON/kernels/arm_gemm/quantized.cpp - ml/ComputeLibrary

void compute_some_rows(unsigned int blocks, const T *input, unsigned int in_stride, int32_t *row_bias, unsigned int mask_mode, uint64x2_t mask, int32x4_t offset_mul) {

549

int16x8_t sums[rows];

550

int32x4_t finalsums[rows];

551

552

for (unsigned int i=0; i<rows; i++) {

Michalis Spyrou

400abc8

2019-08-20 17:25:25 +0100

[diff] [blame]

553

sums[i] = vdupq_n_s16(0);

554

finalsums[i] = vdupq_n_s32(0);

Georgios Pinitas

cfa2bba

2019-06-27 17:00:52 +0100

[diff] [blame]

555

}

556

557

for (unsigned int i=0; i<blocks; i++) {

558

for (unsigned int r=0; r<rows; r++) {

Michalis Spyrou

400abc8

2019-08-20 17:25:25 +0100

[diff] [blame]

559

/* If we add too many blocks together, we run the risk

560

* of overflowing the intermediate 16-bit accumulators,

561

* especially in the unsigned case where we later treat

562

* the accumulator as signed.

563

*

564

* In that case, the maximum (signed) value is 16383,

565

* which is safe for 64 (unsigned) accumulations (255*64

566

* = 16,320).

567

*

568

* Each invocation of pairwise add adds 2 values to the

569

* accumulator - so in the unsigned case we can do 32

570

* adds before we need to reset the 16-bit accumulator

571

* by adding into the 32-bit 'finalsums'.

572

*

573

* We could do 64 adds in the signed case, but that

574

* optimization is not worth the complexity.

575

*/

Georgios Pinitas

48b3ef8

2019-10-14 19:03:09 +0100

[diff] [blame]

576

if (i > 0 && ((i & 31) == 0)) {

577

finalsums[r] = vpadalq_s16(finalsums[r], sums[r]);

578

sums[r] = vdupq_n_s16(0);

579

}

Georgios Pinitas

cfa2bba

2019-06-27 17:00:52 +0100

[diff] [blame]

580

sums[r] = accumulate_16(input + (r * in_stride) + (i * 16), sums[r]);

}

}

/* Handle the final masked read if needed. */

585

if (mask_mode > 0) {

586

for (unsigned int r=0; r<rows; r++) {

587

if (mask_mode == 1) {

588

sums[r] = accumulate_masked_8(input + (r * in_stride) + (blocks * 16), sums[r], mask);

589

} else {

590

sums[r] = accumulate_masked_16(input + (r * in_stride) + (blocks * 16), sums[r], mask);

}

}

}

for (unsigned int i=0; i<rows; i++) {

Michalis Spyrou

400abc8

2019-08-20 17:25:25 +0100

[diff] [blame]

596

finalsums[i] = vpadalq_s16(finalsums[i], sums[i]);

Georgios Pinitas

cfa2bba

2019-06-27 17:00:52 +0100

[diff] [blame]

}

int32x4_t t0, t1;

int32x2_t t2;

/* Result writeback - need to write back one value per row

603

* processed. Multiply all the final totals by -b_offset so

604

* that the terms can simply be added in the requantize code.

605

* */

606

switch (rows) {

Georgios Pinitas

48b3ef8

2019-10-14 19:03:09 +0100

[diff] [blame]

607

default:

Georgios Pinitas

cfa2bba

2019-06-27 17:00:52 +0100

[diff] [blame]

608

case 1:

609

/* If we only have one output, just use ADDV. Multiply

610

* the offset into all four components separately so it

611

* can stay in the SIMD register file. */

612

t0 = vmulq_s32(finalsums[0], offset_mul);

613

*row_bias = vaddvq_s32(t0);

break;

case 2:

/* For two outputs, two rounds of pairwise adds will

618

* generate the result in a 2-vector we can store in one

619

* go. */

620

t0 = vpaddq_s32(finalsums[0], finalsums[1]);

621

t0 = vpaddq_s32(t0, t0);

622

t2 = vmul_s32(vget_low_s32(t0), vget_low_s32(offset_mul));

623

vst1_s32(row_bias, t2);

break;

case 3:

/* Three rows - need to store the low two words plus the odd value from lane 2 */

628

t0 = vpaddq_s32(finalsums[0], finalsums[1]);

629

t1 = vpaddq_s32(finalsums[2], finalsums[2]);

630

631

t0 = vpaddq_s32(t0, t1);

632

t0 = vmulq_s32(t0, offset_mul);

633

634

vst1_s32(row_bias, vget_low_s32(t0));

635

row_bias[2] = vgetq_lane_s32(t0, 2);

break;

case 4:

/* Four rows (most common case) - reduce to a single

640

* vector with pairwise adds. */

641

t0 = vpaddq_s32(finalsums[0], finalsums[1]);

642

t1 = vpaddq_s32(finalsums[2], finalsums[3]);

643

644

t0 = vpaddq_s32(t0, t1);

645

t0 = vmulq_s32(t0, offset_mul);

646

647

vst1q_s32(row_bias, t0);

648

break;

Georgios Pinitas

cfa2bba

2019-06-27 17:00:52 +0100

[diff] [blame]

}

}

Michalis Spyrou

2019-11-14 14:31:44 +0000

[diff] [blame]

652

row_sum_helpers(const Requantize32 &qp) : qp(qp) { }

Georgios Pinitas

cfa2bba

2019-06-27 17:00:52 +0100

[diff] [blame]

};

template<>

int16x8_t row_sum_helpers::accumulate_16(const uint8_t *ptr, int16x8_t sum) {

657

return vreinterpretq_s16_u16(vpadalq_u8(vreinterpretq_u16_s16(sum), vld1q_u8(ptr)));

}

template<>

int16x8_t row_sum_helpers::accumulate_16(const int8_t *ptr, int16x8_t sum) {

662

return vpadalq_s8(sum, vld1q_s8(ptr));

}

template<>

int16x8_t row_sum_helpers::accumulate_masked_16(const int8_t *ptr, int16x8_t sum, uint64x2_t mask) {

667

int8x16_t v = vandq_s8(vld1q_s8(ptr), vreinterpretq_s8_u64(mask));

668

return vpadalq_s8(sum, v);

}

template<>

int16x8_t row_sum_helpers::accumulate_masked_16(const uint8_t *ptr, int16x8_t sum, uint64x2_t mask) {

673

uint8x16_t v = vandq_u8(vld1q_u8(ptr), vreinterpretq_u8_u64(mask));

674

return vreinterpretq_s16_u16(vpadalq_u8(vreinterpretq_u16_s16(sum), v));

}

template<>

int16x8_t row_sum_helpers::accumulate_masked_8(const int8_t *ptr, int16x8_t sum, uint64x2_t mask) {

679

int8x16_t v = vcombine_s8(vld1_s8(ptr), vdup_n_s8(0));

680

v = vreinterpretq_s8_u64(vandq_u64(mask, vreinterpretq_u64_s8(v)));

681

return vpadalq_s8(sum, v);

}

template<>

int16x8_t row_sum_helpers::accumulate_masked_8(const uint8_t *ptr, int16x8_t sum, uint64x2_t mask) {

686

uint8x16_t v = vcombine_u8(vld1_u8(ptr), vdup_n_u8(0));

687

v = vreinterpretq_u8_u64(vandq_u64(mask, vreinterpretq_u64_u8(v)));

688

return vreinterpretq_s16_u16(vpadalq_u8(vreinterpretq_u16_s16(sum), v));

}

}

template<typename T>

Michalis Spyrou

71ac903

2019-11-14 14:31:44 +0000

[diff] [blame]

693

void compute_row_sums(const Requantize32 &qp, unsigned int width, unsigned int height,

Georgios Pinitas

cfa2bba

2019-06-27 17:00:52 +0100

[diff] [blame]

694

const T *input, unsigned int in_stride, int32_t *row_bias) {

Michalis Spyrou

71ac903

2019-11-14 14:31:44 +0000

[diff] [blame]

695

/* If the 'b' offset is zero, just skip this entirely. */

696

if (qp.b_offset == 0) {

697

memset(row_bias, 0, height * sizeof(int32_t));

return;

}

Georgios Pinitas

2019-06-27 17:00:52 +0100

[diff] [blame]

701

row_sum_helpers thehelpers(qp);

702

703

const int32x4_t offset_mul = vdupq_n_s32(-qp.b_offset);

704

705

/* Work out how many full vectors of 16 bytes we will read, and how many

706

* odd bytes at the end */

707

unsigned int blocks = (width / 16);

708

const unsigned int odds = width % 16;

709

710

/* Generate a mask to use on the last iteration, if necessary. */

711

uint64x2_t mask;

712

unsigned int mask_mode = 0;

713

714

if (odds > 0 && odds <= 8) {

715

/* 1-8 odds: mask in the low lane, 0 in the top */

716

uint64_t maskval = (~0ULL) >> (8 * (8-odds));

717

718

mask = vsetq_lane_u64(maskval, vdupq_n_u64(0), 0);

719

720

mask_mode = 1;

721

} else if (odds > 8) {

722

/* 9-15 odds: mask in the top lane, all 1s in the bottom. */

723

uint64_t maskval = (~0ULL) >> (8 * (16-odds));

724

725

mask = vsetq_lane_u64(maskval, vdupq_n_u64(~0ULL), 1);

mask_mode = 2;

}

for (unsigned int row=0; row<height; row+=4) {

switch(height-row) {

default:

case 4:

thehelpers.compute_some_rows<4>(blocks, input + (row * in_stride), in_stride, row_bias + row, mask_mode, mask, offset_mul);

735

break;

736

case 3:

737

thehelpers.compute_some_rows<3>(blocks, input + (row * in_stride), in_stride, row_bias + row, mask_mode, mask, offset_mul);

738

break;

739

case 2:

740

thehelpers.compute_some_rows<2>(blocks, input + (row * in_stride), in_stride, row_bias + row, mask_mode, mask, offset_mul);

741

break;

742

case 1:

743

thehelpers.compute_some_rows<1>(blocks, input + (row * in_stride), in_stride, row_bias + row, mask_mode, mask, offset_mul);

break;

}

}

}

/* Instantiate the two versions for uint8_t and int8_t. */

Michalis Spyrou

71ac903

2019-11-14 14:31:44 +0000

[diff] [blame]

750

template void compute_row_sums(const Requantize32 &, unsigned int, unsigned int, const int8_t *, unsigned int, int32_t *);

751

template void compute_row_sums(const Requantize32 &, unsigned int, unsigned int, const uint8_t *, unsigned int, int32_t *);

Georgios Pinitas

cfa2bba

2019-06-27 17:00:52 +0100

[diff] [blame]

752

753

template<unsigned int active_rows, typename T>

754

inline void add_block(const T *input, unsigned int in_stride, int32_t *output);

755

756

template<unsigned int active_rows>

757

inline void add_block(const uint8_t *input, unsigned int in_stride, int32_t *output) {

758

uint8x16_t inputs[4];

759

760

for (unsigned int i=0; i<4; i++) {

761

if (i < active_rows) {

762

inputs[i] = vld1q_u8(input + i * in_stride);

763

} else {

764

inputs[i] = vdupq_n_u8(0);

}

}

int16x8_t sums_16b[4];

769

770

// Two adds for the low pairs

771

sums_16b[0]=vreinterpretq_s16_u16(vaddl_u8(vget_low_u8(inputs[0]), vget_low_u8(inputs[1])));

772

sums_16b[1]=vreinterpretq_s16_u16(vaddl_u8(vget_low_u8(inputs[2]), vget_low_u8(inputs[3])));

773

// Two adds for the high pairs

774

sums_16b[2]=vreinterpretq_s16_u16(vaddl_high_u8(inputs[0], inputs[1]));

775

sums_16b[3]=vreinterpretq_s16_u16(vaddl_high_u8(inputs[2], inputs[3]));

776

777

int32x4_t sums_32b[4];

778

779

sums_32b[0]=vaddl_s16(vget_low_s16(sums_16b[0]), vget_low_s16(sums_16b[1]));

780

sums_32b[1]=vaddl_high_s16(sums_16b[0], sums_16b[1]);

781

sums_32b[2]=vaddl_s16(vget_low_s16(sums_16b[2]), vget_low_s16(sums_16b[3]));

782

sums_32b[3]=vaddl_high_s16(sums_16b[2], sums_16b[3]);

783

784

for (unsigned int i=0; i<4; i++) {

785

vst1q_s32(output + 4*i, vaddq_s32(sums_32b[i], vld1q_s32(output + 4*i)));

}

}

template<unsigned int active_rows>

790

inline void add_block(const int8_t *input, unsigned int in_stride, int32_t *output) {

791

int8x16_t inputs[4];

792

793

for (unsigned int i=0; i<4; i++) {

794

if (i < active_rows) {

795

inputs[i] = vld1q_s8(input + i * in_stride);

796

} else {

797

inputs[i] = vdupq_n_s8(0);

}

}

int16x8_t sums_16b[4];

802

803

// Two adds for the low pairs

804

sums_16b[0]=vaddl_s8(vget_low_s8(inputs[0]), vget_low_s8(inputs[1]));

805

sums_16b[1]=vaddl_s8(vget_low_s8(inputs[2]), vget_low_s8(inputs[3]));

806

// Two adds for the high pairs

807

sums_16b[2]=vaddl_high_s8(inputs[0], inputs[1]);

808

sums_16b[3]=vaddl_high_s8(inputs[2], inputs[3]);

809

810

int32x4_t sums_32b[4];

811

812

sums_32b[0]=vaddl_s16(vget_low_s16(sums_16b[0]), vget_low_s16(sums_16b[1]));

813

sums_32b[1]=vaddl_high_s16(sums_16b[0], sums_16b[1]);

814

sums_32b[2]=vaddl_s16(vget_low_s16(sums_16b[2]), vget_low_s16(sums_16b[3]));

815

sums_32b[3]=vaddl_high_s16(sums_16b[2], sums_16b[3]);

816

817

for (unsigned int i=0; i<4; i++) {

818

vst1q_s32(output + 4*i, vaddq_s32(sums_32b[i], vld1q_s32(output + 4*i)));

}

}

Georgios Pinitas

2019-06-27 17:00:52 +0100

[diff] [blame]

822

/* "first_col" parameter is used to offset the read into the qp.bias array,

823

* in cases where we are not computing the first columns of the output (i.e.

824

* in multithreaded cases where we divide columns across threads) */

825

template<typename T>

Michalis Spyrou

71ac903

2019-11-14 14:31:44 +0000

[diff] [blame]

826

void compute_col_sums(const Requantize32 &qp, unsigned int width, unsigned int height, const T *input, unsigned int in_stride, int32_t *col_bias, unsigned int depth, unsigned int multi, unsigned int first_col) {

827

/* Only actually add up the columns if a_offset is non-zero. */

828

if (qp.a_offset != 0) {

829

memset(reinterpret_cast<void *>(col_bias), 0, width * sizeof(int32_t));

Georgios Pinitas

cfa2bba

2019-06-27 17:00:52 +0100

[diff] [blame]

830

Michalis Spyrou

71ac903

2019-11-14 14:31:44 +0000

[diff] [blame]

831

for (unsigned int row=0; row<height; row+=4) {

832

unsigned int numrows=std::min(height-row, 4u);

Georgios Pinitas

cfa2bba

2019-06-27 17:00:52 +0100

[diff] [blame]

833

Michalis Spyrou

71ac903

2019-11-14 14:31:44 +0000

[diff] [blame]

834

for (unsigned int col=0; col<width; col+=16) {

835

unsigned int numcols=std::min(width-col, 16u);

Georgios Pinitas

cfa2bba

2019-06-27 17:00:52 +0100

[diff] [blame]

836

Michalis Spyrou

71ac903

2019-11-14 14:31:44 +0000

[diff] [blame]

if (numcols==16) {

switch(numrows) {

default:

case 1:

add_block<1>(input + row * in_stride + col, in_stride, col_bias + col);

842

break;

Georgios Pinitas

cfa2bba

2019-06-27 17:00:52 +0100

[diff] [blame]

843

Michalis Spyrou

71ac903

2019-11-14 14:31:44 +0000

[diff] [blame]

844

case 2:

845

add_block<2>(input + row * in_stride + col, in_stride, col_bias + col);

846

break;

Georgios Pinitas

cfa2bba

2019-06-27 17:00:52 +0100

[diff] [blame]

847

Michalis Spyrou

71ac903

2019-11-14 14:31:44 +0000

[diff] [blame]

848

case 3:

849

add_block<3>(input + row * in_stride + col, in_stride, col_bias + col);

850

break;

Georgios Pinitas

cfa2bba

2019-06-27 17:00:52 +0100

[diff] [blame]

851

Michalis Spyrou

71ac903

2019-11-14 14:31:44 +0000

[diff] [blame]

852

case 4:

853

add_block<4>(input + row * in_stride + col, in_stride, col_bias + col);

854

break;

Georgios Pinitas

cfa2bba

2019-06-27 17:00:52 +0100

[diff] [blame]

855

}

Michalis Spyrou

71ac903

2019-11-14 14:31:44 +0000

[diff] [blame]

856

} else {

857

for (; col<width; col++) {

858

int32_t sum=0;

859

for (unsigned int r=0; r<numrows; r++) {

860

sum += input[(row + r)*in_stride + col];

861

}

862

col_bias[col] += sum;

863

}

Georgios Pinitas

cfa2bba

2019-06-27 17:00:52 +0100

[diff] [blame]

}

}

}

}

for (unsigned int col=0; col<width; col++) {

870

int32_t result = col_bias[col];

871

872

result = (qp.a_offset * qp.b_offset * depth) - (result * qp.a_offset);

873

874

if (qp.bias != nullptr) {

Georgios Pinitas

48b3ef8

2019-10-14 19:03:09 +0100

[diff] [blame]

875

result += qp.bias[multi * qp.bias_multi_stride + col + first_col];

Georgios Pinitas

cfa2bba

2019-06-27 17:00:52 +0100

[diff] [blame]

876

}

877

878

col_bias[col] = result;

}

}

Michalis Spyrou

2019-11-14 14:31:44 +0000

[diff] [blame]

882

template void compute_col_sums(const Requantize32 &qp, unsigned int width, unsigned int height, const int8_t *input, unsigned int in_stride, int32_t *col_bias, unsigned int depth, unsigned int multi, unsigned int first_col);

883

template void compute_col_sums(const Requantize32 &qp, unsigned int width, unsigned int height, const uint8_t *input, unsigned int in_stride, int32_t *col_bias, unsigned int depth, unsigned int multi, unsigned int first_col);

Georgios Pinitas

cfa2bba

2019-06-27 17:00:52 +0100

[diff] [blame]

884

885

} // namespace arm_gemm

Georgios Pinitas

f33484f

2019-07-29 12:40:59 +0100

[diff] [blame]

886

887

#endif // __aarch64__