Blame - arm_compute/core/NEON/kernels/detail/NEDirectConvolutionDetail.h - ml/ComputeLibrary

2019-04-12 10:29:17 +0100

[diff] [blame]

397

{

Michele Di Giorgio

2020-01-02 12:11:13 +0000

[diff] [blame]

398

ARM_COMPUTE_ERROR_ON(stridex > 3);

399

int32x4x2_t out =

Usama Arif

2019-04-12 10:29:17 +0100

[diff] [blame]

400

{

401

{

402

single_convolve_3x3_dilation(in_top, in_mid, in_low, m0, m1, m2, dilation_x, input_offset),

403

single_convolve_3x3_dilation(in_top + 4, in_mid + 4, in_low + 4, m0, m1, m2, dilation_x, input_offset)

404

}

405

};

Michele Di Giorgio

2020-01-02 12:11:13 +0000

[diff] [blame]

if(stridex == 2)

{

out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[0], 2), out.val[0], 1);

410

out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[1], 0), out.val[0], 2);

411

out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[1], 2), out.val[0], 3);

412

}

413

else if(stridex == 3)

414

{

415

out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[0], 3), out.val[0], 1);

416

}

Usama Arif

2019-04-12 10:29:17 +0100

[diff] [blame]

return out;

}

Michele Di Giorgio

2020-01-02 12:11:13 +0000

[diff] [blame]

420

/** Perform a convolve3x3 on 8-bit elements

Georgios Pinitas

2018-01-12 16:29:45 +0000

[diff] [blame]

421

*

Vidhya Sudhan Loganathan

7485d5a

2018-07-04 09:34:00 +0100

[diff] [blame]

422

* @param[in] in_top Pointer to the first row of the input.

423

* @param[in] in_mid Pointer to the second row of the input.

424

* @param[in] in_low Pointer to the third row of the input.

425

* @param[in] m0 First row of the filter.

426

* @param[in] m1 Second row of the filter.

427

* @param[in] m2 Third row of the filter.

Michele Di Giorgio

2020-01-02 12:11:13 +0000

[diff] [blame]

428

* @param[in] stridex Stride value in elements across x.

429

* @param[in] input_offset Input quantization offset.

Georgios Pinitas

2018-01-12 16:29:45 +0000

[diff] [blame]

430

*

431

*/

Michele Di Giorgio

2020-01-02 12:11:13 +0000

[diff] [blame]

432

template < typename T, REQUIRES_TA(std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value) >

433

int32x4x2_t convolve_3x3(const T *in_top, const T *in_mid, const T *in_low,

Georgios Pinitas

2018-01-12 16:29:45 +0000

[diff] [blame]

434

const int32x4x3_t &m0, const int32x4x3_t &m1, const int32x4x3_t &m2,

Michalis Spyrou

6f314db

2020-01-13 14:07:48 +0000

[diff] [blame^]

435

unsigned int stridex, int32_t input_offset)

Georgios Pinitas

2018-01-12 16:29:45 +0000

[diff] [blame]

436

{

Michele Di Giorgio

2020-01-02 12:11:13 +0000

[diff] [blame]

437

ARM_COMPUTE_ERROR_ON(stridex > 3);

438

using VectorType = typename std::conditional<std::is_same<T, uint8_t>::value, uint8x8x2_t, int8x8x2_t>::type;

Michalis Spyrou

6f314db

2020-01-13 14:07:48 +0000

[diff] [blame^]

439

using OutputTagType = typename wrapper::traits::neon_bitvector_tag_t<int32_t, wrapper::traits::BitWidth::W128>;

Georgios Pinitas

2018-01-12 16:29:45 +0000

[diff] [blame]

440

Michele Di Giorgio

2020-01-02 12:11:13 +0000

[diff] [blame]

441

const int32x4_t v_input_offset = wrapper::vdup_n(input_offset, OutputTagType{});

442

443

const VectorType vtop =

Georgios Pinitas

2018-01-12 16:29:45 +0000

[diff] [blame]

444

{

445

{

Michele Di Giorgio

2020-01-02 12:11:13 +0000

[diff] [blame]

446

wrapper::vload(in_top),

447

wrapper::vload(in_top + 8)

Georgios Pinitas

2018-01-12 16:29:45 +0000

[diff] [blame]

448

}

449

};

Michele Di Giorgio

2020-01-02 12:11:13 +0000

[diff] [blame]

450

const VectorType vmid =

Georgios Pinitas

2018-01-12 16:29:45 +0000

[diff] [blame]

451

{

452

{

Michele Di Giorgio

2020-01-02 12:11:13 +0000

[diff] [blame]

453

wrapper::vload(in_mid),

454

wrapper::vload(in_mid + 8)

Georgios Pinitas

2018-01-12 16:29:45 +0000

[diff] [blame]

455

}

456

};

Michele Di Giorgio

2020-01-02 12:11:13 +0000

[diff] [blame]

457

const VectorType vlow =

Georgios Pinitas

2018-01-12 16:29:45 +0000

[diff] [blame]

458

{

459

{

Michele Di Giorgio

2020-01-02 12:11:13 +0000

[diff] [blame]

460

wrapper::vload(in_low),

461

wrapper::vload(in_low + 8)

Georgios Pinitas

2018-01-12 16:29:45 +0000

[diff] [blame]

}

};

const int32x4x3_t vtop_s32 =

466

{

467

{

Michele Di Giorgio

2020-01-02 12:11:13 +0000

[diff] [blame]

468

wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[0])))),

469

wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgethigh(wrapper::vmovl(vtop.val[0])))),

470

wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[1])))),

Georgios Pinitas

2018-01-12 16:29:45 +0000

[diff] [blame]

471

}

472

};

473

const int32x4x3_t vmid_s32 =

474

{

475

{

Michele Di Giorgio

2020-01-02 12:11:13 +0000

[diff] [blame]

476

wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[0])))),

477

wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgethigh(wrapper::vmovl(vmid.val[0])))),

478

wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[1])))),

Georgios Pinitas

2018-01-12 16:29:45 +0000

[diff] [blame]

479

}

480

};

481

const int32x4x3_t vlow_s32 =

482

{

483

{

Michele Di Giorgio

2020-01-02 12:11:13 +0000

[diff] [blame]

484

wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[0])))),

485

wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgethigh(wrapper::vmovl(vlow.val[0])))),

486

wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[1])))),

Georgios Pinitas

2018-01-12 16:29:45 +0000

[diff] [blame]

}

};

int32x4x2_t out

{

{

Michalis Spyrou

6f314db

2020-01-13 14:07:48 +0000

[diff] [blame^]

493

wrapper::vdup_n(static_cast<int32_t>(0), OutputTagType{}),

494

wrapper::vdup_n(static_cast<int32_t>(0), OutputTagType{}),

Georgios Pinitas

2018-01-12 16:29:45 +0000

[diff] [blame]

}

};

// 0

Michele Di Giorgio

2020-01-02 12:11:13 +0000

[diff] [blame]

499

out.val[0] = wrapper::vmla(out.val[0], vtop_s32.val[0], m0.val[0]);

500

out.val[0] = wrapper::vmla(out.val[0], wrapper::vext_1(vtop_s32.val[0], vtop_s32.val[1]), m0.val[1]);

501

out.val[0] = wrapper::vmla(out.val[0], wrapper::vext_2(vtop_s32.val[0], vtop_s32.val[1]), m0.val[2]);

Georgios Pinitas

2018-01-12 16:29:45 +0000

[diff] [blame]

502

Michele Di Giorgio

2020-01-02 12:11:13 +0000

[diff] [blame]

503

out.val[0] = wrapper::vmla(out.val[0], vmid_s32.val[0], m1.val[0]);

504

out.val[0] = wrapper::vmla(out.val[0], wrapper::vext_1(vmid_s32.val[0], vmid_s32.val[1]), m1.val[1]);

505

out.val[0] = wrapper::vmla(out.val[0], wrapper::vext_2(vmid_s32.val[0], vmid_s32.val[1]), m1.val[2]);

Georgios Pinitas

2018-01-12 16:29:45 +0000

[diff] [blame]

506

Michele Di Giorgio

2020-01-02 12:11:13 +0000

[diff] [blame]

507

out.val[0] = wrapper::vmla(out.val[0], vlow_s32.val[0], m2.val[0]);

508

out.val[0] = wrapper::vmla(out.val[0], wrapper::vext_1(vlow_s32.val[0], vlow_s32.val[1]), m2.val[1]);

509

out.val[0] = wrapper::vmla(out.val[0], wrapper::vext_2(vlow_s32.val[0], vlow_s32.val[1]), m2.val[2]);

Georgios Pinitas

2018-01-12 16:29:45 +0000

[diff] [blame]

510

511

// 1

Michele Di Giorgio

2020-01-02 12:11:13 +0000

[diff] [blame]

512

out.val[1] = wrapper::vmla(out.val[1], vtop_s32.val[1], m0.val[0]);

513

out.val[1] = wrapper::vmla(out.val[1], wrapper::vext_1(vtop_s32.val[1], vtop_s32.val[2]), m0.val[1]);

514

out.val[1] = wrapper::vmla(out.val[1], wrapper::vext_2(vtop_s32.val[1], vtop_s32.val[2]), m0.val[2]);

Georgios Pinitas

2018-01-12 16:29:45 +0000

[diff] [blame]

515

Michele Di Giorgio

2020-01-02 12:11:13 +0000

[diff] [blame]

516

out.val[1] = wrapper::vmla(out.val[1], vmid_s32.val[1], m1.val[0]);

517

out.val[1] = wrapper::vmla(out.val[1], wrapper::vext_1(vmid_s32.val[1], vmid_s32.val[2]), m1.val[1]);

518

out.val[1] = wrapper::vmla(out.val[1], wrapper::vext_2(vmid_s32.val[1], vmid_s32.val[2]), m1.val[2]);

Georgios Pinitas

2018-01-12 16:29:45 +0000

[diff] [blame]

519

Michele Di Giorgio

2020-01-02 12:11:13 +0000

[diff] [blame]

520

out.val[1] = wrapper::vmla(out.val[1], vlow_s32.val[1], m2.val[0]);

521

out.val[1] = wrapper::vmla(out.val[1], wrapper::vext_1(vlow_s32.val[1], vlow_s32.val[2]), m2.val[1]);

522

out.val[1] = wrapper::vmla(out.val[1], wrapper::vext_2(vlow_s32.val[1], vlow_s32.val[2]), m2.val[2]);

Georgios Pinitas

2018-01-12 16:29:45 +0000

[diff] [blame]

523

Michele Di Giorgio

2020-01-02 12:11:13 +0000

[diff] [blame]

524

if(stridex == 2)

525

{

526

out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[0], 2), out.val[0], 1);

527

out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[1], 0), out.val[0], 2);

528

out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[1], 2), out.val[0], 3);

529

}

530

else if(stridex == 3)

531

{

532

out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[0], 3), out.val[0], 1);

533

}

Georgios Pinitas

2018-01-12 16:29:45 +0000

[diff] [blame]

return out;

}

Michalis Spyrou

2017-10-18 17:58:22 +0100

[diff] [blame]

537

/** Stores a float32x4x2_t array into a memory location.

538

*

539

* @param[in] buffer Pointer to the memory location where the values will be stored.

540

* @param[in] values Values that will be stored.

541

*

542

*/

543

template <unsigned int stridex>

544

void store_results(float *buffer, const float32x4x2_t &values);

545

546

template <>

547

inline void store_results<1>(float *buffer, const float32x4x2_t &values)

548

{

549

vst1q_f32(buffer, values.val[0]);

550

vst1q_f32(buffer + 4, values.val[1]);

}

template <>

inline void store_results<2>(float *buffer, const float32x4x2_t &values)

555

{

556

vst1q_f32(buffer, values.val[0]);

}

template <>

inline void store_results<3>(float *buffer, const float32x4x2_t &values)

561

{

562

vst1_f32(buffer, vget_low_f32(values.val[0]));

563

}

564

Georgios Pinitas

2018-01-12 16:29:45 +0000

[diff] [blame]

565

/** Stores a uint32_t array into a memory location.

566

*

567

* @param[in] buffer Pointer to the memory location where the values will be stored.

568

* @param[in] values Values that will be stored.

569

*

570

*/

571

template <unsigned int stridex>

572

void store_results(int32_t *buffer, const int32x4x2_t &values);

573

574

template <>

575

inline void store_results<1>(int32_t *buffer, const int32x4x2_t &values)

576

{

577

vst1q_s32(buffer, values.val[0]);

578

vst1q_s32(buffer + 4, values.val[1]);

}

template <>

inline void store_results<2>(int32_t *buffer, const int32x4x2_t &values)

583

{

584

vst1q_s32(buffer, values.val[0]);

}

template <>

inline void store_results<3>(int32_t *buffer, const int32x4x2_t &values)

589

{

590

vst1_s32(buffer, vget_low_s32(values.val[0]));

591

}

592

Ioan-Cristian Szabo

5edbd1c

2017-11-13 13:34:08 +0000

[diff] [blame]

593

#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

Michalis Spyrou

2017-10-18 17:58:22 +0100

[diff] [blame]

594

/** Loads a 3x3 matrix as a row (float16_t).

595

*

596

* @param[in] ptr Pointer to a float 3x3 matrix.

597

*

598

* @return The loaded matrix.

599

*/

Georgios Pinitas

20c246a

2018-09-12 16:45:53 +0100

[diff] [blame]

600

inline float16x8x3_t load_matrix_row(const float16_t *ptr, int weights_offset = 0)

Michalis Spyrou

2017-10-18 17:58:22 +0100

[diff] [blame]

601

{

Georgios Pinitas

20c246a

2018-09-12 16:45:53 +0100

[diff] [blame]

602

ARM_COMPUTE_UNUSED(weights_offset);

Michalis Spyrou

2017-10-18 17:58:22 +0100

[diff] [blame]

603

/* ptr is a pointer to a row in a 3x3 matrix, the function returns 3 vectors holding exactly the same value in all lanes:

604

r.val[0] contains the first element, r.val[1] the second element and r.val[2] the third element (in all lanes) */

605

const float16x8x3_t r =

{

{

vld1q_dup_f16(ptr),

vld1q_dup_f16(1 + ptr),

610

vld1q_dup_f16(2 + ptr)

}

};

return r;

}

Usama Arif

2019-04-12 10:29:17 +0100

[diff] [blame]

616

/** Perform a 3x3 convolution for 8 consecutive elements on float16 when dilation.x() or dilation.y() is not 1.

617

*

618

* @param[in] in_top Pointer to the first row of the input.

619

* @param[in] in_mid Pointer to the second row of the input.

620

* @param[in] in_low Pointer to the third row of the input.

621

* @param[in] m0 First row of the filter.

622

* @param[in] m1 Second row of the filter.

623

* @param[in] m2 Third row of the filter.

624

* @param[in] dilation_x Dilation, in elements across x.

625

* @param[in] input_offset (Optional)Input quantization offset.

626

*

627

*/

628

inline float16x8_t single_convolve_3x3_dilation(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low,

629

const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2,

630

const size_t dilation_x, int input_offset = 0)

631

{

632

ARM_COMPUTE_UNUSED(input_offset);

633

const float16x8x3_t vtop =

{

{

vld1q_f16(in_top),

vld1q_f16(in_top + dilation_x),

638

vld1q_f16(in_top + 2 * dilation_x)

639

}

640

};

641

const float16x8x3_t vmid =

{

{

vld1q_f16(in_mid),

vld1q_f16(in_mid + dilation_x),

646

vld1q_f16(in_mid + 2 * dilation_x)

647

}

648

};

649

const float16x8x3_t vlow =

{

{

vld1q_f16(in_low),

vld1q_f16(in_low + dilation_x),

654

vld1q_f16(in_low + 2 * dilation_x)

655

}

656

};

657

float16x8_t out = vmulq_f16(vtop.val[0], m0.val[0]);

658

out = vaddq_f16(out, vmulq_f16(vtop.val[1], m0.val[1]));

659

out = vaddq_f16(out, vmulq_f16(vtop.val[2], m0.val[2]));

660

661

out = vaddq_f16(out, vmulq_f16(vmid.val[0], m1.val[0]));

662

out = vaddq_f16(out, vmulq_f16(vmid.val[1], m1.val[1]));

663

out = vaddq_f16(out, vmulq_f16(vmid.val[2], m1.val[2]));

664

665

out = vaddq_f16(out, vmulq_f16(vlow.val[0], m2.val[0]));

666

out = vaddq_f16(out, vmulq_f16(vlow.val[1], m2.val[1]));

667

out = vaddq_f16(out, vmulq_f16(vlow.val[2], m2.val[2]));

return out;

}

/** Perform a 3x3 convolution for 16 consecutive elements on float16 when dilation.x() or dilation.y() is not 1.

673

*

674

* @param[in] in_top Pointer to the first row of the input.

675

* @param[in] in_mid Pointer to the second row of the input.

676

* @param[in] in_low Pointer to the third row of the input.

677

* @param[in] m0 First row of the filter.

678

* @param[in] m1 Second row of the filter.

679

* @param[in] m2 Third row of the filter.

680

* @param[in] dilation_x Dilation, in elements across x.

Michele Di Giorgio

2020-01-02 12:11:13 +0000

[diff] [blame]

681

* @param[in] stridex Stride value in elements across x.

682

* @param[in] input_offset (Optional) Input quantization offset.

Usama Arif

2019-04-12 10:29:17 +0100

[diff] [blame]

683

*

684

*/

Michele Di Giorgio

2020-01-02 12:11:13 +0000

[diff] [blame]

685

inline float16x8x2_t convolve_3x3_dilation(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low,

686

const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2,

687

const size_t dilation_x, unsigned int stridex, int input_offset = 0)

Usama Arif

2019-04-12 10:29:17 +0100

[diff] [blame]

688

{

Michele Di Giorgio

2020-01-02 12:11:13 +0000

[diff] [blame]

689

float16x8x2_t out =

Usama Arif

2019-04-12 10:29:17 +0100

[diff] [blame]

690

{

691

{

692

single_convolve_3x3_dilation(in_top, in_mid, in_low, m0, m1, m2, dilation_x, input_offset),

693

single_convolve_3x3_dilation(in_top + 8, in_mid + 8, in_low + 8, m0, m1, m2, dilation_x, input_offset)

694

}

695

};

Usama Arif

2019-04-12 10:29:17 +0100

[diff] [blame]

696

Michele Di Giorgio

2020-01-02 12:11:13 +0000

[diff] [blame]

697

if(stridex == 2)

698

{

699

out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 2), out.val[0], 1);

700

out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 4), out.val[0], 2);

701

out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 6), out.val[0], 3);

702

out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[1], 0), out.val[0], 4);

703

out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[1], 2), out.val[0], 5);

704

out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[1], 4), out.val[0], 6);

705

out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[1], 6), out.val[0], 7);

706

}

707

else if(stridex == 3)

708

{

709

out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 3), out.val[0], 1);

710

out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 6), out.val[0], 2);

711

out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[1], 1), out.val[0], 3);

712

}

Usama Arif

2019-04-12 10:29:17 +0100

[diff] [blame]

713

Usama Arif

2019-04-12 10:29:17 +0100

[diff] [blame]

return out;

}

Michalis Spyrou

2017-10-18 17:58:22 +0100

[diff] [blame]

717

/** Perform a convolve3x3 on float16.

718

*

Michele Di Giorgio

2020-01-02 12:11:13 +0000

[diff] [blame]

719

* @param[in] in_top Pointer to the first row of the input.

720

* @param[in] in_mid Pointer to the second row of the input.

721

* @param[in] in_low Pointer to the third row of the input.

722

* @param[in] m0 First row of the filter.

723

* @param[in] m1 Second row of the filter.

724

* @param[in] m2 Third row of the filter.

725

* @param[in] stridex Stride value in elements across x.

726

* @param[in] input_offset (Optional) Input quantization offset.

Michalis Spyrou

2017-10-18 17:58:22 +0100

[diff] [blame]

727

*

728

*/

Michele Di Giorgio

2020-01-02 12:11:13 +0000

[diff] [blame]

729

inline float16x8x2_t convolve_3x3(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low,

730

const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2,

731

unsigned int stridex, int input_offset = 0)

Michalis Spyrou

2017-10-18 17:58:22 +0100

[diff] [blame]

732

{

Georgios Pinitas

20c246a

2018-09-12 16:45:53 +0100

[diff] [blame]

733

ARM_COMPUTE_UNUSED(input_offset);

alankelly

2019-05-15 23:05:31 +0200

[diff] [blame]

734

735

float16x8x2_t out =

Michalis Spyrou

f464337

2019-11-29 16:17:13 +0000

[diff] [blame]

736

{

alankelly

2019-05-15 23:05:31 +0200

[diff] [blame]

737

{

Michele Di Giorgio

2020-01-02 12:11:13 +0000

[diff] [blame]

738

vdupq_n_f16(0),

Michalis Spyrou

f464337

2019-11-29 16:17:13 +0000

[diff] [blame]

739

vdupq_n_f16(0)

740

}

741

};

Michele Di Giorgio

2020-01-02 12:11:13 +0000

[diff] [blame]

742

if(stridex == 2)

743

{

744

const float16x8x2_t vtop = vld2q_f16(in_top);

745

const float16x8x2_t vmid = vld2q_f16(in_mid);

746

const float16x8x2_t vlow = vld2q_f16(in_low);

747

const float16x8_t vtop_end = vld1q_f16(in_top + 16);

748

const float16x8_t vmid_end = vld1q_f16(in_mid + 16);

749

const float16x8_t vlow_end = vld1q_f16(in_low + 16);

alankelly

2019-05-15 23:05:31 +0200

[diff] [blame]

750

Michele Di Giorgio

2020-01-02 12:11:13 +0000

[diff] [blame]

751

out.val[0] = vmulq_f16(vtop.val[0], m0.val[0]);

alankelly

2019-05-15 23:05:31 +0200

[diff] [blame]

752

Michele Di Giorgio

2020-01-02 12:11:13 +0000

[diff] [blame]

753

out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vtop.val[1], m0.val[1]));

754

out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vextq_f16(vtop.val[0], vtop_end, 1), m0.val[2]));

alankelly

2019-05-15 23:05:31 +0200

[diff] [blame]

755

Michele Di Giorgio

2020-01-02 12:11:13 +0000

[diff] [blame]

756

out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vmid.val[0], m1.val[0]));

757

out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vmid.val[1], m1.val[1]));

758

out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vextq_f16(vmid.val[0], vmid_end, 1), m1.val[2]));

Michalis Spyrou

2017-10-18 17:58:22 +0100

[diff] [blame]

759

Michele Di Giorgio

2020-01-02 12:11:13 +0000

[diff] [blame]

760

out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vlow.val[0], m2.val[0]));

761

out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vlow.val[1], m2.val[1]));

762

out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vextq_f16(vlow.val[0], vlow_end, 1), m2.val[2]));

}

else

{

const float16x8x3_t vtop =

{

{

vld1q_f16(in_top),

vld1q_f16(in_top + 8),

771

vld1q_f16(in_top + 16)

772

}

773

};

774

const float16x8x3_t vmid =

{

{

vld1q_f16(in_mid),

vld1q_f16(in_mid + 8),

779

vld1q_f16(in_mid + 16)

780

}

781

};

782

const float16x8x3_t vlow =

{

{

vld1q_f16(in_low),

vld1q_f16(in_low + 8),

787

vld1q_f16(in_low + 16)

788

}

789

};

790

out.val[0] = vmulq_f16(vtop.val[0], m0.val[0]);

791

out.val[1] = vmulq_f16(vtop.val[1], m0.val[0]);

792

793

out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vextq_f16(vtop.val[0], vtop.val[1], 1), m0.val[1]));

794

out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vextq_f16(vtop.val[0], vtop.val[1], 2), m0.val[2]));

795

out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vmid.val[0], m1.val[0]));

796

out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vextq_f16(vmid.val[0], vmid.val[1], 1), m1.val[1]));

797

out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vextq_f16(vmid.val[0], vmid.val[1], 2), m1.val[2]));

798

out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vlow.val[0], m2.val[0]));

799

out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vextq_f16(vlow.val[0], vlow.val[1], 1), m2.val[1]));

800

out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vextq_f16(vlow.val[0], vlow.val[1], 2), m2.val[2]));

801

out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vextq_f16(vtop.val[1], vtop.val[2], 1), m0.val[1]));

802

out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vextq_f16(vtop.val[1], vtop.val[2], 2), m0.val[2]));

803

out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vmid.val[1], m1.val[0]));

804

out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vextq_f16(vmid.val[1], vmid.val[2], 1), m1.val[1]));

805

out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vextq_f16(vmid.val[1], vmid.val[2], 2), m1.val[2]));

806

out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vlow.val[1], m2.val[0]));

807

out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vextq_f16(vlow.val[1], vlow.val[2], 1), m2.val[1]));

808

out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vextq_f16(vlow.val[1], vlow.val[2], 2), m2.val[2]));

if(stridex == 3)

{

out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 3), out.val[0], 1);

813

out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 6), out.val[0], 2);

814

out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[1], 1), out.val[0], 3);

}

}

Michalis Spyrou

2017-10-18 17:58:22 +0100

[diff] [blame]

return out;

}

/** Stores a float16x8x2_t array into a memory location.

822

*

823

* @param[in] buffer Pointer to the memory location where the values will be stored.

824

* @param[in] values Values that will be stored.

825

*

826

*/

827

template <unsigned int stridex>

828

void store_results(float16_t *buffer, const float16x8x2_t &values);

829

830

template <>

831

inline void store_results<1>(float16_t *buffer, const float16x8x2_t &values)

832

{

833

vst1q_f16(buffer, values.val[0]);

834

vst1q_f16(buffer + 8, values.val[1]);

}

template <>

inline void store_results<2>(float16_t *buffer, const float16x8x2_t &values)

839

{

840

vst1q_f16(buffer, values.val[0]);

}

template <>

inline void store_results<3>(float16_t *buffer, const float16x8x2_t &values)

845

{

846

vst1_f16(buffer, vget_low_f16(values.val[0]));

847

}

Ioan-Cristian Szabo

5edbd1c

2017-11-13 13:34:08 +0000

[diff] [blame]

848

#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */

Michalis Spyrou

2017-10-18 17:58:22 +0100

[diff] [blame]

849

850

/** Get the number of elements processed on 3x3 convolution.

851

*

852

* @param[in] num_elems_written_per_iteration Number of elements written per iteration on 3x3 convolution.

Michele Di Giorgio

2020-01-02 12:11:13 +0000

[diff] [blame]

853

* @param[in] stridex Stride value in elements across x.

Michalis Spyrou

2017-10-18 17:58:22 +0100

[diff] [blame]

854

*

855

* @return The number of elements processed.

856

*/

Anthony Barbier

2017-12-12 17:17:50 +0000

[diff] [blame]

857

inline int get_input_num_elems_processed(unsigned int num_elems_written_per_iteration, unsigned int stridex)

{

switch(stridex)

{

case 1:

Michele Di Giorgio

2020-01-02 12:11:13 +0000

[diff] [blame]

862

return num_elems_written_per_iteration;

Anthony Barbier

2017-12-12 17:17:50 +0000

[diff] [blame]

863

case 2:

Michele Di Giorgio

2020-01-02 12:11:13 +0000

[diff] [blame]

864

return num_elems_written_per_iteration << 1;

Anthony Barbier

2017-12-12 17:17:50 +0000

[diff] [blame]

865

case 3:

Michele Di Giorgio

2020-01-02 12:11:13 +0000

[diff] [blame]

866

return num_elems_written_per_iteration * 3;

Anthony Barbier

2017-12-12 17:17:50 +0000

[diff] [blame]

867

default:

868

ARM_COMPUTE_ERROR("stridex not supported");

869

return 0;

870

}

871

}

Michalis Spyrou