Blame - src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp - ml/ComputeLibrary

2017-07-26 10:28:40 +0100

[diff] [blame]

373

Window window_k = calculate_max_window(*weights->info(), Steps(1u));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

374

Iterator out(output, window_out);

375

Iterator in(input, window_in);

376

Iterator k(weights, window_k);

377

378

const uint8_t *k_ptr = k.ptr();

379

380

execute_window_loop(window_out, [&](const Coordinates & id)

381

{

382

/*

383

For a detailed explanation on how the algorithm works refer to template <> class convolver_3x3<1>

384

*/

385

const uint8_t *input_ptr = in.ptr();

386

uint8_t *out_ptr = out.ptr();

387

int ih = 0;

388

int oh = 0;

389

for(int oz = 0; oz < range_z; ++oz)

390

{

391

auto p_out_base = out_ptr + oz * output_stride_z;

392

// Step 1

393

{

394

const auto k_val = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + (id.z() + oz) * kernel_stride_w);

395

const auto vk = internal_vdupq_n(*k_val);

396

for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y)

397

{

398

const int offset_xy = ih * input_stride_y;

399

auto in_val = reinterpret_cast<const T1 *>(input_ptr + (0 * input_stride_z + offset_xy));

400

auto p_out = reinterpret_cast<T2 *>(p_out_base + oh * output_stride_y);

401

for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration, in_val += num_elems_read_per_iteration, p_out += num_elems_written_per_iteration)

402

{

403

internal_vst1q(p_out, internal_vmull(vk, internal_vld1q<stridex>(in_val), fixed_point_position));

404

}

405

}

406

}

Pablo Tello

c09314a

2017-09-21 13:59:14 +0100

[diff] [blame]

407

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

408

// Step 2

409

for(int p = 1; p < kernel_depth; ++p)

410

{

411

const auto k_val = reinterpret_cast<const T1 *>(k_ptr + p * kernel_stride_z + (id.z() + oz) * kernel_stride_w);

412

const auto vk = internal_vdupq_n(*k_val);

413

for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y)

414

{

415

const int offset_xy = ih * input_stride_y;

416

auto in_val = reinterpret_cast<const T1 *>(input_ptr + p * input_stride_z + offset_xy);

417

auto p_out = reinterpret_cast<T2 *>(p_out_base + oh * output_stride_y);

418

for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration, in_val += num_elems_read_per_iteration, p_out += num_elems_written_per_iteration)

419

{

420

internal_vst1q(p_out, internal_vmlal(internal_vld1q<1>(p_out), vk, internal_vld1q<stridex>(in_val), fixed_point_position));

}

}

}

}

},

in, out);

}

};

Ioan-Cristian Szabo

2017-10-26 15:42:24 +0100

[diff] [blame]

430

#ifdef ARM_COMPUTE_AARCH64_V8_2

Pablo Tello

2017-07-06 16:43:14 +0100

[diff] [blame]

431

inline float16x8x3_t load_matrix_row(const float16_t *ptr)

432

{

433

/* ptr is a pointer to a row in a 3x3 matrix, the function returns 3 vectors holding exactly the same value in all lanes:

434

r.val[0] contains the first element, r.val[1] the second element and r.val[2] the third element (in all lanes) */

435

const float16x8x3_t r =

{

{

vld1q_dup_f16(ptr),

vld1q_dup_f16(1 + ptr),

440

vld1q_dup_f16(2 + ptr)

}

};

return r;

}

template <unsigned int stridex>

447

float16x8x2_t convolve_3x3(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low, const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2,

448

int fixed_point_position);

449

450

template <>

451

float16x8x2_t convolve_3x3<1>(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low, const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2,

452

int fixed_point_position)

453

{

454

ARM_COMPUTE_UNUSED(fixed_point_position);

455

456

const float16x8x3_t vtop =

{

{

vld1q_f16(in_top),

vld1q_f16(in_top + 8),

461

vld1q_f16(in_top + 16)

462

}

463

};

464

const float16x8x3_t vmid =

{

{

vld1q_f16(in_mid),

vld1q_f16(in_mid + 8),

469

vld1q_f16(in_mid + 16)

470

}

471

};

472

const float16x8x3_t vlow =

{

{

vld1q_f16(in_low),

vld1q_f16(in_low + 8),

477

vld1q_f16(in_low + 16)

}

};

float16x8x2_t out =

{

{

vmulq_f16(vtop.val[0], m0.val[0]),

484

vmulq_f16(vtop.val[1], m0.val[0])

485

}

486

};

487

out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vextq_f16(vtop.val[0], vtop.val[1], 1), m0.val[1]));

488

out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vextq_f16(vtop.val[0], vtop.val[1], 2), m0.val[2]));

489

out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vmid.val[0], m1.val[0]));

490

out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vextq_f16(vmid.val[0], vmid.val[1], 1), m1.val[1]));

491

out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vextq_f16(vmid.val[0], vmid.val[1], 2), m1.val[2]));

492

out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vlow.val[0], m2.val[0]));

493

out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vextq_f16(vlow.val[0], vlow.val[1], 1), m2.val[1]));

494

out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vextq_f16(vlow.val[0], vlow.val[1], 2), m2.val[2]));

495

out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vextq_f16(vtop.val[1], vtop.val[2], 1), m0.val[1]));

496

out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vextq_f16(vtop.val[1], vtop.val[2], 2), m0.val[2]));

497

out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vmid.val[1], m1.val[0]));

498

out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vextq_f16(vmid.val[1], vmid.val[2], 1), m1.val[1]));

499

out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vextq_f16(vmid.val[1], vmid.val[2], 2), m1.val[2]));

500

out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vlow.val[1], m2.val[0]));

501

out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vextq_f16(vlow.val[1], vlow.val[2], 1), m2.val[1]));

502

out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vextq_f16(vlow.val[1], vlow.val[2], 2), m2.val[2]));

return out;

}

template <>

inline float16x8x2_t convolve_3x3<2>(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low, const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2,

508

int fixed_point_position)

509

{

510

float16x8x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, fixed_point_position);

511

out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 2), out.val[0], 1);

512

out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[1], 0), out.val[0], 2);

513

out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[1], 2), out.val[0], 3);

return out;

}

template <>

inline float16x8x2_t convolve_3x3<3>(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low, const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2,

519

int fixed_point_position)

520

{

521

float16x8x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, fixed_point_position);

522

out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 3), out.val[0], 1);

return out;

}

template <unsigned int stridex>

527

void store_results(float16_t *buffer, const float16x8x2_t &values);

528

529

template <>

530

void store_results<1>(float16_t *buffer, const float16x8x2_t &values)

531

{

532

vst1q_f16(buffer, values.val[0]);

533

vst1q_f16(buffer + 8, values.val[1]);

}

template <>

void store_results<2>(float16_t *buffer, const float16x8x2_t &values)

538

{

539

vst1q_f16(buffer, values.val[0]);

}

template <>

void store_results<3>(float16_t *buffer, const float16x8x2_t &values)

544

{

545

vst1_f16(buffer, vget_low_f16(values.val[0]));

546

}

547

548

template <unsigned int stridex>

549

void accumulate_results(float16_t *buffer, const float16x8x2_t &values);

550

551

template <>

552

void accumulate_results<1>(float16_t *buffer, const float16x8x2_t &values)

553

{

554

vst1q_f16(buffer, vaddq_f16(vld1q_f16(buffer), values.val[0]));

555

vst1q_f16(buffer + 8, vaddq_f16(vld1q_f16(buffer + 8), values.val[1]));

}

template <>

void accumulate_results<2>(float16_t *buffer, const float16x8x2_t &values)

560

{

561

vst1q_f16(buffer, vaddq_f16(vld1q_f16(buffer), values.val[0]));

}

template <>

void accumulate_results<3>(float16_t *buffer, const float16x8x2_t &values)

566

{

567

vst1_f16(buffer, vadd_f16(vld1_f16(buffer), vget_low_f16(values.val[0])));

568

}

569

Ioan-Cristian Szabo

2017-10-26 15:42:24 +0100

[diff] [blame]

570

#endif /* ARM_COMPUTE_AARCH64_V8_2 */

Pablo Tello

2017-07-06 16:43:14 +0100

[diff] [blame]

571

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

572

inline float32x4x3_t load_matrix_row(const float *ptr)

573

{

574

const float32x4x3_t r =

{

{

vld1q_dup_f32(ptr),

vld1q_dup_f32(1 + ptr),

579

vld1q_dup_f32(2 + ptr)

}

};

return r;

}

inline qint8x8x3_t load_matrix_row(const qint8_t *ptr)

585

{

586

/* ptr is a pointer to a row in a 3x3 matrix, the function returns 3 vectors holding exactly the same value in all lanes:

587

r.val[0] contains the first element, r.val[1] the second element and r.val[2] the third element (in all lanes) */

588

const qint8x8x3_t r =

{

{

vld1_dup_qs8(ptr),

vld1_dup_qs8(1 + ptr),

593

vld1_dup_qs8(2 + ptr)

}

};

return r;

}

template <unsigned int stridex>

Pablo Tello

2017-08-10 15:10:40 +0100

[diff] [blame]

600

float32x4x2_t convolve_5x5(const float *in_0, const float *in_1, const float *in_2, const float *in_3, const float *in_4,

601

const float *m0, const float *m1, const float *m2, const float *m3, const float *m4, int fixed_point_position);

602

603

inline float32x4x3_t load_matrix_hi(const float *const m0, const float *const m1, const float *const m2)

604

{

605

const float32x4x3_t m00 =

{

{

vld1q_dup_f32(m0),

vld1q_dup_f32(m1),

vld1q_dup_f32(m2)

}

};

return m00;

}

inline float32x4x2_t load_matrix_lo(const float *const m3, const float *const m4)

617

{

618

const float32x4x2_t m00 =

{

{

vld1q_dup_f32(m3),

vld1q_dup_f32(m4)

}

};

return m00;

}

inline float32x4x3_t load_input(const float *const in)

629

{

630

const float32x4x3_t vin =

{

{

vld1q_f32(in),

vld1q_f32(in + 4),

vld1q_f32(in + 8)

}

};

return vin;

}

template <>

inline float32x4x2_t convolve_5x5<1>(const float *in_0, const float *in_1, const float *in_2, const float *in_3, const float *in_4,

643

const float *m0, const float *m1, const float *m2, const float *m3, const float *m4, int fixed_point_position)

644

{

645

ARM_COMPUTE_UNUSED(fixed_point_position);

646

const float32x4x3_t vin0 = load_input(in_0);

647

const float32x4x3_t vin1 = load_input(in_1);

648

const float32x4x3_t vin2 = load_input(in_2);

649

const float32x4x3_t vin3 = load_input(in_3);

650

const float32x4x3_t vin4 = load_input(in_4);

651

const float32x4x3_t m00 = load_matrix_hi(m0, 1 + m0, 2 + m0);

652

const float32x4x2_t m01 = load_matrix_lo(3 + m0, 4 + m0);

653

const float32x4x3_t m10 = load_matrix_hi(m1, 1 + m1, 2 + m1);

654

const float32x4x2_t m11 = load_matrix_lo(3 + m1, 4 + m1);

655

const float32x4x3_t m20 = load_matrix_hi(m2, 1 + m2, 2 + m2);

656

const float32x4x2_t m21 = load_matrix_lo(3 + m2, 4 + m2);

657

const float32x4x3_t m30 = load_matrix_hi(m3, 1 + m3, 2 + m3);

658

const float32x4x2_t m31 = load_matrix_lo(3 + m3, 4 + m3);

659

const float32x4x3_t m40 = load_matrix_hi(m4, 1 + m4, 2 + m4);

660

const float32x4x2_t m41 = load_matrix_lo(3 + m4, 4 + m4);

float32x4x2_t out =

{

{

vmulq_f32(vin0.val[0], m00.val[0]),

666

vmulq_f32(vin0.val[1], m00.val[0])

}

};

out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin0.val[0], vin0.val[1], 1), m00.val[1]);

671

out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin0.val[0], vin0.val[1], 2), m00.val[2]);

672

out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin0.val[0], vin0.val[1], 3), m01.val[0]);

673

out.val[0] = vmlaq_f32(out.val[0], vin0.val[1], m01.val[1]);

674

675

out.val[0] = vmlaq_f32(out.val[0], vin1.val[0], m10.val[0]);

676

out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin1.val[0], vin1.val[1], 1), m10.val[1]);

677

out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin1.val[0], vin1.val[1], 2), m10.val[2]);

678

out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin1.val[0], vin1.val[1], 3), m11.val[0]);

679

out.val[0] = vmlaq_f32(out.val[0], vin1.val[1], m11.val[1]);

680

681

out.val[0] = vmlaq_f32(out.val[0], vin2.val[0], m20.val[0]);

682

out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin2.val[0], vin2.val[1], 1), m20.val[1]);

683

out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin2.val[0], vin2.val[1], 2), m20.val[2]);

684

out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin2.val[0], vin2.val[1], 3), m21.val[0]);

685

out.val[0] = vmlaq_f32(out.val[0], vin2.val[1], m21.val[1]);

686

687

out.val[0] = vmlaq_f32(out.val[0], vin3.val[0], m30.val[0]);

688

out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin3.val[0], vin3.val[1], 1), m30.val[1]);

689

out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin3.val[0], vin3.val[1], 2), m30.val[2]);

690

out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin3.val[0], vin3.val[1], 3), m31.val[0]);

691

out.val[0] = vmlaq_f32(out.val[0], vin3.val[1], m31.val[1]);

692

693

out.val[0] = vmlaq_f32(out.val[0], vin4.val[0], m40.val[0]);

694

out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin4.val[0], vin4.val[1], 1), m40.val[1]);

695

out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin4.val[0], vin4.val[1], 2), m40.val[2]);

696

out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin4.val[0], vin4.val[1], 3), m41.val[0]);

697

out.val[0] = vmlaq_f32(out.val[0], vin4.val[1], m41.val[1]);

698

699

out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin0.val[1], vin0.val[2], 1), m00.val[1]);

700

out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin0.val[1], vin0.val[2], 2), m00.val[2]);

701

out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin0.val[1], vin0.val[2], 3), m01.val[0]);

702

out.val[1] = vmlaq_f32(out.val[1], vin0.val[2], m01.val[1]);

703

704

out.val[1] = vmlaq_f32(out.val[1], vin1.val[1], m10.val[0]);

705

out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin1.val[1], vin1.val[2], 1), m10.val[1]);

706

out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin1.val[1], vin1.val[2], 2), m10.val[2]);

707

out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin1.val[1], vin1.val[2], 3), m11.val[0]);

708

out.val[1] = vmlaq_f32(out.val[1], vin1.val[2], m11.val[1]);

709

710

out.val[1] = vmlaq_f32(out.val[1], vin2.val[1], m20.val[0]);

711

out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin2.val[1], vin2.val[2], 1), m20.val[1]);

712

out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin2.val[1], vin2.val[2], 2), m20.val[2]);

713

out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin2.val[1], vin2.val[2], 3), m21.val[0]);

714

out.val[1] = vmlaq_f32(out.val[1], vin2.val[2], m21.val[1]);

715

716

out.val[1] = vmlaq_f32(out.val[1], vin3.val[1], m30.val[0]);

717

out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin3.val[1], vin3.val[2], 1), m30.val[1]);

718

out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin3.val[1], vin3.val[2], 2), m30.val[2]);

719

out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin3.val[1], vin3.val[2], 3), m31.val[0]);

720

out.val[1] = vmlaq_f32(out.val[1], vin3.val[2], m31.val[1]);

721

722

out.val[1] = vmlaq_f32(out.val[1], vin4.val[1], m40.val[0]);

723

out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin4.val[1], vin4.val[2], 1), m40.val[1]);

724

out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin4.val[1], vin4.val[2], 2), m40.val[2]);

725

out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin4.val[1], vin4.val[2], 3), m41.val[0]);

726

out.val[1] = vmlaq_f32(out.val[1], vin4.val[2], m41.val[1]);

return out;

}

template <>

inline float32x4x2_t convolve_5x5<2>(const float *in_0, const float *in_1, const float *in_2, const float *in_3, const float *in_4,

733

const float *m0, const float *m1, const float *m2, const float *m3, const float *m4, int fixed_point_position)

734

{

735

ARM_COMPUTE_UNUSED(fixed_point_position);

736

float32x4x2_t out = convolve_5x5<1>(in_0, in_1, in_2, in_3, in_4, m0, m1, m2, m3, m4, fixed_point_position);

737

out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 2), out.val[0], 1);

738

out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 0), out.val[0], 2);

739

out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 2), out.val[0], 3);

return out;

}

template <>

inline float32x4x2_t convolve_5x5<3>(const float *in_0, const float *in_1, const float *in_2, const float *in_3, const float *in_4,

745

const float *m0, const float *m1, const float *m2, const float *m3, const float *m4, int fixed_point_position)

746

{

747

float32x4x2_t out = convolve_5x5<1>(in_0, in_1, in_2, in_3, in_4, m0, m1, m2, m3, m4, fixed_point_position);

748

out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 3), out.val[0], 1);

return out;

}

template <unsigned int stridex>

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

753

float32x4x2_t convolve_3x3(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, int fixed_point_position);

754

755

template <>

756

inline float32x4x2_t convolve_3x3<1>(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, int fixed_point_position)

757

{

758

ARM_COMPUTE_UNUSED(fixed_point_position);

759

760

const float32x4x3_t vtop =

{

{

vld1q_f32(in_top),

vld1q_f32(in_top + 4),

765

vld1q_f32(in_top + 8)

766

}

767

};

768

const float32x4x3_t vmid =

{

{

vld1q_f32(in_mid),

vld1q_f32(in_mid + 4),

773

vld1q_f32(in_mid + 8)

774

}

775

};

776

const float32x4x3_t vlow =

{

{

vld1q_f32(in_low),

vld1q_f32(in_low + 4),

781

vld1q_f32(in_low + 8)

}

};

float32x4x2_t out =

{

{

vmulq_f32(vtop.val[0], m0.val[0]),

788

vmulq_f32(vtop.val[1], m0.val[0])

789

}

790

};

791

out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 1), m0.val[1]);

792

out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 2), m0.val[2]);

Pablo Tello

2017-08-10 15:10:40 +0100

[diff] [blame]

793

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

794

out.val[0] = vmlaq_f32(out.val[0], vmid.val[0], m1.val[0]);

795

out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vmid.val[0], vmid.val[1], 1), m1.val[1]);

796

out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vmid.val[0], vmid.val[1], 2), m1.val[2]);

Pablo Tello

2017-08-10 15:10:40 +0100

[diff] [blame]

797

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

798

out.val[0] = vmlaq_f32(out.val[0], vlow.val[0], m2.val[0]);

799

out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vlow.val[0], vlow.val[1], 1), m2.val[1]);

800

out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vlow.val[0], vlow.val[1], 2), m2.val[2]);

Pablo Tello

2017-08-10 15:10:40 +0100

[diff] [blame]

801

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

802

out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vtop.val[1], vtop.val[2], 1), m0.val[1]);

803

out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vtop.val[1], vtop.val[2], 2), m0.val[2]);

Pablo Tello

2017-08-10 15:10:40 +0100

[diff] [blame]

804

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

805

out.val[1] = vmlaq_f32(out.val[1], vmid.val[1], m1.val[0]);

806

out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vmid.val[1], vmid.val[2], 1), m1.val[1]);

807

out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vmid.val[1], vmid.val[2], 2), m1.val[2]);

Pablo Tello

2017-08-10 15:10:40 +0100

[diff] [blame]

808

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

809

out.val[1] = vmlaq_f32(out.val[1], vlow.val[1], m2.val[0]);

810

out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vlow.val[1], vlow.val[2], 1), m2.val[1]);

811

out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vlow.val[1], vlow.val[2], 2), m2.val[2]);

return out;

}

template <>

inline float32x4x2_t convolve_3x3<2>(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, int fixed_point_position)

817

{

818

float32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, fixed_point_position);

819

out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 2), out.val[0], 1);

820

out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 0), out.val[0], 2);

821

out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 2), out.val[0], 3);

return out;

}

template <>

inline float32x4x2_t convolve_3x3<3>(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, int fixed_point_position)

827

{

828

float32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, fixed_point_position);

829

out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 3), out.val[0], 1);

return out;

}

template <unsigned int stridex>

834

qint16x8x2_t convolve_3x3(const qint8_t *in_top, const qint8_t *in_mid, const qint8_t *in_low, const qint8x8x3_t &m0, const qint8x8x3_t &m1, const qint8x8x3_t &m2, int fixed_point_position);

835

836

template <>

837

inline qint16x8x2_t convolve_3x3<1>(const qint8_t *in_top, const qint8_t *in_mid, const qint8_t *in_low, const qint8x8x3_t &m0, const qint8x8x3_t &m1, const qint8x8x3_t &m2, int fixed_point_position)

838

{

839

ARM_COMPUTE_UNUSED(fixed_point_position);

840

841

const qint8x8x3_t vtop =

{

{

vld1_qs8(in_top),

vld1_qs8(in_top + 8),

846

vld1_qs8(in_top + 16)

847

}

848

};

849

const qint8x8x3_t vmid =

{

{

vld1_qs8(in_mid),

vld1_qs8(in_mid + 8),

854

vld1_qs8(in_mid + 16)

855

}

856

};

857

const qint8x8x3_t vlow =

{

{

vld1_qs8(in_low),

vld1_qs8(in_low + 8),

862

vld1_qs8(in_low + 16)

}

};

qint16x8x2_t out =

{

{

vmull_qs8(vtop.val[0], m0.val[0], fixed_point_position),

869

vmull_qs8(vtop.val[1], m0.val[0], fixed_point_position)

870

}

871

};

872

out.val[0] = vqmlal_qs8(out.val[0], vext_s8(vtop.val[0], vtop.val[1], 1), m0.val[1], fixed_point_position);

873

out.val[0] = vqmlal_qs8(out.val[0], vext_s8(vtop.val[0], vtop.val[1], 2), m0.val[2], fixed_point_position);

874

out.val[0] = vqmlal_qs8(out.val[0], vmid.val[0], m1.val[0], fixed_point_position);

875

out.val[0] = vqmlal_qs8(out.val[0], vext_s8(vmid.val[0], vmid.val[1], 1), m1.val[1], fixed_point_position);

876

out.val[0] = vqmlal_qs8(out.val[0], vext_s8(vmid.val[0], vmid.val[1], 2), m1.val[2], fixed_point_position);

877

out.val[0] = vqmlal_qs8(out.val[0], vlow.val[0], m2.val[0], fixed_point_position);

878

out.val[0] = vqmlal_qs8(out.val[0], vext_s8(vlow.val[0], vlow.val[1], 1), m2.val[1], fixed_point_position);

879

out.val[0] = vqmlal_qs8(out.val[0], vext_s8(vlow.val[0], vlow.val[1], 2), m2.val[2], fixed_point_position);

880

out.val[1] = vqmlal_qs8(out.val[1], vext_s8(vtop.val[1], vtop.val[2], 1), m0.val[1], fixed_point_position);

881

out.val[1] = vqmlal_qs8(out.val[1], vext_s8(vtop.val[1], vtop.val[2], 2), m0.val[2], fixed_point_position);

882

out.val[1] = vqmlal_qs8(out.val[1], vmid.val[1], m1.val[0], fixed_point_position);

883

out.val[1] = vqmlal_qs8(out.val[1], vext_s8(vmid.val[1], vmid.val[2], 1), m1.val[1], fixed_point_position);

884

out.val[1] = vqmlal_qs8(out.val[1], vext_s8(vmid.val[1], vmid.val[2], 2), m1.val[2], fixed_point_position);

885

out.val[1] = vqmlal_qs8(out.val[1], vlow.val[1], m2.val[0], fixed_point_position);

886

out.val[1] = vqmlal_qs8(out.val[1], vext_s8(vlow.val[1], vlow.val[2], 1), m2.val[1], fixed_point_position);

887

out.val[1] = vqmlal_qs8(out.val[1], vext_s8(vlow.val[1], vlow.val[2], 2), m2.val[2], fixed_point_position);

return out;

}

template <>

inline qint16x8x2_t convolve_3x3<2>(const qint8_t *in_top, const qint8_t *in_mid, const qint8_t *in_low, const qint8x8x3_t &m0, const qint8x8x3_t &m1, const qint8x8x3_t &m2, int fixed_point_position)

893

{

894

qint16x8x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, fixed_point_position);

895

out.val[0] = vsetq_lane_s16(vgetq_lane_s16(out.val[0], 2), out.val[0], 1);

896

out.val[0] = vsetq_lane_s16(vgetq_lane_s16(out.val[0], 4), out.val[0], 2);

897

out.val[0] = vsetq_lane_s16(vgetq_lane_s16(out.val[0], 6), out.val[0], 3);

898

out.val[0] = vsetq_lane_s16(vgetq_lane_s16(out.val[1], 0), out.val[0], 4);

899

out.val[0] = vsetq_lane_s16(vgetq_lane_s16(out.val[1], 2), out.val[0], 5);

900

out.val[0] = vsetq_lane_s16(vgetq_lane_s16(out.val[1], 4), out.val[0], 6);

901

out.val[0] = vsetq_lane_s16(vgetq_lane_s16(out.val[1], 6), out.val[0], 7);

return out;

}

template <>

inline qint16x8x2_t convolve_3x3<3>(const qint8_t *in_top, const qint8_t *in_mid, const qint8_t *in_low, const qint8x8x3_t &m0, const qint8x8x3_t &m1, const qint8x8x3_t &m2, int fixed_point_position)

907

{

908

qint16x8x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, fixed_point_position);

909

out.val[0] = vsetq_lane_s16(vgetq_lane_s16(out.val[0], 3), out.val[0], 1);

910

out.val[0] = vsetq_lane_s16(vgetq_lane_s16(out.val[0], 6), out.val[0], 2);

911

out.val[0] = vsetq_lane_s16(vgetq_lane_s16(out.val[1], 1), out.val[0], 3);

return out;

}

template <unsigned int stridex>

916

void store_results(float *buffer, const float32x4x2_t &values);

917

918

template <>

919

void store_results<1>(float *buffer, const float32x4x2_t &values)

920

{

921

vst1q_f32(buffer, values.val[0]);

922

vst1q_f32(buffer + 4, values.val[1]);

}

template <>

void store_results<2>(float *buffer, const float32x4x2_t &values)

927

{

928

vst1q_f32(buffer, values.val[0]);

}

template <>

void store_results<3>(float *buffer, const float32x4x2_t &values)

933

{

934

vst1_f32(buffer, vget_low_f32(values.val[0]));

935

}

936

937

template <unsigned int stridex>

938

void store_results(qint16_t *buffer, const qint16x8x2_t &values);

939

940

template <>

941

void store_results<1>(qint16_t *buffer, const qint16x8x2_t &values)

942

{

943

vst1q_qs16(buffer, values.val[0]);

944

vst1q_qs16(buffer + 8, values.val[1]);

}

template <>

void store_results<2>(qint16_t *buffer, const qint16x8x2_t &values)

949

{

950

vst1q_qs16(buffer, values.val[0]);

}

template <>

void store_results<3>(qint16_t *buffer, const qint16x8x2_t &values)

955

{

956

vst1_qs16(buffer, vget_low_s16(values.val[0]));

957

}

958

959

template <unsigned int stridex>

960

void accumulate_results(float *buffer, const float32x4x2_t &values);

961

962

template <>

963

void accumulate_results<1>(float *buffer, const float32x4x2_t &values)

964

{

965

vst1q_f32(buffer, vaddq_f32(vld1q_f32(buffer), values.val[0]));

966

vst1q_f32(buffer + 4, vaddq_f32(vld1q_f32(buffer + 4), values.val[1]));

}

template <>

void accumulate_results<2>(float *buffer, const float32x4x2_t &values)

971

{

972

vst1q_f32(buffer, vaddq_f32(vld1q_f32(buffer), values.val[0]));

}

template <>

void accumulate_results<3>(float *buffer, const float32x4x2_t &values)

977

{

978

vst1_f32(buffer, vadd_f32(vld1_f32(buffer), vget_low_f32(values.val[0])));

979

}

980

981

template <unsigned int stridex>

982

void accumulate_results(qint16_t *buffer, const qint16x8x2_t &values);

983

984

template <>

985

void accumulate_results<1>(qint16_t *buffer, const qint16x8x2_t &values)

986

{

987

vst1q_qs16(buffer, vqaddq_qs16(vld1q_qs16(buffer), values.val[0]));

988

vst1q_qs16(buffer + 8, vqaddq_qs16(vld1q_qs16(buffer + 8), values.val[1]));

}

template <>

void accumulate_results<2>(qint16_t *buffer, const qint16x8x2_t &values)

993

{

994

vst1q_qs16(buffer, vqaddq_qs16(vld1q_qs16(buffer), values.val[0]));

}

template <>

void accumulate_results<3>(qint16_t *buffer, const qint16x8x2_t &values)

999

{

1000

vst1_qs16(buffer, vqadd_qs16(vld1_qs16(buffer), vget_low_s16(values.val[0])));

1001

}

1002

1003

template <unsigned int stridex>

1004

int get_input_num_elems_processed(unsigned int num_elems_written_per_iteration);

1005

1006

template <>

1007

int get_input_num_elems_processed<1>(unsigned int num_elems_written_per_iteration)

1008

{

1009

return num_elems_written_per_iteration;

}

template <>

int get_input_num_elems_processed<2>(unsigned int num_elems_written_per_iteration)

1014

{

1015

return num_elems_written_per_iteration << 1;

}

template <>

int get_input_num_elems_processed<3>(unsigned int num_elems_written_per_iteration)

1020

{

1021

return num_elems_written_per_iteration * 3;

1022

}

1023

1024

template <typename T1, typename T2, unsigned int stridex>

class convolver_3x3

{

public:

static void convolve(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration,

1029

const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info)

1030

{

1031

ARM_COMPUTE_UNUSED(num_elems_read_per_iteration);

1032

const int input_stride_x = input->info()->strides_in_bytes().x();

1033

const int input_stride_y = input->info()->strides_in_bytes().y();

1034

const int input_stride_z = input->info()->strides_in_bytes().z();

1035

const int output_stride_y = output->info()->strides_in_bytes().y();

1036

const int output_stride_z = output->info()->strides_in_bytes().z();

1037

const int kernel_stride_x = weights->info()->strides_in_bytes().x();

1038

const int kernel_stride_y = weights->info()->strides_in_bytes().y();

1039

const int kernel_stride_z = weights->info()->strides_in_bytes().z();

1040

const int kernel_stride_w = weights->info()->strides_in_bytes()[3];

1041

const int output_w = output->info()->dimension(0);

1042

const int output_h = output->info()->dimension(1);

1043

const int num_planes_z = window.z().end() - window.z().start();

1044

const int delta_input = get_input_num_elems_processed<stridex>(num_elems_written_per_iteration);

1045

const int kernel_depth = weights->info()->dimension(Window::DimZ);

1046

const unsigned int conv_stride_y = std::get<1>(conv_info.stride());

1047

const unsigned int conv_pad_x = std::get<0>(conv_info.pad());

1048

const unsigned int conv_pad_y = std::get<1>(conv_info.pad());

1049

const int fixed_point_position = input->info()->fixed_point_position();

1050

1051

// setup output window for the iterator

1052

Window window_out = window;

1053

window_out.set(Window::DimX, Window::Dimension(0, output->info()->dimension(Window::DimX), output->info()->dimension(Window::DimX)));

1054

window_out.set(Window::DimY, Window::Dimension(0, output->info()->dimension(Window::DimY), output->info()->dimension(Window::DimY)));

1055

window_out.set(Window::DimZ, Window::Dimension(window.z().start(), window.z().end(), num_planes_z));

1056

1057

// setup input window for the iterator

1058

Window window_in = window;

1059

// we just want execute_window_loop to iterate over the higher dimensions (>3), so we set the first 3 dimensions to 0

1060

window_in.set(Window::DimX, Window::Dimension(0, 0, 0));

1061

window_in.set(Window::DimY, Window::Dimension(0, 0, 0));

1062

window_in.set(Window::DimZ, Window::Dimension(0, 0, 0));

1063

1064

Window window_k = calculate_max_window(*weights->info(), Steps(1u));

1065

1066

Iterator out(output, window_out);

1067

Iterator in(input, window_in);

1068

Iterator k(weights, window_k);

1069

1070

const uint8_t *k_ptr = k.ptr();

1071

1072

execute_window_loop(window_out, [&](const Coordinates & id)

1073

{

1074

const uint8_t *input_ptr = in.ptr() - conv_pad_x * input_stride_x - conv_pad_y * input_stride_y;

1075

uint8_t *out_ptr = out.ptr();

int ih = 0;

int oh = 0;

/*

Each thread executing this kernel computes one or more output's volume planes.

1080

1081

Let's say the 3rd dimension of the output volume is 32, the first thread will compute the output for Z = [0,7], the second thread will compute the output for Z = [8,15],

1082

the third thread [16,24] and the fourth thread [25,31].

1083

1084

The algorithm outer loop iterates over Z, P, Y, X where P is the depth/3rd dimension of each kernel. This order is not arbitrary, the main benefit of this

Anthony Barbier

e500747

2017-10-27 15:01:44 +0100

[diff] [blame^]

1085

is that we setup the neon registers containing the kernel's values only once and then compute each XY using the preloaded registers as opposed as doing this for every XY value.

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1086

1087

The algorithm does not require allocating any additional memory amd computes the results directly in-place in two stages:

1088

1) Convolve plane 0 with kernel 0 and initialize the corresponding output plane with these values.

1089

2) Convolve the remaining planes and accumulate the results in the output's plane which has been initialized in step 1.

1090

*/

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1091

for(int oz = 0; oz < num_planes_z; ++oz)

1092

{

Pablo Tello

2017-07-06 16:43:14 +0100

[diff] [blame]

1093

const int zoffset = id.z() + oz;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1094

uint8_t *p_out_base = out_ptr + oz * output_stride_z;

1095

// Step 1

1096

{

Pablo Tello

2017-07-06 16:43:14 +0100

[diff] [blame]

1097

const auto ptr_k_r0 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 0 * kernel_stride_y + 0 * kernel_stride_x);

1098

const auto ptr_k_r1 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 1 * kernel_stride_y + 0 * kernel_stride_x);

1099

const auto ptr_k_r2 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 2 * kernel_stride_y + 0 * kernel_stride_x);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1100

const auto vk_r0 = load_matrix_row(ptr_k_r0);

1101

const auto vk_r1 = load_matrix_row(ptr_k_r1);

1102

const auto vk_r2 = load_matrix_row(ptr_k_r2);

1103

for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y)

1104

{

1105

auto in_top = reinterpret_cast<const T1 *>(input_ptr + 0 * input_stride_z + (ih + 0) * input_stride_y);

1106

auto in_mid = reinterpret_cast<const T1 *>(input_ptr + 0 * input_stride_z + (ih + 1) * input_stride_y);

1107

auto in_low = reinterpret_cast<const T1 *>(input_ptr + 0 * input_stride_z + (ih + 2) * input_stride_y);

1108

auto p_out = reinterpret_cast<T2 *>(p_out_base + oh * output_stride_y);

1109

for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration,

1110

in_top += delta_input, in_mid += delta_input, in_low += delta_input, p_out += num_elems_written_per_iteration)

1111

{

1112

auto vres = convolve_3x3<stridex>(in_top, in_mid, in_low, vk_r0, vk_r1, vk_r2, fixed_point_position);

1113

store_results<stridex>(p_out, vres);

}

}

}

// Step 2

for(int p = 1; p < kernel_depth; ++p)

1119

{

Pablo Tello

2017-08-10 15:10:40 +0100

[diff] [blame]

1120

const uint8_t *ptr_k_base = k_ptr + p * kernel_stride_z + zoffset * kernel_stride_w;

1121

const uint8_t *input_base = input_ptr + p * input_stride_z;

1122

const auto ptr_k_r0 = reinterpret_cast<const T1 *>(ptr_k_base);

1123

const auto ptr_k_r1 = reinterpret_cast<const T1 *>(ptr_k_base + kernel_stride_y);

1124

const auto ptr_k_r2 = reinterpret_cast<const T1 *>(ptr_k_base + kernel_stride_y * 2);

1125

const auto vk_r0 = load_matrix_row(ptr_k_r0);

1126

const auto vk_r1 = load_matrix_row(ptr_k_r1);

1127

const auto vk_r2 = load_matrix_row(ptr_k_r2);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1128

for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y)

1129

{

Pablo Tello

2017-08-10 15:10:40 +0100

[diff] [blame]

1130

auto in_top = reinterpret_cast<const T1 *>(input_base + (ih + 0) * input_stride_y);

1131

auto in_mid = reinterpret_cast<const T1 *>(input_base + (ih + 1) * input_stride_y);

1132

auto in_low = reinterpret_cast<const T1 *>(input_base + (ih + 2) * input_stride_y);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1133

auto p_out = reinterpret_cast<T2 *>(p_out_base + oh * output_stride_y);

1134

for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration,

1135

in_top += delta_input, in_mid += delta_input, in_low += delta_input, p_out += num_elems_written_per_iteration)

1136

{

1137

auto vres = convolve_3x3<stridex>(in_top, in_mid, in_low, vk_r0, vk_r1, vk_r2, fixed_point_position);

1138

accumulate_results<stridex>(p_out, vres);

}

}

}

}

},

in, out);

}

};

Pablo Tello

2017-08-10 15:10:40 +0100

[diff] [blame]

1148

template <typename T1, typename T2, unsigned int stridex>

class convolver_5x5

{

public:

static void convolve(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration,

1153

const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info)

1154

{

1155

ARM_COMPUTE_UNUSED(num_elems_read_per_iteration);

1156

const int input_stride_x = input->info()->strides_in_bytes().x();

1157

const int input_stride_y = input->info()->strides_in_bytes().y();

1158

const int input_stride_z = input->info()->strides_in_bytes().z();

1159

const int output_stride_y = output->info()->strides_in_bytes().y();

1160

const int output_stride_z = output->info()->strides_in_bytes().z();

1161

const int kernel_stride_x = weights->info()->strides_in_bytes().x();

1162

const int kernel_stride_y = weights->info()->strides_in_bytes().y();

1163

const int kernel_stride_z = weights->info()->strides_in_bytes().z();

1164

const int kernel_stride_w = weights->info()->strides_in_bytes()[3];

1165

const int output_w = output->info()->dimension(0);

1166

const int output_h = output->info()->dimension(1);

1167

const int num_planes_z = window.z().end() - window.z().start();

1168

const int delta_input = get_input_num_elems_processed<stridex>(num_elems_written_per_iteration);

1169

const int kernel_depth = weights->info()->dimension(Window::DimZ);

1170

const unsigned int conv_stride_y = std::get<1>(conv_info.stride());

1171

const unsigned int conv_pad_x = std::get<0>(conv_info.pad());

1172

const unsigned int conv_pad_y = std::get<1>(conv_info.pad());

1173

const int fixed_point_position = input->info()->fixed_point_position();

1174

1175

// setup output window for the iterator

1176

Window window_out = window;

1177

window_out.set(Window::DimX, Window::Dimension(0, output->info()->dimension(Window::DimX), output->info()->dimension(Window::DimX)));

1178

window_out.set(Window::DimY, Window::Dimension(0, output->info()->dimension(Window::DimY), output->info()->dimension(Window::DimY)));

1179

window_out.set(Window::DimZ, Window::Dimension(window.z().start(), window.z().end(), num_planes_z));

1180

1181

// setup input window for the iterator

1182

Window window_in = window;

1183

// we just want execute_window_loop to iterate over the higher dimensions (>3), so we set the first 3 dimensions to 0

1184

window_in.set(Window::DimX, Window::Dimension(0, 0, 0));

1185

window_in.set(Window::DimY, Window::Dimension(0, 0, 0));

1186

window_in.set(Window::DimZ, Window::Dimension(0, 0, 0));

1187

1188

Window window_k = calculate_max_window(*weights->info(), Steps(1u));

1189

1190

Iterator out(output, window_out);

1191

Iterator in(input, window_in);

1192

Iterator k(weights, window_k);

1193

1194

const uint8_t *k_ptr = k.ptr();

1195

1196

execute_window_loop(window_out, [&](const Coordinates & id)

1197

{

1198

const uint8_t *input_ptr = in.ptr() - conv_pad_x * input_stride_x - conv_pad_y * input_stride_y;

1199

uint8_t *out_ptr = out.ptr();

1200

int ih = 0;

1201

int oh = 0;

1202

for(int oz = 0; oz < num_planes_z; ++oz)

1203

{

1204

const int zoffset = id.z() + oz;

1205

uint8_t *p_out_base = out_ptr + oz * output_stride_z;

1206

// Step 1

1207

{

1208

const auto ptr_k_r0 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 0 * kernel_stride_y + 0 * kernel_stride_x);

1209

const auto ptr_k_r1 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 1 * kernel_stride_y + 0 * kernel_stride_x);

1210

const auto ptr_k_r2 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 2 * kernel_stride_y + 0 * kernel_stride_x);

1211

const auto ptr_k_r3 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 3 * kernel_stride_y + 0 * kernel_stride_x);

1212

const auto ptr_k_r4 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 4 * kernel_stride_y + 0 * kernel_stride_x);

1213

for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y)

1214

{

1215

auto in_0 = reinterpret_cast<const T1 *>(input_ptr + 0 * input_stride_z + (ih + 0) * input_stride_y);

1216

auto in_1 = reinterpret_cast<const T1 *>(input_ptr + 0 * input_stride_z + (ih + 1) * input_stride_y);

1217

auto in_2 = reinterpret_cast<const T1 *>(input_ptr + 0 * input_stride_z + (ih + 2) * input_stride_y);

1218

auto in_3 = reinterpret_cast<const T1 *>(input_ptr + 0 * input_stride_z + (ih + 3) * input_stride_y);

1219

auto in_4 = reinterpret_cast<const T1 *>(input_ptr + 0 * input_stride_z + (ih + 4) * input_stride_y);

1220

auto p_out = reinterpret_cast<T2 *>(p_out_base + oh * output_stride_y);

1221

for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration,

1222

in_0 += delta_input, in_1 += delta_input, in_2 += delta_input, in_3 += delta_input, in_4 += delta_input, p_out += num_elems_written_per_iteration)

1223

{

1224

auto vres = convolve_5x5<stridex>(in_0, in_1, in_2, in_3, in_4, ptr_k_r0, ptr_k_r1, ptr_k_r2, ptr_k_r3, ptr_k_r4, fixed_point_position);

1225

store_results<stridex>(p_out, vres);

}

}

}

// Step 2

for(int p = 1; p < kernel_depth; ++p)

1231

{

1232

const auto ptr_k_r0 = reinterpret_cast<const T1 *>(k_ptr + p * kernel_stride_z + zoffset * kernel_stride_w + 0 * kernel_stride_y + 0 * kernel_stride_x);

1233

const auto ptr_k_r1 = reinterpret_cast<const T1 *>(k_ptr + p * kernel_stride_z + zoffset * kernel_stride_w + 1 * kernel_stride_y + 0 * kernel_stride_x);

1234

const auto ptr_k_r2 = reinterpret_cast<const T1 *>(k_ptr + p * kernel_stride_z + zoffset * kernel_stride_w + 2 * kernel_stride_y + 0 * kernel_stride_x);

1235

const auto ptr_k_r3 = reinterpret_cast<const T1 *>(k_ptr + p * kernel_stride_z + zoffset * kernel_stride_w + 3 * kernel_stride_y + 0 * kernel_stride_x);

1236

const auto ptr_k_r4 = reinterpret_cast<const T1 *>(k_ptr + p * kernel_stride_z + zoffset * kernel_stride_w + 4 * kernel_stride_y + 0 * kernel_stride_x);

1237

1238

for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y)

1239

{

1240

auto in_0 = reinterpret_cast<const T1 *>(input_ptr + p * input_stride_z + (ih + 0) * input_stride_y);

1241

auto in_1 = reinterpret_cast<const T1 *>(input_ptr + p * input_stride_z + (ih + 1) * input_stride_y);

1242

auto in_2 = reinterpret_cast<const T1 *>(input_ptr + p * input_stride_z + (ih + 2) * input_stride_y);

1243

auto in_3 = reinterpret_cast<const T1 *>(input_ptr + p * input_stride_z + (ih + 3) * input_stride_y);

1244

auto in_4 = reinterpret_cast<const T1 *>(input_ptr + p * input_stride_z + (ih + 4) * input_stride_y);

1245

auto p_out = reinterpret_cast<T2 *>(p_out_base + oh * output_stride_y);

1246

for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration,

1247

in_0 += delta_input, in_1 += delta_input, in_2 += delta_input, in_3 += delta_input, in_4 += delta_input, p_out += num_elems_written_per_iteration)

1248

{

1249

auto vres = convolve_5x5<stridex>(in_0, in_1, in_2, in_3, in_4, ptr_k_r0, ptr_k_r1, ptr_k_r2, ptr_k_r3, ptr_k_r4, fixed_point_position);

1250

accumulate_results<stridex>(p_out, vres);

}

}

}

}

},

in, out);

}

};

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1260

template <typename T1, typename T2>

1261

inline void convolve_1x1(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration,

1262

const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info)

1263

{

1264

const unsigned int conv_stride_x = std::get<0>(conv_info.stride());

1265

switch(conv_stride_x)

1266

{

1267

case 1:

1268

convolver_1x1<T1, T2, 1>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info);

1269

break;

1270

case 2:

1271

convolver_1x1<T1, T2, 2>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info);

1272

break;

1273

case 3:

1274

convolver_1x1<T1, T2, 3>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info);

1275

break;

1276

default:

1277

ARM_COMPUTE_ERROR("Not implemented");

}

}

Pablo Tello

2017-09-21 13:59:14 +0100

[diff] [blame]

1281

template <>

1282

inline void convolve_1x1<float, float>(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration,

1283

const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info)

1284

{

1285

const unsigned int conv_stride_x = std::get<0>(conv_info.stride());

1286

if(run_optim_small_tensor(input))

1287

{

1288

switch(conv_stride_x)

1289

{

1290

case 1:

1291

convolver_w1x1_i8x8_f32<1>::convolve(window, input, weights, output, conv_info);

1292

break;

1293

case 2:

1294

convolver_w1x1_i8x8_f32<2>::convolve(window, input, weights, output, conv_info);

1295

break;

1296

case 3:

1297

convolver_w1x1_i8x8_f32<3>::convolve(window, input, weights, output, conv_info);

1298

break;

1299

default:

1300

ARM_COMPUTE_ERROR("Not implemented");

}

}

else

{

switch(conv_stride_x)

1306

{

1307

case 1:

1308

convolver_1x1<float, float, 1>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info);

1309

break;

1310

case 2:

1311

convolver_1x1<float, float, 2>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info);

1312

break;

1313

case 3:

1314

convolver_1x1<float, float, 3>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info);

1315

break;

1316

default:

1317

ARM_COMPUTE_ERROR("Not implemented");

}

}

}

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1322

template <typename T1, typename T2>

1323

inline void convolve_3x3(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration,

1324

const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info)

1325

{

1326

const unsigned int conv_stride_x = std::get<0>(conv_info.stride());

1327

switch(conv_stride_x)

1328

{

1329

case 1:

1330

convolver_3x3<T1, T2, 1>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info);

1331

break;

1332

case 2:

1333

convolver_3x3<T1, T2, 2>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info);

1334

break;

1335

case 3:

1336

convolver_3x3<T1, T2, 3>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info);

1337

break;

1338

default:

1339

ARM_COMPUTE_ERROR("Not implemented");

1340

}

1341

}

Pablo Tello

2017-08-10 15:10:40 +0100

[diff] [blame]

1342

1343

template <typename T1, typename T2>

1344

inline void convolve_5x5(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration,

1345

const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info)

1346

{

1347

const unsigned int conv_stride_x = std::get<0>(conv_info.stride());

1348

switch(conv_stride_x)

1349

{

1350

case 1:

1351

convolver_5x5<T1, T2, 1>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info);

1352

break;

1353

case 2:

1354

convolver_5x5<T1, T2, 2>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info);

1355

break;

1356

case 3:

1357

convolver_5x5<T1, T2, 3>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info);

1358

break;

1359

default:

1360

ARM_COMPUTE_ERROR("Not implemented");

}

}

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1364

} // namespace

1365

1366

NEDirectConvolutionLayerKernel::NEDirectConvolutionLayerKernel()

Georgios Pinitas

2017-09-12 19:19:12 +0100

[diff] [blame]

1367

: _input(nullptr), _weights(nullptr), _output(nullptr), _conv_info(), _border_size(0), _kernel_size(0), _num_weight_elems_read_per_row(0), _num_elems_read_per_iteration(0),

1368

_num_elems_written_per_iteration(0)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

{

}

BorderSize NEDirectConvolutionLayerKernel::border_size() const

{

return _border_size;

}

void NEDirectConvolutionLayerKernel::configure(const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info)

1378

{

Pablo Tello

2017-07-26 10:28:40 +0100

[diff] [blame]

1379

ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::F16, DataType::QS16, DataType::F32);

Gian Marco Iodice

5cb4d6a

2017-08-08 10:53:00 +0100

[diff] [blame]

1380

ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1381

ARM_COMPUTE_ERROR_ON_MSG(weights->info()->dimension(0) == 1 && (std::get<0>(conv_info.pad()) || std::get<1>(conv_info.pad())),

1382

"Pad > 0 not supported for 1x1 weights");

1383

ARM_COMPUTE_ERROR_ON_MSG(weights->info()->dimension(0) == 3 && (std::get<0>(conv_info.pad()) > 1 || std::get<1>(conv_info.pad()) > 1),

1384

"Pad > 1 not supported for 3x3 weights");

Pablo Tello

2017-08-10 15:10:40 +0100

[diff] [blame]

1385

ARM_COMPUTE_ERROR_ON_MSG(weights->info()->dimension(0) == 5 && (std::get<0>(conv_info.pad()) > 2 || std::get<1>(conv_info.pad()) > 2),

1386

"Pad > 2 not supported for 5x5 weights");

1387

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1388

ARM_COMPUTE_ERROR_ON_MSG(std::get<0>(conv_info.stride()) > 3, "Strides larger than 3 not supported.");

Gian Marco Iodice

5cb4d6a

2017-08-08 10:53:00 +0100

[diff] [blame]

1389

ARM_COMPUTE_ERROR_ON(weights->info()->dimension(2) != input->info()->dimension(2));

1390

ARM_COMPUTE_ERROR_ON(weights->info()->dimension(0) != weights->info()->dimension(1));

1391

ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() > 4);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1392

1393

const unsigned int conv_stride_x = std::get<0>(conv_info.stride());

1394

const unsigned int conv_pad_x = std::get<0>(conv_info.pad());

1395

const unsigned int conv_pad_y = std::get<1>(conv_info.pad());

_input = input;

_weights = weights;

_output = output;

_conv_info = conv_info;

1401

_kernel_size = weights->info()->dimension(0);

1402

_border_size = BorderSize(conv_pad_y, conv_pad_x);

1403

Gian Marco Iodice

5cb4d6a

2017-08-08 10:53:00 +0100

[diff] [blame]

1404

const unsigned int kernel_size = weights->info()->dimension(0);

1405

1406

// Get convolved dimensions

1407

unsigned int output_width = 0;

1408

unsigned int output_height = 0;

1409

std::tie(output_width, output_height) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), kernel_size, kernel_size, conv_info);

1410

1411

TensorShape output_shape = input->info()->tensor_shape();

1412

output_shape.set(0, output_width);

1413

output_shape.set(1, output_height);

1414

output_shape.set(2, weights->info()->dimension(3));

1415

1416

DataType data_type = input->info()->data_type();

1417

1418

if(is_data_type_fixed_point(data_type))

1419

{

1420

// Promote data type in case of fixed point

1421

data_type = ((data_type == DataType::QS8) ? DataType::QS16 : DataType::QS32);

1422

}

1423

1424

// Output auto inizialitation if not yet initialized

1425

auto_init_if_empty(*output->info(), output_shape, 1, data_type, input->info()->fixed_point_position());

1426

1427

ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);

1428

ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, output->info()->data_type());

1429

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

switch(_kernel_size)

{

case 1:

{

Pablo Tello

2017-07-06 16:43:14 +0100

[diff] [blame]

1434

switch(input->info()->data_type())

1435

{

Ioan-Cristian Szabo

2017-10-26 15:42:24 +0100

[diff] [blame]

1436

#ifdef ARM_COMPUTE_AARCH64_V8_2

Pablo Tello

2017-07-06 16:43:14 +0100

[diff] [blame]

1437

case DataType::F16:

Ioan-Cristian Szabo

2017-10-26 15:42:24 +0100

[diff] [blame]

1438

#endif /* ARM_COMPUTE_AARCH64_V8_2 */

Pablo Tello

2017-07-06 16:43:14 +0100

[diff] [blame]

1439

case DataType::QS8:

Pablo Tello

2017-07-26 10:28:40 +0100

[diff] [blame]

1440

case DataType::QS16:

Pablo Tello

2017-07-06 16:43:14 +0100

[diff] [blame]

1441

_num_elems_written_per_iteration = 8;

1442

break;

1443

case DataType::F32:

Pablo Tello

c09314a

2017-09-21 13:59:14 +0100

[diff] [blame]

1444

if(run_optim_small_tensor(input))

1445

{

1446

_num_elems_written_per_iteration = 8;

}

else

{

_num_elems_written_per_iteration = 4;

1451

}

Pablo Tello

2017-07-06 16:43:14 +0100

[diff] [blame]

1452

break;

1453

default:

1454

ARM_COMPUTE_ERROR("Data type not supported.");

1455

break;

1456

}

Georgios Pinitas

2017-09-12 19:19:12 +0100

[diff] [blame]

1457

_num_weight_elems_read_per_row = kernel_size;

1458

_num_elems_read_per_iteration = conv_stride_x * _num_elems_written_per_iteration;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1459

break;

1460

}

1461

case 3:

Pablo Tello

2017-08-10 15:10:40 +0100

[diff] [blame]

1462

case 5:

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1463

{

Pablo Tello

2017-08-10 15:10:40 +0100

[diff] [blame]

1464

switch(input->info()->data_type())

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1465

{

Pablo Tello

2017-08-10 15:10:40 +0100

[diff] [blame]

1466

case DataType::F32:

Georgios Pinitas

2017-09-12 19:19:12 +0100

[diff] [blame]

1467

_num_weight_elems_read_per_row = 4 + _kernel_size - 1;

Pablo Tello

2017-08-10 15:10:40 +0100

[diff] [blame]

1468

_num_elems_read_per_iteration = 12;

1469

_num_elems_written_per_iteration = 16 >> conv_stride_x;

1470

break;

Ioan-Cristian Szabo

2017-10-26 15:42:24 +0100

[diff] [blame]

1471

#ifdef ARM_COMPUTE_AARCH64_V8_2

Pablo Tello

2017-08-10 15:10:40 +0100

[diff] [blame]

1472

case DataType::F16:

Ioan-Cristian Szabo

2017-10-26 15:42:24 +0100

[diff] [blame]

1473

#endif /* ARM_COMPUTE_AARCH64_V8_2 */

Pablo Tello

2017-08-10 15:10:40 +0100

[diff] [blame]

1474

case DataType::QS8:

1475

case DataType::QS16:

Georgios Pinitas

2017-09-12 19:19:12 +0100

[diff] [blame]

1476

_num_weight_elems_read_per_row = 8 + _kernel_size - 1;

Pablo Tello

2017-08-10 15:10:40 +0100

[diff] [blame]

1477

_num_elems_read_per_iteration = 24;

1478

_num_elems_written_per_iteration = 32 >> conv_stride_x;

1479

break;

1480

default:

1481

ARM_COMPUTE_ERROR("Data type not supported.");

1482

break;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1483

}

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1484

}

Georgios Pinitas

2017-09-12 19:19:12 +0100

[diff] [blame]

1485

break;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1486

default:

1487

{

1488

ARM_COMPUTE_ERROR("Not implemented");

break;

}

}

Georgios Pinitas

2017-09-12 19:19:12 +0100

[diff] [blame]

1493

// Calculate right and bottom border

1494

const unsigned int conv_stride_y = std::get<1>(_conv_info.stride());

1495

const int input_width = input->info()->dimension(0);

1496

const int input_height = input->info()->dimension(1);

1497

const int upper_bound_w = ceil_to_multiple(((output->info()->dimension(0) - 1) * conv_stride_x + _kernel_size), _num_elems_read_per_iteration) - conv_pad_x - input_width;

1498

const int upper_bound_h = ((output->info()->dimension(1) - 1) * conv_stride_y - conv_pad_y + _kernel_size) - input_height;

1499

_border_size.right = std::max(upper_bound_w, static_cast<int>(_kernel_size));

1500

_border_size.bottom = std::max(upper_bound_h, static_cast<int>(_kernel_size));

Pablo Tello

c09314a

2017-09-21 13:59:14 +0100

[diff] [blame]

1501

Window win = calculate_max_window(*output->info(), Steps(_num_elems_written_per_iteration));

Georgios Pinitas

2017-09-12 19:19:12 +0100

[diff] [blame]

1502

AccessWindowStatic input_access(input->info(), -conv_pad_x, -conv_pad_y, input_width + _border_size.right, input_height + _border_size.bottom);

1503

AccessWindowStatic weights_access(weights->info(), 0, 0, _num_weight_elems_read_per_row, _kernel_size);

1504

AccessWindowHorizontal output_access(output->info(), 0, _num_elems_written_per_iteration);

1505

update_window_and_padding(win, input_access, weights_access, output_access);

1506

output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape()));

1507

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1508

INEKernel::configure(win);

1509

}

1510

Moritz Pflanzer

c186b57

2017-09-07 09:48:04 +0100

[diff] [blame]

1511

void NEDirectConvolutionLayerKernel::run(const Window &window, const ThreadInfo &info)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1512

{

Moritz Pflanzer

c186b57

2017-09-07 09:48:04 +0100

[diff] [blame]

1513

ARM_COMPUTE_UNUSED(info);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1514

ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);

1515

ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);

1516

ARM_COMPUTE_ERROR_ON(_input->buffer() == nullptr);

1517

1518

const int kernel_size = _weights->info()->dimension(0);

switch(kernel_size)

{

case 1:

{

Pablo Tello

2017-07-06 16:43:14 +0100

[diff] [blame]

1524

switch(_input->info()->data_type())

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1525

{

Pablo Tello

2017-07-06 16:43:14 +0100

[diff] [blame]

1526

case DataType::QS8:

1527

convolve_1x1<qint8_t, qint16_t>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);

1528

break;

Pablo Tello

2017-07-26 10:28:40 +0100

[diff] [blame]

1529

case DataType::QS16:

1530

convolve_1x1<qint16_t, qint32_t>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);

1531

break;

Pablo Tello

2017-07-06 16:43:14 +0100

[diff] [blame]

1532

case DataType::F32:

1533

convolve_1x1<float, float>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);

1534

break;

Ioan-Cristian Szabo

2017-10-26 15:42:24 +0100

[diff] [blame]

1535

#ifdef ARM_COMPUTE_AARCH64_V8_2

Pablo Tello

2017-07-06 16:43:14 +0100

[diff] [blame]

1536

case DataType::F16:

1537

convolve_1x1<float16_t, float16_t>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);

1538

break;

Ioan-Cristian Szabo

2017-10-26 15:42:24 +0100

[diff] [blame]

1539

#endif /* ARM_COMPUTE_AARCH64_V8_2 */

Pablo Tello

2017-07-06 16:43:14 +0100

[diff] [blame]

1540

default:

1541

ARM_COMPUTE_ERROR("Data type not supported");

1542

break;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

}

break;

}

case 3:

{

Pablo Tello

2017-07-06 16:43:14 +0100

[diff] [blame]

1548

switch(_input->info()->data_type())

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1549

{

Pablo Tello

2017-07-06 16:43:14 +0100

[diff] [blame]

1550

case DataType::QS8:

1551

convolve_3x3<qint8_t, qint16_t>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);

1552

break;

1553

case DataType::F32:

1554

convolve_3x3<float, float>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);

1555

break;

Ioan-Cristian Szabo

2017-10-26 15:42:24 +0100

[diff] [blame]

1556

#ifdef ARM_COMPUTE_AARCH64_V8_2

Pablo Tello

2017-07-06 16:43:14 +0100

[diff] [blame]

1557

case DataType::F16:

1558

convolve_3x3<float16_t, float16_t>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);

1559

break;

Ioan-Cristian Szabo

2017-10-26 15:42:24 +0100

[diff] [blame]

1560

#endif /* ARM_COMPUTE_AARCH64_V8_2 */

Pablo Tello

2017-07-06 16:43:14 +0100

[diff] [blame]

1561

default:

1562

ARM_COMPUTE_ERROR("Data type not supported");

1563

break;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1564

}

1565

break;

1566

}

Pablo Tello

2017-08-10 15:10:40 +0100

[diff] [blame]

1567

case 5:

1568

{

1569

switch(_input->info()->data_type())

1570

{

1571

case DataType::F32:

1572

convolve_5x5<float, float>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);

1573

break;

1574

default:

1575

ARM_COMPUTE_ERROR("Data type not supported");

break;

}

break;

}

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1581

default:

1582

{

Pablo Tello

2017-08-10 15:10:40 +0100

[diff] [blame]

1583

ARM_COMPUTE_ERROR("Only kernel sizes 1x1, 3x3 and 5x5 are supported.");

Anthony Barbier