Blame - src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp - ml/ComputeLibrary

2017-09-04 18:44:23 +0100

[diff] [blame]

381

Iterator out(output, window_out);

382

Iterator in(input, window_in);

383

Iterator k(weights, window_k);

384

385

const uint8_t *k_ptr = k.ptr();

386

387

execute_window_loop(window_out, [&](const Coordinates & id)

388

{

389

/*

390

For a detailed explanation on how the algorithm works refer to template <> class convolver_3x3<1>

391

*/

392

const uint8_t *input_ptr = in.ptr();

393

uint8_t *out_ptr = out.ptr();

394

int ih = 0;

395

int oh = 0;

396

for(int oz = 0; oz < range_z; ++oz)

397

{

398

auto p_out_base = out_ptr + oz * output_stride_z;

399

// Step 1

400

{

401

const auto k_val = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + (id.z() + oz) * kernel_stride_w);

402

const auto vk = internal_vdupq_n(*k_val);

403

for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y)

404

{

405

const int offset_xy = ih * input_stride_y;

406

auto in_val = reinterpret_cast<const T1 *>(input_ptr + (0 * input_stride_z + offset_xy));

407

auto p_out = reinterpret_cast<T2 *>(p_out_base + oh * output_stride_y);

408

for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration, in_val += num_elems_read_per_iteration, p_out += num_elems_written_per_iteration)

409

{

410

internal_vst1q(p_out, internal_vmull(vk, internal_vld1q<stridex>(in_val), fixed_point_position));

411

}

412

}

413

}

Pablo Tello

c09314a

2017-09-21 13:59:14 +0100

[diff] [blame]

414

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

415

// Step 2

416

for(int p = 1; p < kernel_depth; ++p)

417

{

418

const auto k_val = reinterpret_cast<const T1 *>(k_ptr + p * kernel_stride_z + (id.z() + oz) * kernel_stride_w);

419

const auto vk = internal_vdupq_n(*k_val);

420

for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y)

421

{

422

const int offset_xy = ih * input_stride_y;

423

auto in_val = reinterpret_cast<const T1 *>(input_ptr + p * input_stride_z + offset_xy);

424

auto p_out = reinterpret_cast<T2 *>(p_out_base + oh * output_stride_y);

425

for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration, in_val += num_elems_read_per_iteration, p_out += num_elems_written_per_iteration)

426

{

427

internal_vst1q(p_out, internal_vmlal(internal_vld1q<1>(p_out), vk, internal_vld1q<stridex>(in_val), fixed_point_position));

}

}

}

}

},

in, out);

}

};

Ioan-Cristian Szabo

2017-11-13 13:34:08 +0000

[diff] [blame]

437

#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

Pablo Tello

2017-07-06 16:43:14 +0100

[diff] [blame]

438

439

template <unsigned int stridex>

440

void accumulate_results(float16_t *buffer, const float16x8x2_t &values);

441

442

template <>

443

void accumulate_results<1>(float16_t *buffer, const float16x8x2_t &values)

444

{

445

vst1q_f16(buffer, vaddq_f16(vld1q_f16(buffer), values.val[0]));

446

vst1q_f16(buffer + 8, vaddq_f16(vld1q_f16(buffer + 8), values.val[1]));

}

template <>

void accumulate_results<2>(float16_t *buffer, const float16x8x2_t &values)

451

{

452

vst1q_f16(buffer, vaddq_f16(vld1q_f16(buffer), values.val[0]));

}

template <>

void accumulate_results<3>(float16_t *buffer, const float16x8x2_t &values)

457

{

458

vst1_f16(buffer, vadd_f16(vld1_f16(buffer), vget_low_f16(values.val[0])));

459

}

460

Ioan-Cristian Szabo

2017-11-13 13:34:08 +0000

[diff] [blame]

461

#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */

Pablo Tello

2017-07-06 16:43:14 +0100

[diff] [blame]

462

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

463

template <unsigned int stridex>

Pablo Tello

2017-08-10 15:10:40 +0100

[diff] [blame]

464

float32x4x2_t convolve_5x5(const float *in_0, const float *in_1, const float *in_2, const float *in_3, const float *in_4,

465

const float *m0, const float *m1, const float *m2, const float *m3, const float *m4, int fixed_point_position);

466

467

inline float32x4x3_t load_matrix_hi(const float *const m0, const float *const m1, const float *const m2)

468

{

469

const float32x4x3_t m00 =

{

{

vld1q_dup_f32(m0),

vld1q_dup_f32(m1),

vld1q_dup_f32(m2)

}

};

return m00;

}

inline float32x4x2_t load_matrix_lo(const float *const m3, const float *const m4)

481

{

482

const float32x4x2_t m00 =

{

{

vld1q_dup_f32(m3),

vld1q_dup_f32(m4)

}

};

return m00;

}

inline float32x4x3_t load_input(const float *const in)

493

{

494

const float32x4x3_t vin =

{

{

vld1q_f32(in),

vld1q_f32(in + 4),

vld1q_f32(in + 8)

}

};

return vin;

}

template <>

inline float32x4x2_t convolve_5x5<1>(const float *in_0, const float *in_1, const float *in_2, const float *in_3, const float *in_4,

507

const float *m0, const float *m1, const float *m2, const float *m3, const float *m4, int fixed_point_position)

508

{

509

ARM_COMPUTE_UNUSED(fixed_point_position);

510

const float32x4x3_t vin0 = load_input(in_0);

511

const float32x4x3_t vin1 = load_input(in_1);

512

const float32x4x3_t vin2 = load_input(in_2);

513

const float32x4x3_t vin3 = load_input(in_3);

514

const float32x4x3_t vin4 = load_input(in_4);

515

const float32x4x3_t m00 = load_matrix_hi(m0, 1 + m0, 2 + m0);

516

const float32x4x2_t m01 = load_matrix_lo(3 + m0, 4 + m0);

517

const float32x4x3_t m10 = load_matrix_hi(m1, 1 + m1, 2 + m1);

518

const float32x4x2_t m11 = load_matrix_lo(3 + m1, 4 + m1);

519

const float32x4x3_t m20 = load_matrix_hi(m2, 1 + m2, 2 + m2);

520

const float32x4x2_t m21 = load_matrix_lo(3 + m2, 4 + m2);

521

const float32x4x3_t m30 = load_matrix_hi(m3, 1 + m3, 2 + m3);

522

const float32x4x2_t m31 = load_matrix_lo(3 + m3, 4 + m3);

523

const float32x4x3_t m40 = load_matrix_hi(m4, 1 + m4, 2 + m4);

524

const float32x4x2_t m41 = load_matrix_lo(3 + m4, 4 + m4);

float32x4x2_t out =

{

{

vmulq_f32(vin0.val[0], m00.val[0]),

530

vmulq_f32(vin0.val[1], m00.val[0])

}

};

out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin0.val[0], vin0.val[1], 1), m00.val[1]);

535

out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin0.val[0], vin0.val[1], 2), m00.val[2]);

536

out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin0.val[0], vin0.val[1], 3), m01.val[0]);

537

out.val[0] = vmlaq_f32(out.val[0], vin0.val[1], m01.val[1]);

538

539

out.val[0] = vmlaq_f32(out.val[0], vin1.val[0], m10.val[0]);

540

out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin1.val[0], vin1.val[1], 1), m10.val[1]);

541

out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin1.val[0], vin1.val[1], 2), m10.val[2]);

542

out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin1.val[0], vin1.val[1], 3), m11.val[0]);

543

out.val[0] = vmlaq_f32(out.val[0], vin1.val[1], m11.val[1]);

544

545

out.val[0] = vmlaq_f32(out.val[0], vin2.val[0], m20.val[0]);

546

out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin2.val[0], vin2.val[1], 1), m20.val[1]);

547

out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin2.val[0], vin2.val[1], 2), m20.val[2]);

548

out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin2.val[0], vin2.val[1], 3), m21.val[0]);

549

out.val[0] = vmlaq_f32(out.val[0], vin2.val[1], m21.val[1]);

550

551

out.val[0] = vmlaq_f32(out.val[0], vin3.val[0], m30.val[0]);

552

out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin3.val[0], vin3.val[1], 1), m30.val[1]);

553

out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin3.val[0], vin3.val[1], 2), m30.val[2]);

554

out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin3.val[0], vin3.val[1], 3), m31.val[0]);

555

out.val[0] = vmlaq_f32(out.val[0], vin3.val[1], m31.val[1]);

556

557

out.val[0] = vmlaq_f32(out.val[0], vin4.val[0], m40.val[0]);

558

out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin4.val[0], vin4.val[1], 1), m40.val[1]);

559

out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin4.val[0], vin4.val[1], 2), m40.val[2]);

560

out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin4.val[0], vin4.val[1], 3), m41.val[0]);

561

out.val[0] = vmlaq_f32(out.val[0], vin4.val[1], m41.val[1]);

562

563

out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin0.val[1], vin0.val[2], 1), m00.val[1]);

564

out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin0.val[1], vin0.val[2], 2), m00.val[2]);

565

out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin0.val[1], vin0.val[2], 3), m01.val[0]);

566

out.val[1] = vmlaq_f32(out.val[1], vin0.val[2], m01.val[1]);

567

568

out.val[1] = vmlaq_f32(out.val[1], vin1.val[1], m10.val[0]);

569

out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin1.val[1], vin1.val[2], 1), m10.val[1]);

570

out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin1.val[1], vin1.val[2], 2), m10.val[2]);

571

out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin1.val[1], vin1.val[2], 3), m11.val[0]);

572

out.val[1] = vmlaq_f32(out.val[1], vin1.val[2], m11.val[1]);

573

574

out.val[1] = vmlaq_f32(out.val[1], vin2.val[1], m20.val[0]);

575

out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin2.val[1], vin2.val[2], 1), m20.val[1]);

576

out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin2.val[1], vin2.val[2], 2), m20.val[2]);

577

out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin2.val[1], vin2.val[2], 3), m21.val[0]);

578

out.val[1] = vmlaq_f32(out.val[1], vin2.val[2], m21.val[1]);

579

580

out.val[1] = vmlaq_f32(out.val[1], vin3.val[1], m30.val[0]);

581

out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin3.val[1], vin3.val[2], 1), m30.val[1]);

582

out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin3.val[1], vin3.val[2], 2), m30.val[2]);

583

out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin3.val[1], vin3.val[2], 3), m31.val[0]);

584

out.val[1] = vmlaq_f32(out.val[1], vin3.val[2], m31.val[1]);

585

586

out.val[1] = vmlaq_f32(out.val[1], vin4.val[1], m40.val[0]);

587

out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin4.val[1], vin4.val[2], 1), m40.val[1]);

588

out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin4.val[1], vin4.val[2], 2), m40.val[2]);

589

out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin4.val[1], vin4.val[2], 3), m41.val[0]);

590

out.val[1] = vmlaq_f32(out.val[1], vin4.val[2], m41.val[1]);

return out;

}

template <>

inline float32x4x2_t convolve_5x5<2>(const float *in_0, const float *in_1, const float *in_2, const float *in_3, const float *in_4,

597

const float *m0, const float *m1, const float *m2, const float *m3, const float *m4, int fixed_point_position)

598

{

599

ARM_COMPUTE_UNUSED(fixed_point_position);

600

float32x4x2_t out = convolve_5x5<1>(in_0, in_1, in_2, in_3, in_4, m0, m1, m2, m3, m4, fixed_point_position);

601

out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 2), out.val[0], 1);

602

out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 0), out.val[0], 2);

603

out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 2), out.val[0], 3);

return out;

}

template <>

inline float32x4x2_t convolve_5x5<3>(const float *in_0, const float *in_1, const float *in_2, const float *in_3, const float *in_4,

609

const float *m0, const float *m1, const float *m2, const float *m3, const float *m4, int fixed_point_position)

610

{

611

float32x4x2_t out = convolve_5x5<1>(in_0, in_1, in_2, in_3, in_4, m0, m1, m2, m3, m4, fixed_point_position);

612

out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 3), out.val[0], 1);

return out;

}

template <unsigned int stridex>

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

617

void accumulate_results(float *buffer, const float32x4x2_t &values);

618

619

template <>

620

void accumulate_results<1>(float *buffer, const float32x4x2_t &values)

621

{

622

vst1q_f32(buffer, vaddq_f32(vld1q_f32(buffer), values.val[0]));

623

vst1q_f32(buffer + 4, vaddq_f32(vld1q_f32(buffer + 4), values.val[1]));

}

template <>

void accumulate_results<2>(float *buffer, const float32x4x2_t &values)

628

{

629

vst1q_f32(buffer, vaddq_f32(vld1q_f32(buffer), values.val[0]));

}

template <>

void accumulate_results<3>(float *buffer, const float32x4x2_t &values)

634

{

635

vst1_f32(buffer, vadd_f32(vld1_f32(buffer), vget_low_f32(values.val[0])));

636

}

637

638

template <unsigned int stridex>

639

void accumulate_results(qint16_t *buffer, const qint16x8x2_t &values);

640

641

template <>

642

void accumulate_results<1>(qint16_t *buffer, const qint16x8x2_t &values)

643

{

644

vst1q_qs16(buffer, vqaddq_qs16(vld1q_qs16(buffer), values.val[0]));

645

vst1q_qs16(buffer + 8, vqaddq_qs16(vld1q_qs16(buffer + 8), values.val[1]));

}

template <>

void accumulate_results<2>(qint16_t *buffer, const qint16x8x2_t &values)

650

{

651

vst1q_qs16(buffer, vqaddq_qs16(vld1q_qs16(buffer), values.val[0]));

}

template <>

void accumulate_results<3>(qint16_t *buffer, const qint16x8x2_t &values)

656

{

657

vst1_qs16(buffer, vqadd_qs16(vld1_qs16(buffer), vget_low_s16(values.val[0])));

658

}

659

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

660

template <typename T1, typename T2, unsigned int stridex>

class convolver_3x3

{

public:

static void convolve(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration,

665

const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info)

666

{

667

ARM_COMPUTE_UNUSED(num_elems_read_per_iteration);

668

const int input_stride_x = input->info()->strides_in_bytes().x();

669

const int input_stride_y = input->info()->strides_in_bytes().y();

670

const int input_stride_z = input->info()->strides_in_bytes().z();

671

const int output_stride_y = output->info()->strides_in_bytes().y();

672

const int output_stride_z = output->info()->strides_in_bytes().z();

673

const int kernel_stride_x = weights->info()->strides_in_bytes().x();

674

const int kernel_stride_y = weights->info()->strides_in_bytes().y();

675

const int kernel_stride_z = weights->info()->strides_in_bytes().z();

676

const int kernel_stride_w = weights->info()->strides_in_bytes()[3];

677

const int output_w = output->info()->dimension(0);

678

const int output_h = output->info()->dimension(1);

679

const int num_planes_z = window.z().end() - window.z().start();

680

const int delta_input = get_input_num_elems_processed<stridex>(num_elems_written_per_iteration);

681

const int kernel_depth = weights->info()->dimension(Window::DimZ);

682

const unsigned int conv_stride_y = std::get<1>(conv_info.stride());

683

const unsigned int conv_pad_x = std::get<0>(conv_info.pad());

684

const unsigned int conv_pad_y = std::get<1>(conv_info.pad());

685

const int fixed_point_position = input->info()->fixed_point_position();

686

687

// setup output window for the iterator

688

Window window_out = window;

689

window_out.set(Window::DimX, Window::Dimension(0, output->info()->dimension(Window::DimX), output->info()->dimension(Window::DimX)));

690

window_out.set(Window::DimY, Window::Dimension(0, output->info()->dimension(Window::DimY), output->info()->dimension(Window::DimY)));

691

window_out.set(Window::DimZ, Window::Dimension(window.z().start(), window.z().end(), num_planes_z));

692

693

// setup input window for the iterator

694

Window window_in = window;

695

// we just want execute_window_loop to iterate over the higher dimensions (>3), so we set the first 3 dimensions to 0

696

window_in.set(Window::DimX, Window::Dimension(0, 0, 0));

697

window_in.set(Window::DimY, Window::Dimension(0, 0, 0));

698

window_in.set(Window::DimZ, Window::Dimension(0, 0, 0));

699

700

Window window_k = calculate_max_window(*weights->info(), Steps(1u));

701

702

Iterator out(output, window_out);

703

Iterator in(input, window_in);

704

Iterator k(weights, window_k);

705

706

const uint8_t *k_ptr = k.ptr();

707

708

execute_window_loop(window_out, [&](const Coordinates & id)

709

{

710

const uint8_t *input_ptr = in.ptr() - conv_pad_x * input_stride_x - conv_pad_y * input_stride_y;

711

uint8_t *out_ptr = out.ptr();

int ih = 0;

int oh = 0;

/*

Each thread executing this kernel computes one or more output's volume planes.

716

717

Let's say the 3rd dimension of the output volume is 32, the first thread will compute the output for Z = [0,7], the second thread will compute the output for Z = [8,15],

718

the third thread [16,24] and the fourth thread [25,31].

719

720

The algorithm outer loop iterates over Z, P, Y, X where P is the depth/3rd dimension of each kernel. This order is not arbitrary, the main benefit of this

Anthony Barbier

e500747

2017-10-27 15:01:44 +0100

[diff] [blame]

721

is that we setup the neon registers containing the kernel's values only once and then compute each XY using the preloaded registers as opposed as doing this for every XY value.

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

722

723

The algorithm does not require allocating any additional memory amd computes the results directly in-place in two stages:

724

1) Convolve plane 0 with kernel 0 and initialize the corresponding output plane with these values.

725

2) Convolve the remaining planes and accumulate the results in the output's plane which has been initialized in step 1.

726

*/

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

727

for(int oz = 0; oz < num_planes_z; ++oz)

728

{

Pablo Tello

2017-07-06 16:43:14 +0100

[diff] [blame]

729

const int zoffset = id.z() + oz;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

730

uint8_t *p_out_base = out_ptr + oz * output_stride_z;

731

// Step 1

732

{

Pablo Tello

2017-07-06 16:43:14 +0100

[diff] [blame]

733

const auto ptr_k_r0 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 0 * kernel_stride_y + 0 * kernel_stride_x);

734

const auto ptr_k_r1 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 1 * kernel_stride_y + 0 * kernel_stride_x);

735

const auto ptr_k_r2 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 2 * kernel_stride_y + 0 * kernel_stride_x);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

736

const auto vk_r0 = load_matrix_row(ptr_k_r0);

737

const auto vk_r1 = load_matrix_row(ptr_k_r1);

738

const auto vk_r2 = load_matrix_row(ptr_k_r2);

739

for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y)

740

{

741

auto in_top = reinterpret_cast<const T1 *>(input_ptr + 0 * input_stride_z + (ih + 0) * input_stride_y);

742

auto in_mid = reinterpret_cast<const T1 *>(input_ptr + 0 * input_stride_z + (ih + 1) * input_stride_y);

743

auto in_low = reinterpret_cast<const T1 *>(input_ptr + 0 * input_stride_z + (ih + 2) * input_stride_y);

744

auto p_out = reinterpret_cast<T2 *>(p_out_base + oh * output_stride_y);

745

for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration,

746

in_top += delta_input, in_mid += delta_input, in_low += delta_input, p_out += num_elems_written_per_iteration)

747

{

748

auto vres = convolve_3x3<stridex>(in_top, in_mid, in_low, vk_r0, vk_r1, vk_r2, fixed_point_position);

749

store_results<stridex>(p_out, vres);

}

}

}

// Step 2

for(int p = 1; p < kernel_depth; ++p)

755

{

Pablo Tello

2017-08-10 15:10:40 +0100

[diff] [blame]

756

const uint8_t *ptr_k_base = k_ptr + p * kernel_stride_z + zoffset * kernel_stride_w;

757

const uint8_t *input_base = input_ptr + p * input_stride_z;

758

const auto ptr_k_r0 = reinterpret_cast<const T1 *>(ptr_k_base);

759

const auto ptr_k_r1 = reinterpret_cast<const T1 *>(ptr_k_base + kernel_stride_y);

760

const auto ptr_k_r2 = reinterpret_cast<const T1 *>(ptr_k_base + kernel_stride_y * 2);

761

const auto vk_r0 = load_matrix_row(ptr_k_r0);

762

const auto vk_r1 = load_matrix_row(ptr_k_r1);

763

const auto vk_r2 = load_matrix_row(ptr_k_r2);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

764

for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y)

765

{

Pablo Tello

2017-08-10 15:10:40 +0100

[diff] [blame]

766

auto in_top = reinterpret_cast<const T1 *>(input_base + (ih + 0) * input_stride_y);

767

auto in_mid = reinterpret_cast<const T1 *>(input_base + (ih + 1) * input_stride_y);

768

auto in_low = reinterpret_cast<const T1 *>(input_base + (ih + 2) * input_stride_y);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

769

auto p_out = reinterpret_cast<T2 *>(p_out_base + oh * output_stride_y);

770

for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration,

771

in_top += delta_input, in_mid += delta_input, in_low += delta_input, p_out += num_elems_written_per_iteration)

772

{

773

auto vres = convolve_3x3<stridex>(in_top, in_mid, in_low, vk_r0, vk_r1, vk_r2, fixed_point_position);

774

accumulate_results<stridex>(p_out, vres);

}

}

}

}

},

in, out);

}

};

Pablo Tello

2017-08-10 15:10:40 +0100

[diff] [blame]

784

template <typename T1, typename T2, unsigned int stridex>

class convolver_5x5

{

public:

static void convolve(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration,

789

const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info)

790

{

791

ARM_COMPUTE_UNUSED(num_elems_read_per_iteration);

792

const int input_stride_x = input->info()->strides_in_bytes().x();

793

const int input_stride_y = input->info()->strides_in_bytes().y();

794

const int input_stride_z = input->info()->strides_in_bytes().z();

795

const int output_stride_y = output->info()->strides_in_bytes().y();

796

const int output_stride_z = output->info()->strides_in_bytes().z();

797

const int kernel_stride_x = weights->info()->strides_in_bytes().x();

798

const int kernel_stride_y = weights->info()->strides_in_bytes().y();

799

const int kernel_stride_z = weights->info()->strides_in_bytes().z();

800

const int kernel_stride_w = weights->info()->strides_in_bytes()[3];

801

const int output_w = output->info()->dimension(0);

802

const int output_h = output->info()->dimension(1);

803

const int num_planes_z = window.z().end() - window.z().start();

804

const int delta_input = get_input_num_elems_processed<stridex>(num_elems_written_per_iteration);

805

const int kernel_depth = weights->info()->dimension(Window::DimZ);

806

const unsigned int conv_stride_y = std::get<1>(conv_info.stride());

807

const unsigned int conv_pad_x = std::get<0>(conv_info.pad());

808

const unsigned int conv_pad_y = std::get<1>(conv_info.pad());

809

const int fixed_point_position = input->info()->fixed_point_position();

810

811

// setup output window for the iterator

812

Window window_out = window;

813

window_out.set(Window::DimX, Window::Dimension(0, output->info()->dimension(Window::DimX), output->info()->dimension(Window::DimX)));

814

window_out.set(Window::DimY, Window::Dimension(0, output->info()->dimension(Window::DimY), output->info()->dimension(Window::DimY)));

815

window_out.set(Window::DimZ, Window::Dimension(window.z().start(), window.z().end(), num_planes_z));

816

817

// setup input window for the iterator

818

Window window_in = window;

819

// we just want execute_window_loop to iterate over the higher dimensions (>3), so we set the first 3 dimensions to 0

820

window_in.set(Window::DimX, Window::Dimension(0, 0, 0));

821

window_in.set(Window::DimY, Window::Dimension(0, 0, 0));

822

window_in.set(Window::DimZ, Window::Dimension(0, 0, 0));

823

824

Window window_k = calculate_max_window(*weights->info(), Steps(1u));

825

826

Iterator out(output, window_out);

827

Iterator in(input, window_in);

828

Iterator k(weights, window_k);

829

830

const uint8_t *k_ptr = k.ptr();

831

832

execute_window_loop(window_out, [&](const Coordinates & id)

833

{

834

const uint8_t *input_ptr = in.ptr() - conv_pad_x * input_stride_x - conv_pad_y * input_stride_y;

835

uint8_t *out_ptr = out.ptr();

836

int ih = 0;

837

int oh = 0;

838

for(int oz = 0; oz < num_planes_z; ++oz)

839

{

840

const int zoffset = id.z() + oz;

841

uint8_t *p_out_base = out_ptr + oz * output_stride_z;

842

// Step 1

843

{

844

const auto ptr_k_r0 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 0 * kernel_stride_y + 0 * kernel_stride_x);

845

const auto ptr_k_r1 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 1 * kernel_stride_y + 0 * kernel_stride_x);

846

const auto ptr_k_r2 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 2 * kernel_stride_y + 0 * kernel_stride_x);

847

const auto ptr_k_r3 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 3 * kernel_stride_y + 0 * kernel_stride_x);

848

const auto ptr_k_r4 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 4 * kernel_stride_y + 0 * kernel_stride_x);

849

for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y)

850

{

851

auto in_0 = reinterpret_cast<const T1 *>(input_ptr + 0 * input_stride_z + (ih + 0) * input_stride_y);

852

auto in_1 = reinterpret_cast<const T1 *>(input_ptr + 0 * input_stride_z + (ih + 1) * input_stride_y);

853

auto in_2 = reinterpret_cast<const T1 *>(input_ptr + 0 * input_stride_z + (ih + 2) * input_stride_y);

854

auto in_3 = reinterpret_cast<const T1 *>(input_ptr + 0 * input_stride_z + (ih + 3) * input_stride_y);

855

auto in_4 = reinterpret_cast<const T1 *>(input_ptr + 0 * input_stride_z + (ih + 4) * input_stride_y);

856

auto p_out = reinterpret_cast<T2 *>(p_out_base + oh * output_stride_y);

857

for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration,

858

in_0 += delta_input, in_1 += delta_input, in_2 += delta_input, in_3 += delta_input, in_4 += delta_input, p_out += num_elems_written_per_iteration)

859

{

860

auto vres = convolve_5x5<stridex>(in_0, in_1, in_2, in_3, in_4, ptr_k_r0, ptr_k_r1, ptr_k_r2, ptr_k_r3, ptr_k_r4, fixed_point_position);

861

store_results<stridex>(p_out, vres);

}

}

}

// Step 2

for(int p = 1; p < kernel_depth; ++p)

867

{

868

const auto ptr_k_r0 = reinterpret_cast<const T1 *>(k_ptr + p * kernel_stride_z + zoffset * kernel_stride_w + 0 * kernel_stride_y + 0 * kernel_stride_x);

869

const auto ptr_k_r1 = reinterpret_cast<const T1 *>(k_ptr + p * kernel_stride_z + zoffset * kernel_stride_w + 1 * kernel_stride_y + 0 * kernel_stride_x);

870

const auto ptr_k_r2 = reinterpret_cast<const T1 *>(k_ptr + p * kernel_stride_z + zoffset * kernel_stride_w + 2 * kernel_stride_y + 0 * kernel_stride_x);

871

const auto ptr_k_r3 = reinterpret_cast<const T1 *>(k_ptr + p * kernel_stride_z + zoffset * kernel_stride_w + 3 * kernel_stride_y + 0 * kernel_stride_x);

872

const auto ptr_k_r4 = reinterpret_cast<const T1 *>(k_ptr + p * kernel_stride_z + zoffset * kernel_stride_w + 4 * kernel_stride_y + 0 * kernel_stride_x);

873

874

for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y)

875

{

876

auto in_0 = reinterpret_cast<const T1 *>(input_ptr + p * input_stride_z + (ih + 0) * input_stride_y);

877

auto in_1 = reinterpret_cast<const T1 *>(input_ptr + p * input_stride_z + (ih + 1) * input_stride_y);

878

auto in_2 = reinterpret_cast<const T1 *>(input_ptr + p * input_stride_z + (ih + 2) * input_stride_y);

879

auto in_3 = reinterpret_cast<const T1 *>(input_ptr + p * input_stride_z + (ih + 3) * input_stride_y);

880

auto in_4 = reinterpret_cast<const T1 *>(input_ptr + p * input_stride_z + (ih + 4) * input_stride_y);

881

auto p_out = reinterpret_cast<T2 *>(p_out_base + oh * output_stride_y);

882

for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration,

883

in_0 += delta_input, in_1 += delta_input, in_2 += delta_input, in_3 += delta_input, in_4 += delta_input, p_out += num_elems_written_per_iteration)

884

{

885

auto vres = convolve_5x5<stridex>(in_0, in_1, in_2, in_3, in_4, ptr_k_r0, ptr_k_r1, ptr_k_r2, ptr_k_r3, ptr_k_r4, fixed_point_position);

886

accumulate_results<stridex>(p_out, vres);

}

}

}

}

},

in, out);

}

};

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

896

template <typename T1, typename T2>

897

inline void convolve_1x1(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration,

898

const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info)

899

{

900

const unsigned int conv_stride_x = std::get<0>(conv_info.stride());

901

switch(conv_stride_x)

902

{

903

case 1:

904

convolver_1x1<T1, T2, 1>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info);

905

break;

906

case 2:

907

convolver_1x1<T1, T2, 2>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info);

908

break;

909

case 3:

910

convolver_1x1<T1, T2, 3>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info);

911

break;

912

default:

913

ARM_COMPUTE_ERROR("Not implemented");

}

}

Pablo Tello

2017-09-21 13:59:14 +0100

[diff] [blame]

917

template <>

918

inline void convolve_1x1<float, float>(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration,

919

const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info)

920

{

921

const unsigned int conv_stride_x = std::get<0>(conv_info.stride());

922

if(run_optim_small_tensor(input))

923

{

924

switch(conv_stride_x)

925

{

926

case 1:

927

convolver_w1x1_i8x8_f32<1>::convolve(window, input, weights, output, conv_info);

928

break;

929

case 2:

930

convolver_w1x1_i8x8_f32<2>::convolve(window, input, weights, output, conv_info);

931

break;

932

case 3:

933

convolver_w1x1_i8x8_f32<3>::convolve(window, input, weights, output, conv_info);

934

break;

935

default:

936

ARM_COMPUTE_ERROR("Not implemented");

}

}

else

{

switch(conv_stride_x)

942

{

943

case 1:

944

convolver_1x1<float, float, 1>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info);

945

break;

946

case 2:

947

convolver_1x1<float, float, 2>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info);

948

break;

949

case 3:

950

convolver_1x1<float, float, 3>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info);

951

break;

952

default:

953

ARM_COMPUTE_ERROR("Not implemented");

}

}

}

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

958

template <typename T1, typename T2>

959

inline void convolve_3x3(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration,

960

const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info)

961

{

962

const unsigned int conv_stride_x = std::get<0>(conv_info.stride());

963

switch(conv_stride_x)

964

{

965

case 1:

966

convolver_3x3<T1, T2, 1>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info);

967

break;

968

case 2:

969

convolver_3x3<T1, T2, 2>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info);

970

break;

971

case 3:

972

convolver_3x3<T1, T2, 3>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info);

973

break;

974

default:

975

ARM_COMPUTE_ERROR("Not implemented");

976

}

977

}

Pablo Tello

2017-08-10 15:10:40 +0100

[diff] [blame]

978

979

template <typename T1, typename T2>

980

inline void convolve_5x5(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration,

981

const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info)

982

{

983

const unsigned int conv_stride_x = std::get<0>(conv_info.stride());

984

switch(conv_stride_x)

985

{

986

case 1:

987

convolver_5x5<T1, T2, 1>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info);

988

break;

989

case 2:

990

convolver_5x5<T1, T2, 2>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info);

991

break;

992

case 3:

993

convolver_5x5<T1, T2, 3>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info);

994

break;

995

default:

996

ARM_COMPUTE_ERROR("Not implemented");

}

}

Michalis Spyrou

2017-11-30 14:25:57 +0000

[diff] [blame]

1000

inline TensorShape get_convolved_dimensions(const ITensorInfo *input, const ITensorInfo *weights, const int kernel_size, const PadStrideInfo &conv_info)

1001

{

1002

unsigned int output_width = 0;

1003

unsigned int output_height = 0;

1004

std::tie(output_width, output_height) = scaled_dimensions(input->dimension(0), input->dimension(1), kernel_size, kernel_size, conv_info);

1005

1006

TensorShape output_shape = input->tensor_shape();

1007

output_shape.set(0, output_width);

1008

output_shape.set(1, output_height);

1009

output_shape.set(2, weights->dimension(3));

return output_shape;

}

Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, const PadStrideInfo &conv_info)

1015

{

1016

ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);

1017

ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);

1018

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);

1019

ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(0) == 1 && (std::get<0>(conv_info.pad()) || std::get<1>(conv_info.pad())),

1020

"Pad > 0 not supported for 1x1 weights");

1021

ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(0) == 3 && (std::get<0>(conv_info.pad()) > 1 || std::get<1>(conv_info.pad()) > 1),

1022

"Pad > 1 not supported for 3x3 weights");

1023

ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(0) == 5 && (std::get<0>(conv_info.pad()) > 2 || std::get<1>(conv_info.pad()) > 2),

1024

"Pad > 2 not supported for 5x5 weights");

1025

1026

ARM_COMPUTE_RETURN_ERROR_ON_MSG(std::get<0>(conv_info.stride()) > 3, "Strides larger than 3 not supported.");

1027

ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(2) != input->dimension(2));

1028

ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(0) != weights->dimension(1));

1029

ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);

1030

1031

// Checks performed when output is configured

1032

if(output->total_size() != 0)

1033

{

1034

TensorShape output_shape = get_convolved_dimensions(input, weights, weights->dimension(0), conv_info);

1035

1036

DataType data_type = input->data_type();

1037

if(is_data_type_fixed_point(data_type))

1038

{

1039

// Promote data type in case of fixed point

1040

data_type = ((data_type == DataType::QS8) ? DataType::QS16 : DataType::QS32);

1041

}

1042

1043

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);

1044

ARM_COMPUTE_RETURN_ERROR_ON(output->data_type() != data_type);

}

return Status{};

}

std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *weights, ITensorInfo *output, const PadStrideInfo &conv_info, unsigned int &num_weight_elems_read_per_row,

Georgios Pinitas

2017-12-12 11:44:44 +0000

[diff] [blame]

1051

unsigned int &num_elems_read_per_iteration, unsigned int &num_elems_written_per_iteration, BorderSize &border_size)

Michalis Spyrou

2017-11-30 14:25:57 +0000

[diff] [blame]

1052

{

1053

// Calculate right and bottom border

1054

unsigned int kernel_size = weights->dimension(0);

Michalis Spyrou

2017-11-30 14:25:57 +0000

[diff] [blame]

1055

const unsigned int conv_stride_x = std::get<0>(conv_info.stride());

1056

const unsigned int conv_stride_y = std::get<1>(conv_info.stride());

Michalis Spyrou

2017-11-30 14:25:57 +0000

[diff] [blame]

1057

const int input_width = input->dimension(0);

1058

const int input_height = input->dimension(1);

switch(kernel_size)

{

case 1:

{

switch(input->data_type())

1065

{

1066

#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

1067

case DataType::F16:

1068

#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */

1069

case DataType::QS8:

1070

case DataType::QS16:

1071

num_elems_written_per_iteration = 8;

1072

break;

1073

case DataType::F32:

1074

if(run_optim_small_tensor_info(input))

1075

{

1076

num_elems_written_per_iteration = 8;

}

else

{

num_elems_written_per_iteration = 4;

}

break;

default:

ARM_COMPUTE_ERROR("Data type not supported.");

1085

break;

1086

}

1087

num_weight_elems_read_per_row = kernel_size;

1088

num_elems_read_per_iteration = conv_stride_x * num_elems_written_per_iteration;

break;

}

case 3:

case 5:

{

switch(input->data_type())

1095

{

1096

case DataType::F32:

1097

num_weight_elems_read_per_row = 4 + kernel_size - 1;

1098

num_elems_read_per_iteration = 12;

1099

num_elems_written_per_iteration = 16 >> conv_stride_x;

1100

break;

1101

#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

1102

case DataType::F16:

1103

#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */

1104

case DataType::QS8:

1105

case DataType::QS16:

1106

num_weight_elems_read_per_row = 8 + kernel_size - 1;

1107

num_elems_read_per_iteration = 24;

1108

num_elems_written_per_iteration = 32 >> conv_stride_x;

1109

break;

1110

default:

1111

ARM_COMPUTE_ERROR("Data type not supported.");

break;

}

}

break;

default:

{

ARM_COMPUTE_ERROR("Not implemented");

break;

}

}

Michalis Spyrou

2018-01-08 17:11:26 +0000

[diff] [blame^]

1123

// Calculate border

1124

int upper_bound_w = ceil_to_multiple(((output->dimension(0) - 1) * conv_stride_x + kernel_size), num_elems_read_per_iteration) - conv_info.pad_left() - conv_info.pad_right() - input_width;

1125

int upper_bound_h = ((output->dimension(1) - 1) * conv_stride_y - conv_info.pad_top() - conv_info.pad_bottom() + kernel_size) - input_height;

1126

1127

const unsigned int conv_pad_left = std::max(upper_bound_w - static_cast<int>(conv_info.pad_right()), static_cast<int>(kernel_size) / 2);

1128

const unsigned int conv_pad_top = std::max(upper_bound_h - static_cast<int>(conv_info.pad_bottom()), static_cast<int>(kernel_size) / 2);

1129

const unsigned int conv_pad_right = std::max(upper_bound_w - static_cast<int>(conv_info.pad_left()), static_cast<int>(kernel_size) / 2);

1130

const unsigned int conv_pad_bottom = std::max(upper_bound_h - static_cast<int>(conv_info.pad_top()), static_cast<int>(kernel_size) / 2);

1131

1132

border_size.right = conv_pad_right;

1133

border_size.bottom = conv_pad_bottom;

1134

border_size.left = conv_pad_left;

1135

border_size.top = conv_pad_top;

1136

Michalis Spyrou

2017-11-30 14:25:57 +0000

[diff] [blame]

1137

Window win = calculate_max_window(*output, Steps(num_elems_written_per_iteration));

Michalis Spyrou

621965e

2018-01-08 17:11:26 +0000

[diff] [blame^]

1138

AccessWindowStatic input_access(input, -conv_pad_left, -conv_pad_top, input_width + conv_pad_right, input_height + conv_pad_bottom);

Michalis Spyrou

2017-11-30 14:25:57 +0000

[diff] [blame]

1139

AccessWindowStatic weights_access(weights, 0, 0, num_weight_elems_read_per_row, kernel_size);

1140

AccessWindowHorizontal output_access(output, 0, num_elems_written_per_iteration);

1141

bool window_changed = update_window_and_padding(win, input_access, weights_access, output_access);

1142

output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));

1143

1144

Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};

1145

return std::make_pair(err, win);

1146

}

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1147

} // namespace

1148

1149

NEDirectConvolutionLayerKernel::NEDirectConvolutionLayerKernel()

Georgios Pinitas

898a806

2017-09-12 19:19:12 +0100

[diff] [blame]

1150

: _input(nullptr), _weights(nullptr), _output(nullptr), _conv_info(), _border_size(0), _kernel_size(0), _num_weight_elems_read_per_row(0), _num_elems_read_per_iteration(0),

1151

_num_elems_written_per_iteration(0)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

{

}

BorderSize NEDirectConvolutionLayerKernel::border_size() const

{

return _border_size;

}

void NEDirectConvolutionLayerKernel::configure(const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info)

1161

{

Michalis Spyrou

2017-11-30 14:25:57 +0000

[diff] [blame]

1162

ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

_input = input;

_weights = weights;

_output = output;

_conv_info = conv_info;

1168

_kernel_size = weights->info()->dimension(0);

Michalis Spyrou

621965e

2018-01-08 17:11:26 +0000

[diff] [blame^]

1169

1170

const unsigned int conv_pad_left = conv_info.pad_left();

1171

const unsigned int conv_pad_top = conv_info.pad_top();

1172

const unsigned int conv_pad_right = conv_info.pad_right();

1173

const unsigned int conv_pad_bottom = conv_info.pad_bottom();

1174

_border_size = BorderSize(conv_pad_top, conv_pad_right, conv_pad_bottom, conv_pad_left);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1175

Gian Marco Iodice

5cb4d6a

2017-08-08 10:53:00 +0100

[diff] [blame]

1176

// Get convolved dimensions

Michalis Spyrou

2017-11-30 14:25:57 +0000

[diff] [blame]

1177

TensorShape output_shape = get_convolved_dimensions(input->info(), weights->info(), _kernel_size, conv_info);

Gian Marco Iodice

5cb4d6a

2017-08-08 10:53:00 +0100

[diff] [blame]

1178

1179

DataType data_type = input->info()->data_type();

1180

1181

if(is_data_type_fixed_point(data_type))

1182

{

1183

// Promote data type in case of fixed point

1184

data_type = ((data_type == DataType::QS8) ? DataType::QS16 : DataType::QS32);

1185

}

1186

1187

// Output auto inizialitation if not yet initialized

1188

auto_init_if_empty(*output->info(), output_shape, 1, data_type, input->info()->fixed_point_position());

1189

Michalis Spyrou

2017-11-30 14:25:57 +0000

[diff] [blame]

1190

// Perform validation step

1191

ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), weights->info(), output->info(), conv_info));

Gian Marco Iodice

5cb4d6a

2017-08-08 10:53:00 +0100

[diff] [blame]

1192

Michalis Spyrou

2017-11-30 14:25:57 +0000

[diff] [blame]

1193

// Configure kernel window

1194

auto win_config = validate_and_configure_window(input->info(), weights->info(), output->info(), conv_info, _num_weight_elems_read_per_row,

Georgios Pinitas

2017-12-12 11:44:44 +0000

[diff] [blame]

1195

_num_elems_read_per_iteration, _num_elems_written_per_iteration, _border_size);

Michalis Spyrou

2017-11-30 14:25:57 +0000

[diff] [blame]

1196

ARM_COMPUTE_ERROR_THROW_ON(win_config.first);

1197

INEKernel::configure(win_config.second);

1198

}

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1199

Michalis Spyrou

2017-11-30 14:25:57 +0000

[diff] [blame]

1200

Status NEDirectConvolutionLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, const PadStrideInfo &conv_info)

1201

{

1202

unsigned int num_weight_elems_read_per_row = 0;

1203

unsigned int num_elems_read_per_iteration = 0;

1204

unsigned int num_elems_written_per_iteration = 0;

Georgios Pinitas

2017-12-12 11:44:44 +0000

[diff] [blame]

1205

BorderSize border_size(conv_info.pad().first, conv_info.pad().second);

Michalis Spyrou

2017-11-30 14:25:57 +0000

[diff] [blame]

1206

ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, weights, output, conv_info));

Georgios Pinitas

2017-12-12 11:44:44 +0000

[diff] [blame]

1207

ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(),

1208

weights->clone().get(),

1209

output->clone().get(),

1210

conv_info,

1211

num_weight_elems_read_per_row,

1212

num_elems_read_per_iteration,

1213

num_elems_written_per_iteration,

1214

border_size)

Michalis Spyrou

2017-11-30 14:25:57 +0000

[diff] [blame]

1215

.first);

Georgios Pinitas

898a806

2017-09-12 19:19:12 +0100

[diff] [blame]

1216

Michalis Spyrou

2017-11-30 14:25:57 +0000

[diff] [blame]

1217

return Status{};

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1218

}

1219

Moritz Pflanzer

c186b57

2017-09-07 09:48:04 +0100

[diff] [blame]

1220

void NEDirectConvolutionLayerKernel::run(const Window &window, const ThreadInfo &info)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1221

{

Moritz Pflanzer

c186b57

2017-09-07 09:48:04 +0100

[diff] [blame]

1222

ARM_COMPUTE_UNUSED(info);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1223

ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);

1224

ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);

1225

ARM_COMPUTE_ERROR_ON(_input->buffer() == nullptr);

1226

1227

const int kernel_size = _weights->info()->dimension(0);

switch(kernel_size)

{

case 1:

{

Pablo Tello

2017-07-06 16:43:14 +0100

[diff] [blame]

1233

switch(_input->info()->data_type())

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1234

{

Pablo Tello

2017-07-06 16:43:14 +0100

[diff] [blame]

1235

case DataType::QS8:

1236

convolve_1x1<qint8_t, qint16_t>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);

1237

break;

Pablo Tello

f87cc7f

2017-07-26 10:28:40 +0100

[diff] [blame]

1238

case DataType::QS16:

1239

convolve_1x1<qint16_t, qint32_t>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);

1240

break;

Pablo Tello

2017-07-06 16:43:14 +0100

[diff] [blame]

1241

case DataType::F32:

1242

convolve_1x1<float, float>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);

1243

break;

Ioan-Cristian Szabo

2017-11-13 13:34:08 +0000

[diff] [blame]

1244

#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

Pablo Tello

2017-07-06 16:43:14 +0100

[diff] [blame]

1245

case DataType::F16:

1246

convolve_1x1<float16_t, float16_t>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);

1247

break;

Ioan-Cristian Szabo

2017-11-13 13:34:08 +0000

[diff] [blame]

1248

#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */

Pablo Tello

2017-07-06 16:43:14 +0100

[diff] [blame]

1249

default:

1250

ARM_COMPUTE_ERROR("Data type not supported");

1251

break;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

}

break;

}

case 3:

{

Pablo Tello

2017-07-06 16:43:14 +0100

[diff] [blame]

1257

switch(_input->info()->data_type())

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1258

{

Pablo Tello

2017-07-06 16:43:14 +0100

[diff] [blame]

1259

case DataType::QS8:

1260

convolve_3x3<qint8_t, qint16_t>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);

1261

break;

1262

case DataType::F32:

1263

convolve_3x3<float, float>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);

1264

break;

Ioan-Cristian Szabo

2017-11-13 13:34:08 +0000

[diff] [blame]

1265

#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

Pablo Tello

2017-07-06 16:43:14 +0100

[diff] [blame]

1266

case DataType::F16:

1267

convolve_3x3<float16_t, float16_t>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);

1268

break;

Ioan-Cristian Szabo

2017-11-13 13:34:08 +0000

[diff] [blame]

1269

#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */

Pablo Tello

2017-07-06 16:43:14 +0100

[diff] [blame]

1270

default:

1271

ARM_COMPUTE_ERROR("Data type not supported");

1272

break;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1273

}

1274

break;

1275

}

Pablo Tello

2017-08-10 15:10:40 +0100

[diff] [blame]

1276

case 5:

1277

{

1278

switch(_input->info()->data_type())

1279

{

1280

case DataType::F32:

1281

convolve_5x5<float, float>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);

1282

break;

1283

default:

1284

ARM_COMPUTE_ERROR("Data type not supported");

break;

}

break;

}

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1290

default:

1291

{

Pablo Tello

2017-08-10 15:10:40 +0100

[diff] [blame]

1292

ARM_COMPUTE_ERROR("Only kernel sizes 1x1, 3x3 and 5x5 are supported.");

Anthony Barbier