Blame - src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp - ml/ComputeLibrary

2018-07-04 09:34:00 +0100

[diff] [blame]

242

const int input_stride_x = input->info()->strides_in_bytes().x();

243

const int input_stride_y = input->info()->strides_in_bytes().y();

244

const int input_stride_z = input->info()->strides_in_bytes().z();

245

const int output_stride_y = output->info()->strides_in_bytes().y();

246

const int output_stride_z = output->info()->strides_in_bytes().z();

247

const int kernel_stride_z = weights->info()->strides_in_bytes().z();

248

const int kernel_stride_w = weights->info()->strides_in_bytes()[3];

249

const int output_w = output->info()->dimension(0);

250

const int output_h = output->info()->dimension(1);

251

const int range_z = window.z().end() - window.z().start();

252

const int kernel_depth = weights->info()->dimension(Window::DimZ);

253

const unsigned int conv_stride_y = std::get<1>(conv_info.stride());

254

const unsigned int conv_pad_left = conv_info.pad_left();

255

const unsigned int conv_pad_top = conv_info.pad_top();

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

256

257

// setup output window for the iterator

258

Window window_out = window;

259

window_out.set(Window::DimX, Window::Dimension(0, output->info()->dimension(Window::DimX), output->info()->dimension(Window::DimX)));

260

window_out.set(Window::DimY, Window::Dimension(0, output->info()->dimension(Window::DimY), output->info()->dimension(Window::DimY)));

261

window_out.set(Window::DimZ, Window::Dimension(window.z().start(), window.z().end(), range_z));

262

263

// setup input window for the iterator

264

Window window_in = window;

265

// we just want execute_window_loop to iterate over the higher dimensions (>3), so we set the first 3 dimensions to 0

266

window_in.set(Window::DimX, Window::Dimension(0, 0, 0));

267

window_in.set(Window::DimY, Window::Dimension(0, 0, 0));

268

window_in.set(Window::DimZ, Window::Dimension(0, 0, 0));

269

Pablo Tello

f87cc7f

2017-07-26 10:28:40 +0100

[diff] [blame]

270

Window window_k = calculate_max_window(*weights->info(), Steps(1u));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

271

Iterator out(output, window_out);

272

Iterator in(input, window_in);

273

Iterator k(weights, window_k);

274

275

const uint8_t *k_ptr = k.ptr();

276

277

execute_window_loop(window_out, [&](const Coordinates & id)

278

{

279

/*

280

For a detailed explanation on how the algorithm works refer to template <> class convolver_3x3<1>

281

*/

Georgios Pinitas

2018-02-19 13:58:22 +0000

[diff] [blame]

282

const uint8_t *input_ptr = in.ptr() - conv_pad_left * input_stride_x - conv_pad_top * input_stride_y;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

283

uint8_t *out_ptr = out.ptr();

284

int ih = 0;

285

int oh = 0;

286

for(int oz = 0; oz < range_z; ++oz)

287

{

288

auto p_out_base = out_ptr + oz * output_stride_z;

289

// Step 1

290

{

291

const auto k_val = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + (id.z() + oz) * kernel_stride_w);

292

const auto vk = internal_vdupq_n(*k_val);

293

for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y)

294

{

295

const int offset_xy = ih * input_stride_y;

296

auto in_val = reinterpret_cast<const T1 *>(input_ptr + (0 * input_stride_z + offset_xy));

297

auto p_out = reinterpret_cast<T2 *>(p_out_base + oh * output_stride_y);

298

for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration, in_val += num_elems_read_per_iteration, p_out += num_elems_written_per_iteration)

299

{

Vidhya Sudhan Loganathan

2018-07-04 09:34:00 +0100

[diff] [blame]

300

internal_vst1q(p_out, internal_vmull(vk, internal_vld1q<stridex>(in_val)));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

301

}

302

}

303

}

Pablo Tello

c09314a

2017-09-21 13:59:14 +0100

[diff] [blame]

304

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

305

// Step 2

306

for(int p = 1; p < kernel_depth; ++p)

307

{

308

const auto k_val = reinterpret_cast<const T1 *>(k_ptr + p * kernel_stride_z + (id.z() + oz) * kernel_stride_w);

309

const auto vk = internal_vdupq_n(*k_val);

310

for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y)

311

{

312

const int offset_xy = ih * input_stride_y;

313

auto in_val = reinterpret_cast<const T1 *>(input_ptr + p * input_stride_z + offset_xy);

314

auto p_out = reinterpret_cast<T2 *>(p_out_base + oh * output_stride_y);

315

for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration, in_val += num_elems_read_per_iteration, p_out += num_elems_written_per_iteration)

316

{

Vidhya Sudhan Loganathan

2018-07-04 09:34:00 +0100

[diff] [blame]

317

internal_vst1q(p_out, internal_vmlal(internal_vld1q<1>(p_out), vk, internal_vld1q<stridex>(in_val)));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

}

}

}

}

},

in, out);

}

};

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

327

template <unsigned int stridex>

Pablo Tello

2017-08-10 15:10:40 +0100

[diff] [blame]

328

float32x4x2_t convolve_5x5(const float *in_0, const float *in_1, const float *in_2, const float *in_3, const float *in_4,

Vidhya Sudhan Loganathan

2018-07-04 09:34:00 +0100

[diff] [blame]

329

const float *m0, const float *m1, const float *m2, const float *m3, const float *m4);

Pablo Tello

2017-08-10 15:10:40 +0100

[diff] [blame]

330

331

inline float32x4x3_t load_matrix_hi(const float *const m0, const float *const m1, const float *const m2)

332

{

333

const float32x4x3_t m00 =

{

{

vld1q_dup_f32(m0),

vld1q_dup_f32(m1),

vld1q_dup_f32(m2)

}

};

return m00;

}

inline float32x4x2_t load_matrix_lo(const float *const m3, const float *const m4)

345

{

346

const float32x4x2_t m00 =

{

{

vld1q_dup_f32(m3),

vld1q_dup_f32(m4)

}

};

return m00;

}

inline float32x4x3_t load_input(const float *const in)

357

{

358

const float32x4x3_t vin =

{

{

vld1q_f32(in),

vld1q_f32(in + 4),

vld1q_f32(in + 8)

}

};

return vin;

}

template <>

inline float32x4x2_t convolve_5x5<1>(const float *in_0, const float *in_1, const float *in_2, const float *in_3, const float *in_4,

Vidhya Sudhan Loganathan

2018-07-04 09:34:00 +0100

[diff] [blame]

371

const float *m0, const float *m1, const float *m2, const float *m3, const float *m4)

Pablo Tello

2017-08-10 15:10:40 +0100

[diff] [blame]

372

{

Pablo Tello

2017-08-10 15:10:40 +0100

[diff] [blame]

373

const float32x4x3_t vin0 = load_input(in_0);

374

const float32x4x3_t vin1 = load_input(in_1);

375

const float32x4x3_t vin2 = load_input(in_2);

376

const float32x4x3_t vin3 = load_input(in_3);

377

const float32x4x3_t vin4 = load_input(in_4);

378

const float32x4x3_t m00 = load_matrix_hi(m0, 1 + m0, 2 + m0);

379

const float32x4x2_t m01 = load_matrix_lo(3 + m0, 4 + m0);

380

const float32x4x3_t m10 = load_matrix_hi(m1, 1 + m1, 2 + m1);

381

const float32x4x2_t m11 = load_matrix_lo(3 + m1, 4 + m1);

382

const float32x4x3_t m20 = load_matrix_hi(m2, 1 + m2, 2 + m2);

383

const float32x4x2_t m21 = load_matrix_lo(3 + m2, 4 + m2);

384

const float32x4x3_t m30 = load_matrix_hi(m3, 1 + m3, 2 + m3);

385

const float32x4x2_t m31 = load_matrix_lo(3 + m3, 4 + m3);

386

const float32x4x3_t m40 = load_matrix_hi(m4, 1 + m4, 2 + m4);

387

const float32x4x2_t m41 = load_matrix_lo(3 + m4, 4 + m4);

float32x4x2_t out =

{

{

vmulq_f32(vin0.val[0], m00.val[0]),

393

vmulq_f32(vin0.val[1], m00.val[0])

}

};

out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin0.val[0], vin0.val[1], 1), m00.val[1]);

398

out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin0.val[0], vin0.val[1], 2), m00.val[2]);

399

out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin0.val[0], vin0.val[1], 3), m01.val[0]);

400

out.val[0] = vmlaq_f32(out.val[0], vin0.val[1], m01.val[1]);

401

402

out.val[0] = vmlaq_f32(out.val[0], vin1.val[0], m10.val[0]);

403

out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin1.val[0], vin1.val[1], 1), m10.val[1]);

404

out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin1.val[0], vin1.val[1], 2), m10.val[2]);

405

out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin1.val[0], vin1.val[1], 3), m11.val[0]);

406

out.val[0] = vmlaq_f32(out.val[0], vin1.val[1], m11.val[1]);

407

408

out.val[0] = vmlaq_f32(out.val[0], vin2.val[0], m20.val[0]);

409

out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin2.val[0], vin2.val[1], 1), m20.val[1]);

410

out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin2.val[0], vin2.val[1], 2), m20.val[2]);

411

out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin2.val[0], vin2.val[1], 3), m21.val[0]);

412

out.val[0] = vmlaq_f32(out.val[0], vin2.val[1], m21.val[1]);

413

414

out.val[0] = vmlaq_f32(out.val[0], vin3.val[0], m30.val[0]);

415

out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin3.val[0], vin3.val[1], 1), m30.val[1]);

416

out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin3.val[0], vin3.val[1], 2), m30.val[2]);

417

out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin3.val[0], vin3.val[1], 3), m31.val[0]);

418

out.val[0] = vmlaq_f32(out.val[0], vin3.val[1], m31.val[1]);

419

420

out.val[0] = vmlaq_f32(out.val[0], vin4.val[0], m40.val[0]);

421

out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin4.val[0], vin4.val[1], 1), m40.val[1]);

422

out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin4.val[0], vin4.val[1], 2), m40.val[2]);

423

out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin4.val[0], vin4.val[1], 3), m41.val[0]);

424

out.val[0] = vmlaq_f32(out.val[0], vin4.val[1], m41.val[1]);

425

426

out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin0.val[1], vin0.val[2], 1), m00.val[1]);

427

out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin0.val[1], vin0.val[2], 2), m00.val[2]);

428

out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin0.val[1], vin0.val[2], 3), m01.val[0]);

429

out.val[1] = vmlaq_f32(out.val[1], vin0.val[2], m01.val[1]);

430

431

out.val[1] = vmlaq_f32(out.val[1], vin1.val[1], m10.val[0]);

432

out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin1.val[1], vin1.val[2], 1), m10.val[1]);

433

out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin1.val[1], vin1.val[2], 2), m10.val[2]);

434

out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin1.val[1], vin1.val[2], 3), m11.val[0]);

435

out.val[1] = vmlaq_f32(out.val[1], vin1.val[2], m11.val[1]);

436

437

out.val[1] = vmlaq_f32(out.val[1], vin2.val[1], m20.val[0]);

438

out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin2.val[1], vin2.val[2], 1), m20.val[1]);

439

out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin2.val[1], vin2.val[2], 2), m20.val[2]);

440

out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin2.val[1], vin2.val[2], 3), m21.val[0]);

441

out.val[1] = vmlaq_f32(out.val[1], vin2.val[2], m21.val[1]);

442

443

out.val[1] = vmlaq_f32(out.val[1], vin3.val[1], m30.val[0]);

444

out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin3.val[1], vin3.val[2], 1), m30.val[1]);

445

out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin3.val[1], vin3.val[2], 2), m30.val[2]);

446

out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin3.val[1], vin3.val[2], 3), m31.val[0]);

447

out.val[1] = vmlaq_f32(out.val[1], vin3.val[2], m31.val[1]);

448

449

out.val[1] = vmlaq_f32(out.val[1], vin4.val[1], m40.val[0]);

450

out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin4.val[1], vin4.val[2], 1), m40.val[1]);

451

out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin4.val[1], vin4.val[2], 2), m40.val[2]);

452

out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin4.val[1], vin4.val[2], 3), m41.val[0]);

453

out.val[1] = vmlaq_f32(out.val[1], vin4.val[2], m41.val[1]);

return out;

}

template <>

inline float32x4x2_t convolve_5x5<2>(const float *in_0, const float *in_1, const float *in_2, const float *in_3, const float *in_4,

Vidhya Sudhan Loganathan

2018-07-04 09:34:00 +0100

[diff] [blame]

460

const float *m0, const float *m1, const float *m2, const float *m3, const float *m4)

Pablo Tello

2017-08-10 15:10:40 +0100

[diff] [blame]

461

{

Vidhya Sudhan Loganathan

2018-07-04 09:34:00 +0100

[diff] [blame]

462

float32x4x2_t out = convolve_5x5<1>(in_0, in_1, in_2, in_3, in_4, m0, m1, m2, m3, m4);

Pablo Tello

2017-08-10 15:10:40 +0100

[diff] [blame]

463

out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 2), out.val[0], 1);

464

out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 0), out.val[0], 2);

465

out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 2), out.val[0], 3);

return out;

}

template <>

inline float32x4x2_t convolve_5x5<3>(const float *in_0, const float *in_1, const float *in_2, const float *in_3, const float *in_4,

Vidhya Sudhan Loganathan

2018-07-04 09:34:00 +0100

[diff] [blame]

471

const float *m0, const float *m1, const float *m2, const float *m3, const float *m4)

Pablo Tello

2017-08-10 15:10:40 +0100

[diff] [blame]

472

{

Vidhya Sudhan Loganathan

2018-07-04 09:34:00 +0100

[diff] [blame]

473

float32x4x2_t out = convolve_5x5<1>(in_0, in_1, in_2, in_3, in_4, m0, m1, m2, m3, m4);

Pablo Tello

2017-08-10 15:10:40 +0100

[diff] [blame]

474

out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 3), out.val[0], 1);

return out;

}

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

478

template <typename T1, typename T2, unsigned int stridex>

class convolver_3x3

{

public:

static void convolve(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration,

483

const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info)

484

{

485

ARM_COMPUTE_UNUSED(num_elems_read_per_iteration);

Vidhya Sudhan Loganathan

2018-07-04 09:34:00 +0100

[diff] [blame]

486

const int input_stride_x = input->info()->strides_in_bytes().x();

487

const int input_stride_y = input->info()->strides_in_bytes().y();

488

const int input_stride_z = input->info()->strides_in_bytes().z();

489

const int output_stride_y = output->info()->strides_in_bytes().y();

490

const int output_stride_z = output->info()->strides_in_bytes().z();

491

const int kernel_stride_x = weights->info()->strides_in_bytes().x();

492

const int kernel_stride_y = weights->info()->strides_in_bytes().y();

493

const int kernel_stride_z = weights->info()->strides_in_bytes().z();

494

const int kernel_stride_w = weights->info()->strides_in_bytes()[3];

495

const int output_w = output->info()->dimension(0);

496

const int output_h = output->info()->dimension(1);

497

const int num_planes_z = window.z().end() - window.z().start();

Michele Di Giorgio

13ec5f0

2020-01-02 12:11:13 +0000

[diff] [blame]

498

const int delta_input = get_input_num_elems_processed(num_elems_written_per_iteration, stridex);

Vidhya Sudhan Loganathan

2018-07-04 09:34:00 +0100

[diff] [blame]

499

const int kernel_depth = weights->info()->dimension(Window::DimZ);

500

const unsigned int conv_stride_y = std::get<1>(conv_info.stride());

501

const unsigned int conv_pad_left = conv_info.pad_left();

502

const unsigned int conv_pad_top = conv_info.pad_top();

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

503

504

// setup output window for the iterator

505

Window window_out = window;

506

window_out.set(Window::DimX, Window::Dimension(0, output->info()->dimension(Window::DimX), output->info()->dimension(Window::DimX)));

507

window_out.set(Window::DimY, Window::Dimension(0, output->info()->dimension(Window::DimY), output->info()->dimension(Window::DimY)));

508

window_out.set(Window::DimZ, Window::Dimension(window.z().start(), window.z().end(), num_planes_z));

509

510

// setup input window for the iterator

511

Window window_in = window;

512

// we just want execute_window_loop to iterate over the higher dimensions (>3), so we set the first 3 dimensions to 0

513

window_in.set(Window::DimX, Window::Dimension(0, 0, 0));

514

window_in.set(Window::DimY, Window::Dimension(0, 0, 0));

515

window_in.set(Window::DimZ, Window::Dimension(0, 0, 0));

516

517

Window window_k = calculate_max_window(*weights->info(), Steps(1u));

518

519

Iterator out(output, window_out);

520

Iterator in(input, window_in);

521

Iterator k(weights, window_k);

522

523

const uint8_t *k_ptr = k.ptr();

524

525

execute_window_loop(window_out, [&](const Coordinates & id)

526

{

Georgios Pinitas

2018-02-19 13:58:22 +0000

[diff] [blame]

527

const uint8_t *input_ptr = in.ptr() - conv_pad_left * input_stride_x - conv_pad_top * input_stride_y;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

528

uint8_t *out_ptr = out.ptr();

int ih = 0;

int oh = 0;

/*

Each thread executing this kernel computes one or more output's volume planes.

533

534

Let's say the 3rd dimension of the output volume is 32, the first thread will compute the output for Z = [0,7], the second thread will compute the output for Z = [8,15],

535

the third thread [16,24] and the fourth thread [25,31].

536

537

The algorithm outer loop iterates over Z, P, Y, X where P is the depth/3rd dimension of each kernel. This order is not arbitrary, the main benefit of this

Anthony Barbier

e500747

2017-10-27 15:01:44 +0100

[diff] [blame]

538

is that we setup the neon registers containing the kernel's values only once and then compute each XY using the preloaded registers as opposed as doing this for every XY value.

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

539

540

The algorithm does not require allocating any additional memory amd computes the results directly in-place in two stages:

541

1) Convolve plane 0 with kernel 0 and initialize the corresponding output plane with these values.

542

2) Convolve the remaining planes and accumulate the results in the output's plane which has been initialized in step 1.

543

*/

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

544

for(int oz = 0; oz < num_planes_z; ++oz)

545

{

Pablo Tello

0d17614

2017-07-06 16:43:14 +0100

[diff] [blame]

546

const int zoffset = id.z() + oz;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

547

uint8_t *p_out_base = out_ptr + oz * output_stride_z;

548

// Step 1

549

{

Pablo Tello

0d17614

2017-07-06 16:43:14 +0100

[diff] [blame]

550

const auto ptr_k_r0 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 0 * kernel_stride_y + 0 * kernel_stride_x);

551

const auto ptr_k_r1 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 1 * kernel_stride_y + 0 * kernel_stride_x);

552

const auto ptr_k_r2 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 2 * kernel_stride_y + 0 * kernel_stride_x);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

553

const auto vk_r0 = load_matrix_row(ptr_k_r0);

554

const auto vk_r1 = load_matrix_row(ptr_k_r1);

555

const auto vk_r2 = load_matrix_row(ptr_k_r2);

556

for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y)

557

{

558

auto in_top = reinterpret_cast<const T1 *>(input_ptr + 0 * input_stride_z + (ih + 0) * input_stride_y);

559

auto in_mid = reinterpret_cast<const T1 *>(input_ptr + 0 * input_stride_z + (ih + 1) * input_stride_y);

560

auto in_low = reinterpret_cast<const T1 *>(input_ptr + 0 * input_stride_z + (ih + 2) * input_stride_y);

561

auto p_out = reinterpret_cast<T2 *>(p_out_base + oh * output_stride_y);

562

for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration,

563

in_top += delta_input, in_mid += delta_input, in_low += delta_input, p_out += num_elems_written_per_iteration)

564

{

Georgios Pinitas

a26e166

2020-03-04 15:31:25 +0000

[diff] [blame]

565

convolve_3x3<false>(in_top, in_mid, in_low, p_out, vk_r0, vk_r1, vk_r2, stridex);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

}

}

}

// Step 2

for(int p = 1; p < kernel_depth; ++p)

571

{

Pablo Tello

2017-08-10 15:10:40 +0100

[diff] [blame]

572

const uint8_t *ptr_k_base = k_ptr + p * kernel_stride_z + zoffset * kernel_stride_w;

573

const uint8_t *input_base = input_ptr + p * input_stride_z;

574

const auto ptr_k_r0 = reinterpret_cast<const T1 *>(ptr_k_base);

575

const auto ptr_k_r1 = reinterpret_cast<const T1 *>(ptr_k_base + kernel_stride_y);

576

const auto ptr_k_r2 = reinterpret_cast<const T1 *>(ptr_k_base + kernel_stride_y * 2);

577

const auto vk_r0 = load_matrix_row(ptr_k_r0);

578

const auto vk_r1 = load_matrix_row(ptr_k_r1);

579

const auto vk_r2 = load_matrix_row(ptr_k_r2);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

580

for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y)

581

{

Pablo Tello

2017-08-10 15:10:40 +0100

[diff] [blame]

582

auto in_top = reinterpret_cast<const T1 *>(input_base + (ih + 0) * input_stride_y);

583

auto in_mid = reinterpret_cast<const T1 *>(input_base + (ih + 1) * input_stride_y);

584

auto in_low = reinterpret_cast<const T1 *>(input_base + (ih + 2) * input_stride_y);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

585

auto p_out = reinterpret_cast<T2 *>(p_out_base + oh * output_stride_y);

586

for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration,

587

in_top += delta_input, in_mid += delta_input, in_low += delta_input, p_out += num_elems_written_per_iteration)

588

{

Georgios Pinitas

a26e166

2020-03-04 15:31:25 +0000

[diff] [blame]

589

convolve_3x3<true>(in_top, in_mid, in_low, p_out, vk_r0, vk_r1, vk_r2, stridex);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

}

}

}

}

},

in, out);

}

};

Pablo Tello

2017-08-10 15:10:40 +0100

[diff] [blame]

599

template <typename T1, typename T2, unsigned int stridex>

class convolver_5x5

{

public:

static void convolve(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration,

604

const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info)

605

{

606

ARM_COMPUTE_UNUSED(num_elems_read_per_iteration);

Vidhya Sudhan Loganathan

2018-07-04 09:34:00 +0100

[diff] [blame]

607

const int input_stride_x = input->info()->strides_in_bytes().x();

608

const int input_stride_y = input->info()->strides_in_bytes().y();

609

const int input_stride_z = input->info()->strides_in_bytes().z();

610

const int output_stride_y = output->info()->strides_in_bytes().y();

611

const int output_stride_z = output->info()->strides_in_bytes().z();

612

const int kernel_stride_x = weights->info()->strides_in_bytes().x();

613

const int kernel_stride_y = weights->info()->strides_in_bytes().y();

614

const int kernel_stride_z = weights->info()->strides_in_bytes().z();

615

const int kernel_stride_w = weights->info()->strides_in_bytes()[3];

616

const int output_w = output->info()->dimension(0);

617

const int output_h = output->info()->dimension(1);

618

const int num_planes_z = window.z().end() - window.z().start();

Michele Di Giorgio

13ec5f0

2020-01-02 12:11:13 +0000

[diff] [blame]

619

const int delta_input = get_input_num_elems_processed(num_elems_written_per_iteration, stridex);

Vidhya Sudhan Loganathan

2018-07-04 09:34:00 +0100

[diff] [blame]

620

const int kernel_depth = weights->info()->dimension(Window::DimZ);

621

const unsigned int conv_stride_y = std::get<1>(conv_info.stride());

622

const unsigned int conv_pad_left = conv_info.pad_left();

623

const unsigned int conv_pad_top = conv_info.pad_top();

Pablo Tello

2017-08-10 15:10:40 +0100

[diff] [blame]

624

625

// setup output window for the iterator

626

Window window_out = window;

627

window_out.set(Window::DimX, Window::Dimension(0, output->info()->dimension(Window::DimX), output->info()->dimension(Window::DimX)));

628

window_out.set(Window::DimY, Window::Dimension(0, output->info()->dimension(Window::DimY), output->info()->dimension(Window::DimY)));

629

window_out.set(Window::DimZ, Window::Dimension(window.z().start(), window.z().end(), num_planes_z));

630

631

// setup input window for the iterator

632

Window window_in = window;

633

// we just want execute_window_loop to iterate over the higher dimensions (>3), so we set the first 3 dimensions to 0

634

window_in.set(Window::DimX, Window::Dimension(0, 0, 0));

635

window_in.set(Window::DimY, Window::Dimension(0, 0, 0));

636

window_in.set(Window::DimZ, Window::Dimension(0, 0, 0));

637

638

Window window_k = calculate_max_window(*weights->info(), Steps(1u));

639

640

Iterator out(output, window_out);

641

Iterator in(input, window_in);

642

Iterator k(weights, window_k);

643

644

const uint8_t *k_ptr = k.ptr();

645

646

execute_window_loop(window_out, [&](const Coordinates & id)

647

{

Georgios Pinitas

2018-02-19 13:58:22 +0000

[diff] [blame]

648

const uint8_t *input_ptr = in.ptr() - conv_pad_left * input_stride_x - conv_pad_top * input_stride_y;

Pablo Tello

2017-08-10 15:10:40 +0100

[diff] [blame]

649

uint8_t *out_ptr = out.ptr();

650

int ih = 0;

651

int oh = 0;

652

for(int oz = 0; oz < num_planes_z; ++oz)

653

{

654

const int zoffset = id.z() + oz;

655

uint8_t *p_out_base = out_ptr + oz * output_stride_z;

656

// Step 1

657

{

658

const auto ptr_k_r0 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 0 * kernel_stride_y + 0 * kernel_stride_x);

659

const auto ptr_k_r1 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 1 * kernel_stride_y + 0 * kernel_stride_x);

660

const auto ptr_k_r2 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 2 * kernel_stride_y + 0 * kernel_stride_x);

661

const auto ptr_k_r3 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 3 * kernel_stride_y + 0 * kernel_stride_x);

662

const auto ptr_k_r4 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 4 * kernel_stride_y + 0 * kernel_stride_x);

663

for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y)

664

{

665

auto in_0 = reinterpret_cast<const T1 *>(input_ptr + 0 * input_stride_z + (ih + 0) * input_stride_y);

666

auto in_1 = reinterpret_cast<const T1 *>(input_ptr + 0 * input_stride_z + (ih + 1) * input_stride_y);

667

auto in_2 = reinterpret_cast<const T1 *>(input_ptr + 0 * input_stride_z + (ih + 2) * input_stride_y);

668

auto in_3 = reinterpret_cast<const T1 *>(input_ptr + 0 * input_stride_z + (ih + 3) * input_stride_y);

669

auto in_4 = reinterpret_cast<const T1 *>(input_ptr + 0 * input_stride_z + (ih + 4) * input_stride_y);

670

auto p_out = reinterpret_cast<T2 *>(p_out_base + oh * output_stride_y);

671

for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration,

672

in_0 += delta_input, in_1 += delta_input, in_2 += delta_input, in_3 += delta_input, in_4 += delta_input, p_out += num_elems_written_per_iteration)

673

{

Vidhya Sudhan Loganathan

2018-07-04 09:34:00 +0100

[diff] [blame]

674

auto vres = convolve_5x5<stridex>(in_0, in_1, in_2, in_3, in_4, ptr_k_r0, ptr_k_r1, ptr_k_r2, ptr_k_r3, ptr_k_r4);

Pablo Tello

2017-08-10 15:10:40 +0100

[diff] [blame]

675

store_results<stridex>(p_out, vres);

}

}

}

// Step 2

for(int p = 1; p < kernel_depth; ++p)

681

{

682

const auto ptr_k_r0 = reinterpret_cast<const T1 *>(k_ptr + p * kernel_stride_z + zoffset * kernel_stride_w + 0 * kernel_stride_y + 0 * kernel_stride_x);

683

const auto ptr_k_r1 = reinterpret_cast<const T1 *>(k_ptr + p * kernel_stride_z + zoffset * kernel_stride_w + 1 * kernel_stride_y + 0 * kernel_stride_x);

684

const auto ptr_k_r2 = reinterpret_cast<const T1 *>(k_ptr + p * kernel_stride_z + zoffset * kernel_stride_w + 2 * kernel_stride_y + 0 * kernel_stride_x);

685

const auto ptr_k_r3 = reinterpret_cast<const T1 *>(k_ptr + p * kernel_stride_z + zoffset * kernel_stride_w + 3 * kernel_stride_y + 0 * kernel_stride_x);

686

const auto ptr_k_r4 = reinterpret_cast<const T1 *>(k_ptr + p * kernel_stride_z + zoffset * kernel_stride_w + 4 * kernel_stride_y + 0 * kernel_stride_x);

687

688

for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y)

689

{

690

auto in_0 = reinterpret_cast<const T1 *>(input_ptr + p * input_stride_z + (ih + 0) * input_stride_y);

691

auto in_1 = reinterpret_cast<const T1 *>(input_ptr + p * input_stride_z + (ih + 1) * input_stride_y);

692

auto in_2 = reinterpret_cast<const T1 *>(input_ptr + p * input_stride_z + (ih + 2) * input_stride_y);

693

auto in_3 = reinterpret_cast<const T1 *>(input_ptr + p * input_stride_z + (ih + 3) * input_stride_y);

694

auto in_4 = reinterpret_cast<const T1 *>(input_ptr + p * input_stride_z + (ih + 4) * input_stride_y);

695

auto p_out = reinterpret_cast<T2 *>(p_out_base + oh * output_stride_y);

696

for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration,

697

in_0 += delta_input, in_1 += delta_input, in_2 += delta_input, in_3 += delta_input, in_4 += delta_input, p_out += num_elems_written_per_iteration)

698

{

Vidhya Sudhan Loganathan

2018-07-04 09:34:00 +0100

[diff] [blame]

699

auto vres = convolve_5x5<stridex>(in_0, in_1, in_2, in_3, in_4, ptr_k_r0, ptr_k_r1, ptr_k_r2, ptr_k_r3, ptr_k_r4);

Pablo Tello

2017-08-10 15:10:40 +0100

[diff] [blame]

700

accumulate_results<stridex>(p_out, vres);

}

}

}

}

},

in, out);

}

};

Gian Marco Iodice

2019-06-13 15:58:32 +0100

[diff] [blame]

710

float vreduce(const float32x4_t &v)

711

{

712

auto v0 = wrapper::vgethigh(v);

713

auto v1 = wrapper::vgetlow(v);

714

auto v_out = wrapper::vadd(v0, v1);

715

716

float a = wrapper::vgetlane(v_out, 0);

717

float b = wrapper::vgetlane(v_out, 1);

return a + b;

}

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

721

template <typename T1, typename T2>

722

inline void convolve_1x1(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration,

723

const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info)

724

{

725

const unsigned int conv_stride_x = std::get<0>(conv_info.stride());

726

switch(conv_stride_x)

727

{

728

case 1:

729

convolver_1x1<T1, T2, 1>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info);

730

break;

731

case 2:

732

convolver_1x1<T1, T2, 2>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info);

733

break;

734

case 3:

735

convolver_1x1<T1, T2, 3>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info);

736

break;

737

default:

738

ARM_COMPUTE_ERROR("Not implemented");

}

}

Pablo Tello

2017-09-21 13:59:14 +0100

[diff] [blame]

742

template <>

743

inline void convolve_1x1<float, float>(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration,

744

const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info)

745

{

746

const unsigned int conv_stride_x = std::get<0>(conv_info.stride());

747

if(run_optim_small_tensor(input))

748

{

749

switch(conv_stride_x)

750

{

751

case 1:

752

convolver_w1x1_i8x8_f32<1>::convolve(window, input, weights, output, conv_info);

753

break;

754

case 2:

755

convolver_w1x1_i8x8_f32<2>::convolve(window, input, weights, output, conv_info);

756

break;

757

case 3:

758

convolver_w1x1_i8x8_f32<3>::convolve(window, input, weights, output, conv_info);

759

break;

760

default:

761

ARM_COMPUTE_ERROR("Not implemented");

}

}

else

{

switch(conv_stride_x)

767

{

768

case 1:

769

convolver_1x1<float, float, 1>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info);

770

break;

771

case 2:

772

convolver_1x1<float, float, 2>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info);

773

break;

774

case 3:

775

convolver_1x1<float, float, 3>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info);

776

break;

777

default:

778

ARM_COMPUTE_ERROR("Not implemented");

}

}

}

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

783

template <typename T1, typename T2>

784

inline void convolve_3x3(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration,

785

const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info)

786

{

787

const unsigned int conv_stride_x = std::get<0>(conv_info.stride());

788

switch(conv_stride_x)

789

{

790

case 1:

791

convolver_3x3<T1, T2, 1>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info);

792

break;

793

case 2:

794

convolver_3x3<T1, T2, 2>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info);

795

break;

796

case 3:

797

convolver_3x3<T1, T2, 3>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info);

798

break;

799

default:

800

ARM_COMPUTE_ERROR("Not implemented");

801

}

802

}

Pablo Tello

2017-08-10 15:10:40 +0100

[diff] [blame]

803

804

template <typename T1, typename T2>

805

inline void convolve_5x5(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration,

806

const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info)

807

{

808

const unsigned int conv_stride_x = std::get<0>(conv_info.stride());

809

switch(conv_stride_x)

810

{

811

case 1:

812

convolver_5x5<T1, T2, 1>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info);

813

break;

814

case 2:

815

convolver_5x5<T1, T2, 2>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info);

816

break;

817

case 3:

818

convolver_5x5<T1, T2, 3>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info);

819

break;

820

default:

821

ARM_COMPUTE_ERROR("Not implemented");

}

}

Michalis Spyrou

2017-11-30 14:25:57 +0000

[diff] [blame]

825

Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, const PadStrideInfo &conv_info)

826

{

827

ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);

Giorgio Arena

2018-03-16 14:02:34 +0000

[diff] [blame]

828

ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN);

Anthony Barbier

eaefd00

2018-07-20 17:49:35 +0100

[diff] [blame]

829

ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);

Vidhya Sudhan Loganathan

2018-07-04 09:34:00 +0100

[diff] [blame]

830

ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);

Michalis Spyrou

2017-11-30 14:25:57 +0000

[diff] [blame]

831

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);

Michalis Spyrou

2017-11-30 14:25:57 +0000

[diff] [blame]

832

Giorgio Arena

2018-03-16 14:02:34 +0000

[diff] [blame]

833

const DataLayout data_layout = input->data_layout();

834

const int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);

835

const int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);

836

const int channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);

837

Michalis Spyrou

2017-11-30 14:25:57 +0000

[diff] [blame]

838

ARM_COMPUTE_RETURN_ERROR_ON_MSG(std::get<0>(conv_info.stride()) > 3, "Strides larger than 3 not supported.");

Giorgio Arena

2018-03-16 14:02:34 +0000

[diff] [blame]

839

ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(channel_idx) != input->dimension(channel_idx));

840

ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(width_idx) != weights->dimension(height_idx));

Michalis Spyrou

2017-11-30 14:25:57 +0000

[diff] [blame]

841

ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);

Giorgio Arena

2018-03-16 14:02:34 +0000

[diff] [blame]

842

ARM_COMPUTE_RETURN_ERROR_ON(data_layout == DataLayout::NHWC && input->data_type() != DataType::F32);

Gian Marco Iodice

41acb76

2018-08-23 10:25:06 +0100

[diff] [blame]

843

ARM_COMPUTE_RETURN_ERROR_ON((weights->dimension(width_idx) > 3) && (input->data_type() == DataType::F16));

Michalis Spyrou

2017-11-30 14:25:57 +0000

[diff] [blame]

844

845

// Checks performed when output is configured

846

if(output->total_size() != 0)

847

{

Giorgio Arena

2018-03-16 14:02:34 +0000

[diff] [blame]

848

TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*input, *weights, conv_info);

Michalis Spyrou

2017-11-30 14:25:57 +0000

[diff] [blame]

849

850

DataType data_type = input->data_type();

Michalis Spyrou

2017-11-30 14:25:57 +0000

[diff] [blame]

851

852

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);

853

ARM_COMPUTE_RETURN_ERROR_ON(output->data_type() != data_type);

}

return Status{};

}

std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *weights, ITensorInfo *output, const PadStrideInfo &conv_info, unsigned int &num_weight_elems_read_per_row,

Georgios Pinitas

0223a78

2017-12-12 11:44:44 +0000

[diff] [blame]

860

unsigned int &num_elems_read_per_iteration, unsigned int &num_elems_written_per_iteration, BorderSize &border_size)

Michalis Spyrou

2017-11-30 14:25:57 +0000

[diff] [blame]

861

{

Giorgio Arena

2018-03-16 14:02:34 +0000

[diff] [blame]

862

ARM_COMPUTE_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN);

863

864

const DataLayout data_layout = input->data_layout();

865

const int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);

866

Michalis Spyrou

2017-11-30 14:25:57 +0000

[diff] [blame]

867

// Calculate right and bottom border

Giorgio Arena

2018-03-16 14:02:34 +0000

[diff] [blame]

868

unsigned int kernel_size = weights->dimension(width_idx);

Georgios Pinitas

1d6d211

2018-02-05 17:40:12 +0000

[diff] [blame]

869

const int conv_stride_x = std::get<0>(conv_info.stride());

Georgios Pinitas

1a03d76

2018-02-21 14:47:09 +0000

[diff] [blame]

870

const int conv_stride_y = std::get<1>(conv_info.stride());

Giorgio Arena

2018-03-16 14:02:34 +0000

[diff] [blame]

871

const int input_width = input->dimension(width_idx);

Michalis Spyrou

2017-11-30 14:25:57 +0000

[diff] [blame]

872

Giorgio Arena

2018-03-16 14:02:34 +0000

[diff] [blame]

873

Window win{};

874

bool window_changed = false;

875

876

if(data_layout == DataLayout::NCHW)

Michalis Spyrou

2017-11-30 14:25:57 +0000

[diff] [blame]

877

{

Giorgio Arena

2018-03-16 14:02:34 +0000

[diff] [blame]

878

switch(kernel_size)

Michalis Spyrou

2017-11-30 14:25:57 +0000

[diff] [blame]

879

{

Giorgio Arena

2018-03-16 14:02:34 +0000

[diff] [blame]

880

case 1:

Michalis Spyrou

2017-11-30 14:25:57 +0000

[diff] [blame]

881

{

Giorgio Arena

2018-03-16 14:02:34 +0000

[diff] [blame]

882

switch(input->data_type())

883

{

Michalis Spyrou

2017-11-30 14:25:57 +0000

[diff] [blame]

884

#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

Giorgio Arena

2018-03-16 14:02:34 +0000

[diff] [blame]

885

case DataType::F16:

Michalis Spyrou

2017-11-30 14:25:57 +0000

[diff] [blame]

886

num_elems_written_per_iteration = 8;

Giorgio Arena

2018-03-16 14:02:34 +0000

[diff] [blame]

887

break;

Vidhya Sudhan Loganathan

2018-07-04 09:34:00 +0100

[diff] [blame]

888

#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */

Giorgio Arena

2018-03-16 14:02:34 +0000

[diff] [blame]

889

case DataType::F32:

890

if(run_optim_small_tensor_info(input))

891

{

892

num_elems_written_per_iteration = 8;

}

else

{

num_elems_written_per_iteration = 4;

}

break;

default:

ARM_COMPUTE_ERROR("Data type not supported.");

901

break;

902

}

903

num_weight_elems_read_per_row = kernel_size;

904

num_elems_read_per_iteration = conv_stride_x * num_elems_written_per_iteration;

905

break;

Michalis Spyrou

2017-11-30 14:25:57 +0000

[diff] [blame]

906

}

Giorgio Arena

2018-03-16 14:02:34 +0000

[diff] [blame]

907

case 3:

Giorgio Arena

2018-03-16 14:02:34 +0000

[diff] [blame]

908

switch(input->data_type())

909

{

910

case DataType::F32:

911

num_weight_elems_read_per_row = 4 + kernel_size - 1;

912

num_elems_read_per_iteration = 12;

913

num_elems_written_per_iteration = 16 >> conv_stride_x;

914

break;

Michalis Spyrou

2017-11-30 14:25:57 +0000

[diff] [blame]

915

#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

Giorgio Arena

2018-03-16 14:02:34 +0000

[diff] [blame]

916

case DataType::F16:

Giorgio Arena

2018-03-16 14:02:34 +0000

[diff] [blame]

917

num_weight_elems_read_per_row = 8 + kernel_size - 1;

918

num_elems_read_per_iteration = 24;

919

num_elems_written_per_iteration = 32 >> conv_stride_x;

920

break;

Vidhya Sudhan Loganathan

2018-07-04 09:34:00 +0100

[diff] [blame]

921

#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */

Giorgio Arena

2018-03-16 14:02:34 +0000

[diff] [blame]

922

default:

923

ARM_COMPUTE_ERROR("Data type not supported.");

924

break;

925

}

Gian Marco Iodice

41acb76

2018-08-23 10:25:06 +0100

[diff] [blame]

break;

case 5:

{

switch(input->data_type())

930

{

931

case DataType::F32:

932

num_weight_elems_read_per_row = 4 + kernel_size - 1;

933

num_elems_read_per_iteration = 12;

934

num_elems_written_per_iteration = 16 >> conv_stride_x;

935

break;

936

default:

937

ARM_COMPUTE_ERROR("Data type not supported.");

938

break;

939

}

Giorgio Arena

2018-03-16 14:02:34 +0000

[diff] [blame]

}

break;

default:

{

ARM_COMPUTE_ERROR("Not implemented");

945

break;

Michalis Spyrou

2017-11-30 14:25:57 +0000

[diff] [blame]

946

}

947

}

Giorgio Arena

2018-03-16 14:02:34 +0000

[diff] [blame]

948

949

// Calculate right pad

950

int start_x = kernel_size / 2 - static_cast<int>(conv_info.pad_left());

951

int end_x = ceil_to_multiple(static_cast<int>(output->dimension(0)), num_elems_written_per_iteration) * conv_stride_x;

952

int upper_bound_w = ceil_to_multiple(start_x + end_x, num_elems_read_per_iteration) - input_width;

953

954

// Calculate border

955

const unsigned int conv_pad_left = conv_info.pad_left();

956

const unsigned int conv_pad_top = conv_info.pad_top();

957

const unsigned int conv_pad_right = std::max(upper_bound_w, 0);

958

const unsigned int conv_pad_bottom = conv_info.pad_bottom();

959

960

border_size.left = conv_pad_left;

961

border_size.top = conv_pad_top;

962

border_size.right = conv_pad_right;

963

border_size.bottom = conv_pad_bottom;

964

965

// Configure window

966

win = calculate_max_window(*output, Steps(num_elems_written_per_iteration));

967

968

AccessWindowRectangle input_access(input, -conv_pad_left, -conv_pad_top,

969

num_elems_read_per_iteration, kernel_size,

970

conv_stride_x, conv_stride_y);

971

AccessWindowStatic weights_access(weights, 0, 0, num_weight_elems_read_per_row, kernel_size);

972

AccessWindowHorizontal output_access(output, 0, num_elems_written_per_iteration);

973

window_changed = update_window_and_padding(win, input_access, weights_access, output_access);

974

output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));

Michalis Spyrou

2017-11-30 14:25:57 +0000

[diff] [blame]

975

}

Giorgio Arena

2018-03-16 14:02:34 +0000

[diff] [blame]

976

else

977

{

Manuel Bottini

2020-09-15 13:03:34 +0100

[diff] [blame]

978

// Configure window NHWC without any padding

979

win = calculate_max_window(*output, Steps());

Giorgio Arena

2018-03-16 14:02:34 +0000

[diff] [blame]

980

}

Michalis Spyrou

2017-11-30 14:25:57 +0000

[diff] [blame]

981

982

Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};

983

return std::make_pair(err, win);

984

}

Manuel Bottini

2020-09-15 13:03:34 +0100

[diff] [blame]

985

986

bool have_zero_x_internal_padding(ITensorInfo *input, ITensorInfo *weights)

987

{

988

return (input->padding().left == 0 && weights->padding().left == 0 && input->padding().right == 0 && weights->padding().right == 0);

989

}

990

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

991

} // namespace

992

Manuel Bottini

2020-09-15 13:03:34 +0100

[diff] [blame]

993

template <typename T>

994

void NEDirectConvolutionLayerKernel::convolve_nhwc_optimized(const Window &window)

995

{

996

// This function assumes that input and weights have not padding in channel

997

998

// Declare useful types

999

using vtype = wrapper::traits::neon_bitvector<T, wrapper::traits::BitWidth::W128>;

1000

using vector_type = typename vtype::type;

1001

using tag_type = typename vtype::tag_type;

1002

1003

// Scalar quantities

1004

const int element_size = _input->info()->element_size();

1005

const int input_stride_w = _input->info()->strides_in_bytes().y() / element_size;

1006

const int input_stride_h = _input->info()->strides_in_bytes().z() / element_size;

1007

const int input_stride_n = _input->info()->strides_in_bytes()[3] / element_size;

1008

const int input_dim_w = _input->info()->dimension(1);

1009

const int input_dim_h = _input->info()->dimension(2);

1010

1011

const int output_stride_c = _output->info()->strides_in_bytes().x();

1012

1013

const unsigned int kernel_stride_w = _weights->info()->strides_in_bytes().y() / element_size;

1014

const unsigned int kernel_stride_h = _weights->info()->strides_in_bytes().z() / element_size;

1015

const int kernel_dim_w = _weights->info()->dimension(1);

1016

const int kernel_dim_h = _weights->info()->dimension(2);

1017

1018

const int conv_pad_top = _conv_info.pad_top();

1019

const int conv_pad_left = _conv_info.pad_left();

1020

const int conv_stride_w = std::get<0>(_conv_info.stride());

1021

const int conv_stride_h = std::get<1>(_conv_info.stride());

1022

1023

// Setup input window for the output iterator

1024

Window window_out = window;

1025

window_out.set(Window::DimX, Window::Dimension(0, 1, 1));

1026

1027

// Setup input window for the weights iterator

1028

Window window_w = calculate_max_window(*_weights->info(), Steps());

1029

window_w.set(Window::DimX, Window::Dimension(0, 1, 1));

1030

window_w.set(Window::DimY, Window::Dimension(0, 1, 1));

1031

window_w.set(Window::DimZ, Window::Dimension(0, 1, 1));

1032

1033

Iterator out(_output, window_out);

1034

Iterator wei(_weights, window_w);

1035

1036

constexpr int num_elems_read_per_iteration = 16 / sizeof(T);

1037

/*

1038

* This implementation parallelize the full WC plane of input and weights by

1039

* treating them as series of elements. So for example, a 3x3 weights and

1040

* floating point vector operations of 4 elements per time, the first 3

1041

* channel elements of the first row would be taken and additionally the first

1042

* element of the second row. The 9 elements in each single WC weight plane

1043

* would require 2 4-element vector operations and a last single element operation.

1044

*

1045

* This works since when we create the input vector to multiply with the weights,

1046

* the exact required elements are loaded in the same order. Therefore the

1047

* multiplication works on the correct input/weight elements.

1048

*/

1049

execute_window_loop(window_out, [&](const Coordinates & id)

1050

{

1051

/*

1052

* In here we create theoretical indexes which then we validate for both

1053

* inputs and weights.

1054

* As a reminder, this loop take each output point in NHW, C is treated

1055

* in the weights loop.

1056

*/

1057

// We are computing the theoretical starting input starting points

1058

const int in_w_start_t = static_cast<int>(id.y()) * conv_stride_w - conv_pad_left;

1059

const int in_h_start_t = static_cast<int>(id.z()) * conv_stride_h - conv_pad_top;

1060

const int in_w_end_t = in_w_start_t + kernel_dim_w;

1061

const int in_h_end_t = in_h_start_t + kernel_dim_h;

1062

1063

// We are computing the valid initial and ending input points by checking the borders

1064

const int in_w_start = std::max(in_w_start_t, 0);

1065

const int in_h_start = std::max(in_h_start_t, 0);

1066

const int in_w_end = std::min(in_w_end_t, input_dim_w);

1067

const int in_h_end = std::min(in_h_end_t, input_dim_h);

1068

1069

// We use the input points to select the valid weight points to use

1070

const int index_wc_start = (in_w_start - in_w_start_t) * kernel_stride_w;

1071

const int index_h_start = in_h_start - in_h_start_t;

1072

const int index_wc_end = (kernel_dim_w - (in_w_end_t - in_w_end)) * kernel_stride_w;

1073

const int index_h_end = kernel_dim_h - (in_h_end_t - in_h_end);

1074

1075

execute_window_loop(window_w, [&](const Coordinates & id_w)

1076

{

1077

/*

1078

* This is the loop in the weights, and it goes along N (the batches)

1079

* As a reminder, the batches of the weights are translated into the

1080

* channels of the output

1081

*/

1082

const T *in_ptr_row = reinterpret_cast<const T *>(_input->buffer() + _input->info()->offset_first_element_in_bytes())

1083

+ id[3] * input_stride_n + in_w_start * input_stride_w + in_h_start * input_stride_h;

1084

const T *weights_ptr_row = reinterpret_cast<const T *>(wei.ptr()) + index_h_start * kernel_stride_h;

1085

uint8_t *out_ptr = out.ptr() + id_w[3] * output_stride_c;

1086

1087

T out_temp = static_cast<T>(0);

1088

for(int index_h = index_h_start; index_h < index_h_end; ++index_h, in_ptr_row += input_stride_h, weights_ptr_row += kernel_stride_h)

1089

{

1090

const T *in_ptr_mover = in_ptr_row;

1091

int index_wc = index_wc_start;

1092

vector_type out_temp_vec = wrapper::vdup_n(static_cast<T>(0), tag_type());

1093

for(; index_wc <= index_wc_end - num_elems_read_per_iteration; index_wc += num_elems_read_per_iteration, in_ptr_mover += num_elems_read_per_iteration)

1094

{

1095

const auto src_vec = wrapper::vloadq(in_ptr_mover);

1096

const auto w_vec = wrapper::vloadq(weights_ptr_row + index_wc);

1097

out_temp_vec = wrapper::vmla(out_temp_vec, w_vec, src_vec);

1098

}

1099

out_temp += vreduce(out_temp_vec);

1100

for(; index_wc < index_wc_end; ++index_wc, ++in_ptr_mover)

1101

{

1102

const auto src_val = *(in_ptr_mover);

1103

const auto w_val = *(weights_ptr_row + index_wc);

1104

out_temp += src_val * w_val;

1105

}

1106

}

1107

*(reinterpret_cast<T *>(out_ptr)) = out_temp;

},

wei);

},

out);

}

template <typename T>

1115

void NEDirectConvolutionLayerKernel::convolve_nhwc(const Window &window)

1116

{

1117

// Declare useful types

1118

using vtype = wrapper::traits::neon_bitvector<T, wrapper::traits::BitWidth::W128>;

1119

using vector_type = typename vtype::type;

1120

using tag_type = typename vtype::tag_type;

1121

1122

// Scalar quantities

1123

const int element_size = _input->info()->element_size();

1124

const int input_stride_w = _input->info()->strides_in_bytes().y() / element_size;

1125

const int input_stride_h = _input->info()->strides_in_bytes().z() / element_size;

1126

const int input_stride_n = _input->info()->strides_in_bytes()[3] / element_size;

1127

const int input_dim_w = _input->info()->dimension(1);

1128

const int input_dim_h = _input->info()->dimension(2);

1129

1130

const int output_stride_c = _output->info()->strides_in_bytes().x();

1131

1132

const unsigned int kernel_stride_w = _weights->info()->strides_in_bytes().y() / element_size;

1133

const unsigned int kernel_stride_h = _weights->info()->strides_in_bytes().z() / element_size;

1134

const int kernel_dim_w = _weights->info()->dimension(1);

1135

const int kernel_dim_h = _weights->info()->dimension(2);

1136

1137

const int conv_pad_top = _conv_info.pad_top();

1138

const int conv_pad_left = _conv_info.pad_left();

1139

const int conv_stride_w = std::get<0>(_conv_info.stride());

1140

const int conv_stride_h = std::get<1>(_conv_info.stride());

1141

1142

// Setup input window for the output iterator

1143

Window window_out = window;

1144

window_out.set(Window::DimX, Window::Dimension(0, 1, 1));

1145

1146

// Setup input window for the weights iterator

1147

Window window_w = calculate_max_window(*_weights->info(), Steps());

1148

window_w.set(Window::DimX, Window::Dimension(0, 1, 1));

1149

window_w.set(Window::DimY, Window::Dimension(0, 1, 1));

1150

window_w.set(Window::DimZ, Window::Dimension(0, 1, 1));

1151

1152

Iterator out(_output, window_out);

1153

Iterator wei(_weights, window_w);

1154

1155

constexpr int num_elems_read_per_iteration = 16 / sizeof(T);

1156

1157

execute_window_loop(window_out, [&](const Coordinates & id)

1158

{

1159

// We are computing the theoretical starting input starting points

1160

const int in_w_start_t = static_cast<int>(id.y()) * conv_stride_w - conv_pad_left;

1161

const int in_h_start_t = static_cast<int>(id.z()) * conv_stride_h - conv_pad_top;

1162

const int in_w_end_t = in_w_start_t + kernel_dim_w;

1163

const int in_h_end_t = in_h_start_t + kernel_dim_h;

1164

1165

// We are computing the valid initial and ending input points by checking the borders

1166

const int in_w_start = std::max(in_w_start_t, 0);

1167

const int in_h_start = std::max(in_h_start_t, 0);

1168

const int in_w_end = std::min(in_w_end_t, input_dim_w);

1169

const int in_h_end = std::min(in_h_end_t, input_dim_h);

1170

1171

// We use the input points to select the valid weight points to use

1172

const int wei_w_start = in_w_start - in_w_start_t;

1173

const int wei_h_start = in_h_start - in_h_start_t;

1174

const int wei_w_end = kernel_dim_w - (in_w_end_t - in_w_end);

1175

const int wei_h_end = kernel_dim_h - (in_h_end_t - in_h_end);

1176

1177

const int index_c_end = _weights->info()->dimension(0);

1178

const T *const in_ptr_start = reinterpret_cast<const T *>(_input->buffer() + _input->info()->offset_first_element_in_bytes()) + id[3] * input_stride_n;

1179

1180

execute_window_loop(window_w, [&](const Coordinates & id_w)

1181

{

1182

const T *const weights_ptr_start = reinterpret_cast<const T *>(wei.ptr());

1183

uint8_t *out_ptr = out.ptr() + id_w[3] * output_stride_c;

1184

1185

T out_temp = static_cast<T>(0);

1186

for(int index_wei_h = wei_h_start, index_in_h = in_h_start; index_wei_h < wei_h_end; ++index_wei_h, ++index_in_h)

1187

{

1188

const T *const in_ptr_row = in_ptr_start + index_in_h * input_stride_h;

1189

const T *const weights_ptr_row = weights_ptr_start + index_wei_h * kernel_stride_h;

1190

for(int index_wei_w = wei_w_start, index_in_w = in_w_start; index_wei_w < wei_w_end; ++index_wei_w, ++index_in_w)

1191

{

1192

const T *in_ptr_mover = in_ptr_row + index_in_w * input_stride_w;

1193

const T *weights_ptr_mover = weights_ptr_row + index_wei_w * kernel_stride_w;

1194

int index_c = 0;

1195

vector_type out_temp_vec = wrapper::vdup_n(static_cast<T>(0), tag_type());

1196

for(; index_c <= index_c_end - num_elems_read_per_iteration; index_c += num_elems_read_per_iteration, in_ptr_mover += num_elems_read_per_iteration, weights_ptr_mover += num_elems_read_per_iteration)

1197

{

1198

const auto src_vec = wrapper::vloadq(in_ptr_mover);

1199

const auto w_vec = wrapper::vloadq(weights_ptr_mover);

1200

out_temp_vec = wrapper::vmla(out_temp_vec, w_vec, src_vec);

1201

}

1202

out_temp += vreduce(out_temp_vec);

1203

for(; index_c < index_c_end; ++index_c, ++in_ptr_mover, ++weights_ptr_mover)

1204

{

1205

const auto src_val = *(in_ptr_mover);

1206

const auto w_val = *(weights_ptr_mover);

1207

out_temp += src_val * w_val;

}

}

}

*(reinterpret_cast<T *>(out_ptr)) = out_temp;

},

wei);

},

out);

}

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1218

NEDirectConvolutionLayerKernel::NEDirectConvolutionLayerKernel()

Georgios Pinitas

898a806

2017-09-12 19:19:12 +0100

[diff] [blame]

1219

: _input(nullptr), _weights(nullptr), _output(nullptr), _conv_info(), _border_size(0), _kernel_size(0), _num_weight_elems_read_per_row(0), _num_elems_read_per_iteration(0),

Manuel Bottini

2021-03-23 11:50:34 +0000

[diff] [blame^]

1220

_num_elems_written_per_iteration(0), _data_layout()

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

{

}

BorderSize NEDirectConvolutionLayerKernel::border_size() const

{

return _border_size;

}

void NEDirectConvolutionLayerKernel::configure(const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info)

1230

{

Michalis Spyrou

2017-11-30 14:25:57 +0000

[diff] [blame]

1231

ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

_input = input;

_weights = weights;

_output = output;

_conv_info = conv_info;

Manuel Bottini

2021-03-23 11:50:34 +0000

[diff] [blame^]

1237

_data_layout = _input->info()->data_layout();

1238

_kernel_size = weights->info()->dimension(get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH));

Michalis Spyrou

621965e

2018-01-08 17:11:26 +0000

[diff] [blame]

1239

1240

const unsigned int conv_pad_left = conv_info.pad_left();

1241

const unsigned int conv_pad_top = conv_info.pad_top();

1242

const unsigned int conv_pad_right = conv_info.pad_right();

1243

const unsigned int conv_pad_bottom = conv_info.pad_bottom();

Manuel Bottini

2021-03-23 11:50:34 +0000

[diff] [blame^]

1244

if(_data_layout == DataLayout::NCHW)

Manuel Bottini

2020-09-15 13:03:34 +0100

[diff] [blame]

1245

{

1246

_border_size = BorderSize(conv_pad_top, conv_pad_right, conv_pad_bottom, conv_pad_left);

}

else

{

_border_size = BorderSize(0);

1251

}

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1252

Gian Marco Iodice

2017-08-08 10:53:00 +0100

[diff] [blame]

1253

// Get convolved dimensions

Giorgio Arena

2018-03-16 14:02:34 +0000

[diff] [blame]

1254

TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*input->info(), *weights->info(), conv_info);

Gian Marco Iodice

2017-08-08 10:53:00 +0100

[diff] [blame]

1255

1256

DataType data_type = input->info()->data_type();

1257

Gian Marco Iodice

2017-08-08 10:53:00 +0100

[diff] [blame]

1258

// Output auto inizialitation if not yet initialized

Vidhya Sudhan Loganathan

2018-07-04 09:34:00 +0100

[diff] [blame]

1259

auto_init_if_empty(*output->info(), output_shape, 1, data_type);

Gian Marco Iodice

2017-08-08 10:53:00 +0100

[diff] [blame]

1260

Michalis Spyrou

2017-11-30 14:25:57 +0000

[diff] [blame]

1261

// Perform validation step

1262

ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), weights->info(), output->info(), conv_info));

Gian Marco Iodice

2017-08-08 10:53:00 +0100

[diff] [blame]

1263

Michalis Spyrou

2017-11-30 14:25:57 +0000

[diff] [blame]

1264

// Configure kernel window

1265

auto win_config = validate_and_configure_window(input->info(), weights->info(), output->info(), conv_info, _num_weight_elems_read_per_row,

Georgios Pinitas

0223a78

2017-12-12 11:44:44 +0000

[diff] [blame]

1266

_num_elems_read_per_iteration, _num_elems_written_per_iteration, _border_size);

Michalis Spyrou

2017-11-30 14:25:57 +0000

[diff] [blame]

1267

ARM_COMPUTE_ERROR_THROW_ON(win_config.first);

1268

INEKernel::configure(win_config.second);

1269

}

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1270

Michalis Spyrou

2017-11-30 14:25:57 +0000

[diff] [blame]

1271

Status NEDirectConvolutionLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, const PadStrideInfo &conv_info)

1272

{

1273

unsigned int num_weight_elems_read_per_row = 0;

1274

unsigned int num_elems_read_per_iteration = 0;

1275

unsigned int num_elems_written_per_iteration = 0;

Georgios Pinitas

2018-02-19 13:58:22 +0000

[diff] [blame]

1276

BorderSize border_size = {};

Michalis Spyrou

2017-11-30 14:25:57 +0000

[diff] [blame]

1277

ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, weights, output, conv_info));

Georgios Pinitas

0223a78

2017-12-12 11:44:44 +0000

[diff] [blame]

1278

ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(),

1279

weights->clone().get(),

1280

output->clone().get(),

1281

conv_info,

1282

num_weight_elems_read_per_row,

1283

num_elems_read_per_iteration,

1284

num_elems_written_per_iteration,

1285

border_size)

Michalis Spyrou

2017-11-30 14:25:57 +0000

[diff] [blame]

1286

.first);

Georgios Pinitas

898a806

2017-09-12 19:19:12 +0100

[diff] [blame]

1287

Michalis Spyrou

2017-11-30 14:25:57 +0000

[diff] [blame]

1288

return Status{};

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1289

}

1290

Moritz Pflanzer

c186b57

2017-09-07 09:48:04 +0100

[diff] [blame]

1291

void NEDirectConvolutionLayerKernel::run(const Window &window, const ThreadInfo &info)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1292

{

Moritz Pflanzer

c186b57

2017-09-07 09:48:04 +0100

[diff] [blame]

1293

ARM_COMPUTE_UNUSED(info);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1294

ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);

1295

ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);

1296

ARM_COMPUTE_ERROR_ON(_input->buffer() == nullptr);

1297

Manuel Bottini

2021-03-23 11:50:34 +0000

[diff] [blame^]

1298

const int kernel_size = _weights->info()->dimension(get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1299

Manuel Bottini

2021-03-23 11:50:34 +0000

[diff] [blame^]

1300

if(_data_layout == DataLayout::NCHW)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1301

{

Giorgio Arena

2018-03-16 14:02:34 +0000

[diff] [blame]

1302

switch(kernel_size)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1303

{

Giorgio Arena

2018-03-16 14:02:34 +0000

[diff] [blame]

1304

case 1:

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1305

{

Giorgio Arena

2018-03-16 14:02:34 +0000

[diff] [blame]

1306

switch(_input->info()->data_type())

1307

{

Giorgio Arena

2018-03-16 14:02:34 +0000

[diff] [blame]

1308

case DataType::F32:

1309

convolve_1x1<float, float>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);

1310

break;

Ioan-Cristian Szabo

2017-11-13 13:34:08 +0000

[diff] [blame]

1311

#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

Giorgio Arena

2018-03-16 14:02:34 +0000

[diff] [blame]

1312

case DataType::F16:

1313

convolve_1x1<float16_t, float16_t>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);

1314

break;

Ioan-Cristian Szabo

2017-11-13 13:34:08 +0000

[diff] [blame]

1315

#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */

Giorgio Arena

2018-03-16 14:02:34 +0000

[diff] [blame]

1316

default:

1317

ARM_COMPUTE_ERROR("Data type not supported");

1318

break;

1319

}

1320

break;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1321

}

Giorgio Arena

2018-03-16 14:02:34 +0000

[diff] [blame]

1322

case 3:

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1323

{

Giorgio Arena

2018-03-16 14:02:34 +0000

[diff] [blame]

1324

switch(_input->info()->data_type())

1325

{

Giorgio Arena

2018-03-16 14:02:34 +0000

[diff] [blame]

1326

case DataType::F32:

1327

convolve_3x3<float, float>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);

1328

break;

Ioan-Cristian Szabo

2017-11-13 13:34:08 +0000

[diff] [blame]

1329

#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

Giorgio Arena

2018-03-16 14:02:34 +0000

[diff] [blame]

1330

case DataType::F16:

1331

convolve_3x3<float16_t, float16_t>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);

1332

break;

Ioan-Cristian Szabo

2017-11-13 13:34:08 +0000

[diff] [blame]

1333

#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */

Giorgio Arena

2018-03-16 14:02:34 +0000

[diff] [blame]

1334

default:

1335

ARM_COMPUTE_ERROR("Data type not supported");

1336

break;

1337

}

1338

break;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1339

}

Giorgio Arena

2018-03-16 14:02:34 +0000

[diff] [blame]

1340

case 5:

Pablo Tello

2017-08-10 15:10:40 +0100

[diff] [blame]

1341

{

Giorgio Arena

2018-03-16 14:02:34 +0000

[diff] [blame]

1342

switch(_input->info()->data_type())

1343

{

1344

case DataType::F32:

1345

convolve_5x5<float, float>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);

1346

break;

1347

default:

1348

ARM_COMPUTE_ERROR("Data type not supported");

1349

break;

1350

}

1351

break;

Pablo Tello

2017-08-10 15:10:40 +0100

[diff] [blame]

1352

}

Giorgio Arena

2018-03-16 14:02:34 +0000

[diff] [blame]

1353

default:

1354

{

1355

ARM_COMPUTE_ERROR("Only kernel sizes 1x1, 3x3 and 5x5 are supported.");

break;

}

}

}

else

{

switch(_input->info()->data_type())

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

1363

{

Giorgio Arena

2018-03-16 14:02:34 +0000

[diff] [blame]

1364

case DataType::F32:

Gian Marco Iodice

2019-06-13 15:58:32 +0100

[diff] [blame]

1365

{

Manuel Bottini

2020-09-15 13:03:34 +0100

[diff] [blame]

1366

if(have_zero_x_internal_padding(_input->info(), _weights->info()))

Gian Marco Iodice

2019-06-13 15:58:32 +0100

[diff] [blame]

1367

{

Manuel Bottini

2020-09-15 13:03:34 +0100

[diff] [blame]

1368

convolve_nhwc_optimized<float>(window);

Gian Marco Iodice

2019-06-13 15:58:32 +0100

[diff] [blame]

1369

}

1370

else

1371

{

Manuel Bottini

2020-09-15 13:03:34 +0100

[diff] [blame]

1372

convolve_nhwc<float>(window);

Gian Marco Iodice

2019-06-13 15:58:32 +0100

[diff] [blame]

1373

}

Giorgio Arena

2018-03-16 14:02:34 +0000

[diff] [blame]

1374

break;

Gian Marco Iodice

2019-06-13 15:58:32 +0100

[diff] [blame]

1375

}

Giorgio Arena

2018-03-16 14:02:34 +0000

[diff] [blame]

1376

default:

1377

ARM_COMPUTE_ERROR("Data type not supported");

1378

break;

Anthony Barbier