Blame - src/core/NEON/kernels/NEArithmeticSubtractionKernel.cpp - ml/ComputeLibrary

2020-05-13 00:12:08 +0100

[diff] [blame]

228

229

// Compute S elements per iteration

230

int x = window_start_x;

231

for(; x <= (window_end_x - window_step_x); x += window_step_x)

232

{

233

const auto a = wrapper::vloadq(non_broadcast_input_ptr + x);

234

235

const float32x4x4_t af =

236

{

237

{

238

vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgetlow(a))))), voffset1)), vscale1),

239

vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgetlow(a))))), voffset1)), vscale1),

240

vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgethigh(a))))), voffset1)), vscale1),

241

vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgethigh(a))))), voffset1)), vscale1),

}

};

const int32x4x4_t rf =

246

{

247

{

Michalis Spyrou

2020-06-15 20:23:59 +0100

[diff] [blame]

248

#ifdef __aarch64_

249

vcvtnq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[0], af.val[0]) : vsubq_f32(af.val[0], bf.val[0]), invvscaleo)),

250

vcvtnq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[1], af.val[1]) : vsubq_f32(af.val[1], bf.val[1]), invvscaleo)),

251

vcvtnq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[2], af.val[2]) : vsubq_f32(af.val[2], bf.val[2]), invvscaleo)),

252

vcvtnq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[3], af.val[3]) : vsubq_f32(af.val[3], bf.val[3]), invvscaleo)),

Michalis Spyrou

2020-05-13 00:12:08 +0100

[diff] [blame]

253

#else //__aarch64__

Michalis Spyrou

2020-06-15 20:23:59 +0100

[diff] [blame]

254

vcvtq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[0], af.val[0]) : vsubq_f32(af.val[0], bf.val[0]), invvscaleo)),

255

vcvtq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[1], af.val[1]) : vsubq_f32(af.val[1], bf.val[1]), invvscaleo)),

256

vcvtq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[2], af.val[2]) : vsubq_f32(af.val[2], bf.val[2]), invvscaleo)),

257

vcvtq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[3], af.val[3]) : vsubq_f32(af.val[3], bf.val[3]), invvscaleo)),

Michalis Spyrou

2020-05-13 00:12:08 +0100

[diff] [blame]

#endif //__aarch64__

}

};

const auto pa = wrapper::vqmov<T>(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])));

263

const auto pb = wrapper::vqmov<T>(vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3])));

264

wrapper::vstore(output_ptr + x, wrapper::vcombine(pa, pb));

Manuel Bottini

2019-02-25 13:50:11 +0000

[diff] [blame]

265

}

Manuel Bottini

2019-02-25 13:50:11 +0000

[diff] [blame]

266

Michalis Spyrou

2020-05-13 00:12:08 +0100

[diff] [blame]

267

// Compute left-over elements

268

for(; x < window_end_x; ++x)

269

{

270

const float afs = static_cast<int32_t>(*(non_broadcast_input_ptr + x) - non_broadcast_qinfo.offset) * non_broadcast_qinfo.scale;

Michalis Spyrou

2020-06-15 20:23:59 +0100

[diff] [blame]

271

const float bfs = static_cast<int32_t>(broadcast_value - broadcast_qinfo.offset) * broadcast_qinfo.scale;

272

*(output_ptr + x) = quantize<T>(is_broadcast_input_2 ? afs - bfs : bfs - afs, out->info()->quantization_info());

Michalis Spyrou

2020-05-13 00:12:08 +0100

[diff] [blame]

273

}

274

},

275

broadcast_input, non_broadcast_input, output);

276

}

277

else

278

{

Michalis Spyrou

2020-06-15 20:23:59 +0100

[diff] [blame]

279

const float32x4_t vscale1 = vdupq_n_f32(iq1_info.scale);

280

const float32x4_t vscale2 = vdupq_n_f32(iq2_info.scale);

281

const int32x4_t voffset1 = vdupq_n_s32(iq1_info.offset);

282

const int32x4_t voffset2 = vdupq_n_s32(iq2_info.offset);

283

Michalis Spyrou

2020-05-13 00:12:08 +0100

[diff] [blame]

284

// Clear X Dimension on execution window as we handle manually

285

input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));

286

input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));

Manuel Bottini

2019-02-25 13:50:11 +0000

[diff] [blame]

287

Michalis Spyrou

2020-05-13 00:12:08 +0100

[diff] [blame]

288

Iterator input1(in1, input1_win);

289

Iterator input2(in2, input2_win);

290

Iterator output(out, win);

291

292

execute_window_loop(win, [&](const Coordinates &)

293

{

294

const auto input1_ptr = reinterpret_cast<const T *>(input1.ptr());

295

const auto input2_ptr = reinterpret_cast<const T *>(input2.ptr());

296

const auto output_ptr = reinterpret_cast<T *>(output.ptr());

297

298

// Compute S elements per iteration

299

int x = window_start_x;

300

for(; x <= (window_end_x - window_step_x); x += window_step_x)

301

{

302

const auto a = wrapper::vloadq(input1_ptr + x);

303

const auto b = wrapper::vloadq(input2_ptr + x);

304

305

const float32x4x4_t af =

306

{

307

{

308

vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgetlow(a))))), voffset1)), vscale1),

309

vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgetlow(a))))), voffset1)), vscale1),

310

vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgethigh(a))))), voffset1)), vscale1),

311

vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgethigh(a))))), voffset1)), vscale1),

}

};

const float32x4x4_t bf =

316

{

317

{

318

vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgetlow(b))))), voffset2)), vscale2),

319

vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgetlow(b))))), voffset2)), vscale2),

320

vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgethigh(b))))), voffset2)), vscale2),

321

vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgethigh(b))))), voffset2)), vscale2),

}

};

const int32x4x4_t rf =

{

{

#ifdef __aarch64__

vcvtnq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[0], bf.val[0]), invvscaleo)),

330

vcvtnq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[1], bf.val[1]), invvscaleo)),

331

vcvtnq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[2], bf.val[2]), invvscaleo)),

332

vcvtnq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[3], bf.val[3]), invvscaleo)),

333

#else //__aarch64__

334

vcvtq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[0], bf.val[0]), invvscaleo)),

335

vcvtq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[1], bf.val[1]), invvscaleo)),

336

vcvtq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[2], bf.val[2]), invvscaleo)),

337

vcvtq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[3], bf.val[3]), invvscaleo)),

#endif //__aarch64__

}

};

const auto pa = wrapper::vqmov<T>(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])));

343

const auto pb = wrapper::vqmov<T>(vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3])));

344

wrapper::vstore(output_ptr + x, wrapper::vcombine(pa, pb));

345

}

346

347

// Compute left-over elements

348

for(; x < window_end_x; ++x)

349

{

350

const float afs = static_cast<int32_t>((*(input1_ptr + x)) - iq1_info.offset) * iq1_info.scale;

351

const float bfs = static_cast<int32_t>((*(input2_ptr + x)) - iq2_info.offset) * iq2_info.scale;

352

353

*(output_ptr + x) = quantize<T>((afs - bfs), out->info()->quantization_info());

354

}

355

},

356

input1, input2, output);

357

}

Manuel Bottini

2019-02-25 13:50:11 +0000

[diff] [blame]

358

}

359

Michalis Spyrou

2020-05-13 00:12:08 +0100

[diff] [blame]

360

void sub_QSYMM16_QSYMM16_QSYMM16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, bool is_sat)

Michalis Spyrou

2019-12-04 12:00:36 +0000

[diff] [blame]

361

{

Michalis Spyrou

2020-05-13 00:12:08 +0100

[diff] [blame]

362

ARM_COMPUTE_UNUSED(is_sat);

363

364

// Create input windows

365

Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());

366

Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());

367

368

// Clear X Dimension on execution window as we handle manually

369

Window win = window;

370

win.set(Window::DimX, Window::Dimension(0, 1, 1));

371

372

const int window_step_x = 8;

373

const auto window_start_x = static_cast<int>(window.x().start());

374

const auto window_end_x = static_cast<int>(window.x().end());

375

const bool is_broadcast_across_x = (input1_win.x().step() == 0) || (input2_win.x().step() == 0);

Michalis Spyrou

2019-12-04 12:00:36 +0000

[diff] [blame]

376

377

const UniformQuantizationInfo iq1_info = in1->info()->quantization_info().uniform();

378

const UniformQuantizationInfo iq2_info = in2->info()->quantization_info().uniform();

379

const UniformQuantizationInfo oq_info = out->info()->quantization_info().uniform();

380

Michalis Spyrou

2020-05-13 00:12:08 +0100

[diff] [blame]

381

const float32x4_t vscale1 = vdupq_n_f32(iq1_info.scale);

382

const float32x4_t vscale2 = vdupq_n_f32(iq2_info.scale);

383

const float32x4_t invvscaleo = vdupq_n_f32(1.f / oq_info.scale);

384

385

if(is_broadcast_across_x)

Michalis Spyrou

2019-12-04 12:00:36 +0000

[diff] [blame]

386

{

Michalis Spyrou

2020-05-13 00:12:08 +0100

[diff] [blame]

387

const bool is_broadcast_input_2 = input2_win.x().step() == 0;

388

Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;

389

Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;

390

const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1;

391

const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;

392

const UniformQuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info().uniform();

393

const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform();

Michalis Spyrou

2019-12-04 12:00:36 +0000

[diff] [blame]

394

Michalis Spyrou

2020-05-13 00:12:08 +0100

[diff] [blame]

395

// Clear X Dimension on execution window as we handle manually

396

non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));

397

398

Iterator broadcast_input(broadcast_tensor, broadcast_win);

399

Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);

400

Iterator output(out, win);

401

402

execute_window_loop(win, [&](const Coordinates &)

Michalis Spyrou

2019-12-04 12:00:36 +0000

[diff] [blame]

403

{

Michalis Spyrou

2020-05-13 00:12:08 +0100

[diff] [blame]

404

const auto non_broadcast_input_ptr = reinterpret_cast<const int16_t *>(non_broadcast_input.ptr());

405

const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());

406

407

const int16_t broadcast_value = *reinterpret_cast<const int16_t *>(broadcast_input.ptr());

408

const int16x8_t broadcast_value_vec = vdupq_n_s16(broadcast_value);

409

410

const float32x4x2_t bf =

Michalis Spyrou

2019-12-04 12:00:36 +0000

[diff] [blame]

411

{

Michalis Spyrou

2020-05-13 00:12:08 +0100

[diff] [blame]

412

{

413

vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(broadcast_value_vec))), vscale2),

414

vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(broadcast_value_vec))), vscale2),

415

}

416

};

417

const float bfs = static_cast<int32_t>(broadcast_value) * broadcast_qinfo.scale;

418

419

// Compute S elements per iteration

420

int x = window_start_x;

421

for(; x <= (window_end_x - window_step_x); x += window_step_x)

422

{

423

const int16x8_t a = vld1q_s16(non_broadcast_input_ptr + x);

424

const float32x4x2_t af =

425

{

426

{

427

vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(a))), vscale1),

428

vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(a))), vscale1),

}

};

const int32x4x4_t rf =

{

{

#ifdef __aarch64__

vcvtnq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(bf.val[0], af.val[0]) : vsubq_f32(af.val[0], bf.val[0]), invvscaleo)),

437

vcvtnq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(bf.val[1], af.val[1]) : vsubq_f32(af.val[1], bf.val[1]), invvscaleo)),

438

#else //__aarch64__

439

vcvtq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(bf.val[0], af.val[0]) : vsubq_f32(af.val[0], bf.val[0]), invvscaleo)),

440

vcvtq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(bf.val[1], af.val[1]) : vsubq_f32(af.val[1], bf.val[1]), invvscaleo)),

#endif //__aarch64__

}

};

const int16x8_t pa = vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1]));

446

vst1q_s16(output_ptr + x, pa);

Michalis Spyrou

2019-12-04 12:00:36 +0000

[diff] [blame]

447

}

Michalis Spyrou

2019-12-04 12:00:36 +0000

[diff] [blame]

448

Michalis Spyrou

2020-05-13 00:12:08 +0100

[diff] [blame]

449

// Compute left-over elements

450

for(; x < window_end_x; ++x)

451

{

452

const float afs = static_cast<int32_t>(*(non_broadcast_input_ptr + x)) * non_broadcast_qinfo.scale;

453

*(output_ptr + x) = quantize_qsymm16(is_broadcast_input_2 ? (bfs - afs) : (afs - bfs), oq_info);

454

}

455

},

456

broadcast_input, non_broadcast_input, output);

}

else

{

// Clear X Dimension on execution window as we handle manually

461

input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));

462

input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));

Michalis Spyrou

2019-12-04 12:00:36 +0000

[diff] [blame]

463

Michalis Spyrou

2020-05-13 00:12:08 +0100

[diff] [blame]

464

Iterator input1(in1, input1_win);

465

Iterator input2(in2, input2_win);

466

Iterator output(out, win);

467

468

execute_window_loop(win, [&](const Coordinates &)

469

{

470

const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());

471

const auto input2_ptr = reinterpret_cast<const int16_t *>(input2.ptr());

472

const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());

473

474

// Compute S elements per iteration

475

int x = window_start_x;

476

for(; x <= (window_end_x - window_step_x); x += window_step_x)

477

{

478

const int16x8_t a = vld1q_s16(input1_ptr + x);

479

const int16x8_t b = vld1q_s16(input2_ptr + x);

480

481

const float32x4x2_t af =

482

{

483

{

484

vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(a))), vscale1),

485

vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(a))), vscale1),

}

};

const float32x4x2_t bf =

490

{

491

{

492

vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(b))), vscale2),

493

vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(b))), vscale2),

}

};

const int32x4x2_t rf =

{

{

#ifdef __aarch64__

vcvtnq_s32_f32(vmulq_f32(vsubq_f32(af.val[0], bf.val[0]), invvscaleo)),

502

vcvtnq_s32_f32(vmulq_f32(vsubq_f32(af.val[1], bf.val[1]), invvscaleo)),

503

#else //__aarch64__

504

vcvtq_s32_f32(vmulq_f32(vsubq_f32(af.val[0], bf.val[0]), invvscaleo)),

505

vcvtq_s32_f32(vmulq_f32(vsubq_f32(af.val[1], bf.val[1]), invvscaleo)),

#endif //__aarch64__

}

};

const int16x8_t pa = vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1]));

511

vst1q_s16(output_ptr + x, pa);

512

}

513

514

// Compute left-over elements

515

for(; x < window_end_x; ++x)

516

{

517

const float afs = static_cast<int32_t>((*(input1_ptr + x))) * iq1_info.scale;

518

const float bfs = static_cast<int32_t>((*(input2_ptr + x))) * iq2_info.scale;

519

*(output_ptr + x) = quantize_qsymm16((afs - bfs), out->info()->quantization_info());

520

}

521

},

522

input1, input2, output);

}

}

void sub_S16_U8_S16_impl(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, bool is_sat, bool is_swapped)

527

{

528

// Create input windows

529

Window win = window;

530

Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());

531

Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());

532

533

// Clear X Dimension on execution window as we handle manually

534

win.set(Window::DimX, Window::Dimension(0, 1, 1));

535

input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));

536

input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));

537

538

Iterator input1(in1, input1_win);

539

Iterator input2(in2, input2_win);

540

Iterator output(out, win);

541

542

const int window_step_x = 8;

543

const auto window_start_x = static_cast<int>(window.x().start());

544

const auto window_end_x = static_cast<int>(window.x().end());

545

546

execute_window_loop(win, [&](const Coordinates &)

547

{

548

const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());

549

const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());

550

const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());

if(!is_sat)

{

// Compute S elements per iteration

555

int x = window_start_x;

556

for(; x <= (window_end_x - window_step_x); x += window_step_x)

557

{

558

const auto vin1 = wrapper::vloadq(input1_ptr + x);

559

const auto vin2 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input2_ptr + x)));

560

const auto res = is_swapped ? wrapper::vsub(vin2, vin1) : wrapper::vsub(vin1, vin2);

561

wrapper::vstore(output_ptr + x, res);

562

}

563

564

// Compute left-over elements

565

for(; x < window_end_x; ++x)

566

{

567

const auto res = is_swapped ? static_cast<int16_t>(*(input2_ptr + x)) - *(input1_ptr + x) : *(input1_ptr + x) - static_cast<int16_t>(*(input2_ptr + x));

568

*(output_ptr + x) = res;

}

}

else

{

// Compute S elements per iteration

574

int x = window_start_x;

575

for(; x <= (window_end_x - window_step_x); x += window_step_x)

576

{

577

const auto vin1 = wrapper::vloadq(input1_ptr + x);

578

const auto vin2 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input2_ptr + x)));

579

const auto res = is_swapped ? wrapper::vqsub(vin2, vin1) : wrapper::vqsub(vin1, vin2);

580

wrapper::vstore(output_ptr + x, res);

581

}

582

583

// Compute left-over elements

584

for(; x < window_end_x; ++x)

585

{

586

const auto res = is_swapped ? wrapper::sub_sat(static_cast<int16_t>(*(input2_ptr + x)), *(input1_ptr + x)) : wrapper::sub_sat(*(input1_ptr + x), static_cast<int16_t>(*(input2_ptr + x)));

587

*(output_ptr + x) = res;

588

}

589

}

Michalis Spyrou

2019-12-04 12:00:36 +0000

[diff] [blame]

590

},

591

input1, input2, output);

592

}

593

Michalis Spyrou

2020-05-13 00:12:08 +0100

[diff] [blame]

594

void sub_S16_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, bool is_sat)

Michele Di Giorgio

2020-03-27 10:23:44 +0000

[diff] [blame]

595

{

Michalis Spyrou

2020-05-13 00:12:08 +0100

[diff] [blame]

596

sub_S16_U8_S16_impl(in1, in2, out, window, is_sat, false);

Michele Di Giorgio

2020-03-27 10:23:44 +0000

[diff] [blame]

597

}

598

Michalis Spyrou

2020-05-13 00:12:08 +0100

[diff] [blame]

599

void sub_U8_S16_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, bool is_sat)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

600

{

Michalis Spyrou

2020-05-13 00:12:08 +0100

[diff] [blame]

601

// Swap arguments

602

sub_S16_U8_S16_impl(in2, in1, out, window, is_sat, true);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

603

}

604

Michalis Spyrou

2020-05-13 00:12:08 +0100

[diff] [blame]

605

void sub_U8_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, bool is_sat)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

606

{

Michalis Spyrou

2020-05-13 00:12:08 +0100

[diff] [blame]

607

// Create input windows

608

Window win = window;

609

Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());

610

Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

611

Michalis Spyrou

2020-05-13 00:12:08 +0100

[diff] [blame]

612

// Clear X Dimension on execution window as we handle manually

613

win.set(Window::DimX, Window::Dimension(0, 1, 1));

614

input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));

615

input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));

616

617

Iterator input1(in1, input1_win);

618

Iterator input2(in2, input2_win);

619

Iterator output(out, win);

620

621

const int window_step_x = 8;

622

const auto window_start_x = static_cast<int>(window.x().start());

623

const auto window_end_x = static_cast<int>(window.x().end());

624

625

execute_window_loop(win, [&](const Coordinates &)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

626

{

Michalis Spyrou

2020-05-13 00:12:08 +0100

[diff] [blame]

627

const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());

628

const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());

629

const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());

630

631

if(!is_sat)

Michele Di Giorgio

2020-03-27 10:23:44 +0000

[diff] [blame]

632

{

Michalis Spyrou

2020-05-13 00:12:08 +0100

[diff] [blame]

633

// Compute S elements per iteration

634

int x = window_start_x;

635

for(; x <= (window_end_x - window_step_x); x += window_step_x)

Michele Di Giorgio

2020-03-27 10:23:44 +0000

[diff] [blame]

636

{

Michalis Spyrou

2020-05-13 00:12:08 +0100

[diff] [blame]

637

const auto vin1 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input1_ptr + x)));

638

const auto vin2 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input2_ptr + x)));

639

wrapper::vstore(output_ptr + x, wrapper::vsub(vin1, vin2));

Michele Di Giorgio

2020-03-27 10:23:44 +0000

[diff] [blame]

640

}

Michalis Spyrou

2020-05-13 00:12:08 +0100

[diff] [blame]

641

642

// Compute left-over elements

643

for(; x < window_end_x; ++x)

644

{

645

*(output_ptr + x) = static_cast<int16_t>(*(input1_ptr + x)) - static_cast<int16_t>(*(input2_ptr + x));

646

}

647

}

648

else

Michele Di Giorgio

2020-03-27 10:23:44 +0000

[diff] [blame]

649

{

Michalis Spyrou

2020-05-13 00:12:08 +0100

[diff] [blame]

650

// Compute S elements per iteration

651

int x = window_start_x;

652

for(; x <= (window_end_x - window_step_x); x += window_step_x)

Michele Di Giorgio

2020-03-27 10:23:44 +0000

[diff] [blame]

653

{

Michalis Spyrou

2020-05-13 00:12:08 +0100

[diff] [blame]

654

const auto vin1 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input1_ptr + x)));

655

const auto vin2 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input2_ptr + x)));

656

wrapper::vstore(output_ptr + x, wrapper::vqsub(vin1, vin2));

Michele Di Giorgio

2020-03-27 10:23:44 +0000

[diff] [blame]

657

}

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

658

Michalis Spyrou

2020-05-13 00:12:08 +0100

[diff] [blame]

659

// Compute left-over elements

660

for(; x < window_end_x; ++x)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

661

{

Michalis Spyrou

2020-05-13 00:12:08 +0100

[diff] [blame]

662

*(output_ptr + x) = wrapper::sub_sat(static_cast<int16_t>(*(input1_ptr + x)),

663

static_cast<int16_t>(*(input2_ptr + x)));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

664

}

Michalis Spyrou

2020-05-13 00:12:08 +0100

[diff] [blame]

665

}

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

666

},

667

input1, input2, output);

668

}

Ioan-Cristian Szabo

2017-11-30 15:19:11 +0000

[diff] [blame]

669

Georgios Pinitas

2018-09-10 15:07:45 +0100

[diff] [blame]

670

inline Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output, ConvertPolicy policy)

Ioan-Cristian Szabo

2017-11-30 15:19:11 +0000

[diff] [blame]

671

{

672

ARM_COMPUTE_UNUSED(policy);

Georgios Pinitas

2018-09-10 15:07:45 +0100

[diff] [blame]

673

ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&input1);

SiCong Li

2020-08-27 10:17:10 +0100

[diff] [blame]

674

ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input1, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM16, DataType::S16, DataType::S32, DataType::F16,

675

DataType::F32);

676

ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input2, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM16, DataType::S16, DataType::S32, DataType::F16,

677

DataType::F32);

678

ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&output, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM16, DataType::S16, DataType::S32, DataType::F16,

679

DataType::F32);

Ioan-Cristian Szabo

2017-11-30 15:19:11 +0000

[diff] [blame]

680

Georgios Pinitas

2018-09-10 15:07:45 +0100

[diff] [blame]

681

const TensorShape out_shape = TensorShape::broadcast_shape(input1.tensor_shape(), input2.tensor_shape());

682

ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");

Ioan-Cristian Szabo

2017-11-30 15:19:11 +0000

[diff] [blame]

683

Manuel Bottini

2019-02-25 13:50:11 +0000

[diff] [blame]

684

ARM_COMPUTE_RETURN_ERROR_ON_MSG(

685

!(input1.data_type() == DataType::U8 && input2.data_type() == DataType::U8)

686

&& !(input1.data_type() == DataType::QASYMM8 && input2.data_type() == DataType::QASYMM8)

Michalis Spyrou

2019-12-04 12:00:36 +0000

[diff] [blame]

687

&& !(input1.data_type() == DataType::QASYMM8_SIGNED && input2.data_type() == DataType::QASYMM8_SIGNED)

Michele Di Giorgio

2020-03-27 10:23:44 +0000

[diff] [blame]

688

&& !(input1.data_type() == DataType::QSYMM16 && input2.data_type() == DataType::QSYMM16)

Manuel Bottini

2019-02-25 13:50:11 +0000

[diff] [blame]

689

&& !(input1.data_type() == DataType::U8 && input2.data_type() == DataType::U8)

690

&& !(input1.data_type() == DataType::U8 && input2.data_type() == DataType::S16)

691

&& !(input1.data_type() == DataType::S16 && input2.data_type() == DataType::U8)

692

&& !(input1.data_type() == DataType::S16 && input2.data_type() == DataType::S16)

SiCong Li

2020-08-27 10:17:10 +0100

[diff] [blame]

693

&& !(input1.data_type() == DataType::S32 && input2.data_type() == DataType::S32)

Manuel Bottini

2019-02-25 13:50:11 +0000

[diff] [blame]

694

&& !(input1.data_type() == DataType::F32 && input2.data_type() == DataType::F32)

695

&& !(input1.data_type() == DataType::F16 && input2.data_type() == DataType::F16),

696

"You called subtract with the wrong image formats");

697

698

ARM_COMPUTE_RETURN_ERROR_ON_MSG(

SiCong Li

2020-08-27 10:17:10 +0100

[diff] [blame]

699

(input1.data_type() == DataType::QASYMM8_SIGNED && input2.data_type() == DataType::QASYMM8_SIGNED && policy == ConvertPolicy::WRAP)

700

|| (input1.data_type() == DataType::QASYMM8 && input2.data_type() == DataType::QASYMM8 && policy == ConvertPolicy::WRAP)

701

|| (input1.data_type() == DataType::QSYMM16 && input2.data_type() == DataType::QSYMM16 && policy == ConvertPolicy::WRAP),

702

"Convert policy cannot be WRAP if datatype is quantized");

Manuel Bottini

2019-02-25 13:50:11 +0000

[diff] [blame]

703

Georgios Pinitas

2018-09-10 15:07:45 +0100

[diff] [blame]

704

// Validate in case of configured output

705

if(output.total_size() > 0)

706

{

707

ARM_COMPUTE_RETURN_ERROR_ON_MSG(

708

!(input1.data_type() == DataType::U8 && input2.data_type() == DataType::U8 && output.data_type() == DataType::U8)

Manuel Bottini

2019-02-25 13:50:11 +0000

[diff] [blame]

709

&& !(input1.data_type() == DataType::QASYMM8 && input2.data_type() == DataType::QASYMM8 && output.data_type() == DataType::QASYMM8)

Michalis Spyrou

2019-12-04 12:00:36 +0000

[diff] [blame]

710

&& !(input1.data_type() == DataType::QASYMM8_SIGNED && input2.data_type() == DataType::QASYMM8_SIGNED && output.data_type() == DataType::QASYMM8_SIGNED)

Michele Di Giorgio

2020-03-27 10:23:44 +0000

[diff] [blame]

711

&& !(input1.data_type() == DataType::QSYMM16 && input2.data_type() == DataType::QSYMM16 && output.data_type() == DataType::QSYMM16)

Georgios Pinitas

2018-09-10 15:07:45 +0100

[diff] [blame]

712

&& !(input1.data_type() == DataType::U8 && input2.data_type() == DataType::U8 && output.data_type() == DataType::S16)

713

&& !(input1.data_type() == DataType::U8 && input2.data_type() == DataType::S16 && output.data_type() == DataType::S16)

714

&& !(input1.data_type() == DataType::S16 && input2.data_type() == DataType::U8 && output.data_type() == DataType::S16)

715

&& !(input1.data_type() == DataType::S16 && input2.data_type() == DataType::S16 && output.data_type() == DataType::S16)

SiCong Li

2020-08-27 10:17:10 +0100

[diff] [blame]

716

&& !(input1.data_type() == DataType::S32 && input2.data_type() == DataType::S32 && output.data_type() == DataType::S32)

Georgios Pinitas

2018-09-10 15:07:45 +0100

[diff] [blame]

717

&& !(input1.data_type() == DataType::F32 && input2.data_type() == DataType::F32 && output.data_type() == DataType::F32)

718

&& !(input1.data_type() == DataType::F16 && input2.data_type() == DataType::F16 && output.data_type() == DataType::F16),

719

"You called subtract with the wrong image formats");

720

721

ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, output.tensor_shape(), 0),

722

"Wrong shape for output");

723

}

Georgios Pinitas

631c41a

2017-12-06 11:53:03 +0000

[diff] [blame]

724

return Status{};

Ioan-Cristian Szabo

2017-11-30 15:19:11 +0000

[diff] [blame]

725

}

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

726

} // namespace

727

728

NEArithmeticSubtractionKernel::NEArithmeticSubtractionKernel()

Michalis Spyrou

2020-06-23 17:25:43 +0100

[diff] [blame]

729

: _func(nullptr), _policy(ConvertPolicy::WRAP)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

{

}

Michalis Spyrou

2020-06-23 17:25:43 +0100

[diff] [blame]

733

void NEArithmeticSubtractionKernel::configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output, ConvertPolicy policy)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

734

{

735

ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);

Michalis Spyrou

2020-06-23 17:25:43 +0100

[diff] [blame]

736

ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input1, *input2, *output, policy));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

737

Michalis Spyrou

2020-05-13 00:12:08 +0100

[diff] [blame]

738

_policy = policy;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

739

Michalis Spyrou

2020-06-23 17:25:43 +0100

[diff] [blame]

740

const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(*input1, *input2);

Michalis Spyrou

2020-05-13 00:12:08 +0100

[diff] [blame]

741

const TensorShape &out_shape = broadcast_pair.first;

742

const ValidRegion &valid_region = broadcast_pair.second;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

743

Michalis Spyrou

2020-05-13 00:12:08 +0100

[diff] [blame]

744

// Auto initialize output if not initialized

Michalis Spyrou

2020-06-23 17:25:43 +0100

[diff] [blame]

745

set_shape_if_empty(*output, out_shape);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

746

Michalis Spyrou

2020-06-23 17:25:43 +0100

[diff] [blame]

747

switch(input1->data_type())

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

748

{

Michalis Spyrou

2020-05-13 00:12:08 +0100

[diff] [blame]

749

case DataType::U8:

Michalis Spyrou

2020-06-23 17:25:43 +0100

[diff] [blame]

750

if(input2->data_type() == DataType::U8 && output->data_type() == DataType::U8)

Michalis Spyrou

2020-05-13 00:12:08 +0100

[diff] [blame]

751

{

752

_func = &sub_same<uint8_t>;

753

}

Michalis Spyrou

2020-06-23 17:25:43 +0100

[diff] [blame]

754

else if(input2->data_type() == DataType::U8 && output->data_type() == DataType::S16)

Michalis Spyrou

2020-05-13 00:12:08 +0100

[diff] [blame]

755

{

756

_func = &sub_U8_U8_S16;

}

else

{

_func = &sub_U8_S16_S16;

761

}

762

break;

763

case DataType::QASYMM8:

764

_func = &sub_quantized<uint8_t>;

Michalis Spyrou

2020-06-23 17:25:43 +0100

[diff] [blame]

765

set_data_type_if_unknown(*output, DataType::QASYMM8);

Michalis Spyrou

2020-05-13 00:12:08 +0100

[diff] [blame]

766

break;

767

case DataType::QASYMM8_SIGNED:

768

_func = &sub_quantized<int8_t>;

Michalis Spyrou

2020-06-23 17:25:43 +0100

[diff] [blame]

769

set_data_type_if_unknown(*output, DataType::QASYMM8_SIGNED);

Michalis Spyrou

2020-05-13 00:12:08 +0100

[diff] [blame]

770

break;

771

case DataType::S16:

Michalis Spyrou

2020-06-23 17:25:43 +0100

[diff] [blame]

772

if(input2->data_type() == DataType::U8)

Michalis Spyrou

2020-05-13 00:12:08 +0100

[diff] [blame]

773

{

774

_func = &sub_S16_U8_S16;

}

else

{

_func = &sub_same<int16_t>;

779

}

Michalis Spyrou

2020-06-23 17:25:43 +0100

[diff] [blame]

780

set_format_if_unknown(*output, Format::S16);

Michalis Spyrou

2020-05-13 00:12:08 +0100

[diff] [blame]

781

break;

782

case DataType::QSYMM16:

783

_func = &sub_QSYMM16_QSYMM16_QSYMM16;

Michalis Spyrou

2020-06-23 17:25:43 +0100

[diff] [blame]

784

set_data_type_if_unknown(*output, DataType::QSYMM16);

Michalis Spyrou

2020-05-13 00:12:08 +0100

[diff] [blame]

785

break;

SiCong Li

2020-08-27 10:17:10 +0100

[diff] [blame]

786

case DataType::S32:

787

_func = &sub_same<int32_t>;

788

set_format_if_unknown(*output, Format::S32);

789

break;

Michalis Spyrou

2020-05-13 00:12:08 +0100

[diff] [blame]

790

#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

791

case DataType::F16:

792

_func = &sub_same<float16_t>;

Michalis Spyrou

2020-06-23 17:25:43 +0100

[diff] [blame]

793

set_format_if_unknown(*output, Format::F16);

Michalis Spyrou

2020-05-13 00:12:08 +0100

[diff] [blame]

794

break;

795

#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */

796

case DataType::F32:

797

_func = &sub_same<float>;

Michalis Spyrou

2020-06-23 17:25:43 +0100

[diff] [blame]

798

set_format_if_unknown(*output, Format::F32);

Michalis Spyrou

2020-05-13 00:12:08 +0100

[diff] [blame]

799

break;

800

default:

801

_func = nullptr;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

802

}

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

803

Michalis Spyrou

2020-05-13 00:12:08 +0100

[diff] [blame]

804

// NEArithmeticSubtractionKernel doesn't need padding so update_window_and_padding() can be skipped

805

Coordinates coord;

Michalis Spyrou

2020-06-23 17:25:43 +0100

[diff] [blame]

806

coord.set_num_dimensions(output->num_dimensions());

807

output->set_valid_region(valid_region);

Michalis Spyrou

2020-05-13 00:12:08 +0100

[diff] [blame]

808

Window win = calculate_max_window(valid_region, Steps());

809

810

INEKernel::configure(win);

Ioan-Cristian Szabo

2017-11-30 15:19:11 +0000

[diff] [blame]

811

}

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

812

Georgios Pinitas

631c41a

2017-12-06 11:53:03 +0000

[diff] [blame]

813

Status NEArithmeticSubtractionKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy)

Ioan-Cristian Szabo

2017-11-30 15:19:11 +0000

[diff] [blame]

814

{

Georgios Pinitas

2018-09-10 15:07:45 +0100

[diff] [blame]

815

ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output);

Georgios Pinitas

2018-09-10 15:07:45 +0100

[diff] [blame]

816

ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*input1, *input2, *output, policy));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

817

Georgios Pinitas

631c41a

2017-12-06 11:53:03 +0000

[diff] [blame]

818

return Status{};

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

819

}

820

Georgios Pinitas

0499dff

2020-07-31 22:21:38 +0100

[diff] [blame]

821

void NEArithmeticSubtractionKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

822

{

Moritz Pflanzer

c186b57

2017-09-07 09:48:04 +0100

[diff] [blame]

823

ARM_COMPUTE_UNUSED(info);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

824

ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);

825

ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);

Michalis Spyrou

2020-06-23 17:25:43 +0100

[diff] [blame]

826

// Dispatch kernel

Georgios Pinitas

0499dff

2020-07-31 22:21:38 +0100

[diff] [blame]

827

(*_func)(tensors.get_const_tensor(TensorType::ACL_SRC_0),

828

tensors.get_const_tensor(TensorType::ACL_SRC_1),

829

tensors.get_tensor(TensorType::ACL_DST),

830

window,

831

(_policy == ConvertPolicy::SATURATE));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

832

}

Michalis Spyrou