Blame - src/core/NEON/kernels/NEArithmeticSubtractionKernel.cpp - ml/ComputeLibrary

2020-05-13 00:12:08 +0100

[diff] [blame]

226

227

// Compute S elements per iteration

228

int x = window_start_x;

229

for(; x <= (window_end_x - window_step_x); x += window_step_x)

230

{

231

const auto a = wrapper::vloadq(non_broadcast_input_ptr + x);

232

233

const float32x4x4_t af =

234

{

235

{

236

vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgetlow(a))))), voffset1)), vscale1),

237

vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgetlow(a))))), voffset1)), vscale1),

238

vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgethigh(a))))), voffset1)), vscale1),

239

vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgethigh(a))))), voffset1)), vscale1),

}

};

const int32x4x4_t rf =

244

{

245

{

Michalis Spyrou

2020-06-15 20:23:59 +0100

[diff] [blame]

246

#ifdef __aarch64_

247

vcvtnq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[0], af.val[0]) : vsubq_f32(af.val[0], bf.val[0]), invvscaleo)),

248

vcvtnq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[1], af.val[1]) : vsubq_f32(af.val[1], bf.val[1]), invvscaleo)),

249

vcvtnq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[2], af.val[2]) : vsubq_f32(af.val[2], bf.val[2]), invvscaleo)),

250

vcvtnq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[3], af.val[3]) : vsubq_f32(af.val[3], bf.val[3]), invvscaleo)),

Michalis Spyrou

2020-05-13 00:12:08 +0100

[diff] [blame]

251

#else //__aarch64__

Michalis Spyrou

2020-06-15 20:23:59 +0100

[diff] [blame]

252

vcvtq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[0], af.val[0]) : vsubq_f32(af.val[0], bf.val[0]), invvscaleo)),

253

vcvtq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[1], af.val[1]) : vsubq_f32(af.val[1], bf.val[1]), invvscaleo)),

254

vcvtq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[2], af.val[2]) : vsubq_f32(af.val[2], bf.val[2]), invvscaleo)),

255

vcvtq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[3], af.val[3]) : vsubq_f32(af.val[3], bf.val[3]), invvscaleo)),

Michalis Spyrou

2020-05-13 00:12:08 +0100

[diff] [blame]

#endif //__aarch64__

}

};

const auto pa = wrapper::vqmov<T>(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])));

261

const auto pb = wrapper::vqmov<T>(vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3])));

262

wrapper::vstore(output_ptr + x, wrapper::vcombine(pa, pb));

Manuel Bottini

2019-02-25 13:50:11 +0000

[diff] [blame]

263

}

Manuel Bottini

2019-02-25 13:50:11 +0000

[diff] [blame]

264

Michalis Spyrou

2020-05-13 00:12:08 +0100

[diff] [blame]

265

// Compute left-over elements

266

for(; x < window_end_x; ++x)

267

{

268

const float afs = static_cast<int32_t>(*(non_broadcast_input_ptr + x) - non_broadcast_qinfo.offset) * non_broadcast_qinfo.scale;

Michalis Spyrou

2020-06-15 20:23:59 +0100

[diff] [blame]

269

const float bfs = static_cast<int32_t>(broadcast_value - broadcast_qinfo.offset) * broadcast_qinfo.scale;

270

*(output_ptr + x) = quantize<T>(is_broadcast_input_2 ? afs - bfs : bfs - afs, out->info()->quantization_info());

Michalis Spyrou

2020-05-13 00:12:08 +0100

[diff] [blame]

271

}

272

},

273

broadcast_input, non_broadcast_input, output);

274

}

275

else

276

{

Michalis Spyrou

2020-06-15 20:23:59 +0100

[diff] [blame]

277

const float32x4_t vscale1 = vdupq_n_f32(iq1_info.scale);

278

const float32x4_t vscale2 = vdupq_n_f32(iq2_info.scale);

279

const int32x4_t voffset1 = vdupq_n_s32(iq1_info.offset);

280

const int32x4_t voffset2 = vdupq_n_s32(iq2_info.offset);

281

Michalis Spyrou

2020-05-13 00:12:08 +0100

[diff] [blame]

282

// Clear X Dimension on execution window as we handle manually

283

input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));

284

input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));

Manuel Bottini

2019-02-25 13:50:11 +0000

[diff] [blame]

285

Michalis Spyrou

2020-05-13 00:12:08 +0100

[diff] [blame]

286

Iterator input1(in1, input1_win);

287

Iterator input2(in2, input2_win);

288

Iterator output(out, win);

289

290

execute_window_loop(win, [&](const Coordinates &)

291

{

292

const auto input1_ptr = reinterpret_cast<const T *>(input1.ptr());

293

const auto input2_ptr = reinterpret_cast<const T *>(input2.ptr());

294

const auto output_ptr = reinterpret_cast<T *>(output.ptr());

295

296

// Compute S elements per iteration

297

int x = window_start_x;

298

for(; x <= (window_end_x - window_step_x); x += window_step_x)

299

{

300

const auto a = wrapper::vloadq(input1_ptr + x);

301

const auto b = wrapper::vloadq(input2_ptr + x);

302

303

const float32x4x4_t af =

304

{

305

{

306

vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgetlow(a))))), voffset1)), vscale1),

307

vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgetlow(a))))), voffset1)), vscale1),

308

vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgethigh(a))))), voffset1)), vscale1),

309

vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgethigh(a))))), voffset1)), vscale1),

}

};

const float32x4x4_t bf =

314

{

315

{

316

vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgetlow(b))))), voffset2)), vscale2),

317

vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgetlow(b))))), voffset2)), vscale2),

318

vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vgethigh(b))))), voffset2)), vscale2),

319

vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgethigh(b))))), voffset2)), vscale2),

}

};

const int32x4x4_t rf =

{

{

#ifdef __aarch64__

vcvtnq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[0], bf.val[0]), invvscaleo)),

328

vcvtnq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[1], bf.val[1]), invvscaleo)),

329

vcvtnq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[2], bf.val[2]), invvscaleo)),

330

vcvtnq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[3], bf.val[3]), invvscaleo)),

331

#else //__aarch64__

332

vcvtq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[0], bf.val[0]), invvscaleo)),

333

vcvtq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[1], bf.val[1]), invvscaleo)),

334

vcvtq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[2], bf.val[2]), invvscaleo)),

335

vcvtq_s32_f32(vmlaq_f32(voffseto, vsubq_f32(af.val[3], bf.val[3]), invvscaleo)),

#endif //__aarch64__

}

};

const auto pa = wrapper::vqmov<T>(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])));

341

const auto pb = wrapper::vqmov<T>(vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3])));

342

wrapper::vstore(output_ptr + x, wrapper::vcombine(pa, pb));

343

}

344

345

// Compute left-over elements

346

for(; x < window_end_x; ++x)

347

{

348

const float afs = static_cast<int32_t>((*(input1_ptr + x)) - iq1_info.offset) * iq1_info.scale;

349

const float bfs = static_cast<int32_t>((*(input2_ptr + x)) - iq2_info.offset) * iq2_info.scale;

350

351

*(output_ptr + x) = quantize<T>((afs - bfs), out->info()->quantization_info());

352

}

353

},

354

input1, input2, output);

355

}

Manuel Bottini

2019-02-25 13:50:11 +0000

[diff] [blame]

356

}

357

Michalis Spyrou

2020-05-13 00:12:08 +0100

[diff] [blame]

358

void sub_QSYMM16_QSYMM16_QSYMM16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, bool is_sat)

Michalis Spyrou

2019-12-04 12:00:36 +0000

[diff] [blame]

359

{

Michalis Spyrou

2020-05-13 00:12:08 +0100

[diff] [blame]

360

ARM_COMPUTE_UNUSED(is_sat);

361

362

// Create input windows

363

Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());

364

Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());

365

366

// Clear X Dimension on execution window as we handle manually

367

Window win = window;

368

win.set(Window::DimX, Window::Dimension(0, 1, 1));

369

370

const int window_step_x = 8;

371

const auto window_start_x = static_cast<int>(window.x().start());

372

const auto window_end_x = static_cast<int>(window.x().end());

373

const bool is_broadcast_across_x = (input1_win.x().step() == 0) || (input2_win.x().step() == 0);

Michalis Spyrou

2019-12-04 12:00:36 +0000

[diff] [blame]

374

375

const UniformQuantizationInfo iq1_info = in1->info()->quantization_info().uniform();

376

const UniformQuantizationInfo iq2_info = in2->info()->quantization_info().uniform();

377

const UniformQuantizationInfo oq_info = out->info()->quantization_info().uniform();

378

Michalis Spyrou

2020-05-13 00:12:08 +0100

[diff] [blame]

379

const float32x4_t vscale1 = vdupq_n_f32(iq1_info.scale);

380

const float32x4_t vscale2 = vdupq_n_f32(iq2_info.scale);

381

const float32x4_t invvscaleo = vdupq_n_f32(1.f / oq_info.scale);

382

383

if(is_broadcast_across_x)

Michalis Spyrou

2019-12-04 12:00:36 +0000

[diff] [blame]

384

{

Michalis Spyrou

2020-05-13 00:12:08 +0100

[diff] [blame]

385

const bool is_broadcast_input_2 = input2_win.x().step() == 0;

386

Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win;

387

Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win;

388

const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1;

389

const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;

390

const UniformQuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info().uniform();

391

const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform();

Michalis Spyrou

2019-12-04 12:00:36 +0000

[diff] [blame]

392

Michalis Spyrou

2020-05-13 00:12:08 +0100

[diff] [blame]

393

// Clear X Dimension on execution window as we handle manually

394

non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));

395

396

Iterator broadcast_input(broadcast_tensor, broadcast_win);

397

Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);

398

Iterator output(out, win);

399

400

execute_window_loop(win, [&](const Coordinates &)

Michalis Spyrou

2019-12-04 12:00:36 +0000

[diff] [blame]

401

{

Michalis Spyrou

2020-05-13 00:12:08 +0100

[diff] [blame]

402

const auto non_broadcast_input_ptr = reinterpret_cast<const int16_t *>(non_broadcast_input.ptr());

403

const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());

404

405

const int16_t broadcast_value = *reinterpret_cast<const int16_t *>(broadcast_input.ptr());

406

const int16x8_t broadcast_value_vec = vdupq_n_s16(broadcast_value);

407

408

const float32x4x2_t bf =

Michalis Spyrou

2019-12-04 12:00:36 +0000

[diff] [blame]

409

{

Michalis Spyrou

2020-05-13 00:12:08 +0100

[diff] [blame]

410

{

411

vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(broadcast_value_vec))), vscale2),

412

vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(broadcast_value_vec))), vscale2),

413

}

414

};

415

const float bfs = static_cast<int32_t>(broadcast_value) * broadcast_qinfo.scale;

416

417

// Compute S elements per iteration

418

int x = window_start_x;

419

for(; x <= (window_end_x - window_step_x); x += window_step_x)

420

{

421

const int16x8_t a = vld1q_s16(non_broadcast_input_ptr + x);

422

const float32x4x2_t af =

423

{

424

{

425

vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(a))), vscale1),

426

vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(a))), vscale1),

}

};

const int32x4x4_t rf =

{

{

#ifdef __aarch64__

vcvtnq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(bf.val[0], af.val[0]) : vsubq_f32(af.val[0], bf.val[0]), invvscaleo)),

435

vcvtnq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(bf.val[1], af.val[1]) : vsubq_f32(af.val[1], bf.val[1]), invvscaleo)),

436

#else //__aarch64__

437

vcvtq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(bf.val[0], af.val[0]) : vsubq_f32(af.val[0], bf.val[0]), invvscaleo)),

438

vcvtq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(bf.val[1], af.val[1]) : vsubq_f32(af.val[1], bf.val[1]), invvscaleo)),

#endif //__aarch64__

}

};

const int16x8_t pa = vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1]));

444

vst1q_s16(output_ptr + x, pa);

Michalis Spyrou

2019-12-04 12:00:36 +0000

[diff] [blame]

445

}

Michalis Spyrou

2019-12-04 12:00:36 +0000

[diff] [blame]

446

Michalis Spyrou

2020-05-13 00:12:08 +0100

[diff] [blame]

447

// Compute left-over elements

448

for(; x < window_end_x; ++x)

449

{

450

const float afs = static_cast<int32_t>(*(non_broadcast_input_ptr + x)) * non_broadcast_qinfo.scale;

451

*(output_ptr + x) = quantize_qsymm16(is_broadcast_input_2 ? (bfs - afs) : (afs - bfs), oq_info);

452

}

453

},

454

broadcast_input, non_broadcast_input, output);

}

else

{

// Clear X Dimension on execution window as we handle manually

459

input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));

460

input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));

Michalis Spyrou

2019-12-04 12:00:36 +0000

[diff] [blame]

461

Michalis Spyrou

2020-05-13 00:12:08 +0100

[diff] [blame]

462

Iterator input1(in1, input1_win);

463

Iterator input2(in2, input2_win);

464

Iterator output(out, win);

465

466

execute_window_loop(win, [&](const Coordinates &)

467

{

468

const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());

469

const auto input2_ptr = reinterpret_cast<const int16_t *>(input2.ptr());

470

const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());

471

472

// Compute S elements per iteration

473

int x = window_start_x;

474

for(; x <= (window_end_x - window_step_x); x += window_step_x)

475

{

476

const int16x8_t a = vld1q_s16(input1_ptr + x);

477

const int16x8_t b = vld1q_s16(input2_ptr + x);

478

479

const float32x4x2_t af =

480

{

481

{

482

vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(a))), vscale1),

483

vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(a))), vscale1),

}

};

const float32x4x2_t bf =

488

{

489

{

490

vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(b))), vscale2),

491

vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(b))), vscale2),

}

};

const int32x4x2_t rf =

{

{

#ifdef __aarch64__

vcvtnq_s32_f32(vmulq_f32(vsubq_f32(af.val[0], bf.val[0]), invvscaleo)),

500

vcvtnq_s32_f32(vmulq_f32(vsubq_f32(af.val[1], bf.val[1]), invvscaleo)),

501

#else //__aarch64__

502

vcvtq_s32_f32(vmulq_f32(vsubq_f32(af.val[0], bf.val[0]), invvscaleo)),

503

vcvtq_s32_f32(vmulq_f32(vsubq_f32(af.val[1], bf.val[1]), invvscaleo)),

#endif //__aarch64__

}

};

const int16x8_t pa = vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1]));

509

vst1q_s16(output_ptr + x, pa);

510

}

511

512

// Compute left-over elements

513

for(; x < window_end_x; ++x)

514

{

515

const float afs = static_cast<int32_t>((*(input1_ptr + x))) * iq1_info.scale;

516

const float bfs = static_cast<int32_t>((*(input2_ptr + x))) * iq2_info.scale;

517

*(output_ptr + x) = quantize_qsymm16((afs - bfs), out->info()->quantization_info());

518

}

519

},

520

input1, input2, output);

}

}

void sub_S16_U8_S16_impl(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, bool is_sat, bool is_swapped)

525

{

526

// Create input windows

527

Window win = window;

528

Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());

529

Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());

530

531

// Clear X Dimension on execution window as we handle manually

532

win.set(Window::DimX, Window::Dimension(0, 1, 1));

533

input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));

534

input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));

535

536

Iterator input1(in1, input1_win);

537

Iterator input2(in2, input2_win);

538

Iterator output(out, win);

539

540

const int window_step_x = 8;

541

const auto window_start_x = static_cast<int>(window.x().start());

542

const auto window_end_x = static_cast<int>(window.x().end());

543

544

execute_window_loop(win, [&](const Coordinates &)

545

{

546

const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());

547

const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());

548

const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());

if(!is_sat)

{

// Compute S elements per iteration

553

int x = window_start_x;

554

for(; x <= (window_end_x - window_step_x); x += window_step_x)

555

{

556

const auto vin1 = wrapper::vloadq(input1_ptr + x);

557

const auto vin2 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input2_ptr + x)));

558

const auto res = is_swapped ? wrapper::vsub(vin2, vin1) : wrapper::vsub(vin1, vin2);

559

wrapper::vstore(output_ptr + x, res);

560

}

561

562

// Compute left-over elements

563

for(; x < window_end_x; ++x)

564

{

565

const auto res = is_swapped ? static_cast<int16_t>(*(input2_ptr + x)) - *(input1_ptr + x) : *(input1_ptr + x) - static_cast<int16_t>(*(input2_ptr + x));

566

*(output_ptr + x) = res;

}

}

else

{

// Compute S elements per iteration

572

int x = window_start_x;

573

for(; x <= (window_end_x - window_step_x); x += window_step_x)

574

{

575

const auto vin1 = wrapper::vloadq(input1_ptr + x);

576

const auto vin2 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input2_ptr + x)));

577

const auto res = is_swapped ? wrapper::vqsub(vin2, vin1) : wrapper::vqsub(vin1, vin2);

578

wrapper::vstore(output_ptr + x, res);

579

}

580

581

// Compute left-over elements

582

for(; x < window_end_x; ++x)

583

{

584

const auto res = is_swapped ? wrapper::sub_sat(static_cast<int16_t>(*(input2_ptr + x)), *(input1_ptr + x)) : wrapper::sub_sat(*(input1_ptr + x), static_cast<int16_t>(*(input2_ptr + x)));

585

*(output_ptr + x) = res;

586

}

587

}

Michalis Spyrou

2019-12-04 12:00:36 +0000

[diff] [blame]

588

},

589

input1, input2, output);

590

}

591

Michalis Spyrou

2020-05-13 00:12:08 +0100

[diff] [blame]

592

void sub_S16_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, bool is_sat)

Michele Di Giorgio

2020-03-27 10:23:44 +0000

[diff] [blame]

593

{

Michalis Spyrou

2020-05-13 00:12:08 +0100

[diff] [blame]

594

sub_S16_U8_S16_impl(in1, in2, out, window, is_sat, false);

Michele Di Giorgio

2020-03-27 10:23:44 +0000

[diff] [blame]

595

}

596

Michalis Spyrou

2020-05-13 00:12:08 +0100

[diff] [blame]

597

void sub_U8_S16_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, bool is_sat)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

598

{

Michalis Spyrou

2020-05-13 00:12:08 +0100

[diff] [blame]

599

// Swap arguments

600

sub_S16_U8_S16_impl(in2, in1, out, window, is_sat, true);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

601

}

602

Michalis Spyrou

2020-05-13 00:12:08 +0100

[diff] [blame]

603

void sub_U8_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, bool is_sat)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

604

{

Michalis Spyrou

2020-05-13 00:12:08 +0100

[diff] [blame]

605

// Create input windows

606

Window win = window;

607

Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());

608

Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape());

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

609

Michalis Spyrou

2020-05-13 00:12:08 +0100

[diff] [blame]

610

// Clear X Dimension on execution window as we handle manually

611

win.set(Window::DimX, Window::Dimension(0, 1, 1));

612

input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));

613

input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));

614

615

Iterator input1(in1, input1_win);

616

Iterator input2(in2, input2_win);

617

Iterator output(out, win);

618

619

const int window_step_x = 8;

620

const auto window_start_x = static_cast<int>(window.x().start());

621

const auto window_end_x = static_cast<int>(window.x().end());

622

623

execute_window_loop(win, [&](const Coordinates &)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

624

{

Michalis Spyrou

2020-05-13 00:12:08 +0100

[diff] [blame]

625

const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());

626

const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());

627

const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());

628

629

if(!is_sat)

Michele Di Giorgio

2020-03-27 10:23:44 +0000

[diff] [blame]

630

{

Michalis Spyrou

2020-05-13 00:12:08 +0100

[diff] [blame]

631

// Compute S elements per iteration

632

int x = window_start_x;

633

for(; x <= (window_end_x - window_step_x); x += window_step_x)

Michele Di Giorgio

2020-03-27 10:23:44 +0000

[diff] [blame]

634

{

Michalis Spyrou

2020-05-13 00:12:08 +0100

[diff] [blame]

635

const auto vin1 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input1_ptr + x)));

636

const auto vin2 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input2_ptr + x)));

637

wrapper::vstore(output_ptr + x, wrapper::vsub(vin1, vin2));

Michele Di Giorgio

2020-03-27 10:23:44 +0000

[diff] [blame]

638

}

Michalis Spyrou

2020-05-13 00:12:08 +0100

[diff] [blame]

639

640

// Compute left-over elements

641

for(; x < window_end_x; ++x)

642

{

643

*(output_ptr + x) = static_cast<int16_t>(*(input1_ptr + x)) - static_cast<int16_t>(*(input2_ptr + x));

644

}

645

}

646

else

Michele Di Giorgio

2020-03-27 10:23:44 +0000

[diff] [blame]

647

{

Michalis Spyrou

2020-05-13 00:12:08 +0100

[diff] [blame]

648

// Compute S elements per iteration

649

int x = window_start_x;

650

for(; x <= (window_end_x - window_step_x); x += window_step_x)

Michele Di Giorgio

2020-03-27 10:23:44 +0000

[diff] [blame]

651

{

Michalis Spyrou

2020-05-13 00:12:08 +0100

[diff] [blame]

652

const auto vin1 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input1_ptr + x)));

653

const auto vin2 = vreinterpretq_s16_u16(wrapper::vmovl(wrapper::vload(input2_ptr + x)));

654

wrapper::vstore(output_ptr + x, wrapper::vqsub(vin1, vin2));

Michele Di Giorgio

2020-03-27 10:23:44 +0000

[diff] [blame]

655

}

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

656

Michalis Spyrou

2020-05-13 00:12:08 +0100

[diff] [blame]

657

// Compute left-over elements

658

for(; x < window_end_x; ++x)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

659

{

Michalis Spyrou

2020-05-13 00:12:08 +0100

[diff] [blame]

660

*(output_ptr + x) = wrapper::sub_sat(static_cast<int16_t>(*(input1_ptr + x)),

661

static_cast<int16_t>(*(input2_ptr + x)));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

662

}

Michalis Spyrou

2020-05-13 00:12:08 +0100

[diff] [blame]

663

}

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

664

},

665

input1, input2, output);

666

}

Ioan-Cristian Szabo

2017-11-30 15:19:11 +0000

[diff] [blame]

667

Georgios Pinitas

2018-09-10 15:07:45 +0100

[diff] [blame]

668

inline Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output, ConvertPolicy policy)

Ioan-Cristian Szabo

2017-11-30 15:19:11 +0000

[diff] [blame]

669

{

670

ARM_COMPUTE_UNUSED(policy);

Georgios Pinitas

2018-09-10 15:07:45 +0100

[diff] [blame]

671

ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&input1);

Michele Di Giorgio

2020-03-27 10:23:44 +0000

[diff] [blame]

672

ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input1, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM16, DataType::S16, DataType::F16, DataType::F32);

673

ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input2, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM16, DataType::S16, DataType::F16, DataType::F32);

674

ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&output, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM16, DataType::S16, DataType::F16, DataType::F32);

Ioan-Cristian Szabo

2017-11-30 15:19:11 +0000

[diff] [blame]

675

Georgios Pinitas

2018-09-10 15:07:45 +0100

[diff] [blame]

676

const TensorShape out_shape = TensorShape::broadcast_shape(input1.tensor_shape(), input2.tensor_shape());

677

ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");

Ioan-Cristian Szabo

2017-11-30 15:19:11 +0000

[diff] [blame]

678

Manuel Bottini

2019-02-25 13:50:11 +0000

[diff] [blame]

679

ARM_COMPUTE_RETURN_ERROR_ON_MSG(

680

!(input1.data_type() == DataType::U8 && input2.data_type() == DataType::U8)

681

&& !(input1.data_type() == DataType::QASYMM8 && input2.data_type() == DataType::QASYMM8)

Michalis Spyrou

2019-12-04 12:00:36 +0000

[diff] [blame]

682

&& !(input1.data_type() == DataType::QASYMM8_SIGNED && input2.data_type() == DataType::QASYMM8_SIGNED)

Michele Di Giorgio

2020-03-27 10:23:44 +0000

[diff] [blame]

683

&& !(input1.data_type() == DataType::QSYMM16 && input2.data_type() == DataType::QSYMM16)

Manuel Bottini

2019-02-25 13:50:11 +0000

[diff] [blame]

684

&& !(input1.data_type() == DataType::U8 && input2.data_type() == DataType::U8)

685

&& !(input1.data_type() == DataType::U8 && input2.data_type() == DataType::S16)

686

&& !(input1.data_type() == DataType::S16 && input2.data_type() == DataType::U8)

687

&& !(input1.data_type() == DataType::S16 && input2.data_type() == DataType::S16)

688

&& !(input1.data_type() == DataType::F32 && input2.data_type() == DataType::F32)

689

&& !(input1.data_type() == DataType::F16 && input2.data_type() == DataType::F16),

690

"You called subtract with the wrong image formats");

691

692

ARM_COMPUTE_RETURN_ERROR_ON_MSG(

Michalis Spyrou

2019-12-04 12:00:36 +0000

[diff] [blame]

693

input1.data_type() == DataType::QASYMM8_SIGNED && input2.data_type() == DataType::QASYMM8_SIGNED && policy == ConvertPolicy::WRAP

Michele Di Giorgio

2020-03-27 10:23:44 +0000

[diff] [blame]

694

&& input1.data_type() == DataType::QASYMM8 && input2.data_type() == DataType::QASYMM8 && policy == ConvertPolicy::WRAP

695

&& input1.data_type() == DataType::QSYMM16 && input2.data_type() == DataType::QSYMM16 && policy == ConvertPolicy::WRAP,

Michalis Spyrou

2019-12-04 12:00:36 +0000

[diff] [blame]

696

"Convert policy cannot be WRAP if datatype is QASYMM8 or QASYMM8_SIGNED");

Manuel Bottini

2019-02-25 13:50:11 +0000

[diff] [blame]

697

Georgios Pinitas

2018-09-10 15:07:45 +0100

[diff] [blame]

698

// Validate in case of configured output

699

if(output.total_size() > 0)

700

{

701

ARM_COMPUTE_RETURN_ERROR_ON_MSG(

702

!(input1.data_type() == DataType::U8 && input2.data_type() == DataType::U8 && output.data_type() == DataType::U8)

Manuel Bottini

2019-02-25 13:50:11 +0000

[diff] [blame]

703

&& !(input1.data_type() == DataType::QASYMM8 && input2.data_type() == DataType::QASYMM8 && output.data_type() == DataType::QASYMM8)

Michalis Spyrou

2019-12-04 12:00:36 +0000

[diff] [blame]

704

&& !(input1.data_type() == DataType::QASYMM8_SIGNED && input2.data_type() == DataType::QASYMM8_SIGNED && output.data_type() == DataType::QASYMM8_SIGNED)

Michele Di Giorgio

2020-03-27 10:23:44 +0000

[diff] [blame]

705

&& !(input1.data_type() == DataType::QSYMM16 && input2.data_type() == DataType::QSYMM16 && output.data_type() == DataType::QSYMM16)

Georgios Pinitas

2018-09-10 15:07:45 +0100

[diff] [blame]

706

&& !(input1.data_type() == DataType::U8 && input2.data_type() == DataType::U8 && output.data_type() == DataType::S16)

707

&& !(input1.data_type() == DataType::U8 && input2.data_type() == DataType::S16 && output.data_type() == DataType::S16)

708

&& !(input1.data_type() == DataType::S16 && input2.data_type() == DataType::U8 && output.data_type() == DataType::S16)

709

&& !(input1.data_type() == DataType::S16 && input2.data_type() == DataType::S16 && output.data_type() == DataType::S16)

710

&& !(input1.data_type() == DataType::F32 && input2.data_type() == DataType::F32 && output.data_type() == DataType::F32)

711

&& !(input1.data_type() == DataType::F16 && input2.data_type() == DataType::F16 && output.data_type() == DataType::F16),

712

"You called subtract with the wrong image formats");

713

714

ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, output.tensor_shape(), 0),

715

"Wrong shape for output");

716

}

Georgios Pinitas

631c41a

2017-12-06 11:53:03 +0000

[diff] [blame]

717

return Status{};

Ioan-Cristian Szabo

2017-11-30 15:19:11 +0000

[diff] [blame]

718

}

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

719

} // namespace

720

721

NEArithmeticSubtractionKernel::NEArithmeticSubtractionKernel()

Michalis Spyrou

2020-06-23 17:25:43 +0100

[diff] [blame^]

722

: _func(nullptr), _policy(ConvertPolicy::WRAP)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

{

}

Michalis Spyrou

2020-06-23 17:25:43 +0100

[diff] [blame^]

726

void NEArithmeticSubtractionKernel::configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output, ConvertPolicy policy)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

727

{

728

ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);

Michalis Spyrou

2020-06-23 17:25:43 +0100

[diff] [blame^]

729

ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input1, *input2, *output, policy));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

730

Michalis Spyrou

2020-05-13 00:12:08 +0100

[diff] [blame]

731

_policy = policy;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

732

Michalis Spyrou

2020-06-23 17:25:43 +0100

[diff] [blame^]

733

const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(*input1, *input2);

Michalis Spyrou

2020-05-13 00:12:08 +0100

[diff] [blame]

734

const TensorShape &out_shape = broadcast_pair.first;

735

const ValidRegion &valid_region = broadcast_pair.second;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

736

Michalis Spyrou

2020-05-13 00:12:08 +0100

[diff] [blame]

737

// Auto initialize output if not initialized

Michalis Spyrou

2020-06-23 17:25:43 +0100

[diff] [blame^]

738

set_shape_if_empty(*output, out_shape);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

739

Michalis Spyrou

2020-06-23 17:25:43 +0100

[diff] [blame^]

740

switch(input1->data_type())

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

741

{

Michalis Spyrou

2020-05-13 00:12:08 +0100

[diff] [blame]

742

case DataType::U8:

Michalis Spyrou

2020-06-23 17:25:43 +0100

[diff] [blame^]

743

if(input2->data_type() == DataType::U8 && output->data_type() == DataType::U8)

Michalis Spyrou

2020-05-13 00:12:08 +0100

[diff] [blame]

744

{

745

_func = &sub_same<uint8_t>;

746

}

Michalis Spyrou

2020-06-23 17:25:43 +0100

[diff] [blame^]

747

else if(input2->data_type() == DataType::U8 && output->data_type() == DataType::S16)

Michalis Spyrou

2020-05-13 00:12:08 +0100

[diff] [blame]

748

{

749

_func = &sub_U8_U8_S16;

}

else

{

_func = &sub_U8_S16_S16;

754

}

755

break;

756

case DataType::QASYMM8:

757

_func = &sub_quantized<uint8_t>;

Michalis Spyrou

2020-06-23 17:25:43 +0100

[diff] [blame^]

758

set_data_type_if_unknown(*output, DataType::QASYMM8);

Michalis Spyrou

2020-05-13 00:12:08 +0100

[diff] [blame]

759

break;

760

case DataType::QASYMM8_SIGNED:

761

_func = &sub_quantized<int8_t>;

Michalis Spyrou

2020-06-23 17:25:43 +0100

[diff] [blame^]

762

set_data_type_if_unknown(*output, DataType::QASYMM8_SIGNED);

Michalis Spyrou

2020-05-13 00:12:08 +0100

[diff] [blame]

763

break;

764

case DataType::S16:

Michalis Spyrou

2020-06-23 17:25:43 +0100

[diff] [blame^]

765

if(input2->data_type() == DataType::U8)

Michalis Spyrou

2020-05-13 00:12:08 +0100

[diff] [blame]

766

{

767

_func = &sub_S16_U8_S16;

}

else

{

_func = &sub_same<int16_t>;

772

}

Michalis Spyrou

2020-06-23 17:25:43 +0100

[diff] [blame^]

773

set_format_if_unknown(*output, Format::S16);

Michalis Spyrou

2020-05-13 00:12:08 +0100

[diff] [blame]

774

break;

775

case DataType::QSYMM16:

776

_func = &sub_QSYMM16_QSYMM16_QSYMM16;

Michalis Spyrou

2020-06-23 17:25:43 +0100

[diff] [blame^]

777

set_data_type_if_unknown(*output, DataType::QSYMM16);

Michalis Spyrou

2020-05-13 00:12:08 +0100

[diff] [blame]

778

break;

779

#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

780

case DataType::F16:

781

_func = &sub_same<float16_t>;

Michalis Spyrou

2020-06-23 17:25:43 +0100

[diff] [blame^]

782

set_format_if_unknown(*output, Format::F16);

Michalis Spyrou

2020-05-13 00:12:08 +0100

[diff] [blame]

783

break;

784

#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */

785

case DataType::F32:

786

_func = &sub_same<float>;

Michalis Spyrou

2020-06-23 17:25:43 +0100

[diff] [blame^]

787

set_format_if_unknown(*output, Format::F32);

Michalis Spyrou

2020-05-13 00:12:08 +0100

[diff] [blame]

788

break;

789

default:

790

_func = nullptr;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

791

}

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

792

Michalis Spyrou

2020-05-13 00:12:08 +0100

[diff] [blame]

793

// NEArithmeticSubtractionKernel doesn't need padding so update_window_and_padding() can be skipped

794

Coordinates coord;

Michalis Spyrou

2020-06-23 17:25:43 +0100

[diff] [blame^]

795

coord.set_num_dimensions(output->num_dimensions());

796

output->set_valid_region(valid_region);

Michalis Spyrou

2020-05-13 00:12:08 +0100

[diff] [blame]

797

Window win = calculate_max_window(valid_region, Steps());

798

799

INEKernel::configure(win);

Ioan-Cristian Szabo

2017-11-30 15:19:11 +0000

[diff] [blame]

800

}

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

801

Georgios Pinitas

631c41a

2017-12-06 11:53:03 +0000

[diff] [blame]

802

Status NEArithmeticSubtractionKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy)

Ioan-Cristian Szabo

2017-11-30 15:19:11 +0000

[diff] [blame]

803

{

Georgios Pinitas

2018-09-10 15:07:45 +0100

[diff] [blame]

804

ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output);

Georgios Pinitas

2018-09-10 15:07:45 +0100

[diff] [blame]

805

ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*input1, *input2, *output, policy));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

806

Georgios Pinitas

631c41a

2017-12-06 11:53:03 +0000

[diff] [blame]

807

return Status{};

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

808

}

809

Michalis Spyrou

2020-06-23 17:25:43 +0100

[diff] [blame^]

810

void NEArithmeticSubtractionKernel::run_op(const InputTensorMap &inputs, const OutputTensorMap &outputs, const Window &window, const ThreadInfo &info)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

811

{

Moritz Pflanzer

c186b57

2017-09-07 09:48:04 +0100

[diff] [blame]

812

ARM_COMPUTE_UNUSED(info);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

813

ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);

814

ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);

Michalis Spyrou

2020-06-23 17:25:43 +0100

[diff] [blame^]

815

// Dispatch kernel

816

(*_func)(inputs.at(TensorType::ACL_SRC_0), inputs.at(TensorType::ACL_SRC_1), outputs.at(TensorType::ACL_DST), window, (_policy == ConvertPolicy::SATURATE));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

817

}

Michalis Spyrou