Blame - src/cpu/kernels/CpuTransposeKernel.cpp - ml/ComputeLibrary

2017-09-04 18:44:23 +0100

[diff] [blame]

204

}

205

206

void transpose_16bit_elements(const ITensor *in, ITensor *out, const Window &window)

207

{

Gian Marco

2017-12-07 10:09:07 +0000

[diff] [blame]

208

const int window_step_x = 4;

209

const int window_step_y = 4;

210

const int window_start_x = window.x().start();

211

const int window_end_x = window.x().end();

212

const int window_start_y = window.y().start();

213

const int window_end_y = std::min(window.y().end(), static_cast<int>(in->info()->dimension(1)));

214

const int window_end_y_multiple_of = ((window_end_y - window_start_y) / window_step_y) * window_step_y;

215

const size_t input_stride_in_bytes = in->info()->strides_in_bytes()[1];

216

const size_t output_stride_in_bytes = out->info()->strides_in_bytes()[1];

217

218

// Check if we need a left-over loop for the y dimension

219

bool left_over_loop_y = (((window_end_y - window_start_y) % window_step_y) != 0);

220

221

Window window_in(window);

222

window_in.set(Window::DimX, Window::Dimension(0, 1, 1));

223

if(left_over_loop_y)

224

{

225

// Check if window_end_y_multiple_of is greater than window_start_y

226

if(window_end_y_multiple_of > window_start_y)

227

{

228

window_in.set(Window::DimY, Window::Dimension(window_start_y, window_end_y_multiple_of, window_step_y));

}

else

{

window_in.set(Window::DimY, Window::Dimension(0, 0, 1));

}

}

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

236

Window window_out(window);

237

window_out.set(Window::DimX, Window::Dimension(0, 0, 0));

238

window_out.set(Window::DimY, Window::Dimension(0, 0, 0));

239

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

240

Iterator output(out, window_out);

241

Michele Di Giorgio

33f41fa

2021-03-09 14:09:08 +0000

[diff] [blame]

242

// Run the SIMD path if and only if the input is not a row-vector

Gian Marco

2017-12-07 10:09:07 +0000

[diff] [blame]

243

if(in->info()->dimension(1) != 1)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

244

{

Gian Marco

2017-12-07 10:09:07 +0000

[diff] [blame]

245

Iterator input(in, window_in);

246

execute_window_loop(window_in, [&](const Coordinates & id)

247

{

248

// Compute 4x4 elements per iteration

249

int x = window_start_x;

250

for(; x <= (window_end_x - window_step_x); x += window_step_x)

251

{

252

const uint16x4_t row0 = vld1_u16(reinterpret_cast<const uint16_t *>(input.ptr() + 0 * input_stride_in_bytes) + x);

253

const uint16x4_t row1 = vld1_u16(reinterpret_cast<const uint16_t *>(input.ptr() + 1 * input_stride_in_bytes) + x);

254

const uint16x4_t row2 = vld1_u16(reinterpret_cast<const uint16_t *>(input.ptr() + 2 * input_stride_in_bytes) + x);

255

const uint16x4_t row3 = vld1_u16(reinterpret_cast<const uint16_t *>(input.ptr() + 3 * input_stride_in_bytes) + x);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

256

Gian Marco

2017-12-07 10:09:07 +0000

[diff] [blame]

257

// Transpose 2x2

258

const uint16x4x2_t k0_u16 = vtrn_u16(row0, row1);

259

const uint16x4x2_t k1_u16 = vtrn_u16(row2, row3);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

260

Gian Marco

2017-12-07 10:09:07 +0000

[diff] [blame]

261

// Transpose 4x4

262

const uint32x2x2_t k0_u32 = vtrn_u32(vreinterpret_u32_u16(k0_u16.val[0]), vreinterpret_u32_u16(k1_u16.val[0]));

263

const uint32x2x2_t k1_u32 = vtrn_u32(vreinterpret_u32_u16(k0_u16.val[1]), vreinterpret_u32_u16(k1_u16.val[1]));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

264

Gian Marco

2017-12-07 10:09:07 +0000

[diff] [blame]

265

// Compute destination address

266

const size_t dst_offset_in_bytes = id.y() * sizeof(uint16_t) + x * output_stride_in_bytes;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

267

Gian Marco

2017-12-07 10:09:07 +0000

[diff] [blame]

268

vst1_u16(reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes + 0 * output_stride_in_bytes), vreinterpret_u16_u32(k0_u32.val[0]));

269

vst1_u16(reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes + 1 * output_stride_in_bytes), vreinterpret_u16_u32(k1_u32.val[0]));

270

vst1_u16(reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes + 2 * output_stride_in_bytes), vreinterpret_u16_u32(k0_u32.val[1]));

271

vst1_u16(reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes + 3 * output_stride_in_bytes), vreinterpret_u16_u32(k1_u32.val[1]));

272

}

273

274

// Compute left-over elements (1x4)

275

for(; x < window_end_x; ++x)

276

{

277

const uint16_t val0 = *(reinterpret_cast<uint16_t *>(input.ptr() + 0 * input_stride_in_bytes) + x);

278

const uint16_t val1 = *(reinterpret_cast<uint16_t *>(input.ptr() + 1 * input_stride_in_bytes) + x);

279

const uint16_t val2 = *(reinterpret_cast<uint16_t *>(input.ptr() + 2 * input_stride_in_bytes) + x);

280

const uint16_t val3 = *(reinterpret_cast<uint16_t *>(input.ptr() + 3 * input_stride_in_bytes) + x);

281

282

uint16x4_t result = vdup_n_u16(0);

283

result = vset_lane_u16(val0, result, 0);

284

result = vset_lane_u16(val1, result, 1);

285

result = vset_lane_u16(val2, result, 2);

286

result = vset_lane_u16(val3, result, 3);

287

288

// Compute destination address

289

const size_t dst_offset_in_bytes = id.y() * sizeof(uint16_t) + x * output_stride_in_bytes;

290

291

vst1_u16(reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes), result);

}

},

input, output);

}

if(left_over_loop_y)

{

window_in.set(Window::DimX, Window::Dimension(window.x().start(), window.x().end(), 1));

300

window_in.set(Window::DimY, Window::Dimension(window_end_y_multiple_of, window_end_y, 1));

301

302

Iterator input(in, window_in);

303

Iterator output(out, window_out);

304

305

// Compute left-over elements along the y dimension (1x1)

306

execute_window_loop(window_in, [&](const Coordinates & id)

307

{

308

const uint16_t val0 = *(reinterpret_cast<uint16_t *>(input.ptr()));

309

310

// Compute destination address

311

const size_t dst_offset_in_bytes = id.y() * sizeof(uint16_t) + id.x() * output_stride_in_bytes;

312

313

*(reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes)) = val0;

314

},

315

input, output);

316

}

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

317

}

318

Ethan Doe

a07c01b

2023-04-14 17:24:33 +0000

[diff] [blame]

319

#ifdef __aarch64__

320

inline uint32x4x2_t vld1q_u32_x2_(const uint32_t *ptr)

321

{

322

// gcc-7 doesn't support vld1q_u32_x2 instruction

323

return {vld1q_u32(ptr), vld1q_u32(ptr + 4)};

324

}

325

326

inline void vst1q_u32_x2_(const uint32_t *ptr, const uint32x4x2_t &val)

327

{

328

// gcc-7 doesn't support vst1q_u32_x2 instruction

329

vst1q_u32(const_cast<uint32_t *>(ptr), val.val[0]);

330

vst1q_u32(const_cast<uint32_t *>(ptr + 4), val.val[1]);

331

}

332

333

void transpose_32bit_elements(const ITensor *in, ITensor *out, const Window &window)

334

{

335

constexpr int window_step_x = 8;

336

constexpr int window_step_y = 8;

337

const int window_start_x = window.x().start();

338

const int window_end_x = window.x().end();

339

const int window_start_y = window.y().start();

340

const int window_end_y = std::min(window.y().end(), static_cast<int>(in->info()->dimension(1)));

341

const int window_end_y_multiple_of = ((window_end_y - window_start_y) / window_step_y) * window_step_y;

342

const size_t input_stride_in_bytes = in->info()->strides_in_bytes()[1];

343

const size_t output_stride_in_bytes = out->info()->strides_in_bytes()[1];

344

345

// Check if we need a left-over loop for the y dimension

346

bool left_over_loop_y = (((window_end_y - window_start_y) % window_step_y) != 0);

347

348

Window window_in(window);

349

window_in.set(Window::DimX, Window::Dimension(0, 1, 1));

350

if(left_over_loop_y)

351

{

352

// Check if window_end_y_multiple_of is greater than window_start_y

353

if(window_end_y_multiple_of > window_start_y)

354

{

355

window_in.set(Window::DimY, Window::Dimension(window_start_y, window_end_y_multiple_of, window_step_y));

}

else

{

window_in.set(Window::DimY, Window::Dimension(0, 0, 1));

}

}

Window window_out(window);

364

window_out.set(Window::DimX, Window::Dimension(0, 0, 0));

365

window_out.set(Window::DimY, Window::Dimension(0, 0, 0));

366

367

Iterator output(out, window_out);

368

369

// Run the SIMD path if and only if the input is not a row-vector

370

if(in->info()->dimension(1) != 1)

371

{

372

Iterator input(in, window_in);

373

execute_window_loop(window_in, [&](const Coordinates & id)

374

{

375

// Compute 8x8 elements per iteration

376

int x = window_start_x;

377

for(; x <= (window_end_x - window_step_x); x += window_step_x)

378

{

379

// Load

380

const uint32x4x2_t row0 = vld1q_u32_x2_(reinterpret_cast<const uint32_t *>(input.ptr() + 0 * input_stride_in_bytes) + x);

381

const uint32x4x2_t row1 = vld1q_u32_x2_(reinterpret_cast<const uint32_t *>(input.ptr() + 1 * input_stride_in_bytes) + x);

382

const uint32x4x2_t row2 = vld1q_u32_x2_(reinterpret_cast<const uint32_t *>(input.ptr() + 2 * input_stride_in_bytes) + x);

383

const uint32x4x2_t row3 = vld1q_u32_x2_(reinterpret_cast<const uint32_t *>(input.ptr() + 3 * input_stride_in_bytes) + x);

384

const uint32x4x2_t row4 = vld1q_u32_x2_(reinterpret_cast<const uint32_t *>(input.ptr() + 4 * input_stride_in_bytes) + x);

385

const uint32x4x2_t row5 = vld1q_u32_x2_(reinterpret_cast<const uint32_t *>(input.ptr() + 5 * input_stride_in_bytes) + x);

386

const uint32x4x2_t row6 = vld1q_u32_x2_(reinterpret_cast<const uint32_t *>(input.ptr() + 6 * input_stride_in_bytes) + x);

387

const uint32x4x2_t row7 = vld1q_u32_x2_(reinterpret_cast<const uint32_t *>(input.ptr() + 7 * input_stride_in_bytes) + x);

388

389

// Transpose 2x4

390

const uint32x4x2_t k0_u32 = {vtrn1q_u32(row0.val[0], row1.val[0]), vtrn2q_u32(row0.val[0], row1.val[0])};

391

const uint32x4x2_t k1_u32 = {vtrn1q_u32(row0.val[1], row1.val[1]), vtrn2q_u32(row0.val[1], row1.val[1])};

392

const uint32x4x2_t k2_u32 = {vtrn1q_u32(row2.val[0], row3.val[0]), vtrn2q_u32(row2.val[0], row3.val[0])};

393

const uint32x4x2_t k3_u32 = {vtrn1q_u32(row2.val[1], row3.val[1]), vtrn2q_u32(row2.val[1], row3.val[1])};

394

const uint32x4x2_t k4_u32 = {vtrn1q_u32(row4.val[0], row5.val[0]), vtrn2q_u32(row4.val[0], row5.val[0])};

395

const uint32x4x2_t k5_u32 = {vtrn1q_u32(row4.val[1], row5.val[1]), vtrn2q_u32(row4.val[1], row5.val[1])};

396

const uint32x4x2_t k6_u32 = {vtrn1q_u32(row6.val[0], row7.val[0]), vtrn2q_u32(row6.val[0], row7.val[0])};

397

const uint32x4x2_t k7_u32 = {vtrn1q_u32(row6.val[1], row7.val[1]), vtrn2q_u32(row6.val[1], row7.val[1])};

398

399

// Transpose 2x2

400

const uint64x2x2_t k0_u64 = {vtrn1q_u64(vreinterpretq_u64_u32(k0_u32.val[0]), vreinterpretq_u64_u32(k2_u32.val[0])), vtrn2q_u64(vreinterpretq_u64_u32(k0_u32.val[0]), vreinterpretq_u64_u32(k2_u32.val[0]))};

401

const uint64x2x2_t k1_u64 = {vtrn1q_u64(vreinterpretq_u64_u32(k0_u32.val[1]), vreinterpretq_u64_u32(k2_u32.val[1])), vtrn2q_u64(vreinterpretq_u64_u32(k0_u32.val[1]), vreinterpretq_u64_u32(k2_u32.val[1]))};

402

const uint64x2x2_t k2_u64 = {vtrn1q_u64(vreinterpretq_u64_u32(k1_u32.val[0]), vreinterpretq_u64_u32(k3_u32.val[0])), vtrn2q_u64(vreinterpretq_u64_u32(k1_u32.val[0]), vreinterpretq_u64_u32(k3_u32.val[0]))};

403

const uint64x2x2_t k3_u64 = {vtrn1q_u64(vreinterpretq_u64_u32(k1_u32.val[1]), vreinterpretq_u64_u32(k3_u32.val[1])), vtrn2q_u64(vreinterpretq_u64_u32(k1_u32.val[1]), vreinterpretq_u64_u32(k3_u32.val[1]))};

404

const uint64x2x2_t k4_u64 = {vtrn1q_u64(vreinterpretq_u64_u32(k4_u32.val[0]), vreinterpretq_u64_u32(k6_u32.val[0])), vtrn2q_u64(vreinterpretq_u64_u32(k4_u32.val[0]), vreinterpretq_u64_u32(k6_u32.val[0]))};

405

const uint64x2x2_t k5_u64 = {vtrn1q_u64(vreinterpretq_u64_u32(k4_u32.val[1]), vreinterpretq_u64_u32(k6_u32.val[1])), vtrn2q_u64(vreinterpretq_u64_u32(k4_u32.val[1]), vreinterpretq_u64_u32(k6_u32.val[1]))};

406

const uint64x2x2_t k6_u64 = {vtrn1q_u64(vreinterpretq_u64_u32(k5_u32.val[0]), vreinterpretq_u64_u32(k7_u32.val[0])), vtrn2q_u64(vreinterpretq_u64_u32(k5_u32.val[0]), vreinterpretq_u64_u32(k7_u32.val[0]))};

407

const uint64x2x2_t k7_u64 = {vtrn1q_u64(vreinterpretq_u64_u32(k5_u32.val[1]), vreinterpretq_u64_u32(k7_u32.val[1])), vtrn2q_u64(vreinterpretq_u64_u32(k5_u32.val[1]), vreinterpretq_u64_u32(k7_u32.val[1]))};

408

409

// Swap blocks

410

const uint32x4x2_t col0 = {vreinterpretq_u32_u64(k0_u64.val[0]), vreinterpretq_u32_u64(k4_u64.val[0])};

411

const uint32x4x2_t col1 = {vreinterpretq_u32_u64(k1_u64.val[0]), vreinterpretq_u32_u64(k5_u64.val[0])};

412

const uint32x4x2_t col2 = {vreinterpretq_u32_u64(k0_u64.val[1]), vreinterpretq_u32_u64(k4_u64.val[1])};

413

const uint32x4x2_t col3 = {vreinterpretq_u32_u64(k1_u64.val[1]), vreinterpretq_u32_u64(k5_u64.val[1])};

414

const uint32x4x2_t col4 = {vreinterpretq_u32_u64(k2_u64.val[0]), vreinterpretq_u32_u64(k6_u64.val[0])};

415

const uint32x4x2_t col5 = {vreinterpretq_u32_u64(k3_u64.val[0]), vreinterpretq_u32_u64(k7_u64.val[0])};

416

const uint32x4x2_t col6 = {vreinterpretq_u32_u64(k2_u64.val[1]), vreinterpretq_u32_u64(k6_u64.val[1])};

417

const uint32x4x2_t col7 = {vreinterpretq_u32_u64(k3_u64.val[1]), vreinterpretq_u32_u64(k7_u64.val[1])};

418

419

// Compute destination address

420

const size_t dst_offset_in_bytes = id.y() * sizeof(uint32_t) + x * output_stride_in_bytes;

421

422

// Store

423

vst1q_u32_x2_(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 0 * output_stride_in_bytes), col0);

424

vst1q_u32_x2_(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 1 * output_stride_in_bytes), col1);

425

vst1q_u32_x2_(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 2 * output_stride_in_bytes), col2);

426

vst1q_u32_x2_(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 3 * output_stride_in_bytes), col3);

427

vst1q_u32_x2_(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 4 * output_stride_in_bytes), col4);

428

vst1q_u32_x2_(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 5 * output_stride_in_bytes), col5);

429

vst1q_u32_x2_(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 6 * output_stride_in_bytes), col6);

430

vst1q_u32_x2_(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 7 * output_stride_in_bytes), col7);

431

}

432

433

// Compute left-over elements (8x1)

434

for(; x < window_end_x; ++x)

435

{

436

const uint32_t val0 = *(reinterpret_cast<uint32_t *>(input.ptr() + 0 * input_stride_in_bytes) + x);

437

const uint32_t val1 = *(reinterpret_cast<uint32_t *>(input.ptr() + 1 * input_stride_in_bytes) + x);

438

const uint32_t val2 = *(reinterpret_cast<uint32_t *>(input.ptr() + 2 * input_stride_in_bytes) + x);

439

const uint32_t val3 = *(reinterpret_cast<uint32_t *>(input.ptr() + 3 * input_stride_in_bytes) + x);

440

const uint32_t val4 = *(reinterpret_cast<uint32_t *>(input.ptr() + 4 * input_stride_in_bytes) + x);

441

const uint32_t val5 = *(reinterpret_cast<uint32_t *>(input.ptr() + 5 * input_stride_in_bytes) + x);

442

const uint32_t val6 = *(reinterpret_cast<uint32_t *>(input.ptr() + 6 * input_stride_in_bytes) + x);

443

const uint32_t val7 = *(reinterpret_cast<uint32_t *>(input.ptr() + 7 * input_stride_in_bytes) + x);

444

445

uint32x4_t result0 = vdupq_n_u32(0);

446

uint32x4_t result1 = vdupq_n_u32(0);

447

result0 = vsetq_lane_u32(val0, result0, 0);

448

result0 = vsetq_lane_u32(val1, result0, 1);

449

result0 = vsetq_lane_u32(val2, result0, 2);

450

result0 = vsetq_lane_u32(val3, result0, 3);

451

result1 = vsetq_lane_u32(val4, result1, 0);

452

result1 = vsetq_lane_u32(val5, result1, 1);

453

result1 = vsetq_lane_u32(val6, result1, 2);

454

result1 = vsetq_lane_u32(val7, result1, 3);

455

456

// Compute destination address

457

const size_t dst_offset_in_bytes = id.y() * sizeof(uint32_t) + x * output_stride_in_bytes;

458

459

vst1q_u32_x2_(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes), {result0, result1});

}

},

input, output);

}

if(left_over_loop_y)

{

window_in.set(Window::DimX, Window::Dimension(window.x().start(), window.x().end(), 1));

468

window_in.set(Window::DimY, Window::Dimension(window_end_y_multiple_of, window_end_y, 1));

469

470

Iterator input(in, window_in);

471

Iterator output(out, window_out);

472

473

// Compute left-over elements along the y dimension (1x1)

474

execute_window_loop(window_in, [&](const Coordinates & id)

475

{

476

const uint32_t val0 = *(reinterpret_cast<uint32_t *>(input.ptr()));

477

478

// Compute destination address

479

const size_t dst_offset_in_bytes = id.y() * sizeof(uint32_t) + id.x() * output_stride_in_bytes;

480

481

*(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes)) = val0;

},

input, output);

}

}

#else // __aarch64__

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

487

void transpose_32bit_elements(const ITensor *in, ITensor *out, const Window &window)

488

{

Gian Marco

2017-12-07 10:09:07 +0000

[diff] [blame]

489

const int window_step_x = 4;

490

const int window_step_y = 4;

491

const int window_start_x = window.x().start();

492

const int window_end_x = window.x().end();

493

const int window_start_y = window.y().start();

494

const int window_end_y = std::min(window.y().end(), static_cast<int>(in->info()->dimension(1)));

495

const int window_end_y_multiple_of = ((window_end_y - window_start_y) / window_step_y) * window_step_y;

496

const size_t input_stride_in_bytes = in->info()->strides_in_bytes()[1];

497

const size_t output_stride_in_bytes = out->info()->strides_in_bytes()[1];

498

499

// Check if we need a left-over loop for the y dimension

500

bool left_over_loop_y = (((window_end_y - window_start_y) % window_step_y) != 0);

501

502

Window window_in(window);

503

window_in.set(Window::DimX, Window::Dimension(0, 1, 1));

504

if(left_over_loop_y)

505

{

506

// Check if window_end_y_multiple_of is greater than window_start_y

507

if(window_end_y_multiple_of > window_start_y)

508

{

509

window_in.set(Window::DimY, Window::Dimension(window_start_y, window_end_y_multiple_of, window_step_y));

}

else

{

window_in.set(Window::DimY, Window::Dimension(0, 0, 1));

}

}

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

517

Window window_out(window);

518

window_out.set(Window::DimX, Window::Dimension(0, 0, 0));

519

window_out.set(Window::DimY, Window::Dimension(0, 0, 0));

520

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

521

Iterator output(out, window_out);

522

Michele Di Giorgio

33f41fa

2021-03-09 14:09:08 +0000

[diff] [blame]

523

// Run the SIMD path if and only if the input is not a row-vector

Gian Marco

2017-12-07 10:09:07 +0000

[diff] [blame]

524

if(in->info()->dimension(1) != 1)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

525

{

Gian Marco

2017-12-07 10:09:07 +0000

[diff] [blame]

526

Iterator input(in, window_in);

527

execute_window_loop(window_in, [&](const Coordinates & id)

528

{

529

// Compute 4x4 elements per iteration

530

int x = window_start_x;

531

for(; x <= (window_end_x - window_step_x); x += window_step_x)

532

{

533

const uint32x4_t row0 = vld1q_u32(reinterpret_cast<const uint32_t *>(input.ptr() + 0 * input_stride_in_bytes) + x);

534

const uint32x4_t row1 = vld1q_u32(reinterpret_cast<const uint32_t *>(input.ptr() + 1 * input_stride_in_bytes) + x);

535

const uint32x4_t row2 = vld1q_u32(reinterpret_cast<const uint32_t *>(input.ptr() + 2 * input_stride_in_bytes) + x);

536

const uint32x4_t row3 = vld1q_u32(reinterpret_cast<const uint32_t *>(input.ptr() + 3 * input_stride_in_bytes) + x);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

537

Gian Marco

2017-12-07 10:09:07 +0000

[diff] [blame]

538

// Transpose 2x2

539

const uint32x2x2_t k0_u32 = vtrn_u32(vget_low_u32(row0), vget_low_u32(row1));

540

const uint32x2x2_t k1_u32 = vtrn_u32(vget_high_u32(row2), vget_high_u32(row3));

541

const uint32x2x2_t k2_u32 = vtrn_u32(vget_high_u32(row0), vget_high_u32(row1));

542

const uint32x2x2_t k3_u32 = vtrn_u32(vget_low_u32(row2), vget_low_u32(row3));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

543

Gian Marco

2017-12-07 10:09:07 +0000

[diff] [blame]

544

// Compute destination address

545

const size_t dst_offset_in_bytes = id.y() * sizeof(uint32_t) + x * output_stride_in_bytes;

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

546

Gian Marco

2017-12-07 10:09:07 +0000

[diff] [blame]

547

// Swap block 01 with block 10 and store

548

vst1q_u32(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 0 * output_stride_in_bytes), vcombine_u32(k0_u32.val[0], k3_u32.val[0]));

549

vst1q_u32(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 1 * output_stride_in_bytes), vcombine_u32(k0_u32.val[1], k3_u32.val[1]));

550

vst1q_u32(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 2 * output_stride_in_bytes), vcombine_u32(k2_u32.val[0], k1_u32.val[0]));

551

vst1q_u32(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 3 * output_stride_in_bytes), vcombine_u32(k2_u32.val[1], k1_u32.val[1]));

552

}

553

554

// Compute left-over elements (1x4)

555

for(; x < window_end_x; ++x)

556

{

557

const uint32_t val0 = *(reinterpret_cast<uint32_t *>(input.ptr() + 0 * input_stride_in_bytes) + x);

558

const uint32_t val1 = *(reinterpret_cast<uint32_t *>(input.ptr() + 1 * input_stride_in_bytes) + x);

559

const uint32_t val2 = *(reinterpret_cast<uint32_t *>(input.ptr() + 2 * input_stride_in_bytes) + x);

560

const uint32_t val3 = *(reinterpret_cast<uint32_t *>(input.ptr() + 3 * input_stride_in_bytes) + x);

561

562

uint32x4_t result = vdupq_n_u32(0);

563

result = vsetq_lane_u32(val0, result, 0);

564

result = vsetq_lane_u32(val1, result, 1);

565

result = vsetq_lane_u32(val2, result, 2);

566

result = vsetq_lane_u32(val3, result, 3);

567

568

// Compute destination address

569

const size_t dst_offset_in_bytes = id.y() * sizeof(uint32_t) + x * output_stride_in_bytes;

570

571

vst1q_u32(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes), result);

}

},

input, output);

}

if(left_over_loop_y)

{

window_in.set(Window::DimX, Window::Dimension(window.x().start(), window.x().end(), 1));

580

window_in.set(Window::DimY, Window::Dimension(window_end_y_multiple_of, window_end_y, 1));

581

582

Iterator input(in, window_in);

583

Iterator output(out, window_out);

584

585

// Compute left-over elements along the y dimension (1x1)

586

execute_window_loop(window_in, [&](const Coordinates & id)

587

{

588

const uint32_t val0 = *(reinterpret_cast<uint32_t *>(input.ptr()));

589

590

// Compute destination address

591

const size_t dst_offset_in_bytes = id.y() * sizeof(uint32_t) + id.x() * output_stride_in_bytes;

592

593

*(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes)) = val0;

594

},

595

input, output);

596

}

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

597

}

Ethan Doe

a07c01b

2023-04-14 17:24:33 +0000

[diff] [blame]

598

#endif // __aarch64__

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

599

} // namespace

600

Teresa Charlin

2021-03-04 15:24:45 +0000

[diff] [blame]

601

void CpuTransposeKernel::configure(const ITensorInfo *src, ITensorInfo *dst)

Gian Marco

7c435f2

2017-12-05 16:17:23 +0000

[diff] [blame]

602

{

Teresa Charlin

2021-03-04 15:24:45 +0000

[diff] [blame]

603

ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);

Gian Marco

7c435f2

2017-12-05 16:17:23 +0000

[diff] [blame]

604

Teresa Charlin

2021-03-04 15:24:45 +0000

[diff] [blame]

605

// Destination auto inizialitation if not yet initialized

606

const TensorShape dst_shape = misc::shape_calculator::compute_transposed_shape(*src);

607

auto_init_if_empty(*dst, src->clone()->set_tensor_shape(dst_shape));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

608

Teresa Charlin

2021-03-04 15:24:45 +0000

[diff] [blame]

609

// Perform validation step

610

ARM_COMPUTE_ERROR_THROW_ON(validate(src, dst));

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

611

Michalis Spyrou

0b1452d

2020-02-27 16:20:19 +0000

[diff] [blame]

612

// Note: This kernel performs 16 elements per iteration.

613

// However, since we use a left-over for loop on both dimensions (X and Y), we cannot have any read or write out of memory

614

// For this reason num_elems_processed_per_iteration_x is set to 1

615

const unsigned int num_elems_processed_per_iteration_x = 1;

Teresa Charlin

2021-03-04 15:24:45 +0000

[diff] [blame]

616

const unsigned int num_elems_processed_per_iteration_y = num_elems_processed(src->element_size());

Michalis Spyrou

0b1452d

2020-02-27 16:20:19 +0000

[diff] [blame]

617

618

// Configure kernel window

Teresa Charlin

2021-03-04 15:24:45 +0000

[diff] [blame]

619

Window win = calculate_max_window(*src, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));

Michalis Spyrou

0b1452d

2020-02-27 16:20:19 +0000

[diff] [blame]

620

Teresa Charlin

2021-03-04 15:24:45 +0000

[diff] [blame]

621

// The CpuTranspose doesn't need padding so update_window_and_padding() can be skipped

622

Coordinates coord;

623

coord.set_num_dimensions(dst->num_dimensions());

624

dst->set_valid_region(ValidRegion(coord, dst->tensor_shape()));

625

626

ICpuKernel::configure(win);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

627

}

628

Teresa Charlin

2021-03-04 15:24:45 +0000

[diff] [blame]

629

Status CpuTransposeKernel::validate(const ITensorInfo *src, const ITensorInfo *dst)

630

{

631

ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);

Michele Di Giorgio

33f41fa

2021-03-09 14:09:08 +0000

[diff] [blame]

632

//Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use CPU FP16 instructions.

Teresa Charlin

2021-03-04 15:24:45 +0000

[diff] [blame]

633

ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);

634

635

// Error if input is not 8 bit, 16bit or 32bit

636

ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->element_size() != 1 && src->element_size() != 2 && src->element_size() != 4,

637

"Element size not supported");

638

639

// Validate configured destination

640

if(dst->total_size() != 0)

641

{

642

const TensorShape dst_shape = misc::shape_calculator::compute_transposed_shape(*src);

643

644

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), dst_shape);

645

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst);

646

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);

}

return Status{};

}

void CpuTransposeKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

653

{

Moritz Pflanzer

c186b57

2017-09-07 09:48:04 +0100

[diff] [blame]

654

ARM_COMPUTE_UNUSED(info);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

655

ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);

Teresa Charlin

2021-03-04 15:24:45 +0000

[diff] [blame]

656

ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

657

Teresa Charlin

2021-03-04 15:24:45 +0000

[diff] [blame]

658

const auto src = tensors.get_const_tensor(TensorType::ACL_SRC);

659

auto dst = tensors.get_tensor(TensorType::ACL_DST);

660

661

switch(src->info()->element_size())

662

{

663

case 1:

664

transpose_8bit_elements(src, dst, window);

665

break;

666

case 2:

667

transpose_16bit_elements(src, dst, window);

668

break;

669

case 4:

670

transpose_32bit_elements(src, dst, window);

671

break;

672

default:

673

ARM_COMPUTE_ERROR("Element size not supported");

674

break;

675

}

Anthony Barbier

2017-09-04 18:44:23 +0100

[diff] [blame]

676

}

Teresa Charlin