Blame - src/core/CL/cl_kernels/winograd_output_transform.cl - ml/ComputeLibrary

2018-07-04 17:03:33 +0100

[diff] [blame]

354

/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4, 4x1 or 1x4, the filter size 3x3, 3x1 or 1x3 and the data layout is NHWC

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

355

*

356

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

357

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4

358

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

359

* @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

360

* @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

361

*

362

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32

363

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

364

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

365

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

366

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

367

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

368

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

369

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

370

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

371

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

372

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

373

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

374

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

375

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

376

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

377

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

378

* @param[in] dst_size Size of the destination tensor, minus the last padding

379

*/

380

__kernel void winograd_output_transform_4x4_3x3_nhwc(

381

TENSOR3D_DECLARATION(src),

382

TENSOR3D_DECLARATION(dst),

383

#if defined(HAS_BIAS)

384

VECTOR_DECLARATION(bias),

385

#endif // defined(HAS_BIAS)

386

int dst_size)

387

{

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

388

// Each thread stores a 4x4/4x1 or 1x4 tile

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

389

Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);

390

391

const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);

392

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

393

// Load the values across the 36 channels to compose the 6x6 or 6x1 tile

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

394

float d00 = *((__global float *)(src_addr + 0 * src_stride_z));

395

float d01 = *((__global float *)(src_addr + 1 * src_stride_z));

396

float d02 = *((__global float *)(src_addr + 2 * src_stride_z));

397

float d03 = *((__global float *)(src_addr + 3 * src_stride_z));

398

float d04 = *((__global float *)(src_addr + 4 * src_stride_z));

399

float d05 = *((__global float *)(src_addr + 5 * src_stride_z));

400

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

401

#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

402

// Compute out00, out01, out02 and out03

403

float out00 = d00 + d01 + d02 + d03 + d04;

404

float out01 = d01 - d02 + 2.0f * d03 - 2.0f * d04;

405

float out02 = d01 + d02 + 4.0f * d03 + 4.0f * d04;

406

float out03 = d01 - d02 + 8.0f * d03 - 8.0f * d04 + d05;

407

#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

408

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

409

float d10 = *((__global float *)(src_addr + 6 * src_stride_z));

410

float d11 = *((__global float *)(src_addr + 7 * src_stride_z));

411

float d12 = *((__global float *)(src_addr + 8 * src_stride_z));

412

float d13 = *((__global float *)(src_addr + 9 * src_stride_z));

413

float d14 = *((__global float *)(src_addr + 10 * src_stride_z));

414

float d15 = *((__global float *)(src_addr + 11 * src_stride_z));

415

416

float d20 = *((__global float *)(src_addr + 12 * src_stride_z));

417

float d21 = *((__global float *)(src_addr + 13 * src_stride_z));

418

float d22 = *((__global float *)(src_addr + 14 * src_stride_z));

419

float d23 = *((__global float *)(src_addr + 15 * src_stride_z));

420

float d24 = *((__global float *)(src_addr + 16 * src_stride_z));

421

float d25 = *((__global float *)(src_addr + 17 * src_stride_z));

422

423

float d30 = *((__global float *)(src_addr + 18 * src_stride_z));

424

float d31 = *((__global float *)(src_addr + 19 * src_stride_z));

425

float d32 = *((__global float *)(src_addr + 20 * src_stride_z));

426

float d33 = *((__global float *)(src_addr + 21 * src_stride_z));

427

float d34 = *((__global float *)(src_addr + 22 * src_stride_z));

428

float d35 = *((__global float *)(src_addr + 23 * src_stride_z));

429

430

float d40 = *((__global float *)(src_addr + 24 * src_stride_z));

431

float d41 = *((__global float *)(src_addr + 25 * src_stride_z));

432

float d42 = *((__global float *)(src_addr + 26 * src_stride_z));

433

float d43 = *((__global float *)(src_addr + 27 * src_stride_z));

434

float d44 = *((__global float *)(src_addr + 28 * src_stride_z));

435

float d45 = *((__global float *)(src_addr + 29 * src_stride_z));

436

437

float d50 = *((__global float *)(src_addr + 30 * src_stride_z));

438

float d51 = *((__global float *)(src_addr + 31 * src_stride_z));

439

float d52 = *((__global float *)(src_addr + 32 * src_stride_z));

440

float d53 = *((__global float *)(src_addr + 33 * src_stride_z));

441

float d54 = *((__global float *)(src_addr + 34 * src_stride_z));

442

float d55 = *((__global float *)(src_addr + 35 * src_stride_z));

443

444

// Compute out00, out01, out02 and out03

445

float out00 = d01 + d21 + d41 + d11 + d31;

446

float out01 = d01 + d21 + d41 + d11 + d31;

447

float out02 = d01 + d21 + d41 + d11 + d31;

448

float out03 = d01 + d21 + d41 + d11 + d31;

449

450

float k0 = d03 + d04 + d13 + d14 + d23 + d24 + d33 + d34 + d43 + d44;

451

float k1 = 2.0f * d03 - 2.0f * d04 + 2.0f * d13 - 2.0f * d14 + 2.0f * d23 - 2.0f * d24 + 2.0f * d33 - 2.0f * d34 + 2.0f * d43 - 2.0f * d44;

452

453

out00 += k0 + d00 + d02 + d10 + d12 + d20 + d22 + d30 + d32 + d40 + d42;

454

out01 += k1 - d02 - d12 - d22 - d32 - d42;

455

out02 += 4.0f * k0 + d02 + d12 + d22 + d32 + d42;

456

out03 += 4.0f * k1 - d02 - d12 - d22 - d32 - d42 + d05 + d15 + d25 + d35 + d45;

457

458

// Compute out10, out11, out12 and out13

459

float out10 = d11 - d21 + 2.0f * d31 - 2.0f * d41;

460

float out11 = d11 - d21 + 2.0f * d31 - 2.0f * d41;

461

float out12 = d11 - d21 + 2.0f * d31 - 2.0f * d41;

462

float out13 = d11 - d21 + 2.0f * d31 - 2.0f * d41;

463

464

k0 = d13 + d14 - d23 - d24 + 2.0f * d33 + 2.0f * d34 - 2.0f * d43 - 2.0f * d44;

465

k1 = 2.0f * d13 - 2.0f * d14 - 2.0f * d23 + 2.0f * d24 + 4.0f * d33 - 4.0f * d34 - 4.0f * d43 + 4.0f * d44;

466

467

out10 += k0 + d10 + d12 - d20 - d22 + 2.0f * d30 + 2.0f * d32 - 2.0f * d40 - 2.0f * d42;

468

out11 += k1 - d12 + d22 - 2.0f * d32 + 2.0f * d42;

469

out12 += 4.0f * k0 + d12 - d22 + 2.0f * d32 - 2.0f * d42;

470

out13 += 4.0f * k1 - d12 + d15 + d22 - d25 - 2.0f * d32 + 2.0f * d35 + 2.0f * d42 - 2.0f * d45;

471

472

// Compute out20, out21, out22 and out23

473

float out20 = d11 + d21 + 4.0f * d31 + 4.0f * d41;

474

float out21 = d11 + d21 + 4.0f * d31 + 4.0f * d41;

475

float out22 = d11 + d21 + 4.0f * d31 + 4.0f * d41;

476

float out23 = d11 + d21 + 4.0f * d31 + 4.0f * d41;

477

478

k0 = d13 + d14 + d23 + d24 + 4.0f * d33 + 4.0f * d34 + 4.0f * d43 + 4.0f * d44;

479

k1 = 2.0f * d13 - 2.0f * d14 + 2.0f * d23 - 2.0f * d24 + 8.0f * d33 - 8.0f * d34 + 8.0f * d43 - 8.0f * d44;

480

481

out20 += k0 + d10 + d12 + d20 + d22 + 4.0f * d30 + 4.0f * d32 + 4.0f * d40 + 4.0f * d42;

482

out21 += k1 - d12 - d22 - 4.0f * d32 - 4.0f * d42;

483

out22 += 4.0f * k0 + d12 + d22 + 4.0f * d32 + 4.0f * d42;

484

out23 += 4.0f * k1 - d12 + d15 - d22 + d25 - 4.0f * d32 + 4.0f * d35 - 4.0f * d42 + 4.0f * d45;

485

486

// Compute out30, out31, out32 and out33

487

float out30 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;

488

float out31 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;

489

float out32 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;

490

float out33 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;

491

492

k0 = d13 + d14 - d23 - d24 + 8.0f * d33 + 8.0f * d34 - 8.0f * d43 - 8.0f * d44 + d53 + d54;

493

k1 = 2.0f * d13 - 2.0f * d14 - 2.0f * d23 + 2.0f * d24 + 16.0f * d33 - 16.0f * d34 - 16.0f * d43 + 16.0f * d44 + 2.0f * d53 - 2.0f * d54;

494

495

out30 += k0 + d10 + d12 - d20 - d22 + 8.0f * d30 + 8.0f * d32 - 8.0f * d40 - 8.0f * d42 + d50 + d52;

496

out31 += k1 - d12 + d22 - 8.0f * d32 + 8.0f * d42 - d52;

497

out32 += 4.0f * k0 + d12 - d22 + 8.0f * d32 - 8.0f * d42 + d52;

498

out33 += 4.0f * k1 - d12 + d15 + d22 - d25 - 8.0f * d32 + 8.0f * d35 + 8.0f * d42 - 8.0f * d45 - d52 + d55;

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

499

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

500

501

int y_in = get_global_id(1);

502

int x_out = get_global_id(0);

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

503

int y_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;

504

int z_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

505

506

#if defined(HAS_BIAS)

507

// Add bias

508

Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);

509

Giorgio Arena

ea55f91

2018-07-12 15:41:35 +0100

[diff] [blame]

510

float b = (float) * ((__global float *)(vector_offset(&bias, x_out)));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

out00 += (float)b;

out01 += (float)b;

out02 += (float)b;

out03 += (float)b;

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

516

#if !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) & !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

out10 += (float)b;

out11 += (float)b;

out12 += (float)b;

out13 += (float)b;

out20 += (float)b;

out21 += (float)b;

out22 += (float)b;

out23 += (float)b;

out30 += (float)b;

out31 += (float)b;

out32 += (float)b;

out33 += (float)b;

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

531

#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) & !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

532

533

#endif // defined(HAS_BIAS)

534

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

535

#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

536

int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(float) + y_out * dst_stride_y + z_out * dst_stride_z);

537

offset = min(offset + (int4)(0, 1, 2, 3) * (int4)dst_stride_z, dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).

538

539

// Store the 1x4 output tile

540

*((__global float *)(dst_ptr + offset.s0)) = out00;

541

*((__global float *)(dst_ptr + offset.s1)) = out01;

542

*((__global float *)(dst_ptr + offset.s2)) = out02;

543

*((__global float *)(dst_ptr + offset.s3)) = out03;

544

#elif defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)

545

// Store the 4x1 output tile

Giorgio Arena

2018-07-18 11:45:30 +0100

[diff] [blame]

546

int offset = dst_offset_first_element_in_bytes + x_out * sizeof(float) + y_out * dst_stride_y + z_out * dst_stride_z;

547

int mult_y = min(dst_size - offset, 1);

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

548

Giorgio Arena

2018-07-18 11:45:30 +0100

[diff] [blame]

549

*((__global float *)(dst_ptr + mult_y * 0 * dst_stride_y + offset)) = out00;

550

*((__global float *)(dst_ptr + mult_y * 1 * dst_stride_y + offset)) = out01;

551

*((__global float *)(dst_ptr + mult_y * 2 * dst_stride_y + offset)) = out02;

552

*((__global float *)(dst_ptr + mult_y * 3 * dst_stride_y + offset)) = out03;

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

553

#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

554

// Get output address

555

int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(float) + y_out * dst_stride_y + z_out * dst_stride_z);

556

offset = min(offset + (int4)(0, 1, 2, 3) * (int4)dst_stride_z, dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).

557

int4 mult_y = min(dst_size - offset, 1); // If out of bound, we don't want to increase dst_stride_y, so we set the multiplier to 0. It will be 1 otherwise.

558

559

// Store the 4x4 output tile

560

*((__global float *)(dst_ptr + mult_y.s0 * 0 * dst_stride_y + offset.s0)) = out00;

561

*((__global float *)(dst_ptr + mult_y.s0 * 1 * dst_stride_y + offset.s0)) = out01;

562

*((__global float *)(dst_ptr + mult_y.s0 * 2 * dst_stride_y + offset.s0)) = out02;

563

*((__global float *)(dst_ptr + mult_y.s0 * 3 * dst_stride_y + offset.s0)) = out03;

564

*((__global float *)(dst_ptr + mult_y.s1 * 0 * dst_stride_y + offset.s1)) = out10;

565

*((__global float *)(dst_ptr + mult_y.s1 * 1 * dst_stride_y + offset.s1)) = out11;

566

*((__global float *)(dst_ptr + mult_y.s1 * 2 * dst_stride_y + offset.s1)) = out12;

567

*((__global float *)(dst_ptr + mult_y.s1 * 3 * dst_stride_y + offset.s1)) = out13;

568

*((__global float *)(dst_ptr + mult_y.s2 * 0 * dst_stride_y + offset.s2)) = out20;

569

*((__global float *)(dst_ptr + mult_y.s2 * 1 * dst_stride_y + offset.s2)) = out21;

570

*((__global float *)(dst_ptr + mult_y.s2 * 2 * dst_stride_y + offset.s2)) = out22;

571

*((__global float *)(dst_ptr + mult_y.s2 * 3 * dst_stride_y + offset.s2)) = out23;

572

*((__global float *)(dst_ptr + mult_y.s3 * 0 * dst_stride_y + offset.s3)) = out30;

573

*((__global float *)(dst_ptr + mult_y.s3 * 1 * dst_stride_y + offset.s3)) = out31;

574

*((__global float *)(dst_ptr + mult_y.s3 * 2 * dst_stride_y + offset.s3)) = out32;

575

*((__global float *)(dst_ptr + mult_y.s3 * 3 * dst_stride_y + offset.s3)) = out33;

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame]

576

577

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

578

}

579

580

#define COMPUTE_TMP_COL(col, d0, d1, d2, d3, d4, d5, d6, d7, comm_fact) \

581

({ \

582

comm_fact.s0 = d1 + d2; \

583

comm_fact.s1 = d3 + d4; \

584

comm_fact.s2 = d5 + d6; \

585

\

586

col.s0 = comm_fact.s0 + comm_fact.s1 + 8.f * comm_fact.s2 + d0; \

587

col.s2 = comm_fact.s0 + 4.f * comm_fact.s1 + 2.f * comm_fact.s2; \

588

\

589

comm_fact.s0 = d1 - d2; \

590

comm_fact.s1 = d3 - d4; \

591

comm_fact.s2 = d5 - d6; \

592

\

593

col.s1 = comm_fact.s0 + 2.f * comm_fact.s1 + 4.f * comm_fact.s2; \

594

col.s3 = comm_fact.s0 + 8.f * comm_fact.s1 + comm_fact.s2 + d7; \

595

})

596

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

597

/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4/4x1 or 1x4, the filter size 5x5/5x1 or 1x5 and the data layout is NCHW

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

598

*

599

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

600

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4

601

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

602

* @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

603

* @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

604

*

605

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32

606

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

607

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

608

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

609

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

610

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

611

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

612

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

613

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

614

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

615

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

616

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

617

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

618

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

619

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

620

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

621

*/

622

__kernel void winograd_output_transform_4x4_5x5_nchw(

623

TENSOR3D_DECLARATION(src),

624

TENSOR3D_DECLARATION(dst)

625

#if defined(HAS_BIAS)

626

,

627

VECTOR_DECLARATION(bias)

628

#endif // defined(HAS_BIAS)

629

)

630

{

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

631

// Each thread stores a 4x4/4x1 or 1x4 tile

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

632

Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);

633

634

const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);

635

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

636

// Compute output address

637

int y_in = get_global_id(1);

638

int x_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;

639

int y_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;

640

int z_out = get_global_id(0);

641

642

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(float) + y_out * dst_stride_y + z_out * dst_stride_z;

643

644

// Load the values across the channels to compose the input tile

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

645

float d00 = *((__global float *)(src_addr + 0 * src_stride_z));

646

float d01 = *((__global float *)(src_addr + 1 * src_stride_z));

647

float d02 = *((__global float *)(src_addr + 2 * src_stride_z));

648

float d03 = *((__global float *)(src_addr + 3 * src_stride_z));

649

float d04 = *((__global float *)(src_addr + 4 * src_stride_z));

650

float d05 = *((__global float *)(src_addr + 5 * src_stride_z));

651

float d06 = *((__global float *)(src_addr + 6 * src_stride_z));

652

float d07 = *((__global float *)(src_addr + 7 * src_stride_z));

653

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

654

#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

655

// Compute out00, out01, out02 and out03

656

float out00 = d00 + d01 + d02 + d03 + d04 + 8.0f * d05 + 8.0f * d06;

657

float out01 = d01 - d02 + 2.0f * d03 - 2.0f * d04 + 4.0f * d05 - 4.0f * d06;

658

float out02 = d01 + d02 + 4.0f * d03 + 4.0f * d04 + 2.0f * d05 + 2.0f * d06;

659

float out03 = d01 - d02 + 8.0f * d03 - 8.0f * d04 + d05 - d06 + d07;

660

661

#if defined(HAS_BIAS)

662

// Add bias

663

Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);

664

665

float b = (float) * ((__global float *)(vector_offset(&bias, z_out)));

out00 += (float)b;

out01 += (float)b;

out02 += (float)b;

out03 += (float)b;

#endif // defined(HAS_BIAS)

672

673

// Store the output tile

674

#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

675

*((__global float *)(dst_addr + 0 * dst_stride_y)) = out00;

676

*((__global float *)(dst_addr + 1 * dst_stride_y)) = out01;

677

*((__global float *)(dst_addr + 2 * dst_stride_y)) = out02;

678

*((__global float *)(dst_addr + 3 * dst_stride_y)) = out03;

679

#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

680

vstore4((float4)(out00, out01, out02, out03), 0, (__global float *)(dst_addr));

681

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

682

683

#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Giorgio Arena

2018-07-18 11:45:30 +0100

[diff] [blame]

684

float d10 = *((__global float *)(src_addr + 8 * src_stride_z));

685

float d11 = *((__global float *)(src_addr + 9 * src_stride_z));

686

float d12 = *((__global float *)(src_addr + 10 * src_stride_z));

687

float d13 = *((__global float *)(src_addr + 11 * src_stride_z));

688

float d14 = *((__global float *)(src_addr + 12 * src_stride_z));

689

float d15 = *((__global float *)(src_addr + 13 * src_stride_z));

690

float d16 = *((__global float *)(src_addr + 14 * src_stride_z));

691

float d17 = *((__global float *)(src_addr + 15 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

692

693

float d20 = *((__global float *)(src_addr + 16 * src_stride_z));

694

float d21 = *((__global float *)(src_addr + 17 * src_stride_z));

695

float d22 = *((__global float *)(src_addr + 18 * src_stride_z));

696

float d23 = *((__global float *)(src_addr + 19 * src_stride_z));

697

float d24 = *((__global float *)(src_addr + 20 * src_stride_z));

698

float d25 = *((__global float *)(src_addr + 21 * src_stride_z));

699

float d26 = *((__global float *)(src_addr + 22 * src_stride_z));

700

float d27 = *((__global float *)(src_addr + 23 * src_stride_z));

701

702

float d30 = *((__global float *)(src_addr + 24 * src_stride_z));

703

float d31 = *((__global float *)(src_addr + 25 * src_stride_z));

704

float d32 = *((__global float *)(src_addr + 26 * src_stride_z));

705

float d33 = *((__global float *)(src_addr + 27 * src_stride_z));

706

float d34 = *((__global float *)(src_addr + 28 * src_stride_z));

707

float d35 = *((__global float *)(src_addr + 29 * src_stride_z));

708

float d36 = *((__global float *)(src_addr + 30 * src_stride_z));

709

float d37 = *((__global float *)(src_addr + 31 * src_stride_z));

710

711

float d40 = *((__global float *)(src_addr + 32 * src_stride_z));

712

float d41 = *((__global float *)(src_addr + 33 * src_stride_z));

713

float d42 = *((__global float *)(src_addr + 34 * src_stride_z));

714

float d43 = *((__global float *)(src_addr + 35 * src_stride_z));

715

float d44 = *((__global float *)(src_addr + 36 * src_stride_z));

716

float d45 = *((__global float *)(src_addr + 37 * src_stride_z));

717

float d46 = *((__global float *)(src_addr + 38 * src_stride_z));

718

float d47 = *((__global float *)(src_addr + 39 * src_stride_z));

719

720

float d50 = *((__global float *)(src_addr + 40 * src_stride_z));

721

float d51 = *((__global float *)(src_addr + 41 * src_stride_z));

722

float d52 = *((__global float *)(src_addr + 42 * src_stride_z));

723

float d53 = *((__global float *)(src_addr + 43 * src_stride_z));

724

float d54 = *((__global float *)(src_addr + 44 * src_stride_z));

725

float d55 = *((__global float *)(src_addr + 45 * src_stride_z));

726

float d56 = *((__global float *)(src_addr + 46 * src_stride_z));

727

float d57 = *((__global float *)(src_addr + 47 * src_stride_z));

728

729

float d60 = *((__global float *)(src_addr + 48 * src_stride_z));

730

float d61 = *((__global float *)(src_addr + 49 * src_stride_z));

731

float d62 = *((__global float *)(src_addr + 50 * src_stride_z));

732

float d63 = *((__global float *)(src_addr + 51 * src_stride_z));

733

float d64 = *((__global float *)(src_addr + 52 * src_stride_z));

734

float d65 = *((__global float *)(src_addr + 53 * src_stride_z));

735

float d66 = *((__global float *)(src_addr + 54 * src_stride_z));

736

float d67 = *((__global float *)(src_addr + 55 * src_stride_z));

737

738

float d70 = *((__global float *)(src_addr + 56 * src_stride_z));

739

float d71 = *((__global float *)(src_addr + 57 * src_stride_z));

740

float d72 = *((__global float *)(src_addr + 58 * src_stride_z));

741

float d73 = *((__global float *)(src_addr + 59 * src_stride_z));

742

float d74 = *((__global float *)(src_addr + 60 * src_stride_z));

743

float d75 = *((__global float *)(src_addr + 61 * src_stride_z));

744

float d76 = *((__global float *)(src_addr + 62 * src_stride_z));

745

float d77 = *((__global float *)(src_addr + 63 * src_stride_z));

746

747

// Compute the 8x4 intermediate tensor

748

float4 comm_fact0, comm_fact1, comm_fact2;

749

float4 tmp_col0, tmp_col1, tmp_col2, tmp_col3, tmp_col4, tmp_col5, tmp_col6, tmp_col7;

750

751

COMPUTE_TMP_COL(tmp_col0, d00, d10, d20, d30, d40, d50, d60, d70, comm_fact0);

752

COMPUTE_TMP_COL(tmp_col1, d01, d11, d21, d31, d41, d51, d61, d71, comm_fact0);

753

COMPUTE_TMP_COL(tmp_col2, d02, d12, d22, d32, d42, d52, d62, d72, comm_fact0);

754

COMPUTE_TMP_COL(tmp_col3, d03, d13, d23, d33, d43, d53, d63, d73, comm_fact0);

755

COMPUTE_TMP_COL(tmp_col4, d04, d14, d24, d34, d44, d54, d64, d74, comm_fact0);

756

COMPUTE_TMP_COL(tmp_col5, d05, d15, d25, d35, d45, d55, d65, d75, comm_fact0);

757

COMPUTE_TMP_COL(tmp_col6, d06, d16, d26, d36, d46, d56, d66, d76, comm_fact0);

758

COMPUTE_TMP_COL(tmp_col7, d07, d17, d27, d37, d47, d57, d67, d77, comm_fact0);

759

760

// Compute the 4x4 output tile

761

comm_fact0 = tmp_col1 + tmp_col2;

762

comm_fact1 = tmp_col3 + tmp_col4;

763

comm_fact2 = tmp_col5 + tmp_col6;

764

765

float4 out_col0 = comm_fact0 + comm_fact1 + 8.f * comm_fact2 + tmp_col0;

766

float4 out_col2 = comm_fact0 + 4.f * comm_fact1 + 2.f * comm_fact2;

767

768

comm_fact0 = tmp_col1 - tmp_col2;

769

comm_fact1 = tmp_col3 - tmp_col4;

770

comm_fact2 = tmp_col5 - tmp_col6;

771

772

float4 out_col1 = comm_fact0 + 2.f * comm_fact1 + 4.f * comm_fact2;

773

float4 out_col3 = comm_fact0 + 8.f * comm_fact1 + comm_fact2 + tmp_col7;

774

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

775

#if defined(HAS_BIAS)

776

// Add bias

777

Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);

778

779

float b = (float) * ((__global float *)(vector_offset(&bias, z_out)));

780

781

out_col0 += (float4)b;

782

out_col1 += (float4)b;

783

out_col2 += (float4)b;

784

out_col3 += (float4)b;

785

#endif // defined(HAS_BIAS)

786

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

787

// Store the output tile

788

vstore4((float4)(out_col0.s0, out_col1.s0, out_col2.s0, out_col3.s0), 0, (__global float *)(dst_addr + 0 * dst_stride_y));

789

vstore4((float4)(out_col0.s1, out_col1.s1, out_col2.s1, out_col3.s1), 0, (__global float *)(dst_addr + 1 * dst_stride_y));

790

vstore4((float4)(out_col0.s2, out_col1.s2, out_col2.s2, out_col3.s2), 0, (__global float *)(dst_addr + 2 * dst_stride_y));

791

vstore4((float4)(out_col0.s3, out_col1.s3, out_col2.s3, out_col3.s3), 0, (__global float *)(dst_addr + 3 * dst_stride_y));

792

#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

793

}

794

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

795

/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4/4x1 or 1x4, the filter size 5x5/5x1 or 1x5 and the data layout is NHWC

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

796

*

797

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

798

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4

799

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

800

* @note If this kernel is used to perform Winograd output transform 5x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

801

* @note If this kernel is used to perform Winograd output transform 1x5, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

802

*

803

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32

804

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

805

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

806

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

807

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

808

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

809

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

810

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

811

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

812

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

813

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

814

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

815

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

816

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

817

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

818

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

819

*/

820

__kernel void winograd_output_transform_4x4_5x5_nhwc(

821

TENSOR3D_DECLARATION(src),

822

TENSOR3D_DECLARATION(dst),

823

#if defined(HAS_BIAS)

824

VECTOR_DECLARATION(bias),

825

#endif // defined(HAS_BIAS)

826

int dst_size)

827

{

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

828

// Each thread stores a 4x4/4x1 or 1x4 tile

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

829

Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);

830

831

const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);

832

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

833

int y_in = get_global_id(1);

834

int x_out = get_global_id(0);

835

int y_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;

836

int z_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;

837

838

// Load the values across the channels to compose the input tile

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

839

float d00 = *((__global float *)(src_addr + 0 * src_stride_z));

840

float d01 = *((__global float *)(src_addr + 1 * src_stride_z));

841

float d02 = *((__global float *)(src_addr + 2 * src_stride_z));

842

float d03 = *((__global float *)(src_addr + 3 * src_stride_z));

843

float d04 = *((__global float *)(src_addr + 4 * src_stride_z));

844

float d05 = *((__global float *)(src_addr + 5 * src_stride_z));

845

float d06 = *((__global float *)(src_addr + 6 * src_stride_z));

846

float d07 = *((__global float *)(src_addr + 7 * src_stride_z));

847

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

848

#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

849

// Compute out00, out01, out02 and out03

850

float out00 = d00 + d01 + d02 + d03 + d04 + 8.0f * d05 + 8.0f * d06;

851

float out01 = d01 - d02 + 2.0f * d03 - 2.0f * d04 + 4.0f * d05 - 4.0f * d06;

852

float out02 = d01 + d02 + 4.0f * d03 + 4.0f * d04 + 2.0f * d05 + 2.0f * d06;

853

float out03 = d01 - d02 + 8.0f * d03 - 8.0f * d04 + d05 - d06 + d07;

854

855

#if defined(HAS_BIAS)

856

// Add bias

857

Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);

858

Giorgio Arena

ea55f91

2018-07-12 15:41:35 +0100

[diff] [blame]

859

float b = (float) * ((__global float *)(vector_offset(&bias, x_out)));

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

out00 += (float)b;

out01 += (float)b;

out02 += (float)b;

out03 += (float)b;

#endif // defined(HAS_BIAS)

866

867

// Store the output tile

868

#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

869

// Get output address

870

int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(float) + y_out * dst_stride_y + z_out * dst_stride_z);

871

offset = min(offset + (int4)(0, 1, 2, 3) * (int4)dst_stride_z, dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).

872

873

*(__global float *)(dst_ptr + offset.s0) = out00;

874

*(__global float *)(dst_ptr + offset.s1) = out01;

875

*(__global float *)(dst_ptr + offset.s2) = out02;

876

*(__global float *)(dst_ptr + offset.s3) = out03;

877

#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

878

// Get output address

879

int offset = dst_offset_first_element_in_bytes + x_out * sizeof(float) + y_out * dst_stride_y + z_out * dst_stride_z;

880

881

*(__global float *)(dst_ptr + 0 * dst_stride_y + offset) = out00;

882

*(__global float *)(dst_ptr + 1 * dst_stride_y + offset) = out01;

883

*(__global float *)(dst_ptr + 2 * dst_stride_y + offset) = out02;

884

*(__global float *)(dst_ptr + 3 * dst_stride_y + offset) = out03;

885

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

886

887

#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

888

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

889

float d10 = *((__global float *)(src_addr + 8 * src_stride_z));

890

float d11 = *((__global float *)(src_addr + 9 * src_stride_z));

891

float d12 = *((__global float *)(src_addr + 10 * src_stride_z));

892

float d13 = *((__global float *)(src_addr + 11 * src_stride_z));

893

float d14 = *((__global float *)(src_addr + 12 * src_stride_z));

894

float d15 = *((__global float *)(src_addr + 13 * src_stride_z));

895

float d16 = *((__global float *)(src_addr + 14 * src_stride_z));

896

float d17 = *((__global float *)(src_addr + 15 * src_stride_z));

897

898

float d20 = *((__global float *)(src_addr + 16 * src_stride_z));

899

float d21 = *((__global float *)(src_addr + 17 * src_stride_z));

900

float d22 = *((__global float *)(src_addr + 18 * src_stride_z));

901

float d23 = *((__global float *)(src_addr + 19 * src_stride_z));

902

float d24 = *((__global float *)(src_addr + 20 * src_stride_z));

903

float d25 = *((__global float *)(src_addr + 21 * src_stride_z));

904

float d26 = *((__global float *)(src_addr + 22 * src_stride_z));

905

float d27 = *((__global float *)(src_addr + 23 * src_stride_z));

906

907

float d30 = *((__global float *)(src_addr + 24 * src_stride_z));

908

float d31 = *((__global float *)(src_addr + 25 * src_stride_z));

909

float d32 = *((__global float *)(src_addr + 26 * src_stride_z));

910

float d33 = *((__global float *)(src_addr + 27 * src_stride_z));

911

float d34 = *((__global float *)(src_addr + 28 * src_stride_z));

912

float d35 = *((__global float *)(src_addr + 29 * src_stride_z));

913

float d36 = *((__global float *)(src_addr + 30 * src_stride_z));

914

float d37 = *((__global float *)(src_addr + 31 * src_stride_z));

915

916

float d40 = *((__global float *)(src_addr + 32 * src_stride_z));

917

float d41 = *((__global float *)(src_addr + 33 * src_stride_z));

918

float d42 = *((__global float *)(src_addr + 34 * src_stride_z));

919

float d43 = *((__global float *)(src_addr + 35 * src_stride_z));

920

float d44 = *((__global float *)(src_addr + 36 * src_stride_z));

921

float d45 = *((__global float *)(src_addr + 37 * src_stride_z));

922

float d46 = *((__global float *)(src_addr + 38 * src_stride_z));

923

float d47 = *((__global float *)(src_addr + 39 * src_stride_z));

924

925

float d50 = *((__global float *)(src_addr + 40 * src_stride_z));

926

float d51 = *((__global float *)(src_addr + 41 * src_stride_z));

927

float d52 = *((__global float *)(src_addr + 42 * src_stride_z));

928

float d53 = *((__global float *)(src_addr + 43 * src_stride_z));

929

float d54 = *((__global float *)(src_addr + 44 * src_stride_z));

930

float d55 = *((__global float *)(src_addr + 45 * src_stride_z));

931

float d56 = *((__global float *)(src_addr + 46 * src_stride_z));

932

float d57 = *((__global float *)(src_addr + 47 * src_stride_z));

933

934

float d60 = *((__global float *)(src_addr + 48 * src_stride_z));

935

float d61 = *((__global float *)(src_addr + 49 * src_stride_z));

936

float d62 = *((__global float *)(src_addr + 50 * src_stride_z));

937

float d63 = *((__global float *)(src_addr + 51 * src_stride_z));

938

float d64 = *((__global float *)(src_addr + 52 * src_stride_z));

939

float d65 = *((__global float *)(src_addr + 53 * src_stride_z));

940

float d66 = *((__global float *)(src_addr + 54 * src_stride_z));

941

float d67 = *((__global float *)(src_addr + 55 * src_stride_z));

942

943

float d70 = *((__global float *)(src_addr + 56 * src_stride_z));

944

float d71 = *((__global float *)(src_addr + 57 * src_stride_z));

945

float d72 = *((__global float *)(src_addr + 58 * src_stride_z));

946

float d73 = *((__global float *)(src_addr + 59 * src_stride_z));

947

float d74 = *((__global float *)(src_addr + 60 * src_stride_z));

948

float d75 = *((__global float *)(src_addr + 61 * src_stride_z));

949

float d76 = *((__global float *)(src_addr + 62 * src_stride_z));

950

float d77 = *((__global float *)(src_addr + 63 * src_stride_z));

951

952

// Compute the 8x4 intermediate tensor

953

float4 comm_fact0, comm_fact1, comm_fact2;

954

float4 tmp_col0, tmp_col1, tmp_col2, tmp_col3, tmp_col4, tmp_col5, tmp_col6, tmp_col7;

955

956

COMPUTE_TMP_COL(tmp_col0, d00, d10, d20, d30, d40, d50, d60, d70, comm_fact0);

957

COMPUTE_TMP_COL(tmp_col1, d01, d11, d21, d31, d41, d51, d61, d71, comm_fact0);

958

COMPUTE_TMP_COL(tmp_col2, d02, d12, d22, d32, d42, d52, d62, d72, comm_fact0);

959

COMPUTE_TMP_COL(tmp_col3, d03, d13, d23, d33, d43, d53, d63, d73, comm_fact0);

960

COMPUTE_TMP_COL(tmp_col4, d04, d14, d24, d34, d44, d54, d64, d74, comm_fact0);

961

COMPUTE_TMP_COL(tmp_col5, d05, d15, d25, d35, d45, d55, d65, d75, comm_fact0);

962

COMPUTE_TMP_COL(tmp_col6, d06, d16, d26, d36, d46, d56, d66, d76, comm_fact0);

963

COMPUTE_TMP_COL(tmp_col7, d07, d17, d27, d37, d47, d57, d67, d77, comm_fact0);

964

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

965

// Compute the output tile

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

966

comm_fact0 = tmp_col1 + tmp_col2;

967

comm_fact1 = tmp_col3 + tmp_col4;

968

comm_fact2 = tmp_col5 + tmp_col6;

969

970

float4 out_col0 = comm_fact0 + comm_fact1 + 8.f * comm_fact2 + tmp_col0;

971

float4 out_col2 = comm_fact0 + 4.f * comm_fact1 + 2.f * comm_fact2;

972

973

comm_fact0 = tmp_col1 - tmp_col2;

974

comm_fact1 = tmp_col3 - tmp_col4;

975

comm_fact2 = tmp_col5 - tmp_col6;

976

977

float4 out_col1 = comm_fact0 + 2.f * comm_fact1 + 4.f * comm_fact2;

978

float4 out_col3 = comm_fact0 + 8.f * comm_fact1 + comm_fact2 + tmp_col7;

979

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

980

#if defined(HAS_BIAS)

981

// Add bias

982

Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);

983

Giorgio Arena

ea55f91

2018-07-12 15:41:35 +0100

[diff] [blame]

984

float b = (float) * ((__global float *)(vector_offset(&bias, x_out)));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

985

986

out_col0 += (float4)b;

987

out_col1 += (float4)b;

988

out_col2 += (float4)b;

989

out_col3 += (float4)b;

990

#endif // defined(HAS_BIAS)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

991

// Get output address

992

int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(float) + y_out * dst_stride_y + z_out * dst_stride_z);

993

offset = min(offset + (int4)(0, 1, 2, 3) * (int4)dst_stride_z, dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).

994

int4 mult_y = min(dst_size - offset, 1); // If out of bound, we don't want to increase dst_stride_y, so we set the multiplier to 0. It will be 1 otherwise.

995

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

996

// Store the output tile

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

997

*(__global float *)(dst_ptr + mult_y.s0 * 0 * dst_stride_y + offset.s0) = out_col0.s0;

998

*(__global float *)(dst_ptr + mult_y.s0 * 1 * dst_stride_y + offset.s0) = out_col1.s0;

999

*(__global float *)(dst_ptr + mult_y.s0 * 2 * dst_stride_y + offset.s0) = out_col2.s0;

1000

*(__global float *)(dst_ptr + mult_y.s0 * 3 * dst_stride_y + offset.s0) = out_col3.s0;

Giorgio Arena

2018-07-18 11:45:30 +0100

[diff] [blame]

1001

*(__global float *)(dst_ptr + mult_y.s1 * 0 * dst_stride_y + offset.s1) = out_col0.s1;

1002

*(__global float *)(dst_ptr + mult_y.s1 * 1 * dst_stride_y + offset.s1) = out_col1.s1;

1003

*(__global float *)(dst_ptr + mult_y.s1 * 2 * dst_stride_y + offset.s1) = out_col2.s1;

1004

*(__global float *)(dst_ptr + mult_y.s1 * 3 * dst_stride_y + offset.s1) = out_col3.s1;

1005

*(__global float *)(dst_ptr + mult_y.s2 * 0 * dst_stride_y + offset.s2) = out_col0.s2;

1006

*(__global float *)(dst_ptr + mult_y.s2 * 1 * dst_stride_y + offset.s2) = out_col1.s2;

1007

*(__global float *)(dst_ptr + mult_y.s2 * 2 * dst_stride_y + offset.s2) = out_col2.s2;

1008

*(__global float *)(dst_ptr + mult_y.s2 * 3 * dst_stride_y + offset.s2) = out_col3.s2;

1009

*(__global float *)(dst_ptr + mult_y.s3 * 0 * dst_stride_y + offset.s3) = out_col0.s3;

1010

*(__global float *)(dst_ptr + mult_y.s3 * 1 * dst_stride_y + offset.s3) = out_col1.s3;

1011

*(__global float *)(dst_ptr + mult_y.s3 * 2 * dst_stride_y + offset.s3) = out_col2.s3;

1012

*(__global float *)(dst_ptr + mult_y.s3 * 3 * dst_stride_y + offset.s3) = out_col3.s3;

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1013

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

1014

}

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1015

1016

#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)

1017

/** This OpenCL kernel performs Winograd output transform when the output tile is 2x1, the filter size 3x1 and the data layout is NCHW

1018

*

1019

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

1020

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2

1021

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1

1022

* @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

1023

*

1024

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32

1025

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1026

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1027

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1028

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1029

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1030

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

1031

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1032

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

1033

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1034

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1035

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1036

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

1037

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1038

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

1039

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1040

*/

1041

__kernel void winograd_output_transform_2x1_3x1_nchw(

1042

TENSOR3D_DECLARATION(src),

1043

TENSOR3D_DECLARATION(dst)

1044

#if defined(HAS_BIAS)

1045

,

1046

VECTOR_DECLARATION(bias)

1047

#endif // defined(HAS_BIAS)

1048

)

1049

{

1050

winograd_output_transform_2x2_3x3_nchw(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

dst_offset_first_element_in_bytes

1066

#if defined(HAS_BIAS)

,

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes

1072

#endif // defined(HAS_BIAS)

);

}

/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 3x1 and the data layout is NCHW

1077

*

1078

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

1079

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4

1080

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1

1081

* @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

1082

*

1083

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32

1084

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1085

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1086

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1087

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1088

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1089

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

1090

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1091

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

1092

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1093

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1094

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1095

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

1096

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1097

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

1098

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1099

*/

1100

__kernel void winograd_output_transform_4x1_3x1_nchw(

1101

TENSOR3D_DECLARATION(src),

1102

TENSOR3D_DECLARATION(dst)

1103

#if defined(HAS_BIAS)

1104

,

1105

VECTOR_DECLARATION(bias)

1106

#endif // defined(HAS_BIAS)

1107

)

1108

{

1109

winograd_output_transform_4x4_3x3_nchw(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

dst_offset_first_element_in_bytes

1125

#if defined(HAS_BIAS)

,

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes

1131

#endif // defined(HAS_BIAS)

);

}

/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 5x1 and the data layout is NCHW

1136

*

1137

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

1138

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4

1139

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1

1140

* @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

1141

*

1142

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32

1143

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1144

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1145

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1146

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1147

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1148

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

1149

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1150

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

1151

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1152

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1153

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1154

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

1155

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1156

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

1157

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1158

*/

1159

__kernel void winograd_output_transform_4x1_5x1_nchw(

1160

TENSOR3D_DECLARATION(src),

1161

TENSOR3D_DECLARATION(dst)

1162

#if defined(HAS_BIAS)

1163

,

1164

VECTOR_DECLARATION(bias)

1165

#endif // defined(HAS_BIAS)

1166

)

1167

{

1168

winograd_output_transform_4x4_5x5_nchw(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

dst_offset_first_element_in_bytes

1184

#if defined(HAS_BIAS)

,

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes

1190

#endif // defined(HAS_BIAS)

1191

);

1192

}

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1193

1194

/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 3x1 and the data layout is NHWC

1195

*

1196

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

1197

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4

1198

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1

1199

* @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

1200

*

1201

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32

1202

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1203

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1204

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1205

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1206

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1207

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

1208

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1209

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

1210

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1211

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1212

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1213

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

1214

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1215

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

1216

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1217

*/

1218

__kernel void winograd_output_transform_4x1_3x1_nhwc(

1219

TENSOR3D_DECLARATION(src),

1220

TENSOR3D_DECLARATION(dst),

1221

#if defined(HAS_BIAS)

1222

VECTOR_DECLARATION(bias),

1223

#endif // defined(HAS_BIAS)

1224

int dst_size)

1225

{

1226

winograd_output_transform_4x4_3x3_nhwc(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

dst_offset_first_element_in_bytes,

1242

#if defined(HAS_BIAS)

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes,

1247

#endif // defined(HAS_BIAS)

dst_size);

}

/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 5x1 and the data layout is NHWC

1252

*

1253

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

1254

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4

1255

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1

1256

* @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

1257

*

1258

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32

1259

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1260

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1261

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1262

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1263

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1264

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

1265

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1266

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

1267

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1268

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1269

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1270

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

1271

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1272

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

1273

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1274

*/

1275

__kernel void winograd_output_transform_4x1_5x1_nhwc(

1276

TENSOR3D_DECLARATION(src),

1277

TENSOR3D_DECLARATION(dst),

1278

#if defined(HAS_BIAS)

1279

VECTOR_DECLARATION(bias),

1280

#endif // defined(HAS_BIAS)

1281

int dst_size)

1282

{

1283

winograd_output_transform_4x4_5x5_nhwc(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

dst_offset_first_element_in_bytes,

1299

#if defined(HAS_BIAS)

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes,

1304

#endif // defined(HAS_BIAS)

1305

dst_size);

1306

}

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1307

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)

1308

1309

#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

1310

/** This OpenCL kernel performs Winograd output transform when the output tile is 1x2, the filter size 1x3 and the data layout is NCHW

1311

*

1312

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

1313

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1

1314

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2

1315

* @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time

1316

*

1317

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32

1318

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1319

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1320

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1321

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1322

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1323

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

1324

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1325

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

1326

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1327

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1328

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1329

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

1330

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1331

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

1332

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1333

*/

1334

__kernel void winograd_output_transform_1x2_1x3_nchw(

1335

TENSOR3D_DECLARATION(src),

1336

TENSOR3D_DECLARATION(dst)

1337

#if defined(HAS_BIAS)

1338

,

1339

VECTOR_DECLARATION(bias)

1340

#endif // defined(HAS_BIAS)

1341

)

1342

{

1343

winograd_output_transform_2x2_3x3_nchw(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

dst_offset_first_element_in_bytes

1359

#if defined(HAS_BIAS)

,

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes

1365

#endif // defined(HAS_BIAS)

);

}

/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x3 and the data layout is NCHW

1370

*

1371

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

1372

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1

1373

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

1374

* @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time

1375

*

1376

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32

1377

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1378

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1379

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1380

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1381

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1382

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

1383

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1384

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

1385

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1386

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1387

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1388

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

1389

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1390

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

1391

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1392

*/

1393

__kernel void winograd_output_transform_1x4_1x3_nchw(

1394

TENSOR3D_DECLARATION(src),

1395

TENSOR3D_DECLARATION(dst)

1396

#if defined(HAS_BIAS)

1397

,

1398

VECTOR_DECLARATION(bias)

1399

#endif // defined(HAS_BIAS)

1400

)

1401

{

1402

winograd_output_transform_4x4_3x3_nchw(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

dst_offset_first_element_in_bytes

1418

#if defined(HAS_BIAS)

,

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes

1424

#endif // defined(HAS_BIAS)

);

}

/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x5 and the data layout is NCHW

1429

*

1430

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

1431

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1

1432

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

1433

* @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time

1434

*

1435

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32

1436

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1437

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1438

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1439

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1440

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1441

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

1442

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1443

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

1444

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1445

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1446

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1447

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

1448

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1449

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

1450

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1451

*/

1452

__kernel void winograd_output_transform_1x4_1x5_nchw(

1453

TENSOR3D_DECLARATION(src),

1454

TENSOR3D_DECLARATION(dst)

1455

#if defined(HAS_BIAS)

1456

,

1457

VECTOR_DECLARATION(bias)

1458

#endif // defined(HAS_BIAS)

1459

)

1460

{

1461

winograd_output_transform_4x4_5x5_nchw(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

dst_offset_first_element_in_bytes

1477

#if defined(HAS_BIAS)

,

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes

1483

#endif // defined(HAS_BIAS)

1484

);

1485

}

Gian Marco Iodice

2018-07-06 12:59:28 +0100

[diff] [blame]

1486

1487

/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x3 and the data layout is NHWC

1488

*

1489

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

1490

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1

1491

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

1492

* @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time

1493

*

1494

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32

1495

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1496

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1497

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1498

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1499

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1500

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

1501

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1502

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

1503

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1504

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1505

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1506

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

1507

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1508

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

1509

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1510

*/

1511

__kernel void winograd_output_transform_1x4_1x3_nhwc(

1512

TENSOR3D_DECLARATION(src),

1513

TENSOR3D_DECLARATION(dst),

1514

#if defined(HAS_BIAS)

1515

VECTOR_DECLARATION(bias),

1516

#endif // defined(HAS_BIAS)

1517

int dst_size)

1518

{

1519

winograd_output_transform_4x4_3x3_nhwc(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

dst_offset_first_element_in_bytes,

1535

#if defined(HAS_BIAS)

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes,

1540

#endif // defined(HAS_BIAS)

dst_size);

}

/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x5 and the data layout is NHWC

1545

*

1546

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

1547

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1

1548

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

1549

* @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time

1550

*

1551

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32

1552

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1553

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1554

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1555

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1556

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1557

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

1558

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1559

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

1560

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1561

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1562

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1563

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

1564

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1565

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

1566

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1567

*/

1568

__kernel void winograd_output_transform_1x4_1x5_nhwc(

1569

TENSOR3D_DECLARATION(src),

1570

TENSOR3D_DECLARATION(dst),

1571

#if defined(HAS_BIAS)

1572

VECTOR_DECLARATION(bias),

1573

#endif // defined(HAS_BIAS)

1574

int dst_size)

1575

{

1576

winograd_output_transform_4x4_5x5_nhwc(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

dst_offset_first_element_in_bytes,

1592

#if defined(HAS_BIAS)

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes,

1597

#endif // defined(HAS_BIAS)

1598

dst_size);

1599

}

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1600

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Giorgio Arena