Blame - src/core/CL/cl_kernels/winograd_output_transform.cl - ml/ComputeLibrary

2018-07-04 17:03:33 +0100

[diff] [blame^]

354

/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4, 4x1 or 1x4, the filter size 3x3, 3x1 or 1x3 and the data layout is NHWC

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

355

*

356

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame^]

357

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4

358

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

359

* @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

360

* @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

361

*

362

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32

363

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

364

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

365

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

366

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

367

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

368

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

369

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

370

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

371

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

372

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

373

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

374

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

375

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

376

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

377

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

378

* @param[in] dst_size Size of the destination tensor, minus the last padding

379

*/

380

__kernel void winograd_output_transform_4x4_3x3_nhwc(

381

TENSOR3D_DECLARATION(src),

382

TENSOR3D_DECLARATION(dst),

383

#if defined(HAS_BIAS)

384

VECTOR_DECLARATION(bias),

385

#endif // defined(HAS_BIAS)

386

int dst_size)

387

{

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame^]

388

// Each thread stores a 4x4/4x1 or 1x4 tile

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

389

Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);

390

391

const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);

392

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame^]

393

// Load the values across the 36 channels to compose the 6x6 or 6x1 tile

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

394

float d00 = *((__global float *)(src_addr + 0 * src_stride_z));

395

float d01 = *((__global float *)(src_addr + 1 * src_stride_z));

396

float d02 = *((__global float *)(src_addr + 2 * src_stride_z));

397

float d03 = *((__global float *)(src_addr + 3 * src_stride_z));

398

float d04 = *((__global float *)(src_addr + 4 * src_stride_z));

399

float d05 = *((__global float *)(src_addr + 5 * src_stride_z));

400

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame^]

401

#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

402

// Compute out00, out01, out02 and out03

403

float out00 = d00 + d01 + d02 + d03 + d04;

404

float out01 = d01 - d02 + 2.0f * d03 - 2.0f * d04;

405

float out02 = d01 + d02 + 4.0f * d03 + 4.0f * d04;

406

float out03 = d01 - d02 + 8.0f * d03 - 8.0f * d04 + d05;

407

#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

408

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

409

float d10 = *((__global float *)(src_addr + 6 * src_stride_z));

410

float d11 = *((__global float *)(src_addr + 7 * src_stride_z));

411

float d12 = *((__global float *)(src_addr + 8 * src_stride_z));

412

float d13 = *((__global float *)(src_addr + 9 * src_stride_z));

413

float d14 = *((__global float *)(src_addr + 10 * src_stride_z));

414

float d15 = *((__global float *)(src_addr + 11 * src_stride_z));

415

416

float d20 = *((__global float *)(src_addr + 12 * src_stride_z));

417

float d21 = *((__global float *)(src_addr + 13 * src_stride_z));

418

float d22 = *((__global float *)(src_addr + 14 * src_stride_z));

419

float d23 = *((__global float *)(src_addr + 15 * src_stride_z));

420

float d24 = *((__global float *)(src_addr + 16 * src_stride_z));

421

float d25 = *((__global float *)(src_addr + 17 * src_stride_z));

422

423

float d30 = *((__global float *)(src_addr + 18 * src_stride_z));

424

float d31 = *((__global float *)(src_addr + 19 * src_stride_z));

425

float d32 = *((__global float *)(src_addr + 20 * src_stride_z));

426

float d33 = *((__global float *)(src_addr + 21 * src_stride_z));

427

float d34 = *((__global float *)(src_addr + 22 * src_stride_z));

428

float d35 = *((__global float *)(src_addr + 23 * src_stride_z));

429

430

float d40 = *((__global float *)(src_addr + 24 * src_stride_z));

431

float d41 = *((__global float *)(src_addr + 25 * src_stride_z));

432

float d42 = *((__global float *)(src_addr + 26 * src_stride_z));

433

float d43 = *((__global float *)(src_addr + 27 * src_stride_z));

434

float d44 = *((__global float *)(src_addr + 28 * src_stride_z));

435

float d45 = *((__global float *)(src_addr + 29 * src_stride_z));

436

437

float d50 = *((__global float *)(src_addr + 30 * src_stride_z));

438

float d51 = *((__global float *)(src_addr + 31 * src_stride_z));

439

float d52 = *((__global float *)(src_addr + 32 * src_stride_z));

440

float d53 = *((__global float *)(src_addr + 33 * src_stride_z));

441

float d54 = *((__global float *)(src_addr + 34 * src_stride_z));

442

float d55 = *((__global float *)(src_addr + 35 * src_stride_z));

443

444

// Compute out00, out01, out02 and out03

445

float out00 = d01 + d21 + d41 + d11 + d31;

446

float out01 = d01 + d21 + d41 + d11 + d31;

447

float out02 = d01 + d21 + d41 + d11 + d31;

448

float out03 = d01 + d21 + d41 + d11 + d31;

449

450

float k0 = d03 + d04 + d13 + d14 + d23 + d24 + d33 + d34 + d43 + d44;

451

float k1 = 2.0f * d03 - 2.0f * d04 + 2.0f * d13 - 2.0f * d14 + 2.0f * d23 - 2.0f * d24 + 2.0f * d33 - 2.0f * d34 + 2.0f * d43 - 2.0f * d44;

452

453

out00 += k0 + d00 + d02 + d10 + d12 + d20 + d22 + d30 + d32 + d40 + d42;

454

out01 += k1 - d02 - d12 - d22 - d32 - d42;

455

out02 += 4.0f * k0 + d02 + d12 + d22 + d32 + d42;

456

out03 += 4.0f * k1 - d02 - d12 - d22 - d32 - d42 + d05 + d15 + d25 + d35 + d45;

457

458

// Compute out10, out11, out12 and out13

459

float out10 = d11 - d21 + 2.0f * d31 - 2.0f * d41;

460

float out11 = d11 - d21 + 2.0f * d31 - 2.0f * d41;

461

float out12 = d11 - d21 + 2.0f * d31 - 2.0f * d41;

462

float out13 = d11 - d21 + 2.0f * d31 - 2.0f * d41;

463

464

k0 = d13 + d14 - d23 - d24 + 2.0f * d33 + 2.0f * d34 - 2.0f * d43 - 2.0f * d44;

465

k1 = 2.0f * d13 - 2.0f * d14 - 2.0f * d23 + 2.0f * d24 + 4.0f * d33 - 4.0f * d34 - 4.0f * d43 + 4.0f * d44;

466

467

out10 += k0 + d10 + d12 - d20 - d22 + 2.0f * d30 + 2.0f * d32 - 2.0f * d40 - 2.0f * d42;

468

out11 += k1 - d12 + d22 - 2.0f * d32 + 2.0f * d42;

469

out12 += 4.0f * k0 + d12 - d22 + 2.0f * d32 - 2.0f * d42;

470

out13 += 4.0f * k1 - d12 + d15 + d22 - d25 - 2.0f * d32 + 2.0f * d35 + 2.0f * d42 - 2.0f * d45;

471

472

// Compute out20, out21, out22 and out23

473

float out20 = d11 + d21 + 4.0f * d31 + 4.0f * d41;

474

float out21 = d11 + d21 + 4.0f * d31 + 4.0f * d41;

475

float out22 = d11 + d21 + 4.0f * d31 + 4.0f * d41;

476

float out23 = d11 + d21 + 4.0f * d31 + 4.0f * d41;

477

478

k0 = d13 + d14 + d23 + d24 + 4.0f * d33 + 4.0f * d34 + 4.0f * d43 + 4.0f * d44;

479

k1 = 2.0f * d13 - 2.0f * d14 + 2.0f * d23 - 2.0f * d24 + 8.0f * d33 - 8.0f * d34 + 8.0f * d43 - 8.0f * d44;

480

481

out20 += k0 + d10 + d12 + d20 + d22 + 4.0f * d30 + 4.0f * d32 + 4.0f * d40 + 4.0f * d42;

482

out21 += k1 - d12 - d22 - 4.0f * d32 - 4.0f * d42;

483

out22 += 4.0f * k0 + d12 + d22 + 4.0f * d32 + 4.0f * d42;

484

out23 += 4.0f * k1 - d12 + d15 - d22 + d25 - 4.0f * d32 + 4.0f * d35 - 4.0f * d42 + 4.0f * d45;

485

486

// Compute out30, out31, out32 and out33

487

float out30 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;

488

float out31 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;

489

float out32 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;

490

float out33 = d11 - d21 + 8.0f * d31 - 8.0f * d41 + d51;

491

492

k0 = d13 + d14 - d23 - d24 + 8.0f * d33 + 8.0f * d34 - 8.0f * d43 - 8.0f * d44 + d53 + d54;

493

k1 = 2.0f * d13 - 2.0f * d14 - 2.0f * d23 + 2.0f * d24 + 16.0f * d33 - 16.0f * d34 - 16.0f * d43 + 16.0f * d44 + 2.0f * d53 - 2.0f * d54;

494

495

out30 += k0 + d10 + d12 - d20 - d22 + 8.0f * d30 + 8.0f * d32 - 8.0f * d40 - 8.0f * d42 + d50 + d52;

496

out31 += k1 - d12 + d22 - 8.0f * d32 + 8.0f * d42 - d52;

497

out32 += 4.0f * k0 + d12 - d22 + 8.0f * d32 - 8.0f * d42 + d52;

498

out33 += 4.0f * k1 - d12 + d15 + d22 - d25 - 8.0f * d32 + 8.0f * d35 + 8.0f * d42 - 8.0f * d45 - d52 + d55;

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame^]

499

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

500

501

int y_in = get_global_id(1);

502

int x_out = get_global_id(0);

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame^]

503

int y_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;

504

int z_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

505

506

#if defined(HAS_BIAS)

507

// Add bias

508

Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);

509

510

float b = (float) * ((__global float *)(vector_offset(&bias, z_out)));

out00 += (float)b;

out01 += (float)b;

out02 += (float)b;

out03 += (float)b;

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame^]

516

#if !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) & !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

out10 += (float)b;

out11 += (float)b;

out12 += (float)b;

out13 += (float)b;

out20 += (float)b;

out21 += (float)b;

out22 += (float)b;

out23 += (float)b;

out30 += (float)b;

out31 += (float)b;

out32 += (float)b;

out33 += (float)b;

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame^]

531

#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) & !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

532

533

#endif // defined(HAS_BIAS)

534

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame^]

535

#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

536

int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(float) + y_out * dst_stride_y + z_out * dst_stride_z);

537

offset = min(offset + (int4)(0, 1, 2, 3) * (int4)dst_stride_z, dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).

538

539

// Store the 1x4 output tile

540

*((__global float *)(dst_ptr + offset.s0)) = out00;

541

*((__global float *)(dst_ptr + offset.s1)) = out01;

542

*((__global float *)(dst_ptr + offset.s2)) = out02;

543

*((__global float *)(dst_ptr + offset.s3)) = out03;

544

#elif defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)

545

// Store the 4x1 output tile

546

int offset = dst_offset_first_element_in_bytes + x_out * sizeof(float) + y_out * dst_stride_y + z_out * dst_stride_z;

547

int4 mult_y = min(dst_size - offset, 1);

548

549

*((__global float *)(dst_ptr + mult_y.s0 * 0 * dst_stride_y + offset)) = out00;

550

*((__global float *)(dst_ptr + mult_y.s0 * 1 * dst_stride_y + offset)) = out01;

551

*((__global float *)(dst_ptr + mult_y.s0 * 2 * dst_stride_y + offset)) = out02;

552

*((__global float *)(dst_ptr + mult_y.s0 * 3 * dst_stride_y + offset)) = out03;

553

#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

554

// Get output address

555

int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(float) + y_out * dst_stride_y + z_out * dst_stride_z);

556

offset = min(offset + (int4)(0, 1, 2, 3) * (int4)dst_stride_z, dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).

557

int4 mult_y = min(dst_size - offset, 1); // If out of bound, we don't want to increase dst_stride_y, so we set the multiplier to 0. It will be 1 otherwise.

558

559

// Store the 4x4 output tile

560

*((__global float *)(dst_ptr + mult_y.s0 * 0 * dst_stride_y + offset.s0)) = out00;

561

*((__global float *)(dst_ptr + mult_y.s0 * 1 * dst_stride_y + offset.s0)) = out01;

562

*((__global float *)(dst_ptr + mult_y.s0 * 2 * dst_stride_y + offset.s0)) = out02;

563

*((__global float *)(dst_ptr + mult_y.s0 * 3 * dst_stride_y + offset.s0)) = out03;

564

*((__global float *)(dst_ptr + mult_y.s1 * 0 * dst_stride_y + offset.s1)) = out10;

565

*((__global float *)(dst_ptr + mult_y.s1 * 1 * dst_stride_y + offset.s1)) = out11;

566

*((__global float *)(dst_ptr + mult_y.s1 * 2 * dst_stride_y + offset.s1)) = out12;

567

*((__global float *)(dst_ptr + mult_y.s1 * 3 * dst_stride_y + offset.s1)) = out13;

568

*((__global float *)(dst_ptr + mult_y.s2 * 0 * dst_stride_y + offset.s2)) = out20;

569

*((__global float *)(dst_ptr + mult_y.s2 * 1 * dst_stride_y + offset.s2)) = out21;

570

*((__global float *)(dst_ptr + mult_y.s2 * 2 * dst_stride_y + offset.s2)) = out22;

571

*((__global float *)(dst_ptr + mult_y.s2 * 3 * dst_stride_y + offset.s2)) = out23;

572

*((__global float *)(dst_ptr + mult_y.s3 * 0 * dst_stride_y + offset.s3)) = out30;

573

*((__global float *)(dst_ptr + mult_y.s3 * 1 * dst_stride_y + offset.s3)) = out31;

574

*((__global float *)(dst_ptr + mult_y.s3 * 2 * dst_stride_y + offset.s3)) = out32;

575

*((__global float *)(dst_ptr + mult_y.s3 * 3 * dst_stride_y + offset.s3)) = out33;

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame^]

576

577

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

578

}

579

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame^]

580

#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)

581

/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 3x1 and the data layout is NHWC

582

*

583

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

584

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4

585

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1

586

* @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

587

*

588

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32

589

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

590

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

591

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

592

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

593

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

594

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

595

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

596

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

597

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

598

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

599

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

600

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

601

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

602

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

603

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

604

*/

605

__kernel void winograd_output_transform_4x1_3x1_nhwc(

606

TENSOR3D_DECLARATION(src),

607

TENSOR3D_DECLARATION(dst),

608

#if defined(HAS_BIAS)

609

VECTOR_DECLARATION(bias),

610

#endif // defined(HAS_BIAS)

611

int dst_size)

612

{

613

winograd_output_transform_4x4_3x3_nhwc(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

dst_offset_first_element_in_bytes,

629

#if defined(HAS_BIAS)

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes,

634

#endif // defined(HAS_BIAS)

635

dst_size);

636

}

637

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)

638

639

#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

640

/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x3 and the data layout is NHWC

641

*

642

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

643

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1

644

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

645

* @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time

646

*

647

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32

648

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

649

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

650

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

651

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

652

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

653

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

654

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

655

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

656

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

657

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

658

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

659

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

660

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

661

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

662

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

663

*/

664

__kernel void winograd_output_transform_1x4_1x3_nhwc(

665

TENSOR3D_DECLARATION(src),

666

TENSOR3D_DECLARATION(dst),

667

#if defined(HAS_BIAS)

668

VECTOR_DECLARATION(bias),

669

#endif // defined(HAS_BIAS)

670

int dst_size)

671

{

672

winograd_output_transform_4x4_3x3_nhwc(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

dst_offset_first_element_in_bytes,

688

#if defined(HAS_BIAS)

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes,

693

#endif // defined(HAS_BIAS)

694

dst_size);

695

}

696

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

697

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

698

#define COMPUTE_TMP_COL(col, d0, d1, d2, d3, d4, d5, d6, d7, comm_fact) \

699

({ \

700

comm_fact.s0 = d1 + d2; \

701

comm_fact.s1 = d3 + d4; \

702

comm_fact.s2 = d5 + d6; \

703

\

704

col.s0 = comm_fact.s0 + comm_fact.s1 + 8.f * comm_fact.s2 + d0; \

705

col.s2 = comm_fact.s0 + 4.f * comm_fact.s1 + 2.f * comm_fact.s2; \

706

\

707

comm_fact.s0 = d1 - d2; \

708

comm_fact.s1 = d3 - d4; \

709

comm_fact.s2 = d5 - d6; \

710

\

711

col.s1 = comm_fact.s0 + 2.f * comm_fact.s1 + 4.f * comm_fact.s2; \

712

col.s3 = comm_fact.s0 + 8.f * comm_fact.s1 + comm_fact.s2 + d7; \

713

})

714

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

715

/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4/4x1 or 1x4, the filter size 5x5/5x1 or 1x5 and the data layout is NCHW

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

716

*

717

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

718

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4

719

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

720

* @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

721

* @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

722

*

723

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32

724

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

725

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

726

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

727

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

728

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

729

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

730

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

731

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

732

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

733

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

734

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

735

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

736

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

737

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

738

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

739

*/

740

__kernel void winograd_output_transform_4x4_5x5_nchw(

741

TENSOR3D_DECLARATION(src),

742

TENSOR3D_DECLARATION(dst)

743

#if defined(HAS_BIAS)

744

,

745

VECTOR_DECLARATION(bias)

746

#endif // defined(HAS_BIAS)

747

)

748

{

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

749

// Each thread stores a 4x4/4x1 or 1x4 tile

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

750

Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);

751

752

const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);

753

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

754

// Compute output address

755

int y_in = get_global_id(1);

756

int x_out = (y_in % NUM_TILES_X) * OUTPUT_TILE_W;

757

int y_out = (y_in / NUM_TILES_X) * OUTPUT_TILE_H;

758

int z_out = get_global_id(0);

759

760

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_out * sizeof(float) + y_out * dst_stride_y + z_out * dst_stride_z;

761

762

// Load the values across the channels to compose the input tile

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

763

float d00 = *((__global float *)(src_addr + 0 * src_stride_z));

764

float d01 = *((__global float *)(src_addr + 1 * src_stride_z));

765

float d02 = *((__global float *)(src_addr + 2 * src_stride_z));

766

float d03 = *((__global float *)(src_addr + 3 * src_stride_z));

767

float d04 = *((__global float *)(src_addr + 4 * src_stride_z));

768

float d05 = *((__global float *)(src_addr + 5 * src_stride_z));

769

float d06 = *((__global float *)(src_addr + 6 * src_stride_z));

770

float d07 = *((__global float *)(src_addr + 7 * src_stride_z));

771

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

772

#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

773

// Compute out00, out01, out02 and out03

774

float out00 = d00 + d01 + d02 + d03 + d04 + 8.0f * d05 + 8.0f * d06;

775

float out01 = d01 - d02 + 2.0f * d03 - 2.0f * d04 + 4.0f * d05 - 4.0f * d06;

776

float out02 = d01 + d02 + 4.0f * d03 + 4.0f * d04 + 2.0f * d05 + 2.0f * d06;

777

float out03 = d01 - d02 + 8.0f * d03 - 8.0f * d04 + d05 - d06 + d07;

778

779

#if defined(HAS_BIAS)

780

// Add bias

781

Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);

782

783

float b = (float) * ((__global float *)(vector_offset(&bias, z_out)));

out00 += (float)b;

out01 += (float)b;

out02 += (float)b;

out03 += (float)b;

#endif // defined(HAS_BIAS)

790

791

// Store the output tile

792

#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

793

*((__global float *)(dst_addr + 0 * dst_stride_y)) = out00;

794

*((__global float *)(dst_addr + 1 * dst_stride_y)) = out01;

795

*((__global float *)(dst_addr + 2 * dst_stride_y)) = out02;

796

*((__global float *)(dst_addr + 3 * dst_stride_y)) = out03;

797

#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

798

vstore4((float4)(out00, out01, out02, out03), 0, (__global float *)(dst_addr));

799

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

800

801

#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Giorgio Arena

2018-07-04 17:03:33 +0100

[diff] [blame^]

802

float d10 = *((__global float *)(src_addr + 8 * src_stride_z));

803

float d11 = *((__global float *)(src_addr + 9 * src_stride_z));

804

float d12 = *((__global float *)(src_addr + 10 * src_stride_z));

805

float d13 = *((__global float *)(src_addr + 11 * src_stride_z));

806

float d14 = *((__global float *)(src_addr + 12 * src_stride_z));

807

float d15 = *((__global float *)(src_addr + 13 * src_stride_z));

808

float d16 = *((__global float *)(src_addr + 14 * src_stride_z));

809

float d17 = *((__global float *)(src_addr + 15 * src_stride_z));

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

810

811

float d20 = *((__global float *)(src_addr + 16 * src_stride_z));

812

float d21 = *((__global float *)(src_addr + 17 * src_stride_z));

813

float d22 = *((__global float *)(src_addr + 18 * src_stride_z));

814

float d23 = *((__global float *)(src_addr + 19 * src_stride_z));

815

float d24 = *((__global float *)(src_addr + 20 * src_stride_z));

816

float d25 = *((__global float *)(src_addr + 21 * src_stride_z));

817

float d26 = *((__global float *)(src_addr + 22 * src_stride_z));

818

float d27 = *((__global float *)(src_addr + 23 * src_stride_z));

819

820

float d30 = *((__global float *)(src_addr + 24 * src_stride_z));

821

float d31 = *((__global float *)(src_addr + 25 * src_stride_z));

822

float d32 = *((__global float *)(src_addr + 26 * src_stride_z));

823

float d33 = *((__global float *)(src_addr + 27 * src_stride_z));

824

float d34 = *((__global float *)(src_addr + 28 * src_stride_z));

825

float d35 = *((__global float *)(src_addr + 29 * src_stride_z));

826

float d36 = *((__global float *)(src_addr + 30 * src_stride_z));

827

float d37 = *((__global float *)(src_addr + 31 * src_stride_z));

828

829

float d40 = *((__global float *)(src_addr + 32 * src_stride_z));

830

float d41 = *((__global float *)(src_addr + 33 * src_stride_z));

831

float d42 = *((__global float *)(src_addr + 34 * src_stride_z));

832

float d43 = *((__global float *)(src_addr + 35 * src_stride_z));

833

float d44 = *((__global float *)(src_addr + 36 * src_stride_z));

834

float d45 = *((__global float *)(src_addr + 37 * src_stride_z));

835

float d46 = *((__global float *)(src_addr + 38 * src_stride_z));

836

float d47 = *((__global float *)(src_addr + 39 * src_stride_z));

837

838

float d50 = *((__global float *)(src_addr + 40 * src_stride_z));

839

float d51 = *((__global float *)(src_addr + 41 * src_stride_z));

840

float d52 = *((__global float *)(src_addr + 42 * src_stride_z));

841

float d53 = *((__global float *)(src_addr + 43 * src_stride_z));

842

float d54 = *((__global float *)(src_addr + 44 * src_stride_z));

843

float d55 = *((__global float *)(src_addr + 45 * src_stride_z));

844

float d56 = *((__global float *)(src_addr + 46 * src_stride_z));

845

float d57 = *((__global float *)(src_addr + 47 * src_stride_z));

846

847

float d60 = *((__global float *)(src_addr + 48 * src_stride_z));

848

float d61 = *((__global float *)(src_addr + 49 * src_stride_z));

849

float d62 = *((__global float *)(src_addr + 50 * src_stride_z));

850

float d63 = *((__global float *)(src_addr + 51 * src_stride_z));

851

float d64 = *((__global float *)(src_addr + 52 * src_stride_z));

852

float d65 = *((__global float *)(src_addr + 53 * src_stride_z));

853

float d66 = *((__global float *)(src_addr + 54 * src_stride_z));

854

float d67 = *((__global float *)(src_addr + 55 * src_stride_z));

855

856

float d70 = *((__global float *)(src_addr + 56 * src_stride_z));

857

float d71 = *((__global float *)(src_addr + 57 * src_stride_z));

858

float d72 = *((__global float *)(src_addr + 58 * src_stride_z));

859

float d73 = *((__global float *)(src_addr + 59 * src_stride_z));

860

float d74 = *((__global float *)(src_addr + 60 * src_stride_z));

861

float d75 = *((__global float *)(src_addr + 61 * src_stride_z));

862

float d76 = *((__global float *)(src_addr + 62 * src_stride_z));

863

float d77 = *((__global float *)(src_addr + 63 * src_stride_z));

864

865

// Compute the 8x4 intermediate tensor

866

float4 comm_fact0, comm_fact1, comm_fact2;

867

float4 tmp_col0, tmp_col1, tmp_col2, tmp_col3, tmp_col4, tmp_col5, tmp_col6, tmp_col7;

868

869

COMPUTE_TMP_COL(tmp_col0, d00, d10, d20, d30, d40, d50, d60, d70, comm_fact0);

870

COMPUTE_TMP_COL(tmp_col1, d01, d11, d21, d31, d41, d51, d61, d71, comm_fact0);

871

COMPUTE_TMP_COL(tmp_col2, d02, d12, d22, d32, d42, d52, d62, d72, comm_fact0);

872

COMPUTE_TMP_COL(tmp_col3, d03, d13, d23, d33, d43, d53, d63, d73, comm_fact0);

873

COMPUTE_TMP_COL(tmp_col4, d04, d14, d24, d34, d44, d54, d64, d74, comm_fact0);

874

COMPUTE_TMP_COL(tmp_col5, d05, d15, d25, d35, d45, d55, d65, d75, comm_fact0);

875

COMPUTE_TMP_COL(tmp_col6, d06, d16, d26, d36, d46, d56, d66, d76, comm_fact0);

876

COMPUTE_TMP_COL(tmp_col7, d07, d17, d27, d37, d47, d57, d67, d77, comm_fact0);

877

878

// Compute the 4x4 output tile

879

comm_fact0 = tmp_col1 + tmp_col2;

880

comm_fact1 = tmp_col3 + tmp_col4;

881

comm_fact2 = tmp_col5 + tmp_col6;

882

883

float4 out_col0 = comm_fact0 + comm_fact1 + 8.f * comm_fact2 + tmp_col0;

884

float4 out_col2 = comm_fact0 + 4.f * comm_fact1 + 2.f * comm_fact2;

885

886

comm_fact0 = tmp_col1 - tmp_col2;

887

comm_fact1 = tmp_col3 - tmp_col4;

888

comm_fact2 = tmp_col5 - tmp_col6;

889

890

float4 out_col1 = comm_fact0 + 2.f * comm_fact1 + 4.f * comm_fact2;

891

float4 out_col3 = comm_fact0 + 8.f * comm_fact1 + comm_fact2 + tmp_col7;

892

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

893

#if defined(HAS_BIAS)

894

// Add bias

895

Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);

896

897

float b = (float) * ((__global float *)(vector_offset(&bias, z_out)));

898

899

out_col0 += (float4)b;

900

out_col1 += (float4)b;

901

out_col2 += (float4)b;

902

out_col3 += (float4)b;

903

#endif // defined(HAS_BIAS)

904

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

905

// Store the output tile

906

vstore4((float4)(out_col0.s0, out_col1.s0, out_col2.s0, out_col3.s0), 0, (__global float *)(dst_addr + 0 * dst_stride_y));

907

vstore4((float4)(out_col0.s1, out_col1.s1, out_col2.s1, out_col3.s1), 0, (__global float *)(dst_addr + 1 * dst_stride_y));

908

vstore4((float4)(out_col0.s2, out_col1.s2, out_col2.s2, out_col3.s2), 0, (__global float *)(dst_addr + 2 * dst_stride_y));

909

vstore4((float4)(out_col0.s3, out_col1.s3, out_col2.s3, out_col3.s3), 0, (__global float *)(dst_addr + 3 * dst_stride_y));

910

#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Giorgio Arena

2018-07-02 13:42:23 +0100

[diff] [blame]

911

}

912

913

/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4, the filter size 5x5 and the data layout is NHWC

914

*

915

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

916

*

917

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32

918

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

919

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

920

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

921

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

922

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

923

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

924

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

925

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

926

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

927

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

928

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

929

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

930

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

931

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

932

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

933

*/

934

__kernel void winograd_output_transform_4x4_5x5_nhwc(

935

TENSOR3D_DECLARATION(src),

936

TENSOR3D_DECLARATION(dst),

937

#if defined(HAS_BIAS)

938

VECTOR_DECLARATION(bias),

939

#endif // defined(HAS_BIAS)

940

int dst_size)

941

{

942

// Each thread stores a 4x4 tile

943

Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);

944

945

const __global uchar *src_addr = tensor3D_offset(&src, 0, 0, 0);

946

947

// Load the values across the 64 channels to compose the 8x8 input tile

948

float d00 = *((__global float *)(src_addr + 0 * src_stride_z));

949

float d01 = *((__global float *)(src_addr + 1 * src_stride_z));

950

float d02 = *((__global float *)(src_addr + 2 * src_stride_z));

951

float d03 = *((__global float *)(src_addr + 3 * src_stride_z));

952

float d04 = *((__global float *)(src_addr + 4 * src_stride_z));

953

float d05 = *((__global float *)(src_addr + 5 * src_stride_z));

954

float d06 = *((__global float *)(src_addr + 6 * src_stride_z));

955

float d07 = *((__global float *)(src_addr + 7 * src_stride_z));

956

957

float d10 = *((__global float *)(src_addr + 8 * src_stride_z));

958

float d11 = *((__global float *)(src_addr + 9 * src_stride_z));

959

float d12 = *((__global float *)(src_addr + 10 * src_stride_z));

960

float d13 = *((__global float *)(src_addr + 11 * src_stride_z));

961

float d14 = *((__global float *)(src_addr + 12 * src_stride_z));

962

float d15 = *((__global float *)(src_addr + 13 * src_stride_z));

963

float d16 = *((__global float *)(src_addr + 14 * src_stride_z));

964

float d17 = *((__global float *)(src_addr + 15 * src_stride_z));

965

966

float d20 = *((__global float *)(src_addr + 16 * src_stride_z));

967

float d21 = *((__global float *)(src_addr + 17 * src_stride_z));

968

float d22 = *((__global float *)(src_addr + 18 * src_stride_z));

969

float d23 = *((__global float *)(src_addr + 19 * src_stride_z));

970

float d24 = *((__global float *)(src_addr + 20 * src_stride_z));

971

float d25 = *((__global float *)(src_addr + 21 * src_stride_z));

972

float d26 = *((__global float *)(src_addr + 22 * src_stride_z));

973

float d27 = *((__global float *)(src_addr + 23 * src_stride_z));

974

975

float d30 = *((__global float *)(src_addr + 24 * src_stride_z));

976

float d31 = *((__global float *)(src_addr + 25 * src_stride_z));

977

float d32 = *((__global float *)(src_addr + 26 * src_stride_z));

978

float d33 = *((__global float *)(src_addr + 27 * src_stride_z));

979

float d34 = *((__global float *)(src_addr + 28 * src_stride_z));

980

float d35 = *((__global float *)(src_addr + 29 * src_stride_z));

981

float d36 = *((__global float *)(src_addr + 30 * src_stride_z));

982

float d37 = *((__global float *)(src_addr + 31 * src_stride_z));

983

984

float d40 = *((__global float *)(src_addr + 32 * src_stride_z));

985

float d41 = *((__global float *)(src_addr + 33 * src_stride_z));

986

float d42 = *((__global float *)(src_addr + 34 * src_stride_z));

987

float d43 = *((__global float *)(src_addr + 35 * src_stride_z));

988

float d44 = *((__global float *)(src_addr + 36 * src_stride_z));

989

float d45 = *((__global float *)(src_addr + 37 * src_stride_z));

990

float d46 = *((__global float *)(src_addr + 38 * src_stride_z));

991

float d47 = *((__global float *)(src_addr + 39 * src_stride_z));

992

993

float d50 = *((__global float *)(src_addr + 40 * src_stride_z));

994

float d51 = *((__global float *)(src_addr + 41 * src_stride_z));

995

float d52 = *((__global float *)(src_addr + 42 * src_stride_z));

996

float d53 = *((__global float *)(src_addr + 43 * src_stride_z));

997

float d54 = *((__global float *)(src_addr + 44 * src_stride_z));

998

float d55 = *((__global float *)(src_addr + 45 * src_stride_z));

999

float d56 = *((__global float *)(src_addr + 46 * src_stride_z));

1000

float d57 = *((__global float *)(src_addr + 47 * src_stride_z));

1001

1002

float d60 = *((__global float *)(src_addr + 48 * src_stride_z));

1003

float d61 = *((__global float *)(src_addr + 49 * src_stride_z));

1004

float d62 = *((__global float *)(src_addr + 50 * src_stride_z));

1005

float d63 = *((__global float *)(src_addr + 51 * src_stride_z));

1006

float d64 = *((__global float *)(src_addr + 52 * src_stride_z));

1007

float d65 = *((__global float *)(src_addr + 53 * src_stride_z));

1008

float d66 = *((__global float *)(src_addr + 54 * src_stride_z));

1009

float d67 = *((__global float *)(src_addr + 55 * src_stride_z));

1010

1011

float d70 = *((__global float *)(src_addr + 56 * src_stride_z));

1012

float d71 = *((__global float *)(src_addr + 57 * src_stride_z));

1013

float d72 = *((__global float *)(src_addr + 58 * src_stride_z));

1014

float d73 = *((__global float *)(src_addr + 59 * src_stride_z));

1015

float d74 = *((__global float *)(src_addr + 60 * src_stride_z));

1016

float d75 = *((__global float *)(src_addr + 61 * src_stride_z));

1017

float d76 = *((__global float *)(src_addr + 62 * src_stride_z));

1018

float d77 = *((__global float *)(src_addr + 63 * src_stride_z));

1019

1020

// Compute the 8x4 intermediate tensor

1021

float4 comm_fact0, comm_fact1, comm_fact2;

1022

float4 tmp_col0, tmp_col1, tmp_col2, tmp_col3, tmp_col4, tmp_col5, tmp_col6, tmp_col7;

1023

1024

COMPUTE_TMP_COL(tmp_col0, d00, d10, d20, d30, d40, d50, d60, d70, comm_fact0);

1025

COMPUTE_TMP_COL(tmp_col1, d01, d11, d21, d31, d41, d51, d61, d71, comm_fact0);

1026

COMPUTE_TMP_COL(tmp_col2, d02, d12, d22, d32, d42, d52, d62, d72, comm_fact0);

1027

COMPUTE_TMP_COL(tmp_col3, d03, d13, d23, d33, d43, d53, d63, d73, comm_fact0);

1028

COMPUTE_TMP_COL(tmp_col4, d04, d14, d24, d34, d44, d54, d64, d74, comm_fact0);

1029

COMPUTE_TMP_COL(tmp_col5, d05, d15, d25, d35, d45, d55, d65, d75, comm_fact0);

1030

COMPUTE_TMP_COL(tmp_col6, d06, d16, d26, d36, d46, d56, d66, d76, comm_fact0);

1031

COMPUTE_TMP_COL(tmp_col7, d07, d17, d27, d37, d47, d57, d67, d77, comm_fact0);

1032

1033

// Compute the 4x4 output tile

1034

comm_fact0 = tmp_col1 + tmp_col2;

1035

comm_fact1 = tmp_col3 + tmp_col4;

1036

comm_fact2 = tmp_col5 + tmp_col6;

1037

1038

float4 out_col0 = comm_fact0 + comm_fact1 + 8.f * comm_fact2 + tmp_col0;

1039

float4 out_col2 = comm_fact0 + 4.f * comm_fact1 + 2.f * comm_fact2;

1040

1041

comm_fact0 = tmp_col1 - tmp_col2;

1042

comm_fact1 = tmp_col3 - tmp_col4;

1043

comm_fact2 = tmp_col5 - tmp_col6;

1044

1045

float4 out_col1 = comm_fact0 + 2.f * comm_fact1 + 4.f * comm_fact2;

1046

float4 out_col3 = comm_fact0 + 8.f * comm_fact1 + comm_fact2 + tmp_col7;

1047

1048

int y_in = get_global_id(1);

1049

int x_out = get_global_id(0);

1050

int y_out = (y_in % NUM_TILES_X) * 4;

1051

int z_out = (y_in / NUM_TILES_X) * 4;

1052

1053

#if defined(HAS_BIAS)

1054

// Add bias

1055

Vector bias = CONVERT_TO_VECTOR_STRUCT_NO_STEP(bias);

1056

1057

float b = (float) * ((__global float *)(vector_offset(&bias, z_out)));

1058

1059

out_col0 += (float4)b;

1060

out_col1 += (float4)b;

1061

out_col2 += (float4)b;

1062

out_col3 += (float4)b;

1063

#endif // defined(HAS_BIAS)

1064

1065

// Get output address

1066

int4 offset = (int4)(dst_offset_first_element_in_bytes + x_out * sizeof(float) + y_out * dst_stride_y + z_out * dst_stride_z);

1067

offset = min(offset + (int4)(0, 1, 2, 3) * (int4)dst_stride_z, dst_size); // If address is beyond the last plane, clamp it to dst_size (which points to the last padding).

1068

int4 mult_y = min(dst_size - offset, 1); // If out of bound, we don't want to increase dst_stride_y, so we set the multiplier to 0. It will be 1 otherwise.

1069

1070

// Store the 4x4 output tile

1071

*(__global float *)(dst_ptr + mult_y.s0 * 0 * dst_stride_y + offset.s0) = out_col0.s0;

1072

*(__global float *)(dst_ptr + mult_y.s0 * 1 * dst_stride_y + offset.s0) = out_col1.s0;

1073

*(__global float *)(dst_ptr + mult_y.s0 * 2 * dst_stride_y + offset.s0) = out_col2.s0;

1074

*(__global float *)(dst_ptr + mult_y.s0 * 3 * dst_stride_y + offset.s0) = out_col3.s0;

1075

*(__global float *)(dst_ptr + mult_y.s0 * 0 * dst_stride_y + offset.s1) = out_col0.s1;

1076

*(__global float *)(dst_ptr + mult_y.s0 * 1 * dst_stride_y + offset.s1) = out_col1.s1;

1077

*(__global float *)(dst_ptr + mult_y.s0 * 2 * dst_stride_y + offset.s1) = out_col2.s1;

1078

*(__global float *)(dst_ptr + mult_y.s0 * 3 * dst_stride_y + offset.s1) = out_col3.s1;

1079

*(__global float *)(dst_ptr + mult_y.s0 * 0 * dst_stride_y + offset.s2) = out_col0.s2;

1080

*(__global float *)(dst_ptr + mult_y.s0 * 1 * dst_stride_y + offset.s2) = out_col1.s2;

1081

*(__global float *)(dst_ptr + mult_y.s0 * 2 * dst_stride_y + offset.s2) = out_col2.s2;

1082

*(__global float *)(dst_ptr + mult_y.s0 * 3 * dst_stride_y + offset.s2) = out_col3.s2;

1083

*(__global float *)(dst_ptr + mult_y.s0 * 0 * dst_stride_y + offset.s3) = out_col0.s3;

1084

*(__global float *)(dst_ptr + mult_y.s0 * 1 * dst_stride_y + offset.s3) = out_col1.s3;

1085

*(__global float *)(dst_ptr + mult_y.s0 * 2 * dst_stride_y + offset.s3) = out_col2.s3;

1086

*(__global float *)(dst_ptr + mult_y.s0 * 3 * dst_stride_y + offset.s3) = out_col3.s3;

1087

}

Gian Marco Iodice

2018-07-03 12:22:09 +0100

[diff] [blame]

1088

1089

#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)

1090

/** This OpenCL kernel performs Winograd output transform when the output tile is 2x1, the filter size 3x1 and the data layout is NCHW

1091

*

1092

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

1093

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2

1094

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1

1095

* @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

1096

*

1097

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32

1098

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1099

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1100

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1101

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1102

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1103

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

1104

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1105

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

1106

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1107

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1108

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1109

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

1110

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1111

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

1112

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1113

*/

1114

__kernel void winograd_output_transform_2x1_3x1_nchw(

1115

TENSOR3D_DECLARATION(src),

1116

TENSOR3D_DECLARATION(dst)

1117

#if defined(HAS_BIAS)

1118

,

1119

VECTOR_DECLARATION(bias)

1120

#endif // defined(HAS_BIAS)

1121

)

1122

{

1123

winograd_output_transform_2x2_3x3_nchw(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

dst_offset_first_element_in_bytes

1139

#if defined(HAS_BIAS)

,

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes

1145

#endif // defined(HAS_BIAS)

);

}

/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 3x1 and the data layout is NCHW

1150

*

1151

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

1152

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4

1153

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1

1154

* @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

1155

*

1156

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32

1157

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1158

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1159

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1160

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1161

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1162

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

1163

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1164

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

1165

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1166

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1167

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1168

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

1169

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1170

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

1171

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1172

*/

1173

__kernel void winograd_output_transform_4x1_3x1_nchw(

1174

TENSOR3D_DECLARATION(src),

1175

TENSOR3D_DECLARATION(dst)

1176

#if defined(HAS_BIAS)

1177

,

1178

VECTOR_DECLARATION(bias)

1179

#endif // defined(HAS_BIAS)

1180

)

1181

{

1182

winograd_output_transform_4x4_3x3_nchw(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

dst_offset_first_element_in_bytes

1198

#if defined(HAS_BIAS)

,

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes

1204

#endif // defined(HAS_BIAS)

);

}

/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 5x1 and the data layout is NCHW

1209

*

1210

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

1211

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4

1212

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1

1213

* @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time

1214

*

1215

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32

1216

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1217

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1218

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1219

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1220

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1221

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

1222

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1223

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

1224

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1225

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1226

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1227

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

1228

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1229

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

1230

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1231

*/

1232

__kernel void winograd_output_transform_4x1_5x1_nchw(

1233

TENSOR3D_DECLARATION(src),

1234

TENSOR3D_DECLARATION(dst)

1235

#if defined(HAS_BIAS)

1236

,

1237

VECTOR_DECLARATION(bias)

1238

#endif // defined(HAS_BIAS)

1239

)

1240

{

1241

winograd_output_transform_4x4_5x5_nchw(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

dst_offset_first_element_in_bytes

1257

#if defined(HAS_BIAS)

,

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes

1263

#endif // defined(HAS_BIAS)

1264

);

1265

}

1266

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)

1267

1268

#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

1269

/** This OpenCL kernel performs Winograd output transform when the output tile is 1x2, the filter size 1x3 and the data layout is NCHW

1270

*

1271

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

1272

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1

1273

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2

1274

* @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time

1275

*

1276

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32

1277

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1278

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1279

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1280

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1281

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1282

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

1283

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1284

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

1285

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1286

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1287

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1288

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

1289

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1290

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

1291

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1292

*/

1293

__kernel void winograd_output_transform_1x2_1x3_nchw(

1294

TENSOR3D_DECLARATION(src),

1295

TENSOR3D_DECLARATION(dst)

1296

#if defined(HAS_BIAS)

1297

,

1298

VECTOR_DECLARATION(bias)

1299

#endif // defined(HAS_BIAS)

1300

)

1301

{

1302

winograd_output_transform_2x2_3x3_nchw(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

dst_offset_first_element_in_bytes

1318

#if defined(HAS_BIAS)

,

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes

1324

#endif // defined(HAS_BIAS)

);

}

/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x3 and the data layout is NCHW

1329

*

1330

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

1331

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1

1332

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

1333

* @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time

1334

*

1335

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32

1336

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1337

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1338

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1339

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1340

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1341

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

1342

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1343

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

1344

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1345

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1346

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1347

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

1348

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1349

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

1350

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1351

*/

1352

__kernel void winograd_output_transform_1x4_1x3_nchw(

1353

TENSOR3D_DECLARATION(src),

1354

TENSOR3D_DECLARATION(dst)

1355

#if defined(HAS_BIAS)

1356

,

1357

VECTOR_DECLARATION(bias)

1358

#endif // defined(HAS_BIAS)

1359

)

1360

{

1361

winograd_output_transform_4x4_3x3_nchw(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

dst_offset_first_element_in_bytes

1377

#if defined(HAS_BIAS)

,

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes

1383

#endif // defined(HAS_BIAS)

);

}

/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x5 and the data layout is NCHW

1388

*

1389

* @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16

1390

* @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1

1391

* @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4

1392

* @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time

1393

*

1394

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32

1395

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1396

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1397

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1398

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1399

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1400

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

1401

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1402

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

1403

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1404

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1405

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1406

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

1407

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1408

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

1409

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1410

*/

1411

__kernel void winograd_output_transform_1x4_1x5_nchw(

1412

TENSOR3D_DECLARATION(src),

1413

TENSOR3D_DECLARATION(dst)

1414

#if defined(HAS_BIAS)

1415

,

1416

VECTOR_DECLARATION(bias)

1417

#endif // defined(HAS_BIAS)

1418

)

1419

{

1420

winograd_output_transform_4x4_5x5_nchw(src_ptr,

src_stride_x,

src_step_x,

src_stride_y,

src_step_y,

src_stride_z,

src_step_z,

src_offset_first_element_in_bytes,

dst_ptr,

dst_stride_x,

dst_step_x,

dst_stride_y,

dst_step_y,

dst_stride_z,

dst_step_z,

dst_offset_first_element_in_bytes

1436

#if defined(HAS_BIAS)

,

bias_ptr,

bias_stride_x,

bias_step_x,

bias_offset_first_element_in_bytes

1442

#endif // defined(HAS_BIAS)

1443

);

1444

}

1445

#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)

Giorgio Arena