Blame - src/core/CL/cl_kernels/depthwise_convolution.cl - ml/ComputeLibrary

__global uchar *src_addr = src_ptr + get_global_id(0) * src_step_x + get_global_id(1) * src_step_y + get_global_id(2) * src_step_z - batch * (DST_CHANNELS / DEPTH_MULTIPLIER) *

354

(DEPTH_MULTIPLIER - 1) * src_step_z - (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;

355

356

// Load the weights

357

float3 weights_values0 = vload3(0, (__global float *)(weights_addr + 0 * weights_stride_y));

358

float3 weights_values1 = vload3(0, (__global float *)(weights_addr + 1 * weights_stride_y));

359

float3 weights_values2 = vload3(0, (__global float *)(weights_addr + 2 * weights_stride_y));

360

361

pixels = convolution3x3(src_addr, src_stride_y,

362

weights_values0.s0, weights_values0.s1, weights_values0.s2,

363

weights_values1.s0, weights_values1.s1, weights_values1.s2,

364

weights_values2.s0, weights_values2.s1, weights_values2.s2);

365

#if defined(HAS_BIAS)

366

Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);

367

368

float bias = *((__global float *)(vector_offset(&biases, channel)));

369

370

pixels += (float2)bias;

371

#endif //defined(HAS_BIAS)

372

373

vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, pixels, A_VAL, B_VAL), 0, (__global float *)dst.ptr);

374

}

375

#endif //defined(CONV_STRIDE_X)

376

377

#if(DILATION_X > 1 || DILATION_Y > 1)

378

379

/** Perform 3x3 convolution for stride_x=1 and stride_y=1 when DILATION_X>1 or DILATION_Y>1 for F32

380

*

381

* @param[in] src_addr Pointer to the starting position of where to perform the convolution

382

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

383

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

384

* @param[in] y_offset Offset from the source tensor from which to start convolution

385

* @param[in] weights_addr Pointer from where to get weights

386

* @param[in] weights_stride_y Stride of weights tesnsor in Y dimension

387

*/

388

inline float2 convolution_3x3_dilation_stridex1_stridey1_f32(__global uchar *src_addr, const int stride_x_bytes, const int stride_y_bytes,

389

const int y_offset, __global uchar *weights_addr, const int weights_stride_y)

390

{

391

// Load the weights

392

float3 weights_row0 = vload3(0, (__global float *)(weights_addr + 0 * weights_stride_y));

393

float3 weights_row1 = vload3(0, (__global float *)(weights_addr + 1 * weights_stride_y));

394

float3 weights_row2 = vload3(0, (__global float *)(weights_addr + 2 * weights_stride_y));

395

396

float2 pixels0 = 0.0f;

397

398

float2 src00_left = vload2(0, (__global float *)ptr_offset(src_addr, 0, y_offset, stride_x_bytes, stride_y_bytes)); // Row0

399

float2 src00_mid = vload2(0, (__global float *)ptr_offset(src_addr, DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));

400

float2 src00_right = vload2(0, (__global float *)ptr_offset(src_addr, 2 * DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));

401

402

float2 src10_left = vload2(0, (__global float *)ptr_offset(src_addr, 0, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes)); // Row1

403

float2 src10_mid = vload2(0, (__global float *)ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));

404

float2 src10_right = vload2(0, (__global float *)ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));

405

406

float2 src20_left = vload2(0, (__global float *)ptr_offset(src_addr, 0, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes)); // Row2

407

float2 src20_mid = vload2(0, (__global float *)ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));

408

float2 src20_right = vload2(0, (__global float *)ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));

409

410

CONVOLUTION1x3_2X1_STRIDE1(pixels0, src00_left, src00_mid, src00_right, weights_row0);

411

CONVOLUTION1x3_2X1_STRIDE1(pixels0, src10_left, src10_mid, src10_right, weights_row1);

412

CONVOLUTION1x3_2X1_STRIDE1(pixels0, src20_left, src20_mid, src20_right, weights_row2);

return pixels0;

}

/** Perform 3x3 convolution for stride_x=2 and stride_y=2 when DILATION_X>1 or DILATION_Y>1 for F32

418

*

419

* @param[in] src_addr Pointer to the starting position of where to perform the convolution

420

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

421

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

422

* @param[in] y_offset Offset from the source tensor from which to start convolution

423

* @param[in] weights_addr Pointer from where to get weights

424

* @param[in] weights_stride_y Stride of weights tesnsor in Y dimension

425

*/

426

inline float2 convolution_3x3_dilation_stridex2_stridey2_f32(__global uchar *src_addr, const int stride_x_bytes, const int stride_y_bytes,

427

const int y_offset, __global uchar *weights_addr, const int weights_stride_y)

428

{

429

// Load the weights

430

float3 weights_row0 = vload3(0, (__global float *)(weights_addr + 0 * weights_stride_y));

431

float3 weights_row1 = vload3(0, (__global float *)(weights_addr + 1 * weights_stride_y));

432

float3 weights_row2 = vload3(0, (__global float *)(weights_addr + 2 * weights_stride_y));

433

434

float2 pixels0 = 0.0f;

435

436

float3 src00_left = vload3(0, (__global float *)ptr_offset(src_addr, 0, y_offset, stride_x_bytes, stride_y_bytes)); // Row0

437

float3 src00_mid = vload3(0, (__global float *)ptr_offset(src_addr, DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));

438

float3 src00_right = vload3(0, (__global float *)ptr_offset(src_addr, 2 * DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));

439

440

float3 src10_left = vload3(0, (__global float *)ptr_offset(src_addr, 0, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes)); // Row1

441

float3 src10_mid = vload3(0, (__global float *)ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));

442

float3 src10_right = vload3(0, (__global float *)ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));

443

444

float3 src20_left = vload3(0, (__global float *)ptr_offset(src_addr, 0, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes)); // Row2

445

float3 src20_mid = vload3(0, (__global float *)ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));

446

float3 src20_right = vload3(0, (__global float *)ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));

447

448

CONVOLUTION1x3_2X1_STRIDE2(pixels0, src00_left, src00_mid, src00_right, weights_row0);

449

CONVOLUTION1x3_2X1_STRIDE2(pixels0, src10_left, src10_mid, src10_right, weights_row1);

450

CONVOLUTION1x3_2X1_STRIDE2(pixels0, src20_left, src20_mid, src20_right, weights_row2);

return pixels0;

}

#endif /* (DILATION_X > 1 || DILATION_Y > 1) */

456

457

/** This OpenCL kernel is optimized for Bifrost architectures and computes the depthwise convolution 3x3 when both

458

* stride_x and stride_y are equal to 1

459

*

460

* @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu

461

* @note If activation function is enabled, the data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float.

462

* @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively

463

* @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size

464

*

465

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32

466

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

467

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

468

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

469

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

470

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

471

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

472

* @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)

473

* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: F32

474

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

475

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

476

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

477

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

478

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

479

* @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)

480

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

481

* @param[in] weights_ptr Pointer to the weights tensor. Supported data types: F32

482

* @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)

483

* @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)

484

* @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)

485

* @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)

486

* @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)

487

* @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)

488

* @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector

489

* @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: F32

490

* @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)

491

* @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)

492

* @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector

493

*/

494

__kernel void depthwise_convolution_3x3_stridex1_stridey1_f32(

495

TENSOR3D_DECLARATION(src),

496

TENSOR3D_DECLARATION(dst),

497

TENSOR3D_DECLARATION(weights)

498

#if defined(HAS_BIAS)

499

,

500

VECTOR_DECLARATION(biases)

501

#endif //defined(HAS_BIAS)

502

)

503

{

504

Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);

505

Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);

506

507

float2 pixels0 = 0.0f;

508

float2 pixels1 = 0.0f;

509

float2 pixels2 = 0.0f;

510

float2 pixels3 = 0.0f;

511

512

// Extract channel and linearized batch indices

513

const int channel = get_global_id(2) % DST_CHANNELS;

514

const int batch = get_global_id(2) / DST_CHANNELS;

515

// Load relevant input and weights data (Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)

516

__global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;

517

__global uchar *src_addr = src_ptr + get_global_id(0) * src_step_x + get_global_id(1) * src_step_y + get_global_id(2) * src_step_z - batch * (DST_CHANNELS / DEPTH_MULTIPLIER) *

518

(DEPTH_MULTIPLIER - 1) * src_step_z - (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;

519

520

#if(DILATION_X == 1 && DILATION_Y == 1)

521

// Load the weights

522

float3 weights_row0 = vload3(0, (__global float *)(weights_addr + 0 * weights_stride_y));

523

float3 weights_row1 = vload3(0, (__global float *)(weights_addr + 1 * weights_stride_y));

524

float3 weights_row2 = vload3(0, (__global float *)(weights_addr + 2 * weights_stride_y));

525

526

// Note: Since each work-item computes 4x2 elements, we need to load 6 rows from the input tensor

527

float4 src00 = vload4(0, (__global float *)(src_addr + 0 * src_stride_y)); // Row0

528

float4 src10 = vload4(0, (__global float *)(src_addr + 1 * src_stride_y)); // Row1

529

float4 src20 = vload4(0, (__global float *)(src_addr + 2 * src_stride_y)); // Row2

530

float4 src30 = vload4(0, (__global float *)(src_addr + 3 * src_stride_y)); // Row3

531

float4 src40 = vload4(0, (__global float *)(src_addr + 4 * src_stride_y)); // Row4

532

float4 src50 = vload4(0, (__global float *)(src_addr + 5 * src_stride_y)); // Row5

533

534

CONVOLUTION1x3_2X1_STRIDE1(pixels0, src00, weights_row0);

535

CONVOLUTION1x3_2X1_STRIDE1(pixels0, src10, weights_row1);

536

CONVOLUTION1x3_2X1_STRIDE1(pixels0, src20, weights_row2);

537

CONVOLUTION1x3_2X1_STRIDE1(pixels1, src10, weights_row0);

538

CONVOLUTION1x3_2X1_STRIDE1(pixels1, src20, weights_row1);

539

CONVOLUTION1x3_2X1_STRIDE1(pixels1, src30, weights_row2);

540

CONVOLUTION1x3_2X1_STRIDE1(pixels2, src20, weights_row0);

541

CONVOLUTION1x3_2X1_STRIDE1(pixels2, src30, weights_row1);

542

CONVOLUTION1x3_2X1_STRIDE1(pixels2, src40, weights_row2);

543

CONVOLUTION1x3_2X1_STRIDE1(pixels3, src30, weights_row0);

544

CONVOLUTION1x3_2X1_STRIDE1(pixels3, src40, weights_row1);

545

CONVOLUTION1x3_2X1_STRIDE1(pixels3, src50, weights_row2);

546

547

#else /* DILATION_X==1 && DILATION_Y==1 */

548

549

//3x3 Convolution of elements starting in 0th row

550

pixels0 = convolution_3x3_dilation_stridex1_stridey1_f32(src_addr, src_stride_x, src_stride_y, 0, weights_addr, weights_stride_y);

551

//3x3 Convolution of elements starting in 1st row

552

pixels1 = convolution_3x3_dilation_stridex1_stridey1_f32(src_addr, src_stride_x, src_stride_y, 1, weights_addr, weights_stride_y);

553

//3x3 Convolution of elements starting in 2nd row

554

pixels2 = convolution_3x3_dilation_stridex1_stridey1_f32(src_addr, src_stride_x, src_stride_y, 2, weights_addr, weights_stride_y);

555

//3x3 Convolution of elements starting in 3rd row

556

pixels3 = convolution_3x3_dilation_stridex1_stridey1_f32(src_addr, src_stride_x, src_stride_y, 3, weights_addr, weights_stride_y);

557

558

#endif /* DILATION_X==1 && DILATION_Y==1 */

559

560

#ifdef HAS_BIAS

561

Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);

562

563

float bias = *((__global float *)(vector_offset(&biases, channel)));

564

565

pixels0 += (float2)bias;

566

pixels1 += (float2)bias;

567

pixels2 += (float2)bias;

568

pixels3 += (float2)bias;

569

#endif /* defined(HAS_BIAS) */

570

571

vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, pixels0, A_VAL, B_VAL), 0, (__global float *)(dst.ptr + 0 * dst_stride_y));

572

vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, pixels1, A_VAL, B_VAL), 0, (__global float *)(dst.ptr + 1 * dst_stride_y));

573

vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, pixels2, A_VAL, B_VAL), 0, (__global float *)(dst.ptr + 2 * dst_stride_y));

574

vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, pixels3, A_VAL, B_VAL), 0, (__global float *)(dst.ptr + 3 * dst_stride_y));

575

}

576

577

/** This OpenCL kernel is optimized for Bifrost architectures and computes the depthwise convolution 3x3 when both

578

* stride_x and stride_y are equal to 2

579

*

580

* @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu

581

* @note If activation function is enabled, the data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float.

582

* @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively

583

* @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size

584

*

585

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32

586

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

587

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

588

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

589

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

590

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

591

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

592

* @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)

593

* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: F32

594

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

595

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

596

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

597

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

598

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

599

* @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)

600

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

601

* @param[in] weights_ptr Pointer to the weights tensor. Supported data types: F32

602

* @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)

603

* @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)

604

* @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)

605

* @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)

606

* @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)

607

* @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)

608

* @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector

609

* @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: F32

610

* @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)

611

* @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)

612

* @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector

613

*/

614

__kernel void depthwise_convolution_3x3_stridex2_stridey2_f32(

615

TENSOR3D_DECLARATION(src),

616

TENSOR3D_DECLARATION(dst),

617

TENSOR3D_DECLARATION(weights)

618

#if defined(HAS_BIAS)

619

,

620

VECTOR_DECLARATION(biases)

621

#endif //defined(HAS_BIAS)

622

)

623

{

624

Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);

625

Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);

626

627

float2 pixels0 = 0.0f;

628

float2 pixels1 = 0.0f;

629

630

// Extract channel and linearized batch indices

631

const int channel = get_global_id(2) % DST_CHANNELS;

632

const int batch = get_global_id(2) / DST_CHANNELS;

633

// Load relevant input and weights data (Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)

634

__global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;

635

__global uchar *src_addr = src_ptr + get_global_id(0) * src_step_x + get_global_id(1) * src_step_y + get_global_id(2) * src_step_z - batch * (DST_CHANNELS / DEPTH_MULTIPLIER) *

636

(DEPTH_MULTIPLIER - 1) * src_step_z - (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;

637

638

#if(DILATION_X == 1 && DILATION_Y == 1)

639

640

// Load the weights

641

float3 weights_row0 = vload3(0, (__global float *)(weights_addr + 0 * weights_stride_y));

642

float3 weights_row1 = vload3(0, (__global float *)(weights_addr + 1 * weights_stride_y));

643

float3 weights_row2 = vload3(0, (__global float *)(weights_addr + 2 * weights_stride_y));

644

645

// Note: Since each work-item computes 4x2 elements, we need to load 5 rows from the input tensor

646

float4 src00 = vload4(0, (__global float *)(src_addr + 0 * src_stride_y)); // Row0

647

float2 src01 = vload2(2, (__global float *)(src_addr + 0 * src_stride_y)); // Row0

648

float4 src10 = vload4(0, (__global float *)(src_addr + 1 * src_stride_y)); // Row1

649

float2 src11 = vload2(2, (__global float *)(src_addr + 1 * src_stride_y)); // Row1

650

float4 src20 = vload4(0, (__global float *)(src_addr + 2 * src_stride_y)); // Row2

651

float2 src21 = vload2(2, (__global float *)(src_addr + 2 * src_stride_y)); // Row2

652

float4 src30 = vload4(0, (__global float *)(src_addr + 3 * src_stride_y)); // Row3

653

float2 src31 = vload2(2, (__global float *)(src_addr + 3 * src_stride_y)); // Row3

654

float4 src40 = vload4(0, (__global float *)(src_addr + 4 * src_stride_y)); // Row4

655

float2 src41 = vload2(2, (__global float *)(src_addr + 4 * src_stride_y)); // Row4

656

657

CONVOLUTION1x3_2X1_STRIDE2(pixels0, src00, src01, weights_row0);

658

CONVOLUTION1x3_2X1_STRIDE2(pixels0, src10, src11, weights_row1);

659

CONVOLUTION1x3_2X1_STRIDE2(pixels0, src20, src21, weights_row2);

660

CONVOLUTION1x3_2X1_STRIDE2(pixels1, src20, src21, weights_row0);

661

CONVOLUTION1x3_2X1_STRIDE2(pixels1, src30, src31, weights_row1);

662

CONVOLUTION1x3_2X1_STRIDE2(pixels1, src40, src41, weights_row2);

663

664

#else /* DILATION_X==1 && DILATION_Y==1 */

665

666

//3x3 Convolution of elements starting in 0th row

667

pixels0 = convolution_3x3_dilation_stridex2_stridey2_f32(src_addr, src_stride_x, src_stride_y, 0, weights_addr, weights_stride_y);

668

//3x3 Convolution of elements starting in 2nd row

669

pixels1 = convolution_3x3_dilation_stridex2_stridey2_f32(src_addr, src_stride_x, src_stride_y, 2, weights_addr, weights_stride_y);

670

#endif /* DILATION_X==1 && DILATION_Y==1 */

671

672

#ifdef HAS_BIAS

673

Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);

674

675

float bias = *((__global float *)(vector_offset(&biases, channel)));

676

677

pixels0 += (float2)bias;

678

pixels1 += (float2)bias;

679

#endif /* defined(HAS_BIAS) */

680

681

vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, pixels0, A_VAL, B_VAL), 0, (__global float *)(dst.ptr + 0 * dst_stride_y));

682

vstore2(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, pixels1, A_VAL, B_VAL), 0, (__global float *)(dst.ptr + 1 * dst_stride_y));

683

}

684

685

#endif // defined(DEPTH_MULTIPLIER) && defined(DST_CHANNELS) && defined(IS_F32)

686

687

#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(DEPTH_MULTIPLIER) && defined(DST_CHANNELS) && defined(IS_F16)

688

#if defined(CONV_STRIDE_X)

689

#if CONV_STRIDE_X == 1

690

#define convolution1x3_f16 convolution1x3_stride_1_f16

691

#elif CONV_STRIDE_X == 2

692

#define convolution1x3_f16 convolution1x3_stride_2_f16

693

#elif CONV_STRIDE_X == 3

694

#define convolution1x3_f16 convolution1x3_stride_3_f16

695

#else /* CONV_STRIDE_X */

696

#error "Stride not supported"

697

#endif /* CONV_STRIDE_X */

698

699

#if(DILATION_X > 1 || DILATION_Y > 1)

700

701

/** Perform 3x3 convolution for stride_x=1 and stride_y=1 when DILATION_X>1 or DILATION_Y>1 for f16

702

*

703

* @param[in] src_addr Pointer to the starting position of where to perform the convolution

704

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

705

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

706

* @param[in] y_offset Offset from the source tensor from which to start convolution

707

* @param[in] weights_addr Pointer from where to get weights

708

* @param[in] weights_stride_y Stride of weights tesnsor in Y dimension

709

*/

710

inline half4 convolution_3x3_dilation_stridex1_stridey1_f16(__global uchar *src_addr, const int stride_x_bytes, const int stride_y_bytes,

711

const int y_offset, __global uchar *weights_addr, const int weights_stride_y)

712

{

713

// Load the weights

714

half3 weights_row0 = vload3(0, (__global half *)(weights_addr + 0 * weights_stride_y));

715

half3 weights_row1 = vload3(0, (__global half *)(weights_addr + 1 * weights_stride_y));

716

half3 weights_row2 = vload3(0, (__global half *)(weights_addr + 2 * weights_stride_y));

717

718

half4 pixels0 = 0.0f;

719

720

half4 src00_left = vload4(0, (__global half *)ptr_offset(src_addr, 0, y_offset, stride_x_bytes, stride_y_bytes)); // Row0

721

half4 src00_mid = vload4(0, (__global half *)ptr_offset(src_addr, DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));

722

half4 src00_right = vload4(0, (__global half *)ptr_offset(src_addr, 2 * DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));

723

724

half4 src10_left = vload4(0, (__global half *)ptr_offset(src_addr, 0, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes)); // Row1

725

half4 src10_mid = vload4(0, (__global half *)ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));

726

half4 src10_right = vload4(0, (__global half *)ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));

727

728

half4 src20_left = vload4(0, (__global half *)ptr_offset(src_addr, 0, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes)); // Row2

729

half4 src20_mid = vload4(0, (__global half *)ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));

730

half4 src20_right = vload4(0, (__global half *)ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));

731

732

CONVOLUTION1x3_4X1_STRIDE1(pixels0, src00_left, src00_mid, src00_right, weights_row0);

733

CONVOLUTION1x3_4X1_STRIDE1(pixels0, src10_left, src10_mid, src10_right, weights_row1);

734

CONVOLUTION1x3_4X1_STRIDE1(pixels0, src20_left, src20_mid, src20_right, weights_row2);

return pixels0;

}

/** Perform 3x3 convolution for stride_x=2 and stride_y=2 when DILATION_X>1 or DILATION_Y>1 for F16

740

*

741

* @param[in] src_addr Pointer to the starting position of where to perform the convolution

742

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

743

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

744

* @param[in] y_offset Offset from the source tensor from which to start convolution

745

* @param[in] weights_addr Pointer from where to get weights

746

* @param[in] weights_stride_y Stride of weights tesnsor in Y dimension

747

*/

748

inline half4 convolution_3x3_dilation_stridex2_stridey2_f16(__global uchar *src_addr, const int stride_x_bytes, const int stride_y_bytes,

749

const int y_offset, __global uchar *weights_addr, const int weights_stride_y)

750

{

751

// Load the weights

752

half3 weights_row0 = vload3(0, (__global half *)(weights_addr + 0 * weights_stride_y));

753

half3 weights_row1 = vload3(0, (__global half *)(weights_addr + 1 * weights_stride_y));

754

half3 weights_row2 = vload3(0, (__global half *)(weights_addr + 2 * weights_stride_y));

755

756

half4 pixels0 = 0.0f;

757

758

half8 src00_left = vload8(0, (__global half *)ptr_offset(src_addr, 0, y_offset, stride_x_bytes, stride_y_bytes)); // Row0

759

half8 src00_mid = vload8(0, (__global half *)ptr_offset(src_addr, DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));

760

half8 src00_right = vload8(0, (__global half *)ptr_offset(src_addr, 2 * DILATION_X, y_offset, stride_x_bytes, stride_y_bytes));

761

762

half8 src10_left = vload8(0, (__global half *)ptr_offset(src_addr, 0, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes)); // Row1

763

half8 src10_mid = vload8(0, (__global half *)ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));

764

half8 src10_right = vload8(0, (__global half *)ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y, stride_x_bytes, stride_y_bytes));

765

766

half8 src20_left = vload8(0, (__global half *)ptr_offset(src_addr, 0, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes)); // Row2

767

half8 src20_mid = vload8(0, (__global half *)ptr_offset(src_addr, DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));

768

half8 src20_right = vload8(0, (__global half *)ptr_offset(src_addr, 2 * DILATION_X, y_offset + DILATION_Y * 2, stride_x_bytes, stride_y_bytes));

769

770

CONVOLUTION1x3_4X1_STRIDE2(pixels0, src00_left, src00_mid, src00_right, weights_row0);

771

CONVOLUTION1x3_4X1_STRIDE2(pixels0, src10_left, src10_mid, src10_right, weights_row1);

772

CONVOLUTION1x3_4X1_STRIDE2(pixels0, src20_left, src20_mid, src20_right, weights_row2);

return pixels0;

}

#endif // (DILATION_X > 1 && DILATION_Y > 1)

778

779

/** Compute a 1D horizontal convolution of size 3 and stride 1 for 16bit floating point type.

780

*

781

* @param[in] left_pixel Pointer to the left pixel.

782

* @param[in] left_coeff Weight of the left pixel

783

* @param[in] middle_coeff Weight of the middle pixel

784

* @param[in] right_coeff Weight of the right pixel

785

*

786

* @return a half4 containing 4 convoluted values.

787

*/

788

inline half4 convolution1x3_stride_1_f16(__global const uchar *left_pixel,

789

const half left_coeff,

790

const half middle_coeff,

791

const half right_coeff)

792

{

793

#if(DILATION_X == 1 && DILATION_Y == 1)

794

795

half8 temp = vload8(0, (__global half *)left_pixel);

796

797

half4 left = CONVERT(temp.s0123, half4);

798

half4 middle = CONVERT(temp.s1234, half4);

799

half4 right = CONVERT(temp.s2345, half4);

800

801

return left * (half4)left_coeff + middle * (half4)middle_coeff + right * (half4)right_coeff;

802

#else /* DILATION_X==1 && DILATION_Y==1 */

803

return vload4(0, (__global half *)left_pixel) * (half4)left_coeff

804

+ vload4(0, (__global half *)(left_pixel) + DILATION_X) * (half4)middle_coeff

805

+ vload4(0, (__global half *)(left_pixel) + 2 * DILATION_X) * (half4)right_coeff;

806

807

#endif /* DILATION_X==1 && DILATION_Y==1 */

808

}

809

810

/** Compute a 1D horizontal convolution of size 3 and stride 2 for 16bit floating point type.

811

*

812

* @param[in] left_pixel Pointer to the left pixel.

813

* @param[in] left_coeff Weight of the left pixel

814

* @param[in] middle_coeff Weight of the middle pixel

815

* @param[in] right_coeff Weight of the right pixel

816

*

817

* @return a half4 containing 4 convoluted values.

818

*/

819

inline half4 convolution1x3_stride_2_f16(__global const uchar *left_pixel,

820

const half left_coeff,

821

const half middle_coeff,

822

const half right_coeff)

823

{

824

#if(DILATION_X == 1 && DILATION_Y == 1)

825

826

half8 temp0 = vload8(0, (__global half *)left_pixel);

827

half temp1 = *((__global half *)(left_pixel + 8 * sizeof(half)));

828

829

half4 left = CONVERT(temp0.s0246, half4);

830

half4 middle = CONVERT(temp0.s1357, half4);

831

half4 right = CONVERT((half4)(temp0.s246, temp1), half4);

832

833

return left * (half4)left_coeff + middle * (half4)middle_coeff + right * (half4)right_coeff;

834

#else /* DILATION_X==1 && DILATION_Y==1 */

835

836

__global half *left_pixel_float = (__global half *)left_pixel;

837

838

return (half4)(*left_pixel_float, *(left_pixel_float + 2), *(left_pixel_float + 4), *(left_pixel_float + 6)) * (half4)left_coeff

839

+ (half4)(*(left_pixel_float + DILATION_X), *(left_pixel_float + DILATION_X + 2), *(left_pixel_float + DILATION_X + 4), *(left_pixel_float + DILATION_X + 6)) * (half4)middle_coeff

840

+ (half4)(*(left_pixel_float + DILATION_X * 2), *(left_pixel_float + DILATION_X * 2 + 2), *(left_pixel_float + DILATION_X * 2 + 4), *(left_pixel_float + DILATION_X * 2 + 6)) * (half4)right_coeff;

841

842

#endif /* DILATION_X==1 && DILATION_Y==1 */

843

}

844

845

/** Compute a 1D horizontal convolution of size 3 and stride 3 for 16bit floating point type.

846

*

847

* @param[in] left_pixel Pointer to the left pixel.

848

* @param[in] left_coeff Weight of the left pixel

849

* @param[in] middle_coeff Weight of the middle pixel

850

* @param[in] right_coeff Weight of the right pixel

851

*

852

* @return a half4 containing 4 convoluted values.

853

*/

854

inline half4 convolution1x3_stride_3_f16(__global const uchar *left_pixel,

855

const half left_coeff,

856

const half middle_coeff,

857

const half right_coeff)

858

{

859

#if(DILATION_X == 1 && DILATION_Y == 1)

860

861

half16 temp0 = vload16(0, (__global half *)left_pixel);

862

863

half4 left = CONVERT(temp0.s0369, half4);

864

half4 middle = CONVERT(temp0.s147A, half4);

865

half4 right = CONVERT(temp0.s258B, half4);

866

867

return left * (half4)left_coeff + middle * (half4)middle_coeff + right * (half4)right_coeff;

868

#else /* DILATION_X==1 && DILATION_Y==1 */

869

870

__global half *left_pixel_float = (__global half *)left_pixel;

871

872

return (half4)(*left_pixel_float, *(left_pixel_float + 3), *(left_pixel_float + 6), *(left_pixel_float + 9)) * (half4)left_coeff

873

+ (half4)(*(left_pixel_float + DILATION_X), *(left_pixel_float + DILATION_X + 3), *(left_pixel_float + DILATION_X + 6), *(left_pixel_float + DILATION_X + 9)) * (half4)middle_coeff

874

+ (half4)(*(left_pixel_float + DILATION_X * 2), *(left_pixel_float + DILATION_X * 2 + 3), *(left_pixel_float + DILATION_X * 2 + 6), *(left_pixel_float + DILATION_X * 2 + 9)) * (half4)right_coeff;

875

876

#endif /* DILATION_X==1 && DILATION_Y==1 */

877

}

878

879

/** Apply a 3x3 convolution matrix to a single channel F16 input image and return the result.

880

*

881

* Convolution matrix layout:

882

*

883

* [ mat0, mat1, mat2 ]\n

884

* [ mat3, mat4, mat5 ]\n

885

* [ mat6, mat7, mat8 ]\n

886

*

887

* @param[in] src A pointer to source Image structure

888

* @param[in] mat0 Coefficient from the convolution matrix

889

* @param[in] mat1 Coefficient from the convolution matrix

890

* @param[in] mat2 Coefficient from the convolution matrix

891

* @param[in] mat3 Coefficient from the convolution matrix

892

* @param[in] mat4 Coefficient from the convolution matrix

893

* @param[in] mat5 Coefficient from the convolution matrix

894

* @param[in] mat6 Coefficient from the convolution matrix

895

* @param[in] mat0 Coefficient from the convolution matrix

896

* @param[in] mat7 Coefficient from the convolution matrix

897

* @param[in] mat8 Coefficient from the convolution matrix

898

*

899

* @return a half4 containing 4 convoluted values.

900

*/

901

inline half4 convolution3x3_f16(

902

__global uchar *src, uint src_stride_y,

903

const half mat0, const half mat1, const half mat2,

904

const half mat3, const half mat4, const half mat5,

905

const half mat6, const half mat7, const half mat8)

{

half4 pixels;

pixels = convolution1x3_f16(src, mat0, mat1, mat2);

910

pixels += convolution1x3_f16(src + DILATION_Y * src_stride_y, mat3, mat4, mat5);

911

pixels += convolution1x3_f16(src + DILATION_Y * 2 * src_stride_y, mat6, mat7, mat8);

return pixels;

}

#if defined(DEPTH_MULTIPLIER)

917

918

/** This OpenCL kernel computes the depthwise convolution 3x3

919

*

920

* @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu

921

* @note If activation function is enabled, the data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types: half.

922

* @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively

923

* @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size

924

*

925

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F16

926

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

927

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

928

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

929

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

930

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

931

* @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)

932

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

933

* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

934

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

935

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

936

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

937

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

938

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

939

* @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)

940

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

941

* @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr

942

* @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)

943

* @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)

944

* @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)

945

* @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)

946

* @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)

947

* @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)

948

* @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector

949

* @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: F16

950

* @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)

951

* @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)

952

* @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector

953

*/

954

__kernel void depthwise_convolution_3x3_f16(

955

TENSOR3D_DECLARATION(src),

956

TENSOR3D_DECLARATION(dst),

957

TENSOR3D_DECLARATION(weights)

958

#if defined(HAS_BIAS)

959

,

960

VECTOR_DECLARATION(biases)

961

#endif //defined(HAS_BIAS)

962

)

963

{

964

Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);

965

Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);

966

#if defined(HAS_BIAS)

967

Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);

968

#endif //defined(HAS_BIAS)

969

970

// Extract channel and linearized batch indices

971

const int channel = get_global_id(2) % DST_CHANNELS;

972

const int batch = get_global_id(2) / DST_CHANNELS;

973

// Load relevant input and weights data (Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)

974

__global uchar *src_addr = src_ptr + get_global_id(0) * src_step_x + get_global_id(1) * src_step_y + get_global_id(2) * src_step_z - batch * (DST_CHANNELS / DEPTH_MULTIPLIER) *

975

(DEPTH_MULTIPLIER - 1) * src_step_z - (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;

976

__global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;

977

978

uchar3 offset = (uchar3)(0, 1, 2) * (uchar3)weights_stride_y;

979

half3 weights_values0 = vload3(0, (__global half *)(weights_addr + offset.s0));

980

half3 weights_values1 = vload3(0, (__global half *)(weights_addr + offset.s1));

981

half3 weights_values2 = vload3(0, (__global half *)(weights_addr + offset.s2));

982

983

half4 pixels = convolution3x3_f16(src_addr, src_stride_y, weights_values0.s0, weights_values0.s1, weights_values0.s2,

984

weights_values1.s0, weights_values1.s1, weights_values1.s2,

985

weights_values2.s0, weights_values2.s1, weights_values2.s2);

986

#if defined(HAS_BIAS)

987

pixels += (half4)(*((__global half *)(biases.ptr + channel * biases_stride_x)));

988

#endif //defined(HAS_BIAS)

989

990

vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, pixels, A_VAL, B_VAL), 0, (__global half *)dst.ptr);

991

}

992

#endif // defined(DEPTH_MULTIPLIER)

993

#endif // defined(CONV_STRIDE_X)

994

995

/** This OpenCL kernel is optimized for Bifrost architectures and computes the 16bit floating point depthwise convolution 3x3

996

* when both stride_x and stride_y are equal to 1

997

*

998

* @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu

999

* @note If activation function is enabled, the data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types: half.

1000

* @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively

1001

* @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size

1002

*

1003

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F16

1004

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1005

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1006

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1007

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1008

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1009

* @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)

1010

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1011

* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

1012

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1013

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1014

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1015

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

1016

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

1017

* @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)

1018

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1019

* @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr

1020

* @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)

1021

* @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)

1022

* @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)

1023

* @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)

1024

* @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)

1025

* @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)

1026

* @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector

1027

* @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: same as @p src_ptr

1028

* @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)

1029

* @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)

1030

* @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector

1031

*/

1032

__kernel void depthwise_convolution_3x3_stridex1_stridey1_f16(

1033

TENSOR3D_DECLARATION(src),

1034

TENSOR3D_DECLARATION(dst),

1035

TENSOR3D_DECLARATION(weights)

1036

#if defined(HAS_BIAS)

1037

,

1038

VECTOR_DECLARATION(biases)

1039

#endif //defined(HAS_BIAS)

1040

)

1041

{

1042

Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);

1043

Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);

1044

1045

// Extract channel and linearized batch indices

1046

const int channel = get_global_id(2) % DST_CHANNELS;

1047

const int batch = get_global_id(2) / DST_CHANNELS;

1048

1049

#ifdef HAS_BIAS

1050

Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);

1051

1052

half bias = *((__global half *)(vector_offset(&biases, channel)));

1053

#endif /* defined(HAS_BIAS) */

1054

1055

half4 pixels0 = 0.0f;

1056

half4 pixels1 = 0.0f;

1057

half4 pixels2 = 0.0f;

1058

half4 pixels3 = 0.0f;

1059

1060

// Load relevant input and weights data (Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)

1061

__global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;

1062

__global uchar *src_addr = src_ptr + get_global_id(0) * src_step_x + get_global_id(1) * src_step_y + get_global_id(2) * src_step_z - batch * (DST_CHANNELS / DEPTH_MULTIPLIER) *

1063

(DEPTH_MULTIPLIER - 1) * src_step_z - (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;

1064

1065

#if(DILATION_X == 1 && DILATION_Y == 1)

1066

// Load the weights

1067

half3 weights_row0 = vload3(0, (__global half *)(weights_addr + 0 * weights_stride_y));

1068

half3 weights_row1 = vload3(0, (__global half *)(weights_addr + 1 * weights_stride_y));

1069

half3 weights_row2 = vload3(0, (__global half *)(weights_addr + 2 * weights_stride_y));

1070

1071

// Note: Since each work-item computes 4x4 elements, we need to load 6 rows from the input tensor

1072

half8 src00 = vload8(0, (__global half *)(src_addr + 0 * src_stride_y)); // Row0

1073

half8 src10 = vload8(0, (__global half *)(src_addr + 1 * src_stride_y)); // Row1

1074

half8 src20 = vload8(0, (__global half *)(src_addr + 2 * src_stride_y)); // Row2

1075

half8 src30 = vload8(0, (__global half *)(src_addr + 3 * src_stride_y)); // Row3

1076

half8 src40 = vload8(0, (__global half *)(src_addr + 4 * src_stride_y)); // Row4

1077

half8 src50 = vload8(0, (__global half *)(src_addr + 5 * src_stride_y)); // Row5

1078

1079

CONVOLUTION1x3_4X1_STRIDE1(pixels0, src00, weights_row0);

1080

CONVOLUTION1x3_4X1_STRIDE1(pixels0, src10, weights_row1);

1081

CONVOLUTION1x3_4X1_STRIDE1(pixels0, src20, weights_row2);

1082

CONVOLUTION1x3_4X1_STRIDE1(pixels1, src10, weights_row0);

1083

CONVOLUTION1x3_4X1_STRIDE1(pixels1, src20, weights_row1);

1084

CONVOLUTION1x3_4X1_STRIDE1(pixels1, src30, weights_row2);

1085

CONVOLUTION1x3_4X1_STRIDE1(pixels2, src20, weights_row0);

1086

CONVOLUTION1x3_4X1_STRIDE1(pixels2, src30, weights_row1);

1087

CONVOLUTION1x3_4X1_STRIDE1(pixels2, src40, weights_row2);

1088

CONVOLUTION1x3_4X1_STRIDE1(pixels3, src30, weights_row0);

1089

CONVOLUTION1x3_4X1_STRIDE1(pixels3, src40, weights_row1);

1090

CONVOLUTION1x3_4X1_STRIDE1(pixels3, src50, weights_row2);

1091

1092

#else /* DILATION_X==1 && DILATION_Y==1 */

1093

1094

//3x3 Convolution of elements starting in 0th row

1095

pixels0 = convolution_3x3_dilation_stridex1_stridey1_f16(src_addr, src_stride_x, src_stride_y, 0, weights_addr, weights_stride_y);

1096

//3x3 Convolution of elements starting in 1st row

1097

pixels1 = convolution_3x3_dilation_stridex1_stridey1_f16(src_addr, src_stride_x, src_stride_y, 1, weights_addr, weights_stride_y);

1098

//3x3 Convolution of elements starting in 2nd row

1099

pixels2 = convolution_3x3_dilation_stridex1_stridey1_f16(src_addr, src_stride_x, src_stride_y, 2, weights_addr, weights_stride_y);

1100

//3x3 Convolution of elements starting in 3rd row

1101

pixels3 = convolution_3x3_dilation_stridex1_stridey1_f16(src_addr, src_stride_x, src_stride_y, 3, weights_addr, weights_stride_y);

1102

1103

#endif /* DILATION_X==1 && DILATION_Y==1 */

1104

1105

#ifdef HAS_BIAS

1106

pixels0 += (half4)bias;

1107

pixels1 += (half4)bias;

1108

pixels2 += (half4)bias;

1109

pixels3 += (half4)bias;

1110

#endif /* defined(HAS_BIAS) */

1111

1112

vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, pixels0, A_VAL, B_VAL), 0, (__global half *)(dst.ptr + 0 * dst_stride_y));

1113

vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, pixels1, A_VAL, B_VAL), 0, (__global half *)(dst.ptr + 1 * dst_stride_y));

1114

vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, pixels2, A_VAL, B_VAL), 0, (__global half *)(dst.ptr + 2 * dst_stride_y));

1115

vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, pixels3, A_VAL, B_VAL), 0, (__global half *)(dst.ptr + 3 * dst_stride_y));

1116

}

1117

1118

/** This OpenCL kernel is optimized for Bifrost architectures and computes 16bit floating point the depthwise convolution 3x3

1119

* when both stride_x and stride_y are equal to 2

1120

*

1121

* @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu

1122

* @note If activation function is enabled, the data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types: half.

1123

* @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively

1124

* @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size

1125

*

1126

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F16

1127

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1128

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1129

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1130

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1131

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1132

* @param[in] src_step_z src_stride_y * number of elements along Z processed per workitem(in bytes)

1133

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1134

* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

1135

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1136

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1137

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1138

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

1139

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

1140

* @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)

1141

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1142

* @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr

1143

* @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)

1144

* @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)

1145

* @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)

1146

* @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)

1147

* @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)

1148

* @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)

1149

* @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector

1150

* @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: same as @p src_ptr

1151

* @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)

1152

* @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)

1153

* @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector

1154

*/

1155

__kernel void depthwise_convolution_3x3_stridex2_stridey2_f16(

1156

TENSOR3D_DECLARATION(src),

1157

TENSOR3D_DECLARATION(dst),

1158

TENSOR3D_DECLARATION(weights)

1159

#if defined(HAS_BIAS)

1160

,

1161

VECTOR_DECLARATION(biases)

1162

#endif //defined(HAS_BIAS)

1163

)

1164

{

1165

Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);

1166

Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);

1167

1168

// Extract channel and linearized batch indices

1169

const int channel = get_global_id(2) % DST_CHANNELS;

1170

const int batch = get_global_id(2) / DST_CHANNELS;

1171

1172

#ifdef HAS_BIAS

1173

Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);

1174

1175

half bias = *((__global half *)(vector_offset(&biases, channel)));

1176

#endif /* defined(HAS_BIAS) */

1177

1178

half4 pixels0 = 0.0f;

1179

half4 pixels1 = 0.0f;

1180

1181

// Load relevant input and weights data ( Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)

1182

__global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;

1183

__global uchar *src_addr = src_ptr + get_global_id(0) * src_step_x + get_global_id(1) * src_step_y + get_global_id(2) * src_step_z - batch * (DST_CHANNELS / DEPTH_MULTIPLIER) *

1184

(DEPTH_MULTIPLIER - 1) * src_step_z - (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;

1185

1186

#if(DILATION_X == 1 && DILATION_Y == 1)

1187

1188

// Load the weights

1189

half3 weights_row0 = vload3(0, (__global half *)(weights_addr + 0 * weights_stride_y));

1190

half3 weights_row1 = vload3(0, (__global half *)(weights_addr + 1 * weights_stride_y));

1191

half3 weights_row2 = vload3(0, (__global half *)(weights_addr + 2 * weights_stride_y));

1192

1193

// Note: Since each work-item computes 2x4 elements, we need to load 5 rows from the input tensor

1194

half8 src00 = vload8(0, (__global half *)(src_addr + 0 * src_stride_y)); // Row0

1195

half2 src01 = vload2(4, (__global half *)(src_addr + 0 * src_stride_y)); // Row0

1196

half8 src10 = vload8(0, (__global half *)(src_addr + 1 * src_stride_y)); // Row1

1197

half2 src11 = vload2(4, (__global half *)(src_addr + 1 * src_stride_y)); // Row1

1198

half8 src20 = vload8(0, (__global half *)(src_addr + 2 * src_stride_y)); // Row2

1199

half2 src21 = vload2(4, (__global half *)(src_addr + 2 * src_stride_y)); // Row2

1200

half8 src30 = vload8(0, (__global half *)(src_addr + 3 * src_stride_y)); // Row3

1201

half2 src31 = vload2(4, (__global half *)(src_addr + 3 * src_stride_y)); // Row3

1202

half8 src40 = vload8(0, (__global half *)(src_addr + 4 * src_stride_y)); // Row4

1203

half2 src41 = vload2(4, (__global half *)(src_addr + 4 * src_stride_y)); // Row4

1204

1205

CONVOLUTION1x3_4X1_STRIDE2(pixels0, src00, src01, weights_row0);

1206

CONVOLUTION1x3_4X1_STRIDE2(pixels0, src10, src11, weights_row1);

1207

CONVOLUTION1x3_4X1_STRIDE2(pixels0, src20, src21, weights_row2);

1208

CONVOLUTION1x3_4X1_STRIDE2(pixels1, src20, src21, weights_row0);

1209

CONVOLUTION1x3_4X1_STRIDE2(pixels1, src30, src31, weights_row1);

1210

CONVOLUTION1x3_4X1_STRIDE2(pixels1, src40, src41, weights_row2);

1211

1212

#else /* DILATION_X==1 && DILATION_Y==1 */

1213

//3x3 Convolution of elements starting in 0th row

1214

pixels0 = convolution_3x3_dilation_stridex2_stridey2_f16(src_addr, src_stride_x, src_stride_y, 0, weights_addr, weights_stride_y);

1215

//3x3 Convolution of elements starting in 2nd row

1216

pixels1 = convolution_3x3_dilation_stridex2_stridey2_f16(src_addr, src_stride_x, src_stride_y, 2, weights_addr, weights_stride_y);

1217

#endif /* DILATION_X==1 && DILATION_Y==1 */

1218

1219

#ifdef HAS_BIAS

1220

pixels0 += (half4)bias;

1221

pixels1 += (half4)bias;

1222

#endif /* defined(HAS_BIAS) */

1223

1224

vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, pixels0, A_VAL, B_VAL), 0, (__global half *)(dst.ptr + 0 * dst_stride_y));

1225

vstore4(ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, pixels1, A_VAL, B_VAL), 0, (__global half *)(dst.ptr + 1 * dst_stride_y));

1226

}

1227

#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(DEPTH_MULTIPLIER) && defined(DST_CHANNELS) && defined(IS_F16)

1228

1229

#if defined(SRC_DIM1) && defined(SRC_DIM2) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(N0) && defined(DATA_TYPE) && defined(DILATION_X) && defined(DILATION_Y) && defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y) && defined(CONV_PAD_LEFT) && defined(CONV_PAD_TOP) && defined(VEC_SIZE_LEFTOVER)

1230

/** This function computes the depthwise convolution for NHWC data layout. This kernel assumes that the weights tensor is NOT reshaped

1231

*

1232

* @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float

1233

* @note The number of elements processed must be passed at compile time using -DN0 (e.g. -DN0=2)

1234

* @note The depth multiplier must be passed at compile time using -DDEPTH_MULTIPLIER (e.g. -DDEPTH_MULTIPLIER=1)

1235

* @note The first dimension of the input tensor must be passed at compile time using -DSRC_DIM1 (e.g. -DSRC_DIM1=112)

1236

* @note The second dimension of the input tensor must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM2=80)

1237

* @note The kernel width must be passed at compile time using -DKERNEL_WIDTH (e.g. -DKERNEL_WIDTH=5)

1238

* @note The kernel height must be passed at compile time using -DKERNEL_HEIGHT (e.g. -DKERNEL_HEIGHT=5)

1239

* @note The convolution pad top must be passed at compile time using -DCONV_PAD_TOP (e.g. -DCONV_PAD_TOP=1)

1240

* @note The convolution pad top must be passed at compile time using -DCONV_PAD_LEFT (e.g. -DCONV_PAD_LEFT=1)

1241

* @note The convolution stride along the width must be passed at compile time using -DCONV_STRIDE_X (e.g. -DCONV_STRIDE_Y=X)

1242

* @note The convolution stride along the height must be passed at compile time using -DCONV_STRIDE_Y (e.g. -DCONV_STRIDE_Y=1)

1243

* @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE=3. It is defined as the remainder between the input's first dimension and VEC_SIZE

1244

* @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu

1245

* @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively

1246

*

1247

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32

1248

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1249

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1250

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1251

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1252

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1253

* @param[in] src_step_z src_stride_y * number of elements along Z processed per workitem(in bytes)

1254

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

1255

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

1256

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1257

* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as src_ptr

1258

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1259

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1260

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1261

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

1262

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

1263

* @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)

1264

* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)

1265

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

1266

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1267

* @param[in] weights_ptr Pointer to the weights tensor. Supported data types: F16/F32

1268

* @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)

1269

* @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)

1270

* @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)

1271

* @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)

1272

* @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)

1273

* @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)

1274

* @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor

1275

* @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: same as src_ptr

1276

* @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)

1277

* @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)

1278

* @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector

1279

*/

1280

__kernel void dwc_MxN_native_fp_nhwc(

1281

TENSOR4D_DECLARATION(src),

1282

TENSOR4D_DECLARATION(dst),

1283

TENSOR3D_DECLARATION(weights)

1284

#if defined(HAS_BIAS)

1285

,

1286

VECTOR_DECLARATION(biases)

1287

#endif // defined(HAS_BIAS)

1288

)

1289

{

1290

int x_offs = max((int)(get_global_id(0) * N0 - (N0 - VEC_SIZE_LEFTOVER) % N0), 0) * sizeof(DATA_TYPE);

1291

1292

int x = get_global_id(0); // channels

1293

int y = get_global_id(1); // spatial coordinate x

1294

#if defined(DST_DEPTH)

1295

int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y

1296

int b = get_global_id(2) / (int)DST_DEPTH; // batch

1297

#else // defined(DST_DEPTH)

1298

int z = get_global_id(2); // spatial coordinate y

1299

#endif // defined(DST_DEPTH)

1300

1301

__global uchar *s_addr = src_ptr + src_offset_first_element_in_bytes + x_offs;

1302

1303

__global uchar *d_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offs * (int)DEPTH_MULTIPLIER + y * dst_stride_y + z * dst_stride_z;

1304

1305

__global uchar *w_addr = weights_ptr + weights_offset_first_element_in_bytes + x_offs * (int)DEPTH_MULTIPLIER;

1306

1307

#if defined(HAS_BIAS)

1308

__global uchar *b_addr = biases_ptr + biases_offset_first_element_in_bytes + x_offs * (int)DEPTH_MULTIPLIER;

1309

#endif // defined(HAS_BIAS)

1310

1311

#if defined(DST_DEPTH)

1312

s_addr += b * src_stride_w;

1313

d_addr += b * dst_stride_w;

1314

#endif // defined(DST_DEPTH)

1315

1316

for(int d = 0; d < (int)DEPTH_MULTIPLIER; ++d)

1317

{

1318

// Each work-item computes N0x1x1 elements

1319

VEC_DATA_TYPE(DATA_TYPE, N0)

1320

res0 = 0;

1321

1322

int x_coord = y * CONV_STRIDE_X - (int)CONV_PAD_LEFT;

1323

int y_coord = z * CONV_STRIDE_Y - (int)CONV_PAD_TOP;

1324

1325

for(int yk = 0; yk < KERNEL_HEIGHT; ++yk)

1326

{

1327

if(y_coord >= 0 && y_coord < SRC_DIM2)

1328

{

1329

int x_coord_tmp = x_coord;

1330

1331

for(int xk = 0; xk < KERNEL_WIDTH; ++xk)

1332

{

1333

if(x_coord_tmp >= 0 && x_coord_tmp < SRC_DIM1)

1334

{

1335

int s_offset = x_coord_tmp * (int)src_stride_y + y_coord * (int)src_stride_z;

1336

int w_offset = xk * weights_stride_y + yk * weights_stride_z;

1337

1338

// Load input and weights values

1339

VEC_DATA_TYPE(DATA_TYPE, N0)

1340

i = VLOAD(N0)(0, (__global DATA_TYPE *)(s_addr + s_offset));

1341

VEC_DATA_TYPE(DATA_TYPE, N0)

1342

w = VLOAD(N0)(0, (__global DATA_TYPE *)(w_addr + w_offset));

1343

1344

#if GPU_ARCH == GPU_ARCH_MIDGARD

1345

res0 += i * w;

1346

#else // GPU_ARCH == GPU_ARCH_MIDGARD

1347

res0 = fma(i, w, res0);

1348

#endif // GPU_ARCH == GPU_ARCH_MIDGARD

1349

}

1350

x_coord_tmp += DILATION_X;

1351

}

1352

}

1353

y_coord += DILATION_Y;

1354

}

1355

1356

#if defined(HAS_BIAS)

1357

res0 += VLOAD(N0)(0, (__global DATA_TYPE *)(b_addr));

1358

#endif // defined(HAS_BIAS)

1359

1360

res0 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, N0, res0, A_VAL, B_VAL);

1361

1362

STORE_VECTOR_SELECT(res, DATA_TYPE, d_addr, N0, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)

1363

1364

w_addr += sizeof(DATA_TYPE);

1365

d_addr += sizeof(DATA_TYPE);

1366

#if defined(HAS_BIAS)

1367

b_addr += sizeof(DATA_TYPE);

1368

#endif // defined(HAS_BIAS)

1369

}

1370

}

1371

#endif // defined(SRC_DIM1) && defined(SRC_DIM2) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defiend(N0) && defined(DATA_TYPE) && defined(DILATION_X) && defined(DILATION_Y) && defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y) && defined(CONV_PAD_LEFT) && defined(CONV_PAD_TOP) && defined(VEC_SIZE_LEFTOVER)

1372

1373

#if defined(VEC_SIZE) && defined(SRC_DIM_2) && defined(CONV_PAD_TOP) && defined(CONV_PAD_LEFT) && defined(DATA_TYPE)

1374

1375

#if DATA_TYPE != float || DATA_TYPE != half

1376

#error "Unsupported data type"

1377

#endif // DATA_TYPE != float || DATA_TYPE != half

1378

1379

#define VEC_FLOAT VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)

1380

1381

#define FILL_ZERO_OUT_OF_BOUND_3(data_type, vec_size, basename, cond) \

1382

({ \

1383

basename##0 = select(basename##0, (VEC_DATA_TYPE(data_type, vec_size))0, (SELECT_VEC_DATA_TYPE(data_type, vec_size))((cond).s0)); \

1384

basename##1 = select(basename##1, (VEC_DATA_TYPE(data_type, vec_size))0, (SELECT_VEC_DATA_TYPE(data_type, vec_size))((cond).s1)); \

1385

basename##2 = select(basename##2, (VEC_DATA_TYPE(data_type, vec_size))0, (SELECT_VEC_DATA_TYPE(data_type, vec_size))((cond).s2)); \

1386

})

1387

1388

#define FILL_ZERO_OUT_OF_BOUND_4(data_type, vec_size, basename, cond) \

1389

({ \

1390

FILL_ZERO_OUT_OF_BOUND_3(data_type, vec_size, basename, cond); \

1391

basename##3 = select(basename##3, (VEC_DATA_TYPE(data_type, vec_size))0, (SELECT_VEC_DATA_TYPE(data_type, vec_size))((cond).s3)); \

1392

})

1393

1394

#if defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y)

1395

1396

/** This function computes the depthwise convolution for NHWC data layout when the stride along the width or height is not 1.

1397

*

1398

* @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float

1399

* @note The number of elements read per thread must be passed at compile time using -DVEC_SIZE (e.g. -DVEC_SIZE=2)

1400

* @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)

1401

* @note The convolution pad top must be passed at compile time using -DCONV_PAD_TOP (e.g. -DCONV_PAD_TOP=1)

1402

* @note The convolution pad top must be passed at compile time using -DCONV_PAD_LEFT (e.g. -DCONV_PAD_LEFT=1)

1403

* @note The convolution stride along the width must be passed at compile time using -DCONV_STRIDE_X (e.g. -DCONV_STRIDE_Y=X)

1404

* @note The convolution stride along the height must be passed at compile time using -DCONV_STRIDE_Y (e.g. -DCONV_STRIDE_Y=1)

1405

* @note The dilation_x and dilation_y must be passed at compile time using -DDILATION_X and -DDILATION_Y: e.g. -DDILATION_X=1, -DDILATION_Y=1

1406

* @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu

1407

* @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively

1408

* @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size

1409

* @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)

1410

* @note In case of biases, -DHAS_BIAS must to be passed at compile

1411

* @note If the output tensor has more than three dimensions, its third dimension must be passed at compile time using -DDST_DEPTH (e.g. -DDST_DEPTH=32)

1412

*

1413

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32

1414

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1415

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1416

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1417

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1418

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1419

* @param[in] src_step_z src_stride_y * number of elements along Z processed per workitem(in bytes)

1420

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

1421

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

1422

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1423

* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as src_ptr

1424

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1425

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1426

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1427

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

1428

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

1429

* @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)

1430

* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)

1431

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

1432

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1433

* @param[in] weights_ptr Pointer to the weights tensor. Supported data types: F16/F32

1434

* @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)

1435

* @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)

1436

* @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)

1437

* @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)

1438

* @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)

1439

* @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)

1440

* @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor

1441

* @param[in] max_offset Max offset for the input tensor

1442

* @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: same as src_ptr

1443

* @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)

1444

* @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)

1445

* @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector

1446

*/

1447

__kernel void depthwise_convolution_3x3_nhwc(

1448

TENSOR4D_DECLARATION(src),

1449

TENSOR4D_DECLARATION(dst),

1450

TENSOR3D_DECLARATION(weights)

1451

#if defined(HAS_BIAS)

1452

,

1453

VECTOR_DECLARATION(biases)

1454

#endif /* defined(HAS_BIAS) */

1455

)

1456

{

1457

int x_offset = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - PARTIAL_STORE_N0) % VEC_SIZE), 0) * sizeof(DATA_TYPE);

1458

int y = get_global_id(1); // spatial coordinate x

1459

#if defined(DST_DEPTH)

1460

int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y

1461

int b = get_global_id(2) / (int)DST_DEPTH; // batch

1462

#else // defined(DST_DEPTH)

1463

int z = get_global_id(2); // spatial coordinate y

1464

#endif // defined(DST_DEPTH)

1465

1466

__global uchar *weights_addr = weights_ptr + weights_offset_first_element_in_bytes + x_offset;

1467

1468

#if defined(DST_DEPTH)

1469

__global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x_offset + b * src_stride_w;

1470

#else /* defined(DST_DEPTH) */

1471

__global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x_offset;

1472

#endif /* defined(DST_DEPTH) */

1473

1474

int3 src_coord_y = (int3)(y * CONV_STRIDE_X - CONV_PAD_LEFT) + (int3)(0, DILATION_X, 2 * DILATION_X);

1475

int3 src_coord_z = (int3)(z * CONV_STRIDE_Y - CONV_PAD_TOP) + (int3)(0, DILATION_Y, 2 * DILATION_Y);

1476

1477

int3 src_offset_y = clamp(src_coord_y, (int3)0, (int3)(SRC_DIM_1 - 1));

1478

int3 src_offset_z = clamp(src_coord_z, (int3)0, (int3)(SRC_DIM_2 - 1));

1479

1480

// Use these vectors to check whether the unclamped load would have been out of bounds

1481

src_coord_y = (src_offset_y != src_coord_y);

1482

src_coord_z = (src_offset_z != src_coord_z);

1483

1484

src_offset_y *= (int3)src_stride_y;

1485

src_offset_z *= (int3)src_stride_z;

1486

1487

// We compute VEC_SIZEx1x1 [C,W,H] elements

VEC_FLOAT acc0 = 0;

// Load weights

VEC_FLOAT w0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 0 * weights_stride_y + 0 * weights_stride_z));

1492

VEC_FLOAT w1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_y + 0 * weights_stride_z));

1493

VEC_FLOAT w2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_y + 0 * weights_stride_z));

1494

VEC_FLOAT w3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 0 * weights_stride_y + 1 * weights_stride_z));

1495

VEC_FLOAT w4 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_y + 1 * weights_stride_z));

1496

VEC_FLOAT w5 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_y + 1 * weights_stride_z));

1497

VEC_FLOAT w6 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 0 * weights_stride_y + 2 * weights_stride_z));

1498

VEC_FLOAT w7 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_y + 2 * weights_stride_z));

1499

VEC_FLOAT w8 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_y + 2 * weights_stride_z));

// Load input values

// z == 0

VEC_FLOAT values0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s0 + src_offset_y.s0));

1504

VEC_FLOAT values1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s0 + src_offset_y.s1));

1505

VEC_FLOAT values2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s0 + src_offset_y.s2));

1506

1507

FILL_ZERO_OUT_OF_BOUND_3(DATA_TYPE, VEC_SIZE, values, src_coord_y | (int3)src_coord_z.s0);

1508

1509

acc0 = fma(values0, w0, acc0);

1510

acc0 = fma(values1, w1, acc0);

1511

acc0 = fma(values2, w2, acc0);

1512

1513

// z == 1

1514

values0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s1 + src_offset_y.s0));

1515

values1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s1 + src_offset_y.s1));

1516

values2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s1 + src_offset_y.s2));

1517

1518

FILL_ZERO_OUT_OF_BOUND_3(DATA_TYPE, VEC_SIZE, values, src_coord_y | (int3)src_coord_z.s1);

1519

1520

acc0 = fma(values0, w3, acc0);

1521

acc0 = fma(values1, w4, acc0);

1522

acc0 = fma(values2, w5, acc0);

1523

1524

// z == 2

1525

values0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s2 + src_offset_y.s0));

1526

values1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s2 + src_offset_y.s1));

1527

values2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s2 + src_offset_y.s2));

1528

1529

FILL_ZERO_OUT_OF_BOUND_3(DATA_TYPE, VEC_SIZE, values, src_coord_y | (int3)src_coord_z.s2);

1530

1531

acc0 = fma(values0, w6, acc0);

1532

acc0 = fma(values1, w7, acc0);

1533

acc0 = fma(values2, w8, acc0);

1534

1535

#if defined(HAS_BIAS)

1536

__global uchar *biases_addr = biases_ptr + biases_offset_first_element_in_bytes + x_offset;

1537

VEC_FLOAT bias_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)biases_addr);

1538

acc0 += bias_values;

1539

#endif // defined(HAS_BIAS)

1540

1541

#if defined(DST_DEPTH)

1542

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offset + y * dst_step_y + z * dst_step_z + b * dst_stride_w;

1543

#else /* defined(DST_DEPTH) */

1544

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offset + y * dst_step_y + z * dst_step_z;

1545

#endif /* defined(DST_DEPTH) */

1546

1547

acc0 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, acc0, A_VAL, B_VAL);

1548

STORE_VECTOR_SELECT(acc, DATA_TYPE, dst_addr, VEC_SIZE, PARTIAL_STORE_N0, PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0)

1549

}

1550

#endif // defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y)

1551

1552

#if defined(NUM_ROWS_PROCESSED) && defined(NUM_PLANES_PROCESSED)

1553

/** This function computes the depthwise convolution for NHWC data layout when the stride along the width and height is 1.

1554

*

1555

* @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float

1556

* @note The number of elements read per thread must be passed at compile time using -DVEC_SIZE (e.g. -DVEC_SIZE=2)

1557

* @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)

1558

* @note The number of rows processed per thread must be passed at compile time using -DNUM_ROWS_PROCESSED (i.e. -DNUM_ROWS_PROCESSED=2)

1559

* @note The number of planes processed per thread must be passed at compile time using -DNUM_PLANES_PROCESSED (i.e. -DNUM_PLANES_PROCESSED=2)

1560

* @note The convolution pad top must be passed at compile time using -DCONV_PAD_TOP (e.g. -DCONV_PAD_TOP=1)

1561

* @note The convolution pad top must be passed at compile time using -DCONV_PAD_LEFT (e.g. -DCONV_PAD_LEFT=1)

1562

* @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu

1563

* @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively

1564

* @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size

1565

* @note The size of the partial store block in y must be passed at compile time using -DPARTIAL_STORE_M0 (e.g. -DPARTIAL_STORE_M0=1)

1566

* @note The size of the partial store block in x must be passed at compile time using -DPARTIAL_STORE_N0 (e.g. -DPARTIAL_STORE_N0=1)

1567

* @note The size of the output's second dimension must be passed at compile time using -DDST_DIM_1 (e.g. -DDST_DIM_1=64)

1568

* @note The size of the output's third dimension must be passed at compile time using -DDST_DIM_2 (e.g. -DDST_DIM_2=32)

1569

* @note In case of biases, -DHAS_BIAS must to be passed at compile

1570

* @note If the output tensor has more than three dimensions, its third dimension must be passed at compile time using -DDST_DEPTH (e.g. -DDST_DEPTH=32)

1571

*

1572

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32

1573

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1574

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1575

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1576

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1577

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1578

* @param[in] src_step_z src_stride_y * number of elements along Z processed per workitem(in bytes)

1579

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

1580

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

1581

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1582

* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as src_ptr

1583

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1584

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1585

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1586

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

1587

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

1588

* @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)

1589

* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)

1590

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

1591

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1592

* @param[in] weights_ptr Pointer to the weights tensor. Supported data types: F16/F32

1593

* @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)

1594

* @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)

1595

* @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)

1596

* @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)

1597

* @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)

1598

* @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)

1599

* @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor

1600

* @param[in] max_offset Max offset for the input tensor

1601

* @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: same as src_ptr

1602

* @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)

1603

* @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)

1604

* @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector

1605

*/

1606

__kernel void depthwise_convolution_3x3_nhwc_stride1(

1607

TENSOR4D_DECLARATION(src),

1608

TENSOR4D_DECLARATION(dst),

1609

TENSOR3D_DECLARATION(weights)

1610

#if defined(HAS_BIAS)

1611

,

1612

VECTOR_DECLARATION(biases)

1613

#endif /* defined(HAS_BIAS) */

1614

)

1615

{

1616

int x_offset = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - PARTIAL_STORE_N0) % VEC_SIZE), 0) * sizeof(DATA_TYPE);

1617

int y = get_global_id(1); // spatial coordinate x

1618

#if defined(DST_DEPTH)

1619

int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y

1620

int b = get_global_id(2) / (int)DST_DEPTH; // batch

1621

#else // defined(DST_DEPTH)

1622

int z = get_global_id(2); // spatial coordinate y

1623

#endif // defined(DST_DEPTH)

1624

1625

__global uchar *weights_addr = weights_ptr + weights_offset_first_element_in_bytes + x_offset;

1626

1627

#if defined(DST_DEPTH)

1628

__global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x_offset + b * src_stride_w;

1629

#else /* defined(DST_DEPTH) */

1630

__global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x_offset;

1631

#endif /* defined(DST_DEPTH) */

1632

1633

int4 src_coord_y = (int4)(y * NUM_ROWS_PROCESSED - CONV_PAD_LEFT) + V_OFFS4(int);

1634

int4 src_coord_z = (int4)(z * NUM_PLANES_PROCESSED - CONV_PAD_TOP) + V_OFFS4(int);

1635

1636

int4 src_offset_y = clamp(src_coord_y, (int4)0, (int4)(SRC_DIM_1 - 1));

1637

int4 src_offset_z = clamp(src_coord_z, (int4)0, (int4)(SRC_DIM_2 - 1));

1638

1639

// Use these vectors to check whether the unclamped load would have been out of bounds

1640

src_coord_y = (src_offset_y != src_coord_y);

1641

src_coord_z = (src_offset_z != src_coord_z);

1642

1643

src_offset_y *= (int4)src_stride_y;

1644

src_offset_z *= (int4)src_stride_z;

1645

1646

// We compute VEC_SIZEx2x2 [C,W,H] elements

VEC_FLOAT acc0 = 0;

VEC_FLOAT acc1 = 0;

VEC_FLOAT acc2 = 0;

VEC_FLOAT acc3 = 0;

// Load weights

VEC_FLOAT w0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 0 * weights_stride_y + 0 * weights_stride_z));

1654

VEC_FLOAT w1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_y + 0 * weights_stride_z));

1655

VEC_FLOAT w2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_y + 0 * weights_stride_z));

1656

VEC_FLOAT w3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 0 * weights_stride_y + 1 * weights_stride_z));

1657

VEC_FLOAT w4 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_y + 1 * weights_stride_z));

1658

VEC_FLOAT w5 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_y + 1 * weights_stride_z));

1659

VEC_FLOAT w6 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 0 * weights_stride_y + 2 * weights_stride_z));

1660

VEC_FLOAT w7 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 1 * weights_stride_y + 2 * weights_stride_z));

1661

VEC_FLOAT w8 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights_addr + 2 * weights_stride_y + 2 * weights_stride_z));

// Load input values

// z == 0

VEC_FLOAT values0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s0 + src_offset_y.s0));

1666

VEC_FLOAT values1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s0 + src_offset_y.s1));

1667

VEC_FLOAT values2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s0 + src_offset_y.s2));

1668

VEC_FLOAT values3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s0 + src_offset_y.s3));

1669

1670

FILL_ZERO_OUT_OF_BOUND_4(DATA_TYPE, VEC_SIZE, values, src_coord_y | (int4)src_coord_z.s0);

1671

1672

acc0 = fma(values0, w0, acc0);

1673

acc0 = fma(values1, w1, acc0);

1674

acc0 = fma(values2, w2, acc0);

1675

acc1 = fma(values1, w0, acc1);

1676

acc1 = fma(values2, w1, acc1);

1677

acc1 = fma(values3, w2, acc1);

1678

1679

// z == 1

1680

values0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s1 + src_offset_y.s0));

1681

values1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s1 + src_offset_y.s1));

1682

values2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s1 + src_offset_y.s2));

1683

values3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s1 + src_offset_y.s3));

1684

1685

FILL_ZERO_OUT_OF_BOUND_4(DATA_TYPE, VEC_SIZE, values, src_coord_y | (int4)src_coord_z.s1);

1686

1687

acc0 = fma(values0, w3, acc0);

1688

acc0 = fma(values1, w4, acc0);

1689

acc0 = fma(values2, w5, acc0);

1690

acc1 = fma(values1, w3, acc1);

1691

acc1 = fma(values2, w4, acc1);

1692

acc1 = fma(values3, w5, acc1);

1693

1694

acc2 = fma(values0, w0, acc2);

1695

acc2 = fma(values1, w1, acc2);

1696

acc2 = fma(values2, w2, acc2);

1697

acc3 = fma(values1, w0, acc3);

1698

acc3 = fma(values2, w1, acc3);

1699

acc3 = fma(values3, w2, acc3);

1700

1701

// z == 2

1702

values0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s2 + src_offset_y.s0));

1703

values1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s2 + src_offset_y.s1));

1704

values2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s2 + src_offset_y.s2));

1705

values3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s2 + src_offset_y.s3));

1706

1707

FILL_ZERO_OUT_OF_BOUND_4(DATA_TYPE, VEC_SIZE, values, src_coord_y | (int4)src_coord_z.s2);

1708

1709

acc0 = fma(values0, w6, acc0);

1710

acc0 = fma(values1, w7, acc0);

1711

acc0 = fma(values2, w8, acc0);

1712

acc1 = fma(values1, w6, acc1);

1713

acc1 = fma(values2, w7, acc1);

1714

acc1 = fma(values3, w8, acc1);

1715

1716

acc2 = fma(values0, w3, acc2);

1717

acc2 = fma(values1, w4, acc2);

1718

acc2 = fma(values2, w5, acc2);

1719

acc3 = fma(values1, w3, acc3);

1720

acc3 = fma(values2, w4, acc3);

1721

acc3 = fma(values3, w5, acc3);

1722

1723

// z == 3

1724

values0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s3 + src_offset_y.s0));

1725

values1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s3 + src_offset_y.s1));

1726

values2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s3 + src_offset_y.s2));

1727

values3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + src_offset_z.s3 + src_offset_y.s3));

1728

1729

FILL_ZERO_OUT_OF_BOUND_4(DATA_TYPE, VEC_SIZE, values, src_coord_y | (int4)src_coord_z.s3);

1730

1731

acc2 = fma(values0, w6, acc2);

1732

acc2 = fma(values1, w7, acc2);

1733

acc2 = fma(values2, w8, acc2);

1734

acc3 = fma(values1, w6, acc3);

1735

acc3 = fma(values2, w7, acc3);

1736

acc3 = fma(values3, w8, acc3);

1737

1738

#if defined(HAS_BIAS)

1739

__global uchar *biases_addr = biases_ptr + biases_offset_first_element_in_bytes + x_offset;

1740

1741

VEC_FLOAT bias_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)biases_addr);

acc0 += bias_values;

acc1 += bias_values;

acc2 += bias_values;

acc3 += bias_values;

#endif // defined(HAS_BIAS)

1748

1749

int2 dst_offset_y = min((int2)(y * NUM_ROWS_PROCESSED) + V_OFFS2(int), (int2)(DST_DIM_1 - 1)) * (int2)dst_stride_y;

1750

int dst_coord_z = z * NUM_PLANES_PROCESSED;

1751

1752

#if defined(DST_DEPTH)

1753

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offset + dst_coord_z * dst_stride_z + b * dst_stride_w;

1754

#else // defined(DST_DEPTH)

1755

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offset + dst_coord_z * dst_stride_z;

1756

#endif // defined(DST_DEPTH)

1757

1758

/* Store vectors in reverse order along the Y. The Y offsets are calculated so that they are forced to be in bound.

1759

* If only the first address is in bound, the Y offset of the second address will be brought back and there will be 2 writes in the same location for the same thread.

1760

* Since the last vector to be written is always the valid one for that location, it overwrites the wrong values.

1761

*/

1762

values0 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, acc1, A_VAL, B_VAL);

1763

STORE_VECTOR_SELECT(values, DATA_TYPE, dst_addr + dst_offset_y.s1, VEC_SIZE, PARTIAL_STORE_N0, PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0)

1764

1765

values0 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, acc0, A_VAL, B_VAL);

1766

STORE_VECTOR_SELECT(values, DATA_TYPE, dst_addr + dst_offset_y.s0, VEC_SIZE, PARTIAL_STORE_N0, PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0)

1767

1768

#if((DST_DIM_2 % NUM_PLANES_PROCESSED) != 0)

1769

if((dst_coord_z + 1) < DST_DIM_2)

1770

#endif // ((DST_DIM_2 % NUM_PLANES_PROCESSED) != 0)

1771

{

1772

values0 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, acc3, A_VAL, B_VAL);

1773

STORE_VECTOR_SELECT(values, DATA_TYPE, dst_addr + dst_stride_z + dst_offset_y.s1, VEC_SIZE, PARTIAL_STORE_N0, PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0)

1774

1775

values0 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, acc2, A_VAL, B_VAL);

1776

STORE_VECTOR_SELECT(values, DATA_TYPE, dst_addr + dst_stride_z + dst_offset_y.s0, VEC_SIZE, PARTIAL_STORE_N0, PARTIAL_STORE_N0 != 0 && get_global_id(0) == 0)

}

}

#endif // defined(NUM_ROWS_PROCESSED) && defined(NUM_PLANES_PROCESSED)

1781

#endif // defined(VEC_SIZE) && defined(SRC_DIM_2) && defined(CONV_PAD_TOP) && defined(CONV_PAD_LEFT) && defined(DATA_TYPE)