Blame - src/core/CL/cl_kernels/depthwise_convolution.cl - ml/ComputeLibrary

__global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + get_global_id(1) * SRC_WIDTH * dst_stride_x + get_global_id(2) * dst_stride_y;

457

458

for(int i = 0; i < SRC_WIDTH; ++i, ++input_ptr)

459

{

460

*((__global DATA_TYPE *)(output_ptr + i * dst_stride_x)) = *input_ptr;

461

}

Georgios Pinitas

81a26ad

2017-10-23 20:29:30 +0100

[diff] [blame]

462

463

#if defined(HAS_BIAS)

464

if(get_global_id(1) == 0)

465

{

466

*((__global DATA_TYPE *)(output_ptr + SRC_WIDTH * get_global_size(1) * dst_stride_x)) = *((__global float *)(biases.ptr + get_global_id(2) * biases_stride_x));

467

}

468

#endif // defined(HAS_BIAS)

Giorgio Arena

9fe4144

2017-08-23 16:36:24 +0100

[diff] [blame]

469

}

470

#endif //defined(SRC_WIDTH) && defined(DATA_TYPE)

471

Georgios Pinitas

de5a1cc

2018-02-02 12:52:07 +0000

[diff] [blame]

472

#if defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(PAD_RIGHT) && defined(PAD_BOTTOM) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DATA_TYPE) && defined(PAD_VALUE)

Giorgio Arena

9fe4144

2017-08-23 16:36:24 +0100

[diff] [blame]

473

/** This kernel performs a reshaping of the input tensor to a tensor used to perform depthwise convolution using vector to matrix multiplication.

474

*

475

* @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float

Jaroslaw Rzepecki

a1ed41f

2017-10-13 11:13:58 +0100

[diff] [blame]

476

* @note The convolution information must be passed at compile time using -DSTRIDE_X, -DSTRIDE_Y, -DPAD_LEFT, -DPAD_TOP, -DPAD_RIGHT, -DPAD_BOTTOM, -DKERNEL_WIDHT, -DKERNEL_HEIGHT, -DSRC_WIDTH, -DSRC_HEIGHT

Giorgio Arena

9fe4144

2017-08-23 16:36:24 +0100

[diff] [blame]

477

*

478

* @param[in] src_ptr Pointer to the source tensor. Supported data types: QS8/QS16/F16/F32

479

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

480

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

481

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

482

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

483

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

484

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

485

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

486

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

487

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

488

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

489

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

490

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

491

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

492

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

493

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

494

*/

Giorgio Arena

9fe4144

2017-08-23 16:36:24 +0100

[diff] [blame]

495

__kernel void depthwise_im2col(TENSOR3D_DECLARATION(src), TENSOR3D_DECLARATION(dst))

496

{

497

Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);

498

499

const int src_pixel_linear = get_global_id(1) * STRIDE_X;

Jaroslaw Rzepecki

a1ed41f

2017-10-13 11:13:58 +0100

[diff] [blame]

500

const int full_length = SRC_WIDTH + PAD_LEFT + PAD_RIGHT;

Giorgio Arena

9fe4144

2017-08-23 16:36:24 +0100

[diff] [blame]

501

const int max_initial_x = STRIDE_X * (((full_length - KERNEL_WIDTH) / STRIDE_X) + 1);

502

Jaroslaw Rzepecki

a1ed41f

2017-10-13 11:13:58 +0100

[diff] [blame]

503

const int src_x = -PAD_LEFT + src_pixel_linear % max_initial_x;

504

const int src_y = -PAD_TOP + src_pixel_linear / max_initial_x * STRIDE_Y;

Giorgio Arena

9fe4144

2017-08-23 16:36:24 +0100

[diff] [blame]

505

const int src_z = get_global_id(2);

506

507

__global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + src_z * src_stride_z;

508

__global DATA_TYPE *output_ptr = ((__global DATA_TYPE *)(dst.ptr));

509

510

for(int y = src_y; y < src_y + KERNEL_HEIGHT; ++y)

511

{

512

for(int x = src_x; x < src_x + KERNEL_WIDTH; ++x, ++output_ptr)

513

{

514

if(x < 0 || x >= SRC_WIDTH || y < 0 || y >= SRC_HEIGHT)

515

{

Georgios Pinitas

de5a1cc

2018-02-02 12:52:07 +0000

[diff] [blame]

516

*output_ptr = PAD_VALUE;

Giorgio Arena

9fe4144

2017-08-23 16:36:24 +0100

[diff] [blame]

}

else

{

*output_ptr = *((__global DATA_TYPE *)(input_ptr + x * src_stride_x + y * src_stride_y));

521

}

522

}

523

}

Georgios Pinitas

81a26ad

2017-10-23 20:29:30 +0100

[diff] [blame]

524

#if defined(HAS_BIAS)

525

*output_ptr = (DATA_TYPE)(1);

526

#endif // defined(HAS_BIAS)

Giorgio Arena

9fe4144

2017-08-23 16:36:24 +0100

[diff] [blame]

527

}

528

Georgios Pinitas

de5a1cc

2018-02-02 12:52:07 +0000

[diff] [blame]

529

#endif //defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(PAD_RIGHT) && defined(PAD_BOTTOM) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_WIDTH) && defined(DATA_TYPE) && defined(PAD_VALUE)

Giorgio Arena

9fe4144

2017-08-23 16:36:24 +0100

[diff] [blame]

530

531

#if defined(CONV_WIDTH) && defined(CONV_HEIGHT) && defined(DATA_TYPE)

532

533

/** This kernel performs a reshaping of the output of the depthwise generic convolution.

534

*

535

* @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float

536

* @note The convolution information must be passed at compile time using -DCONV_WIDTH, -DCONV_HEIGHT, e.g -DCONV_WIDTH=32, -DCONV_HEIGHT=42

537

*

538

* @param[in] src_ptr Pointer to the source tensor. Supported data types: QS8/QS16/F16/F32

539

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

540

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

541

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

542

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

543

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

544

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

545

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

546

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

547

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

548

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

549

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

550

*/

551

__kernel void depthwise_vector_to_tensor(

552

VECTOR_DECLARATION(src),

553

TENSOR3D_DECLARATION(dst))

554

{

555

Vector src = CONVERT_TO_VECTOR_STRUCT(src);

556

557

const int patch_size = CONV_WIDTH * CONV_HEIGHT;

558

const int id0 = get_global_id(0);

559

const int z = id0 / patch_size;

560

const int index2D = id0 - z * patch_size;

561

562

__global uchar *out_ptr = dst_ptr + dst_offset_first_element_in_bytes + index2D % CONV_WIDTH * dst_stride_x + index2D / CONV_WIDTH * dst_stride_y + z * dst_stride_z;

563

*((__global DATA_TYPE *)out_ptr) = *((__global DATA_TYPE *)src.ptr);

564

}

565

566

#endif //defined(CONV_WIDTH) && defined(CONV_HEIGHT) && defined(DATA_TYPE)

Michele Di Giorgio

933fe86

2018-02-19 15:42:12 +0000

[diff] [blame^]

567

568

#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)

569

#if defined(CONV_STRIDE_X)

570

#if CONV_STRIDE_X == 1

571

#define convolution1x3_f16 convolution1x3_stride_1_f16

572

#elif CONV_STRIDE_X == 2

573

#define convolution1x3_f16 convolution1x3_stride_2_f16

574

#elif CONV_STRIDE_X == 3

575

#define convolution1x3_f16 convolution1x3_stride_3_f16

576

#else /* CONV_STRIDE_X */

577

#error "Stride not supported"

578

#endif /* CONV_STRIDE_X */

579

580

/** Compute a 1D horizontal convolution of size 3 and stride 1 for 16bit floating point type.

581

*

582

* @param[in] left_pixel Pointer to the left pixel.

583

* @param[in] left_coeff Weight of the left pixel

584

* @param[in] middle_coeff Weight of the middle pixel

585

* @param[in] right_coeff Weight of the right pixel

586

*

587

* @return a half4 containing 4 convoluted values.

588

*/

589

inline half4 convolution1x3_stride_1_f16(__global const uchar *left_pixel,

590

const half left_coeff,

591

const half middle_coeff,

592

const half right_coeff)

593

{

594

half8 temp = vload8(0, (__global half *)left_pixel);

595

596

half4 left = CONVERT(temp.s0123, half4);

597

half4 middle = CONVERT(temp.s1234, half4);

598

half4 right = CONVERT(temp.s2345, half4);

599

600

return left * (half4)left_coeff + middle * (half4)middle_coeff + right * (half4)right_coeff;

601

}

602

603

/** Compute a 1D horizontal convolution of size 3 and stride 2 for 16bit floating point type.

604

*

605

* @param[in] left_pixel Pointer to the left pixel.

606

* @param[in] left_coeff Weight of the left pixel

607

* @param[in] middle_coeff Weight of the middle pixel

608

* @param[in] right_coeff Weight of the right pixel

609

*

610

* @return a half4 containing 4 convoluted values.

611

*/

612

inline half4 convolution1x3_stride_2_f16(__global const uchar *left_pixel,

613

const half left_coeff,

614

const half middle_coeff,

615

const half right_coeff)

616

{

617

half8 temp0 = vload8(0, (__global half *)left_pixel);

618

half temp1 = *((__global half *)(left_pixel + 8 * sizeof(half)));

619

620

half4 left = CONVERT(temp0.s0246, half4);

621

half4 middle = CONVERT(temp0.s1357, half4);

622

half4 right = CONVERT((half4)(temp0.s246, temp1), half4);

623

624

return left * (half4)left_coeff + middle * (half4)middle_coeff + right * (half4)right_coeff;

625

}

626

627

/** Compute a 1D horizontal convolution of size 3 and stride 3 for 16bit floating point type.

628

*

629

* @param[in] left_pixel Pointer to the left pixel.

630

* @param[in] left_coeff Weight of the left pixel

631

* @param[in] middle_coeff Weight of the middle pixel

632

* @param[in] right_coeff Weight of the right pixel

633

*

634

* @return a half4 containing 4 convoluted values.

635

*/

636

inline half4 convolution1x3_stride_3_f16(__global const uchar *left_pixel,

637

const half left_coeff,

638

const half middle_coeff,

639

const half right_coeff)

640

{

641

half16 temp0 = vload16(0, (__global half *)left_pixel);

642

643

half4 left = CONVERT(temp0.s0369, half4);

644

half4 middle = CONVERT(temp0.s147A, half4);

645

half4 right = CONVERT(temp0.s258B, half4);

646

647

return left * (half4)left_coeff + middle * (half4)middle_coeff + right * (half4)right_coeff;

648

}

649

650

/** Apply a 3x3 convolution matrix to a single channel F16 input image and return the result.

651

*

652

* Convolution matrix layout:

653

*

654

* [ mat0, mat1, mat2 ]\n

655

* [ mat3, mat4, mat5 ]\n

656

* [ mat6, mat7, mat8 ]\n

657

*

658

* @param[in] src A pointer to source Image structure

659

* @param[in] mat0 Coefficient from the convolution matrix

660

* @param[in] mat1 Coefficient from the convolution matrix

661

* @param[in] mat2 Coefficient from the convolution matrix

662

* @param[in] mat3 Coefficient from the convolution matrix

663

* @param[in] mat4 Coefficient from the convolution matrix

664

* @param[in] mat5 Coefficient from the convolution matrix

665

* @param[in] mat6 Coefficient from the convolution matrix

666

* @param[in] mat0 Coefficient from the convolution matrix

667

* @param[in] mat7 Coefficient from the convolution matrix

668

* @param[in] mat8 Coefficient from the convolution matrix

669

*

670

* @return a half4 containing 4 convoluted values.

671

*/

672

inline half4 convolution3x3_f16(

673

Image *src,

674

const half mat0, const half mat1, const half mat2,

675

const half mat3, const half mat4, const half mat5,

676

const half mat6, const half mat7, const half mat8)

{

half4 pixels;

pixels = convolution1x3_f16(offset(src, 0, 0), mat0, mat1, mat2);

681

pixels += convolution1x3_f16(offset(src, 0, 1), mat3, mat4, mat5);

682

pixels += convolution1x3_f16(offset(src, 0, 2), mat6, mat7, mat8);

return pixels;

}

/** This OpenCL kernel computes the depthwise convolution 3x3

688

*

689

* @param[in] src_ptr Pointer to the source image. Supported data types: F16

690

* @param[in] src_stride_x Stride of the source image in X dimension (in bytes)

691

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

692

* @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)

693

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

694

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image

695

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

696

* @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)

697

* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: F32

698

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

699

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

700

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

701

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

702

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

703

* @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)

704

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

705

* @param[in] weights_ptr Pointer to the weights tensor. Supported data types: F32

706

* @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)

707

* @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)

708

* @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)

709

* @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)

710

* @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)

711

* @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)

712

* @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector

713

* @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: F16/F32

714

* @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)

715

* @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)

716

* @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector

717

*/

718

__kernel void depthwise_convolution_3x3_f16(

719

TENSOR3D_DECLARATION(src),

720

TENSOR3D_DECLARATION(dst),

721

TENSOR3D_DECLARATION(weights)

722

#if defined(HAS_BIAS)

723

,

724

VECTOR_DECLARATION(biases)

725

#endif //defined(HAS_BIAS)

726

)

727

{

728

Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);

729

Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);

730

Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT(weights);

731

#if defined(HAS_BIAS)

732

Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);

733

#endif //defined(HAS_BIAS)

734

735

uchar3 offset = (uchar3)(0, 1, 2) * (uchar3)weights_stride_y;

736

half3 weights_values0 = vload3(0, (__global half *)(weights.ptr + offset.s0));

737

half3 weights_values1 = vload3(0, (__global half *)(weights.ptr + offset.s1));

738

half3 weights_values2 = vload3(0, (__global half *)(weights.ptr + offset.s2));

739

740

half4 pixels = convolution3x3_f16(&src, weights_values0.s0, weights_values0.s1, weights_values0.s2,

741

weights_values1.s0, weights_values1.s1, weights_values1.s2,

742

weights_values2.s0, weights_values2.s1, weights_values2.s2);

743

#if defined(HAS_BIAS)

744

pixels += (half4)(*((__global half *)(biases.ptr + get_global_id(2) * biases_stride_x)));

745

#endif //defined(HAS_BIAS)

746

747

vstore4(pixels, 0, (__global half *)dst.ptr);

748

}

749

#endif // defined(CONV_STRIDE_X)

750

#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)