Blame - src/core/CL/cl_kernels/depthwise_convolution.cl - ml/ComputeLibrary

__global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + get_global_id(1) * SRC_WIDTH * dst_stride_x + get_global_id(2) * dst_stride_y;

489

490

for(int i = 0; i < SRC_WIDTH; ++i, ++input_ptr)

491

{

492

*((__global DATA_TYPE *)(output_ptr + i * dst_stride_x)) = *input_ptr;

493

}

Georgios Pinitas

81a26ad

2017-10-23 20:29:30 +0100

[diff] [blame]

494

495

#if defined(HAS_BIAS)

496

if(get_global_id(1) == 0)

497

{

498

*((__global DATA_TYPE *)(output_ptr + SRC_WIDTH * get_global_size(1) * dst_stride_x)) = *((__global float *)(biases.ptr + get_global_id(2) * biases_stride_x));

499

}

500

#endif // defined(HAS_BIAS)

Giorgio Arena

9fe4144

2017-08-23 16:36:24 +0100

[diff] [blame]

501

}

502

#endif //defined(SRC_WIDTH) && defined(DATA_TYPE)

503

Georgios Pinitas

de5a1cc

2018-02-02 12:52:07 +0000

[diff] [blame]

504

#if defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(PAD_RIGHT) && defined(PAD_BOTTOM) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DATA_TYPE) && defined(PAD_VALUE)

Giorgio Arena

9fe4144

2017-08-23 16:36:24 +0100

[diff] [blame]

505

/** This kernel performs a reshaping of the input tensor to a tensor used to perform depthwise convolution using vector to matrix multiplication.

506

*

507

* @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float

Jaroslaw Rzepecki

a1ed41f

2017-10-13 11:13:58 +0100

[diff] [blame]

508

* @note The convolution information must be passed at compile time using -DSTRIDE_X, -DSTRIDE_Y, -DPAD_LEFT, -DPAD_TOP, -DPAD_RIGHT, -DPAD_BOTTOM, -DKERNEL_WIDHT, -DKERNEL_HEIGHT, -DSRC_WIDTH, -DSRC_HEIGHT

Giorgio Arena

9fe4144

2017-08-23 16:36:24 +0100

[diff] [blame]

509

*

510

* @param[in] src_ptr Pointer to the source tensor. Supported data types: QS8/QS16/F16/F32

511

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

512

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

513

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

514

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

515

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

516

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

517

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

518

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

519

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

520

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

521

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

522

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

523

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

524

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

525

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

526

*/

Giorgio Arena

9fe4144

2017-08-23 16:36:24 +0100

[diff] [blame]

527

__kernel void depthwise_im2col(TENSOR3D_DECLARATION(src), TENSOR3D_DECLARATION(dst))

528

{

529

Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);

530

531

const int src_pixel_linear = get_global_id(1) * STRIDE_X;

Jaroslaw Rzepecki

a1ed41f

2017-10-13 11:13:58 +0100

[diff] [blame]

532

const int full_length = SRC_WIDTH + PAD_LEFT + PAD_RIGHT;

Giorgio Arena

9fe4144

2017-08-23 16:36:24 +0100

[diff] [blame]

533

const int max_initial_x = STRIDE_X * (((full_length - KERNEL_WIDTH) / STRIDE_X) + 1);

534

Jaroslaw Rzepecki

a1ed41f

2017-10-13 11:13:58 +0100

[diff] [blame]

535

const int src_x = -PAD_LEFT + src_pixel_linear % max_initial_x;

536

const int src_y = -PAD_TOP + src_pixel_linear / max_initial_x * STRIDE_Y;

Giorgio Arena

9fe4144

2017-08-23 16:36:24 +0100

[diff] [blame]

537

const int src_z = get_global_id(2);

538

539

__global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + src_z * src_stride_z;

540

__global DATA_TYPE *output_ptr = ((__global DATA_TYPE *)(dst.ptr));

541

542

for(int y = src_y; y < src_y + KERNEL_HEIGHT; ++y)

543

{

544

for(int x = src_x; x < src_x + KERNEL_WIDTH; ++x, ++output_ptr)

545

{

546

if(x < 0 || x >= SRC_WIDTH || y < 0 || y >= SRC_HEIGHT)

547

{

Georgios Pinitas

de5a1cc

2018-02-02 12:52:07 +0000

[diff] [blame]

548

*output_ptr = PAD_VALUE;

Giorgio Arena

9fe4144

2017-08-23 16:36:24 +0100

[diff] [blame]

}

else

{

*output_ptr = *((__global DATA_TYPE *)(input_ptr + x * src_stride_x + y * src_stride_y));

553

}

554

}

555

}

Georgios Pinitas

81a26ad

2017-10-23 20:29:30 +0100

[diff] [blame]

556

#if defined(HAS_BIAS)

557

*output_ptr = (DATA_TYPE)(1);

558

#endif // defined(HAS_BIAS)

Giorgio Arena

9fe4144

2017-08-23 16:36:24 +0100

[diff] [blame]

559

}

560

Georgios Pinitas

de5a1cc

2018-02-02 12:52:07 +0000

[diff] [blame]

561

#endif //defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(PAD_RIGHT) && defined(PAD_BOTTOM) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_WIDTH) && defined(DATA_TYPE) && defined(PAD_VALUE)

Giorgio Arena

9fe4144

2017-08-23 16:36:24 +0100

[diff] [blame]

562

563

#if defined(CONV_WIDTH) && defined(CONV_HEIGHT) && defined(DATA_TYPE)

564

565

/** This kernel performs a reshaping of the output of the depthwise generic convolution.

566

*

567

* @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float

568

* @note The convolution information must be passed at compile time using -DCONV_WIDTH, -DCONV_HEIGHT, e.g -DCONV_WIDTH=32, -DCONV_HEIGHT=42

569

*

570

* @param[in] src_ptr Pointer to the source tensor. Supported data types: QS8/QS16/F16/F32

571

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

572

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

573

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

574

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

575

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

576

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

577

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

578

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

579

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

580

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

581

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

582

*/

583

__kernel void depthwise_vector_to_tensor(

584

VECTOR_DECLARATION(src),

585

TENSOR3D_DECLARATION(dst))

586

{

587

Vector src = CONVERT_TO_VECTOR_STRUCT(src);

588

589

const int patch_size = CONV_WIDTH * CONV_HEIGHT;

590

const int id0 = get_global_id(0);

591

const int z = id0 / patch_size;

592

const int index2D = id0 - z * patch_size;

593

594

__global uchar *out_ptr = dst_ptr + dst_offset_first_element_in_bytes + index2D % CONV_WIDTH * dst_stride_x + index2D / CONV_WIDTH * dst_stride_y + z * dst_stride_z;

595

*((__global DATA_TYPE *)out_ptr) = *((__global DATA_TYPE *)src.ptr);

596

}

597

598

#endif //defined(CONV_WIDTH) && defined(CONV_HEIGHT) && defined(DATA_TYPE)

Michele Di Giorgio

933fe86

2018-02-19 15:42:12 +0000

[diff] [blame]

599

600

#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)

601

#if defined(CONV_STRIDE_X)

602

#if CONV_STRIDE_X == 1

603

#define convolution1x3_f16 convolution1x3_stride_1_f16

604

#elif CONV_STRIDE_X == 2

605

#define convolution1x3_f16 convolution1x3_stride_2_f16

606

#elif CONV_STRIDE_X == 3

607

#define convolution1x3_f16 convolution1x3_stride_3_f16

608

#else /* CONV_STRIDE_X */

609

#error "Stride not supported"

610

#endif /* CONV_STRIDE_X */

611

612

/** Compute a 1D horizontal convolution of size 3 and stride 1 for 16bit floating point type.

613

*

614

* @param[in] left_pixel Pointer to the left pixel.

615

* @param[in] left_coeff Weight of the left pixel

616

* @param[in] middle_coeff Weight of the middle pixel

617

* @param[in] right_coeff Weight of the right pixel

618

*

619

* @return a half4 containing 4 convoluted values.

620

*/

621

inline half4 convolution1x3_stride_1_f16(__global const uchar *left_pixel,

622

const half left_coeff,

623

const half middle_coeff,

624

const half right_coeff)

625

{

626

half8 temp = vload8(0, (__global half *)left_pixel);

627

628

half4 left = CONVERT(temp.s0123, half4);

629

half4 middle = CONVERT(temp.s1234, half4);

630

half4 right = CONVERT(temp.s2345, half4);

631

632

return left * (half4)left_coeff + middle * (half4)middle_coeff + right * (half4)right_coeff;

633

}

634

635

/** Compute a 1D horizontal convolution of size 3 and stride 2 for 16bit floating point type.

636

*

637

* @param[in] left_pixel Pointer to the left pixel.

638

* @param[in] left_coeff Weight of the left pixel

639

* @param[in] middle_coeff Weight of the middle pixel

640

* @param[in] right_coeff Weight of the right pixel

641

*

642

* @return a half4 containing 4 convoluted values.

643

*/

644

inline half4 convolution1x3_stride_2_f16(__global const uchar *left_pixel,

645

const half left_coeff,

646

const half middle_coeff,

647

const half right_coeff)

648

{

649

half8 temp0 = vload8(0, (__global half *)left_pixel);

650

half temp1 = *((__global half *)(left_pixel + 8 * sizeof(half)));

651

652

half4 left = CONVERT(temp0.s0246, half4);

653

half4 middle = CONVERT(temp0.s1357, half4);

654

half4 right = CONVERT((half4)(temp0.s246, temp1), half4);

655

656

return left * (half4)left_coeff + middle * (half4)middle_coeff + right * (half4)right_coeff;

657

}

658

659

/** Compute a 1D horizontal convolution of size 3 and stride 3 for 16bit floating point type.

660

*

661

* @param[in] left_pixel Pointer to the left pixel.

662

* @param[in] left_coeff Weight of the left pixel

663

* @param[in] middle_coeff Weight of the middle pixel

664

* @param[in] right_coeff Weight of the right pixel

665

*

666

* @return a half4 containing 4 convoluted values.

667

*/

668

inline half4 convolution1x3_stride_3_f16(__global const uchar *left_pixel,

669

const half left_coeff,

670

const half middle_coeff,

671

const half right_coeff)

672

{

673

half16 temp0 = vload16(0, (__global half *)left_pixel);

674

675

half4 left = CONVERT(temp0.s0369, half4);

676

half4 middle = CONVERT(temp0.s147A, half4);

677

half4 right = CONVERT(temp0.s258B, half4);

678

679

return left * (half4)left_coeff + middle * (half4)middle_coeff + right * (half4)right_coeff;

680

}

681

682

/** Apply a 3x3 convolution matrix to a single channel F16 input image and return the result.

683

*

684

* Convolution matrix layout:

685

*

686

* [ mat0, mat1, mat2 ]\n

687

* [ mat3, mat4, mat5 ]\n

688

* [ mat6, mat7, mat8 ]\n

689

*

690

* @param[in] src A pointer to source Image structure

691

* @param[in] mat0 Coefficient from the convolution matrix

692

* @param[in] mat1 Coefficient from the convolution matrix

693

* @param[in] mat2 Coefficient from the convolution matrix

694

* @param[in] mat3 Coefficient from the convolution matrix

695

* @param[in] mat4 Coefficient from the convolution matrix

696

* @param[in] mat5 Coefficient from the convolution matrix

697

* @param[in] mat6 Coefficient from the convolution matrix

698

* @param[in] mat0 Coefficient from the convolution matrix

699

* @param[in] mat7 Coefficient from the convolution matrix

700

* @param[in] mat8 Coefficient from the convolution matrix

701

*

702

* @return a half4 containing 4 convoluted values.

703

*/

704

inline half4 convolution3x3_f16(

705

Image *src,

706

const half mat0, const half mat1, const half mat2,

707

const half mat3, const half mat4, const half mat5,

708

const half mat6, const half mat7, const half mat8)

{

half4 pixels;

pixels = convolution1x3_f16(offset(src, 0, 0), mat0, mat1, mat2);

713

pixels += convolution1x3_f16(offset(src, 0, 1), mat3, mat4, mat5);

714

pixels += convolution1x3_f16(offset(src, 0, 2), mat6, mat7, mat8);

return pixels;

}

/** This OpenCL kernel computes the depthwise convolution 3x3

720

*

721

* @param[in] src_ptr Pointer to the source image. Supported data types: F16

722

* @param[in] src_stride_x Stride of the source image in X dimension (in bytes)

723

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

724

* @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)

725

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

726

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image

727

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

728

* @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)

Michele Di Giorgio

3ebef32

2018-02-21 10:02:58 +0000

[diff] [blame^]

729

* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

Michele Di Giorgio

933fe86

2018-02-19 15:42:12 +0000

[diff] [blame]

730

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

731

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

732

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

733

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

734

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

735

* @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)

736

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

Michele Di Giorgio

3ebef32

2018-02-21 10:02:58 +0000

[diff] [blame^]

737

* @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr

Michele Di Giorgio

933fe86

2018-02-19 15:42:12 +0000

[diff] [blame]

738

* @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)

739

* @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)

740

* @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)

741

* @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)

742

* @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)

743

* @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)

744

* @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector

745

* @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: F16/F32

746

* @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)

747

* @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)

748

* @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector

749

*/

750

__kernel void depthwise_convolution_3x3_f16(

751

TENSOR3D_DECLARATION(src),

752

TENSOR3D_DECLARATION(dst),

753

TENSOR3D_DECLARATION(weights)

754

#if defined(HAS_BIAS)

755

,

756

VECTOR_DECLARATION(biases)

757

#endif //defined(HAS_BIAS)

758

)

759

{

760

Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);

761

Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);

762

Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT(weights);

763

#if defined(HAS_BIAS)

764

Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);

765

#endif //defined(HAS_BIAS)

766

767

uchar3 offset = (uchar3)(0, 1, 2) * (uchar3)weights_stride_y;

768

half3 weights_values0 = vload3(0, (__global half *)(weights.ptr + offset.s0));

769

half3 weights_values1 = vload3(0, (__global half *)(weights.ptr + offset.s1));

770

half3 weights_values2 = vload3(0, (__global half *)(weights.ptr + offset.s2));

771

772

half4 pixels = convolution3x3_f16(&src, weights_values0.s0, weights_values0.s1, weights_values0.s2,

773

weights_values1.s0, weights_values1.s1, weights_values1.s2,

774

weights_values2.s0, weights_values2.s1, weights_values2.s2);

775

#if defined(HAS_BIAS)

776

pixels += (half4)(*((__global half *)(biases.ptr + get_global_id(2) * biases_stride_x)));

777

#endif //defined(HAS_BIAS)

778

779

vstore4(pixels, 0, (__global half *)dst.ptr);

780

}

781

#endif // defined(CONV_STRIDE_X)

Michele Di Giorgio

3ebef32

2018-02-21 10:02:58 +0000

[diff] [blame^]

782

783

/** This OpenCL kernel is optimized for Bifrost architectures and computes the 16bit floating point depthwise convolution 3x3

784

* when both stride_x and stride_y are equal to 1

785

*

786

* @param[in] src_ptr Pointer to the source image. Supported data types: F16

787

* @param[in] src_stride_x Stride of the source image in X dimension (in bytes)

788

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

789

* @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)

790

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

791

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image

792

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

793

* @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)

794

* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

795

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

796

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

797

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

798

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

799

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

800

* @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)

801

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

802

* @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr

803

* @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)

804

* @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)

805

* @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)

806

* @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)

807

* @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)

808

* @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)

809

* @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector

810

* @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: same as @p src_ptr

811

* @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)

812

* @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)

813

* @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector

814

*/

815

__kernel void depthwise_convolution_3x3_stridex1_stridey1_bifrost_f16(

816

TENSOR3D_DECLARATION(src),

817

TENSOR3D_DECLARATION(dst),

818

TENSOR3D_DECLARATION(weights)

819

#if defined(HAS_BIAS)

820

,

821

VECTOR_DECLARATION(biases)

822

#endif //defined(HAS_BIAS)

823

)

824

{

825

Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);

826

Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);

827

Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT(weights);

828

829

#ifdef HAS_BIAS

830

Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);

831

832

half bias = *((__global half *)(vector_offset(&biases, get_global_id(2))));

833

#endif /* defined(HAS_BIAS) */

834

835

half4 pixels0 = 0.0f;

836

half4 pixels1 = 0.0f;

837

half4 pixels2 = 0.0f;

838

half4 pixels3 = 0.0f;

839

840

__global uchar *weights_addr = (__global uchar *)weights.ptr;

841

__global uchar *src_addr = (__global uchar *)offset(&src, 0, 0);

842

843

// Load the weights

844

half3 weights_row0 = vload3(0, (__global half *)(weights_addr + 0 * weights_stride_y));

845

half3 weights_row1 = vload3(0, (__global half *)(weights_addr + 1 * weights_stride_y));

846

half3 weights_row2 = vload3(0, (__global half *)(weights_addr + 2 * weights_stride_y));

847

848

// Note: Since each work-item computes 4x4 elements, we need to load 6 rows from the input tensor

849

half8 src00 = vload8(0, (__global half *)(src_addr + 0 * src_stride_y)); // Row0

850

half8 src10 = vload8(0, (__global half *)(src_addr + 1 * src_stride_y)); // Row1

851

half8 src20 = vload8(0, (__global half *)(src_addr + 2 * src_stride_y)); // Row2

852

half8 src30 = vload8(0, (__global half *)(src_addr + 3 * src_stride_y)); // Row3

853

half8 src40 = vload8(0, (__global half *)(src_addr + 4 * src_stride_y)); // Row4

854

half8 src50 = vload8(0, (__global half *)(src_addr + 5 * src_stride_y)); // Row5

855

856

CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels0, src00, weights_row0);

857

CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels0, src10, weights_row1);

858

CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels0, src20, weights_row2);

859

CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels1, src10, weights_row0);

860

CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels1, src20, weights_row1);

861

CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels1, src30, weights_row2);

862

CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels2, src20, weights_row0);

863

CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels2, src30, weights_row1);

864

CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels2, src40, weights_row2);

865

CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels3, src30, weights_row0);

866

CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels3, src40, weights_row1);

867

CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels3, src50, weights_row2);

868

869

#ifdef HAS_BIAS

870

pixels0 += (half4)bias;

871

pixels1 += (half4)bias;

872

pixels2 += (half4)bias;

873

pixels3 += (half4)bias;

874

#endif /* defined(HAS_BIAS) */

875

876

vstore4(pixels0, 0, (__global half *)(dst.ptr + 0 * dst_stride_y));

877

vstore4(pixels1, 0, (__global half *)(dst.ptr + 1 * dst_stride_y));

878

vstore4(pixels2, 0, (__global half *)(dst.ptr + 2 * dst_stride_y));

879

vstore4(pixels3, 0, (__global half *)(dst.ptr + 3 * dst_stride_y));

880

}

881

882

/** This OpenCL kernel is optimized for Bifrost architectures and computes 16bit floating point the depthwise convolution 3x3

883

* when both stride_x and stride_y are equal to 2

884

*

885

* @param[in] src_ptr Pointer to the source image. Supported data types: F16

886

* @param[in] src_stride_x Stride of the source image in X dimension (in bytes)

887

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

888

* @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)

889

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

890

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image

891

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

892

* @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)

893

* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

894

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

895

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

896

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

897

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

898

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

899

* @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)

900

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

901

* @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr

902

* @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)

903

* @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)

904

* @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)

905

* @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)

906

* @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)

907

* @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)

908

* @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector

909

* @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: same as @p src_ptr

910

* @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)

911

* @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)

912

* @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector

913

*/

914

__kernel void depthwise_convolution_3x3_stridex2_stridey2_bifrost_f16(

915

TENSOR3D_DECLARATION(src),

916

TENSOR3D_DECLARATION(dst),

917

TENSOR3D_DECLARATION(weights)

918

#if defined(HAS_BIAS)

919

,

920

VECTOR_DECLARATION(biases)

921

#endif //defined(HAS_BIAS)

922

)

923

{

924

Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);

925

Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);

926

Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT(weights);

927

928

#ifdef HAS_BIAS

929

Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);

930

931

half bias = *((__global half *)(vector_offset(&biases, get_global_id(2))));

932

#endif /* defined(HAS_BIAS) */

933

934

half4 pixels0 = 0.0f;

935

half4 pixels1 = 0.0f;

936

937

__global uchar *weights_addr = (__global uchar *)weights.ptr;

938

__global uchar *src_addr = (__global uchar *)offset(&src, 0, 0);

939

940

// Load the weights

941

half3 weights_row0 = vload3(0, (__global half *)(weights_addr + 0 * weights_stride_y));

942

half3 weights_row1 = vload3(0, (__global half *)(weights_addr + 1 * weights_stride_y));

943

half3 weights_row2 = vload3(0, (__global half *)(weights_addr + 2 * weights_stride_y));

944

945

// Note: Since each work-item computes 2x4 elements, we need to load 5 rows from the input tensor

946

half8 src00 = vload8(0, (__global half *)(src_addr + 0 * src_stride_y)); // Row0

947

half2 src01 = vload2(4, (__global half *)(src_addr + 0 * src_stride_y)); // Row0

948

half8 src10 = vload8(0, (__global half *)(src_addr + 1 * src_stride_y)); // Row1

949

half2 src11 = vload2(4, (__global half *)(src_addr + 1 * src_stride_y)); // Row1

950

half8 src20 = vload8(0, (__global half *)(src_addr + 2 * src_stride_y)); // Row2

951

half2 src21 = vload2(4, (__global half *)(src_addr + 2 * src_stride_y)); // Row2

952

half8 src30 = vload8(0, (__global half *)(src_addr + 3 * src_stride_y)); // Row3

953

half2 src31 = vload2(4, (__global half *)(src_addr + 3 * src_stride_y)); // Row3

954

half8 src40 = vload8(0, (__global half *)(src_addr + 4 * src_stride_y)); // Row4

955

half2 src41 = vload2(4, (__global half *)(src_addr + 4 * src_stride_y)); // Row4

956

957

CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels0, src00, src01, weights_row0);

958

CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels0, src10, src11, weights_row1);

959

CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels0, src20, src21, weights_row2);

960

CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels1, src20, src21, weights_row0);

961

CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels1, src30, src31, weights_row1);

962

CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels1, src40, src41, weights_row2);

963

964

#ifdef HAS_BIAS

965

pixels0 += (half4)bias;

966

pixels1 += (half4)bias;

967

#endif /* defined(HAS_BIAS) */

968

969

vstore4(pixels0, 0, (__global half *)(dst.ptr + 0 * dst_stride_y));

970

vstore4(pixels1, 0, (__global half *)(dst.ptr + 1 * dst_stride_y));

971

}

Michele Di Giorgio

933fe86

2018-02-19 15:42:12 +0000

[diff] [blame]

972

#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED)