Blame - src/core/CL/cl_kernels/depthwise_convolution.cl - ml/ComputeLibrary

2018-02-01 16:57:48 +0000

[diff] [blame]

573

Usama Arif

2019-04-08 17:30:48 +0100

[diff] [blame^]

574

#if(DILATION_X == 1 && DILATION_Y == 1)

Gian Marco

2018-02-01 16:57:48 +0000

[diff] [blame]

575

// Load the weights

576

float3 weights_row0 = vload3(0, (__global float *)(weights_addr + 0 * weights_stride_y));

577

float3 weights_row1 = vload3(0, (__global float *)(weights_addr + 1 * weights_stride_y));

578

float3 weights_row2 = vload3(0, (__global float *)(weights_addr + 2 * weights_stride_y));

579

Michele Di Giorgio

2018-02-21 10:02:58 +0000

[diff] [blame]

580

// Note: Since each work-item computes 4x2 elements, we need to load 6 rows from the input tensor

Gian Marco

2018-02-01 16:57:48 +0000

[diff] [blame]

581

float4 src00 = vload4(0, (__global float *)(src_addr + 0 * src_stride_y)); // Row0

582

float4 src10 = vload4(0, (__global float *)(src_addr + 1 * src_stride_y)); // Row1

583

float4 src20 = vload4(0, (__global float *)(src_addr + 2 * src_stride_y)); // Row2

584

float4 src30 = vload4(0, (__global float *)(src_addr + 3 * src_stride_y)); // Row3

Michele Di Giorgio

2018-02-21 10:02:58 +0000

[diff] [blame]

585

float4 src40 = vload4(0, (__global float *)(src_addr + 4 * src_stride_y)); // Row4

586

float4 src50 = vload4(0, (__global float *)(src_addr + 5 * src_stride_y)); // Row5

Gian Marco

2018-02-01 16:57:48 +0000

[diff] [blame]

587

588

CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels0, src00, weights_row0);

589

CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels0, src10, weights_row1);

590

CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels0, src20, weights_row2);

591

CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels1, src10, weights_row0);

592

CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels1, src20, weights_row1);

593

CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels1, src30, weights_row2);

594

CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels2, src20, weights_row0);

595

CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels2, src30, weights_row1);

596

CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels2, src40, weights_row2);

597

CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels3, src30, weights_row0);

598

CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels3, src40, weights_row1);

599

CONVOLUTION1x3_BIFROST2X1_STRIDE1(pixels3, src50, weights_row2);

600

Usama Arif

2019-04-08 17:30:48 +0100

[diff] [blame^]

601

#else /* DILATION_X==1 && DILATION_Y==1 */

602

603

//3x3 Convolution of elements starting in 0th row

604

pixels0 = convolution_3x3_dilation_stridex1_stridey1_bifrost_f32(src_addr, src.stride_x, src.stride_y, 0, weights_addr, weights_stride_y);

605

//3x3 Convolution of elements starting in 1st row

606

pixels1 = convolution_3x3_dilation_stridex1_stridey1_bifrost_f32(src_addr, src.stride_x, src.stride_y, 1, weights_addr, weights_stride_y);

607

//3x3 Convolution of elements starting in 2nd row

608

pixels2 = convolution_3x3_dilation_stridex1_stridey1_bifrost_f32(src_addr, src.stride_x, src.stride_y, 2, weights_addr, weights_stride_y);

609

//3x3 Convolution of elements starting in 3rd row

610

pixels3 = convolution_3x3_dilation_stridex1_stridey1_bifrost_f32(src_addr, src.stride_x, src.stride_y, 3, weights_addr, weights_stride_y);

611

612

#endif /* DILATION_X==1 && DILATION_Y==1 */

613

Gian Marco

2018-02-01 16:57:48 +0000

[diff] [blame]

614

#ifdef HAS_BIAS

615

Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);

616

Georgios Pinitas

2018-09-13 17:20:04 +0100

[diff] [blame]

617

float bias = *((__global float *)(vector_offset(&biases, channel)));

Gian Marco

2018-02-01 16:57:48 +0000

[diff] [blame]

618

619

pixels0 += (float2)bias;

620

pixels1 += (float2)bias;

621

pixels2 += (float2)bias;

622

pixels3 += (float2)bias;

623

#endif /* defined(HAS_BIAS) */

624

625

vstore2(pixels0, 0, (__global float *)(dst.ptr + 0 * dst_stride_y));

626

vstore2(pixels1, 0, (__global float *)(dst.ptr + 1 * dst_stride_y));

627

vstore2(pixels2, 0, (__global float *)(dst.ptr + 2 * dst_stride_y));

628

vstore2(pixels3, 0, (__global float *)(dst.ptr + 3 * dst_stride_y));

629

}

630

631

/** This OpenCL kernel is optimized for Bifrost architectures and computes the depthwise convolution 3x3 when both

632

* stride_x and stride_y are equal to 2

633

*

Georgios Pinitas

2018-10-30 14:53:25 +0000

[diff] [blame]

634

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F32

635

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

Gian Marco

2018-02-01 16:57:48 +0000

[diff] [blame]

636

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

Georgios Pinitas

2018-10-30 14:53:25 +0000

[diff] [blame]

637

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

Gian Marco

2018-02-01 16:57:48 +0000

[diff] [blame]

638

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

Georgios Pinitas

2018-10-30 14:53:25 +0000

[diff] [blame]

639

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

Gian Marco

2018-02-01 16:57:48 +0000

[diff] [blame]

640

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

641

* @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)

642

* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: F32

643

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

644

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

645

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

646

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

647

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

648

* @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)

649

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

650

* @param[in] weights_ptr Pointer to the weights tensor. Supported data types: F32

651

* @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)

652

* @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)

653

* @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)

654

* @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)

655

* @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)

656

* @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)

657

* @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector

658

* @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: F32

659

* @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)

660

* @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)

661

* @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector

662

*/

Michele Di Giorgio

2018-02-21 10:02:58 +0000

[diff] [blame]

663

__kernel void depthwise_convolution_3x3_stridex2_stridey2_bifrost_f32(

Gian Marco

2018-02-01 16:57:48 +0000

[diff] [blame]

664

TENSOR3D_DECLARATION(src),

665

TENSOR3D_DECLARATION(dst),

666

TENSOR3D_DECLARATION(weights)

667

#if defined(HAS_BIAS)

668

,

669

VECTOR_DECLARATION(biases)

670

#endif //defined(HAS_BIAS)

671

)

672

{

673

Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);

674

Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);

Georgios Pinitas

2018-09-13 17:20:04 +0100

[diff] [blame]

675

Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);

Gian Marco

2018-02-01 16:57:48 +0000

[diff] [blame]

676

677

float2 pixels0 = 0.0f;

678

float2 pixels1 = 0.0f;

679

Georgios Pinitas

2018-09-13 17:20:04 +0100

[diff] [blame]

680

// Extract channel and linearized batch indices

Georgios Pinitas

2018-10-30 14:53:25 +0000

[diff] [blame]

681

const int channel = get_global_id(2) % DST_CHANNELS;

682

const int batch = get_global_id(2) / DST_CHANNELS;

Georgios Pinitas

2018-09-13 17:20:04 +0100

[diff] [blame]

683

// Load relevant input and weights data (Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)

684

__global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;

685

__global uchar *src_addr = src.ptr - batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z - (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;

Gian Marco

2018-02-01 16:57:48 +0000

[diff] [blame]

686

Usama Arif

2019-04-08 17:30:48 +0100

[diff] [blame^]

687

#if(DILATION_X == 1 && DILATION_Y == 1)

688

Gian Marco

2018-02-01 16:57:48 +0000

[diff] [blame]

689

// Load the weights

690

float3 weights_row0 = vload3(0, (__global float *)(weights_addr + 0 * weights_stride_y));

691

float3 weights_row1 = vload3(0, (__global float *)(weights_addr + 1 * weights_stride_y));

692

float3 weights_row2 = vload3(0, (__global float *)(weights_addr + 2 * weights_stride_y));

693

694

// Note: Since each work-item computes 4x2 elements, we need to load 5 rows from the input tensor

695

float4 src00 = vload4(0, (__global float *)(src_addr + 0 * src_stride_y)); // Row0

696

float2 src01 = vload2(2, (__global float *)(src_addr + 0 * src_stride_y)); // Row0

697

float4 src10 = vload4(0, (__global float *)(src_addr + 1 * src_stride_y)); // Row1

698

float2 src11 = vload2(2, (__global float *)(src_addr + 1 * src_stride_y)); // Row1

699

float4 src20 = vload4(0, (__global float *)(src_addr + 2 * src_stride_y)); // Row2

700

float2 src21 = vload2(2, (__global float *)(src_addr + 2 * src_stride_y)); // Row2

701

float4 src30 = vload4(0, (__global float *)(src_addr + 3 * src_stride_y)); // Row3

702

float2 src31 = vload2(2, (__global float *)(src_addr + 3 * src_stride_y)); // Row3

703

float4 src40 = vload4(0, (__global float *)(src_addr + 4 * src_stride_y)); // Row4

704

float2 src41 = vload2(2, (__global float *)(src_addr + 4 * src_stride_y)); // Row4

705

706

CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels0, src00, src01, weights_row0);

707

CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels0, src10, src11, weights_row1);

708

CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels0, src20, src21, weights_row2);

709

CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels1, src20, src21, weights_row0);

710

CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels1, src30, src31, weights_row1);

711

CONVOLUTION1x3_BIFROST2X1_STRIDE2(pixels1, src40, src41, weights_row2);

712

Usama Arif

2019-04-08 17:30:48 +0100

[diff] [blame^]

713

#else /* DILATION_X==1 && DILATION_Y==1 */

714

715

//3x3 Convolution of elements starting in 0th row

716

pixels0 = convolution_3x3_dilation_stridex2_stridey2_bifrost_f32(src_addr, src.stride_x, src.stride_y, 0, weights_addr, weights_stride_y);

717

//3x3 Convolution of elements starting in 2nd row

718

pixels1 = convolution_3x3_dilation_stridex2_stridey2_bifrost_f32(src_addr, src.stride_x, src.stride_y, 2, weights_addr, weights_stride_y);

719

#endif /* DILATION_X==1 && DILATION_Y==1 */

720

Gian Marco

2018-02-01 16:57:48 +0000

[diff] [blame]

721

#ifdef HAS_BIAS

722

Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);

723

Georgios Pinitas

2018-09-13 17:20:04 +0100

[diff] [blame]

724

float bias = *((__global float *)(vector_offset(&biases, channel)));

Gian Marco

2018-02-01 16:57:48 +0000

[diff] [blame]

725

726

pixels0 += (float2)bias;

727

pixels1 += (float2)bias;

728

#endif /* defined(HAS_BIAS) */

729

730

vstore2(pixels0, 0, (__global float *)(dst.ptr + 0 * dst_stride_y));

731

vstore2(pixels1, 0, (__global float *)(dst.ptr + 1 * dst_stride_y));

732

}

733

Georgios Pinitas

2018-09-13 17:20:04 +0100

[diff] [blame]

734

#endif // defined(DEPTH_MULTIPLIER) && defined(DST_CHANNELS)

Giorgio Arena

2018-04-04 17:44:26 +0100

[diff] [blame]

735

giuros01

6d10996

2019-01-07 17:47:19 +0000

[diff] [blame]

736

#if defined(VEC_SIZE) && defined(DATA_TYPE) && defined(DST_WIDTH)

737

/** Reshape the weights for quantized depthwise convolution

738

*

739

* @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type, e.g. -DDATA_TYPE=uint8

740

* @note Output width should be given as a preprocessor argument using -DDST_WIDTH=width, e.g. -DDST_WIDTH=128

741

* @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=vec_size, e.g., -DVEC_SIZE=4

742

* @attention Input's height and width should be 3

743

*

744

* @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8

745

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

746

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

747

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

748

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

749

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

750

* @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)

751

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

752

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

753

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

754

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

755

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

756

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

757

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

758

*/

759

__kernel void depthwise_convolution_reshape_weights(

760

TENSOR3D_DECLARATION(src),

761

IMAGE_DECLARATION(dst))

762

{

763

Vector src = CONVERT_TO_VECTOR_STRUCT(src);

764

const int x = get_global_id(0);

765

766

// Load 3x3xVEC_SIZE weights

767

VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)

768

w0 = VLOAD(VEC_SIZE)(0, src.ptr + 0 * src_stride_y + 0 * src_stride_z);

769

VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)

770

w1 = VLOAD(VEC_SIZE)(0, src.ptr + 1 * src_stride_y + 0 * src_stride_z);

771

VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)

772

w2 = VLOAD(VEC_SIZE)(0, src.ptr + 2 * src_stride_y + 0 * src_stride_z);

773

VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)

774

w3 = VLOAD(VEC_SIZE)(0, src.ptr + 0 * src_stride_y + 1 * src_stride_z);

775

VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)

776

w4 = VLOAD(VEC_SIZE)(0, src.ptr + 1 * src_stride_y + 1 * src_stride_z);

777

VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)

778

w5 = VLOAD(VEC_SIZE)(0, src.ptr + 2 * src_stride_y + 1 * src_stride_z);

779

VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)

780

w6 = VLOAD(VEC_SIZE)(0, src.ptr + 0 * src_stride_y + 2 * src_stride_z);

781

VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)

782

w7 = VLOAD(VEC_SIZE)(0, src.ptr + 1 * src_stride_y + 2 * src_stride_z);

783

VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)

784

w8 = VLOAD(VEC_SIZE)(0, src.ptr + 2 * src_stride_y + 2 * src_stride_z);

785

786

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * DST_WIDTH * sizeof(DATA_TYPE);

787

788

#if defined(TRANSPOSE)

789

#if VEC_SIZE != 4

790

#error "VEC_SIZE not supported"

791

#else // VEC_SIZE != 4

792

VSTORE(VEC_SIZE)

793

((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))(w0.s0, w1.s0, w2.s0, w3.s0), 0, dst_addr + 0);

794

VSTORE(VEC_SIZE)

795

((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))(w4.s0, w5.s0, w6.s0, w7.s0), 0, dst_addr + 1 * sizeof(DATA_TYPE) * VEC_SIZE);

796

VSTORE(VEC_SIZE)

797

((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))(w8.s0, w0.s1, w1.s1, w2.s1), 0, dst_addr + 2 * sizeof(DATA_TYPE) * VEC_SIZE);

798

VSTORE(VEC_SIZE)

799

((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))(w3.s1, w4.s1, w5.s1, w6.s1), 0, dst_addr + 3 * sizeof(DATA_TYPE) * VEC_SIZE);

800

VSTORE(VEC_SIZE)

801

((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))(w7.s1, w8.s1, w0.s2, w1.s2), 0, dst_addr + 4 * sizeof(DATA_TYPE) * VEC_SIZE);

802

VSTORE(VEC_SIZE)

803

((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))(w2.s2, w3.s2, w4.s2, w5.s2), 0, dst_addr + 5 * sizeof(DATA_TYPE) * VEC_SIZE);

804

VSTORE(VEC_SIZE)

805

((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))(w6.s2, w7.s2, w8.s2, w0.s3), 0, dst_addr + 6 * sizeof(DATA_TYPE) * VEC_SIZE);

806

VSTORE(VEC_SIZE)

807

((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))(w1.s3, w2.s3, w3.s3, w4.s3), 0, dst_addr + 7 * sizeof(DATA_TYPE) * VEC_SIZE);

808

VSTORE(VEC_SIZE)

809

((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))(w5.s3, w6.s3, w7.s3, w8.s3), 0, dst_addr + 8 * sizeof(DATA_TYPE) * VEC_SIZE);

810

#endif // VEC_SIZE != 4

811

#else // !defined(TRANSPOSE)

812

VSTORE(VEC_SIZE)

813

(w0, 0, dst_addr + 0);

814

VSTORE(VEC_SIZE)

815

(w1, 0, dst_addr + 1 * sizeof(DATA_TYPE) * VEC_SIZE);

816

VSTORE(VEC_SIZE)

817

(w2, 0, dst_addr + 2 * sizeof(DATA_TYPE) * VEC_SIZE);

818

VSTORE(VEC_SIZE)

819

(w3, 0, dst_addr + 3 * sizeof(DATA_TYPE) * VEC_SIZE);

820

VSTORE(VEC_SIZE)

821

(w4, 0, dst_addr + 4 * sizeof(DATA_TYPE) * VEC_SIZE);

822

VSTORE(VEC_SIZE)

823

(w5, 0, dst_addr + 5 * sizeof(DATA_TYPE) * VEC_SIZE);

824

VSTORE(VEC_SIZE)

825

(w6, 0, dst_addr + 6 * sizeof(DATA_TYPE) * VEC_SIZE);

826

VSTORE(VEC_SIZE)

827

(w7, 0, dst_addr + 7 * sizeof(DATA_TYPE) * VEC_SIZE);

828

VSTORE(VEC_SIZE)

829

(w8, 0, dst_addr + 8 * sizeof(DATA_TYPE) * VEC_SIZE);

830

#endif // defined(TRANSPOSE)

831

}

832

#endif // defined(VEC_SIZE) && defined(DATA_TYPE) && defined(DST_WIDTH)

833

Giorgio Arena

2018-06-20 11:46:42 +0100

[diff] [blame]

834

#if defined(NCHW)

835

#define in_stride_x src_stride_x

836

#define in_stride_y src_stride_y

837

#define in_stride_z src_stride_z

838

#define out_stride_x dst_stride_x

839

#define out_stride_y dst_stride_y

840

#define out_stride_z dst_stride_z

841

#else //defined(NCHW)

842

#define in_stride_x src_stride_y

843

#define in_stride_y src_stride_z

844

#define in_stride_z src_stride_x

845

#define out_stride_x dst_stride_y

846

#define out_stride_y dst_stride_z

847

#define out_stride_z dst_stride_x

848

#endif //defined(NCHW)

849

Giorgio Arena

2017-08-23 16:36:24 +0100

[diff] [blame]

850

#if defined(SRC_WIDTH) && defined(DATA_TYPE)

851

/** This kernel reshapes each of the tensor's low three dimensions to single rows.

852

*

853

* @note Datatype and source width should be given as a preprocessor argument using -DDATA_TYPE=type and -DSRC_WIDTH=width. e.g. -DSRC_WIDTH=128

854

*

Georgios Pinitas

2017-10-23 20:29:30 +0100

[diff] [blame]

855

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32

856

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

857

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

858

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

859

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

860

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

861

* @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)

862

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

863

* @param[out] dst_ptr Pointer to the destination tensor. Same as @p src_ptr

864

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

865

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

866

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

867

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

868

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

869

* @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: F16/F32

870

* @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)

871

* @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)

872

* @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector

Giorgio Arena

2017-08-23 16:36:24 +0100

[diff] [blame]

873

*/

giuros01

6d10996

2019-01-07 17:47:19 +0000

[diff] [blame]

874

__kernel void depthwise_convolution_reshape_weights_generic(

Georgios Pinitas

2017-10-23 20:29:30 +0100

[diff] [blame]

875

TENSOR3D_DECLARATION(src),

876

IMAGE_DECLARATION(dst)

877

#ifdef HAS_BIAS

878

,

879

VECTOR_DECLARATION(biases)

880

#endif /* HAS_BIAS */

881

)

Giorgio Arena

2017-08-23 16:36:24 +0100

[diff] [blame]

882

{

Georgios Pinitas

2017-10-23 20:29:30 +0100

[diff] [blame]

883

#ifdef HAS_BIAS

884

Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);

885

#endif /* HAS_BIAS */

Giorgio Arena

2017-08-23 16:36:24 +0100

[diff] [blame]

886

Giorgio Arena

2018-06-20 11:46:42 +0100

[diff] [blame]

887

__global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + get_global_id(1) * in_stride_y + get_global_id(2) * in_stride_z;

888

__global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + get_global_id(1) * SRC_WIDTH * dst_stride_x + get_global_id(2) * dst_stride_y;

Giorgio Arena

2017-08-23 16:36:24 +0100

[diff] [blame]

889

Giorgio Arena

2018-06-20 11:46:42 +0100

[diff] [blame]

890

for(int i = 0; i < SRC_WIDTH; ++i, input_ptr += in_stride_x)

Giorgio Arena

2017-08-23 16:36:24 +0100

[diff] [blame]

891

{

Giorgio Arena

2018-06-20 11:46:42 +0100

[diff] [blame]

892

*((__global DATA_TYPE *)(output_ptr + i * dst_stride_x)) = *((__global DATA_TYPE *)input_ptr);

Giorgio Arena

2017-08-23 16:36:24 +0100

[diff] [blame]

893

}

Georgios Pinitas

2017-10-23 20:29:30 +0100

[diff] [blame]

894

895

#if defined(HAS_BIAS)

896

if(get_global_id(1) == 0)

897

{

Michele Di Giorgio

d24af8a

2018-05-08 17:23:52 +0100

[diff] [blame]

898

*((__global DATA_TYPE *)(output_ptr + SRC_WIDTH * get_global_size(1) * dst_stride_x)) = *((__global DATA_TYPE *)(biases.ptr + get_global_id(2) * biases_stride_x));

Georgios Pinitas

2017-10-23 20:29:30 +0100

[diff] [blame]

899

}

900

#endif // defined(HAS_BIAS)

Giorgio Arena

2017-08-23 16:36:24 +0100

[diff] [blame]

901

}

902

#endif //defined(SRC_WIDTH) && defined(DATA_TYPE)

903

Usama Arif

2019-04-08 17:30:48 +0100

[diff] [blame^]

904

#if defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(PAD_RIGHT) && defined(PAD_BOTTOM) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DATA_TYPE) && defined(PAD_VALUE) && defined(DEPTH_MULTIPLIER) && defined(DILATION_X) && defined(DILATION_Y)

Giorgio Arena

2017-08-23 16:36:24 +0100

[diff] [blame]

905

/** This kernel performs a reshaping of the input tensor to a tensor used to perform depthwise convolution using vector to matrix multiplication.

906

*

907

* @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float

Giorgio Arena

2018-04-04 17:44:26 +0100

[diff] [blame]

908

* @note The convolution information must be passed at compile time using -DSTRIDE_X, -DSTRIDE_Y, -DPAD_LEFT, -DPAD_TOP, -DPAD_RIGHT, -DPAD_BOTTOM, -DKERNEL_WIDHT, -DKERNEL_HEIGHT, -DSRC_WIDTH, -DSRC_HEIGHT, -DDEPTH_MULTIPLIER

Usama Arif

2019-04-08 17:30:48 +0100

[diff] [blame^]

909

* @note The dilation_x and dilation_y must be passed at compile time using -DDILATION_X and -DDILATION_Y: e.g. -DDILATION_X=1, -DDILATION_Y=1

Giorgio Arena

2017-08-23 16:36:24 +0100

[diff] [blame]

910

*

Vidhya Sudhan Loganathan

7485d5a

2018-07-04 09:34:00 +0100

[diff] [blame]

911

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32

Giorgio Arena

2017-08-23 16:36:24 +0100

[diff] [blame]

912

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

913

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

914

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

915

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

916

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

917

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

918

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

919

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

920

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

921

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

922

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

923

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

924

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

925

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

926

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

927

*/

Giorgio Arena

2017-08-23 16:36:24 +0100

[diff] [blame]

928

__kernel void depthwise_im2col(TENSOR3D_DECLARATION(src), TENSOR3D_DECLARATION(dst))

929

{

930

Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);

931

932

const int src_pixel_linear = get_global_id(1) * STRIDE_X;

Jaroslaw Rzepecki

a1ed41f

2017-10-13 11:13:58 +0100

[diff] [blame]

933

const int full_length = SRC_WIDTH + PAD_LEFT + PAD_RIGHT;

Usama Arif

2019-04-08 17:30:48 +0100

[diff] [blame^]

934

const int max_initial_x = STRIDE_X * (((full_length - (KERNEL_WIDTH + (KERNEL_WIDTH - 1) * (DILATION_X - 1))) / STRIDE_X) + 1);

Giorgio Arena

2017-08-23 16:36:24 +0100

[diff] [blame]

935

Jaroslaw Rzepecki

a1ed41f

2017-10-13 11:13:58 +0100

[diff] [blame]

936

const int src_x = -PAD_LEFT + src_pixel_linear % max_initial_x;

937

const int src_y = -PAD_TOP + src_pixel_linear / max_initial_x * STRIDE_Y;

Giorgio Arena

2018-04-04 17:44:26 +0100

[diff] [blame]

938

const int src_z = get_global_id(2) / DEPTH_MULTIPLIER;

Giorgio Arena

2017-08-23 16:36:24 +0100

[diff] [blame]

939

Giorgio Arena

2018-06-20 11:46:42 +0100

[diff] [blame]

940

__global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + src_z * in_stride_z;

Giorgio Arena

2017-08-23 16:36:24 +0100

[diff] [blame]

941

__global DATA_TYPE *output_ptr = ((__global DATA_TYPE *)(dst.ptr));

942

Usama Arif

2019-04-08 17:30:48 +0100

[diff] [blame^]

943

for(int y = src_y; y < src_y + KERNEL_HEIGHT + (KERNEL_HEIGHT - 1) * (DILATION_Y - 1); y += DILATION_Y)

Giorgio Arena

2017-08-23 16:36:24 +0100

[diff] [blame]

944

{

Usama Arif

2019-04-08 17:30:48 +0100

[diff] [blame^]

945

for(int x = src_x; x < src_x + KERNEL_WIDTH + (KERNEL_WIDTH - 1) * (DILATION_X - 1); x += DILATION_X, ++output_ptr)

Giorgio Arena

2017-08-23 16:36:24 +0100

[diff] [blame]

946

{

947

if(x < 0 || x >= SRC_WIDTH || y < 0 || y >= SRC_HEIGHT)

948

{

Georgios Pinitas

de5a1cc

2018-02-02 12:52:07 +0000

[diff] [blame]

949

*output_ptr = PAD_VALUE;

Giorgio Arena

2017-08-23 16:36:24 +0100

[diff] [blame]

950

}

951

else

952

{

Giorgio Arena

2018-06-20 11:46:42 +0100

[diff] [blame]

953

*output_ptr = *((__global DATA_TYPE *)(input_ptr + x * in_stride_x + y * in_stride_y));

Giorgio Arena

2017-08-23 16:36:24 +0100

[diff] [blame]

954

}

955

}

956

}

Georgios Pinitas

2017-10-23 20:29:30 +0100

[diff] [blame]

957

#if defined(HAS_BIAS)

958

*output_ptr = (DATA_TYPE)(1);

959

#endif // defined(HAS_BIAS)

Giorgio Arena

2017-08-23 16:36:24 +0100

[diff] [blame]

960

}

961

Giorgio Arena

2018-04-04 17:44:26 +0100

[diff] [blame]

962

#endif //defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(PAD_RIGHT) && defined(PAD_BOTTOM) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_WIDTH) && defined(DATA_TYPE) && defined(PAD_VALUE) && defined(DEPTH_MULTIPLIER)

Giorgio Arena

2017-08-23 16:36:24 +0100

[diff] [blame]

963

964

#if defined(CONV_WIDTH) && defined(CONV_HEIGHT) && defined(DATA_TYPE)

965

966

/** This kernel performs a reshaping of the output of the depthwise generic convolution.

967

*

968

* @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float

969

* @note The convolution information must be passed at compile time using -DCONV_WIDTH, -DCONV_HEIGHT, e.g -DCONV_WIDTH=32, -DCONV_HEIGHT=42

970

*

Vidhya Sudhan Loganathan

7485d5a

2018-07-04 09:34:00 +0100

[diff] [blame]

971

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32

Giorgio Arena

2017-08-23 16:36:24 +0100

[diff] [blame]

972

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

973

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

974

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

975

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

976

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

977

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

978

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

979

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

980

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

981

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

982

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

983

*/

984

__kernel void depthwise_vector_to_tensor(

985

VECTOR_DECLARATION(src),

986

TENSOR3D_DECLARATION(dst))

987

{

988

Vector src = CONVERT_TO_VECTOR_STRUCT(src);

989

990

const int patch_size = CONV_WIDTH * CONV_HEIGHT;

991

const int id0 = get_global_id(0);

992

const int z = id0 / patch_size;

993

const int index2D = id0 - z * patch_size;

994

Giorgio Arena

2018-06-20 11:46:42 +0100

[diff] [blame]

995

__global uchar *out_ptr = dst_ptr + dst_offset_first_element_in_bytes + index2D % CONV_WIDTH * out_stride_x + index2D / CONV_WIDTH * out_stride_y + z * out_stride_z;

Giorgio Arena

2017-08-23 16:36:24 +0100

[diff] [blame]

996

*((__global DATA_TYPE *)out_ptr) = *((__global DATA_TYPE *)src.ptr);

997

}

998

999

#endif //defined(CONV_WIDTH) && defined(CONV_HEIGHT) && defined(DATA_TYPE)

Michele Di Giorgio

2018-02-19 15:42:12 +0000

[diff] [blame]

1000

Georgios Pinitas

2018-09-13 17:20:04 +0100

[diff] [blame]

1001

#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(DEPTH_MULTIPLIER) && defined(DST_CHANNELS)

Michele Di Giorgio

2018-02-19 15:42:12 +0000

[diff] [blame]

1002

#if defined(CONV_STRIDE_X)

1003

#if CONV_STRIDE_X == 1

1004

#define convolution1x3_f16 convolution1x3_stride_1_f16

1005

#elif CONV_STRIDE_X == 2

1006

#define convolution1x3_f16 convolution1x3_stride_2_f16

1007

#elif CONV_STRIDE_X == 3

1008

#define convolution1x3_f16 convolution1x3_stride_3_f16

1009

#else /* CONV_STRIDE_X */

1010

#error "Stride not supported"

1011

#endif /* CONV_STRIDE_X */

1012

1013

/** Compute a 1D horizontal convolution of size 3 and stride 1 for 16bit floating point type.

1014

*

1015

* @param[in] left_pixel Pointer to the left pixel.

1016

* @param[in] left_coeff Weight of the left pixel

1017

* @param[in] middle_coeff Weight of the middle pixel

1018

* @param[in] right_coeff Weight of the right pixel

1019

*

1020

* @return a half4 containing 4 convoluted values.

1021

*/

1022

inline half4 convolution1x3_stride_1_f16(__global const uchar *left_pixel,

1023

const half left_coeff,

1024

const half middle_coeff,

1025

const half right_coeff)

1026

{

Usama Arif

2019-04-08 17:30:48 +0100

[diff] [blame^]

1027

#if(DILATION_X == 1 && DILATION_Y == 1)

1028

Michele Di Giorgio

2018-02-19 15:42:12 +0000

[diff] [blame]

1029

half8 temp = vload8(0, (__global half *)left_pixel);

1030

1031

half4 left = CONVERT(temp.s0123, half4);

1032

half4 middle = CONVERT(temp.s1234, half4);

1033

half4 right = CONVERT(temp.s2345, half4);

1034

1035

return left * (half4)left_coeff + middle * (half4)middle_coeff + right * (half4)right_coeff;

Usama Arif

2019-04-08 17:30:48 +0100

[diff] [blame^]

1036

#else /* DILATION_X==1 && DILATION_Y==1 */

1037

return vload4(0, (__global half *)left_pixel) * (half4)left_coeff

1038

+ vload4(0, (__global half *)(left_pixel) + DILATION_X) * (half4)middle_coeff

1039

+ vload4(0, (__global half *)(left_pixel) + 2 * DILATION_X) * (half4)right_coeff;

1040

1041

#endif /* DILATION_X==1 && DILATION_Y==1 */

Michele Di Giorgio

2018-02-19 15:42:12 +0000

[diff] [blame]

1042

}

1043

1044

/** Compute a 1D horizontal convolution of size 3 and stride 2 for 16bit floating point type.

1045

*

1046

* @param[in] left_pixel Pointer to the left pixel.

1047

* @param[in] left_coeff Weight of the left pixel

1048

* @param[in] middle_coeff Weight of the middle pixel

1049

* @param[in] right_coeff Weight of the right pixel

1050

*

1051

* @return a half4 containing 4 convoluted values.

1052

*/

1053

inline half4 convolution1x3_stride_2_f16(__global const uchar *left_pixel,

1054

const half left_coeff,

1055

const half middle_coeff,

1056

const half right_coeff)

1057

{

Usama Arif

2019-04-08 17:30:48 +0100

[diff] [blame^]

1058

#if(DILATION_X == 1 && DILATION_Y == 1)

1059

Michele Di Giorgio

2018-02-19 15:42:12 +0000

[diff] [blame]

1060

half8 temp0 = vload8(0, (__global half *)left_pixel);

1061

half temp1 = *((__global half *)(left_pixel + 8 * sizeof(half)));

1062

1063

half4 left = CONVERT(temp0.s0246, half4);

1064

half4 middle = CONVERT(temp0.s1357, half4);

1065

half4 right = CONVERT((half4)(temp0.s246, temp1), half4);

1066

1067

return left * (half4)left_coeff + middle * (half4)middle_coeff + right * (half4)right_coeff;

Usama Arif

2019-04-08 17:30:48 +0100

[diff] [blame^]

1068

#else /* DILATION_X==1 && DILATION_Y==1 */

1069

1070

__global half *left_pixel_float = (__global half *)left_pixel;

1071

1072

return (half4)(*left_pixel_float, *(left_pixel_float + 2), *(left_pixel_float + 4), *(left_pixel_float + 6)) * (half4)left_coeff

1073

+ (half4)(*(left_pixel_float + DILATION_X), *(left_pixel_float + DILATION_X + 2), *(left_pixel_float + DILATION_X + 4), *(left_pixel_float + DILATION_X + 6)) * (half4)middle_coeff

1074

+ (half4)(*(left_pixel_float + DILATION_X * 2), *(left_pixel_float + DILATION_X * 2 + 2), *(left_pixel_float + DILATION_X * 2 + 4), *(left_pixel_float + DILATION_X * 2 + 6)) * (half4)right_coeff;

1075

1076

#endif /* DILATION_X==1 && DILATION_Y==1 */

Michele Di Giorgio

2018-02-19 15:42:12 +0000

[diff] [blame]

1077

}

1078

1079

/** Compute a 1D horizontal convolution of size 3 and stride 3 for 16bit floating point type.

1080

*

1081

* @param[in] left_pixel Pointer to the left pixel.

1082

* @param[in] left_coeff Weight of the left pixel

1083

* @param[in] middle_coeff Weight of the middle pixel

1084

* @param[in] right_coeff Weight of the right pixel

1085

*

1086

* @return a half4 containing 4 convoluted values.

1087

*/

1088

inline half4 convolution1x3_stride_3_f16(__global const uchar *left_pixel,

1089

const half left_coeff,

1090

const half middle_coeff,

1091

const half right_coeff)

1092

{

Usama Arif

2019-04-08 17:30:48 +0100

[diff] [blame^]

1093

#if(DILATION_X == 1 && DILATION_Y == 1)

1094

Michele Di Giorgio

2018-02-19 15:42:12 +0000

[diff] [blame]

1095

half16 temp0 = vload16(0, (__global half *)left_pixel);

1096

1097

half4 left = CONVERT(temp0.s0369, half4);

1098

half4 middle = CONVERT(temp0.s147A, half4);

1099

half4 right = CONVERT(temp0.s258B, half4);

1100

1101

return left * (half4)left_coeff + middle * (half4)middle_coeff + right * (half4)right_coeff;

Usama Arif

2019-04-08 17:30:48 +0100

[diff] [blame^]

1102

#else /* DILATION_X==1 && DILATION_Y==1 */

1103

1104

__global half *left_pixel_float = (__global half *)left_pixel;

1105

1106

return (half4)(*left_pixel_float, *(left_pixel_float + 3), *(left_pixel_float + 6), *(left_pixel_float + 9)) * (half4)left_coeff

1107

+ (half4)(*(left_pixel_float + DILATION_X), *(left_pixel_float + DILATION_X + 3), *(left_pixel_float + DILATION_X + 6), *(left_pixel_float + DILATION_X + 9)) * (half4)middle_coeff

1108

+ (half4)(*(left_pixel_float + DILATION_X * 2), *(left_pixel_float + DILATION_X * 2 + 3), *(left_pixel_float + DILATION_X * 2 + 6), *(left_pixel_float + DILATION_X * 2 + 9)) * (half4)right_coeff;

1109

1110

#endif /* DILATION_X==1 && DILATION_Y==1 */

Michele Di Giorgio

2018-02-19 15:42:12 +0000

[diff] [blame]

1111

}

1112

1113

/** Apply a 3x3 convolution matrix to a single channel F16 input image and return the result.

1114

*

1115

* Convolution matrix layout:

1116

*

1117

* [ mat0, mat1, mat2 ]\n

1118

* [ mat3, mat4, mat5 ]\n

1119

* [ mat6, mat7, mat8 ]\n

1120

*

1121

* @param[in] src A pointer to source Image structure

1122

* @param[in] mat0 Coefficient from the convolution matrix

1123

* @param[in] mat1 Coefficient from the convolution matrix

1124

* @param[in] mat2 Coefficient from the convolution matrix

1125

* @param[in] mat3 Coefficient from the convolution matrix

1126

* @param[in] mat4 Coefficient from the convolution matrix

1127

* @param[in] mat5 Coefficient from the convolution matrix

1128

* @param[in] mat6 Coefficient from the convolution matrix

1129

* @param[in] mat0 Coefficient from the convolution matrix

1130

* @param[in] mat7 Coefficient from the convolution matrix

1131

* @param[in] mat8 Coefficient from the convolution matrix

1132

*

1133

* @return a half4 containing 4 convoluted values.

1134

*/

1135

inline half4 convolution3x3_f16(

1136

Image *src,

1137

const half mat0, const half mat1, const half mat2,

1138

const half mat3, const half mat4, const half mat5,

1139

const half mat6, const half mat7, const half mat8)

{

half4 pixels;

pixels = convolution1x3_f16(offset(src, 0, 0), mat0, mat1, mat2);

Usama Arif

2019-04-08 17:30:48 +0100

[diff] [blame^]

1144

pixels += convolution1x3_f16(offset(src, 0, DILATION_Y), mat3, mat4, mat5);

1145

pixels += convolution1x3_f16(offset(src, 0, DILATION_Y * 2), mat6, mat7, mat8);

Michele Di Giorgio

2018-02-19 15:42:12 +0000

[diff] [blame]

return pixels;

}

Giorgio Arena

2018-04-04 17:44:26 +0100

[diff] [blame]

1150

#if defined(DEPTH_MULTIPLIER)

1151

Michele Di Giorgio

2018-02-19 15:42:12 +0000

[diff] [blame]

1152

/** This OpenCL kernel computes the depthwise convolution 3x3

1153

*

Georgios Pinitas

2018-10-30 14:53:25 +0000

[diff] [blame]

1154

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F16

1155

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

Michele Di Giorgio

2018-02-19 15:42:12 +0000

[diff] [blame]

1156

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

Georgios Pinitas

2018-10-30 14:53:25 +0000

[diff] [blame]

1157

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

Michele Di Giorgio

2018-02-19 15:42:12 +0000

[diff] [blame]

1158

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

Michele Di Giorgio

2018-02-19 15:42:12 +0000

[diff] [blame]

1159

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1160

* @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)

Georgios Pinitas

2018-10-30 14:53:25 +0000

[diff] [blame]

1161

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

Michele Di Giorgio

2018-02-21 10:02:58 +0000

[diff] [blame]

1162

* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

Michele Di Giorgio

2018-02-19 15:42:12 +0000

[diff] [blame]

1163

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1164

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1165

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1166

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

1167

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

1168

* @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)

1169

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

Michele Di Giorgio

2018-02-21 10:02:58 +0000

[diff] [blame]

1170

* @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr

Michele Di Giorgio

2018-02-19 15:42:12 +0000

[diff] [blame]

1171

* @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)

1172

* @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)

1173

* @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)

1174

* @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)

1175

* @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)

1176

* @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)

1177

* @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector

1178

* @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: F16/F32

1179

* @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)

1180

* @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)

1181

* @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector

1182

*/

1183

__kernel void depthwise_convolution_3x3_f16(

1184

TENSOR3D_DECLARATION(src),

1185

TENSOR3D_DECLARATION(dst),

1186

TENSOR3D_DECLARATION(weights)

1187

#if defined(HAS_BIAS)

1188

,

1189

VECTOR_DECLARATION(biases)

1190

#endif //defined(HAS_BIAS)

1191

)

1192

{

1193

Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);

1194

Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);

Georgios Pinitas

2018-09-13 17:20:04 +0100

[diff] [blame]

1195

Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);

Michele Di Giorgio

2018-02-19 15:42:12 +0000

[diff] [blame]

1196

#if defined(HAS_BIAS)

1197

Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);

1198

#endif //defined(HAS_BIAS)

1199

Georgios Pinitas

2018-09-13 17:20:04 +0100

[diff] [blame]

1200

// Extract channel and linearized batch indices

1201

const int channel = get_global_id(2) % DST_CHANNELS;

1202

const int batch = get_global_id(2) / DST_CHANNELS;

1203

// Load relevant input and weights data (Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)

1204

src.ptr -= batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z + (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;

1205

__global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;

Giorgio Arena

2018-04-04 17:44:26 +0100

[diff] [blame]

1206

Michele Di Giorgio

2018-02-19 15:42:12 +0000

[diff] [blame]

1207

uchar3 offset = (uchar3)(0, 1, 2) * (uchar3)weights_stride_y;

Georgios Pinitas

2018-09-13 17:20:04 +0100

[diff] [blame]

1208

half3 weights_values0 = vload3(0, (__global half *)(weights_addr + offset.s0));

1209

half3 weights_values1 = vload3(0, (__global half *)(weights_addr + offset.s1));

1210

half3 weights_values2 = vload3(0, (__global half *)(weights_addr + offset.s2));

Michele Di Giorgio

2018-02-19 15:42:12 +0000

[diff] [blame]

1211

1212

half4 pixels = convolution3x3_f16(&src, weights_values0.s0, weights_values0.s1, weights_values0.s2,

1213

weights_values1.s0, weights_values1.s1, weights_values1.s2,

1214

weights_values2.s0, weights_values2.s1, weights_values2.s2);

1215

#if defined(HAS_BIAS)

Georgios Pinitas

2018-09-13 17:20:04 +0100

[diff] [blame]

1216

pixels += (half4)(*((__global half *)(biases.ptr + channel * biases_stride_x)));

Michele Di Giorgio

2018-02-19 15:42:12 +0000

[diff] [blame]

1217

#endif //defined(HAS_BIAS)

1218

1219

vstore4(pixels, 0, (__global half *)dst.ptr);

1220

}

Giorgio Arena

2018-04-04 17:44:26 +0100

[diff] [blame]

1221

#endif // defined(DEPTH_MULTIPLIER)

Michele Di Giorgio

2018-02-19 15:42:12 +0000

[diff] [blame]

1222

#endif // defined(CONV_STRIDE_X)

Michele Di Giorgio

2018-02-21 10:02:58 +0000

[diff] [blame]

1223

1224

/** This OpenCL kernel is optimized for Bifrost architectures and computes the 16bit floating point depthwise convolution 3x3

1225

* when both stride_x and stride_y are equal to 1

1226

*

Georgios Pinitas

2018-10-30 14:53:25 +0000

[diff] [blame]

1227

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F16

1228

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

Michele Di Giorgio

2018-02-21 10:02:58 +0000

[diff] [blame]

1229

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

Georgios Pinitas

2018-10-30 14:53:25 +0000

[diff] [blame]

1230

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

Michele Di Giorgio

2018-02-21 10:02:58 +0000

[diff] [blame]

1231

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

Michele Di Giorgio

2018-02-21 10:02:58 +0000

[diff] [blame]

1232

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1233

* @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)

Georgios Pinitas

2018-10-30 14:53:25 +0000

[diff] [blame]

1234

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

Michele Di Giorgio

2018-02-21 10:02:58 +0000

[diff] [blame]

1235

* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

1236

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1237

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1238

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1239

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

1240

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

1241

* @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)

1242

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1243

* @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr

1244

* @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)

1245

* @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)

1246

* @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)

1247

* @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)

1248

* @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)

1249

* @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)

1250

* @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector

1251

* @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: same as @p src_ptr

1252

* @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)

1253

* @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)

1254

* @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector

1255

*/

1256

__kernel void depthwise_convolution_3x3_stridex1_stridey1_bifrost_f16(

1257

TENSOR3D_DECLARATION(src),

1258

TENSOR3D_DECLARATION(dst),

1259

TENSOR3D_DECLARATION(weights)

1260

#if defined(HAS_BIAS)

1261

,

1262

VECTOR_DECLARATION(biases)

1263

#endif //defined(HAS_BIAS)

1264

)

1265

{

1266

Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);

1267

Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);

Georgios Pinitas

2018-09-13 17:20:04 +0100

[diff] [blame]

1268

Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);

1269

1270

// Extract channel and linearized batch indices

1271

const int channel = get_global_id(2) % DST_CHANNELS;

1272

const int batch = get_global_id(2) / DST_CHANNELS;

Michele Di Giorgio

2018-02-21 10:02:58 +0000

[diff] [blame]

1273

1274

#ifdef HAS_BIAS

1275

Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);

1276

Georgios Pinitas

2018-09-13 17:20:04 +0100

[diff] [blame]

1277

half bias = *((__global half *)(vector_offset(&biases, channel)));

Michele Di Giorgio

2018-02-21 10:02:58 +0000

[diff] [blame]

1278

#endif /* defined(HAS_BIAS) */

1279

1280

half4 pixels0 = 0.0f;

1281

half4 pixels1 = 0.0f;

1282

half4 pixels2 = 0.0f;

1283

half4 pixels3 = 0.0f;

1284

Georgios Pinitas

2018-09-13 17:20:04 +0100

[diff] [blame]

1285

// Load relevant input and weights data (Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)

1286

__global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;

1287

__global uchar *src_addr = src.ptr - batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z - (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;

Michele Di Giorgio

2018-02-21 10:02:58 +0000

[diff] [blame]

1288

Usama Arif

2019-04-08 17:30:48 +0100

[diff] [blame^]

1289

#if(DILATION_X == 1 && DILATION_Y == 1)

Michele Di Giorgio

2018-02-21 10:02:58 +0000

[diff] [blame]

1290

// Load the weights

1291

half3 weights_row0 = vload3(0, (__global half *)(weights_addr + 0 * weights_stride_y));

1292

half3 weights_row1 = vload3(0, (__global half *)(weights_addr + 1 * weights_stride_y));

1293

half3 weights_row2 = vload3(0, (__global half *)(weights_addr + 2 * weights_stride_y));

1294

1295

// Note: Since each work-item computes 4x4 elements, we need to load 6 rows from the input tensor

1296

half8 src00 = vload8(0, (__global half *)(src_addr + 0 * src_stride_y)); // Row0

1297

half8 src10 = vload8(0, (__global half *)(src_addr + 1 * src_stride_y)); // Row1

1298

half8 src20 = vload8(0, (__global half *)(src_addr + 2 * src_stride_y)); // Row2

1299

half8 src30 = vload8(0, (__global half *)(src_addr + 3 * src_stride_y)); // Row3

1300

half8 src40 = vload8(0, (__global half *)(src_addr + 4 * src_stride_y)); // Row4

1301

half8 src50 = vload8(0, (__global half *)(src_addr + 5 * src_stride_y)); // Row5

1302

1303

CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels0, src00, weights_row0);

1304

CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels0, src10, weights_row1);

1305

CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels0, src20, weights_row2);

1306

CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels1, src10, weights_row0);

1307

CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels1, src20, weights_row1);

1308

CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels1, src30, weights_row2);

1309

CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels2, src20, weights_row0);

1310

CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels2, src30, weights_row1);

1311

CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels2, src40, weights_row2);

1312

CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels3, src30, weights_row0);

1313

CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels3, src40, weights_row1);

1314

CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels3, src50, weights_row2);

1315

Usama Arif

2019-04-08 17:30:48 +0100

[diff] [blame^]

1316

#else /* DILATION_X==1 && DILATION_Y==1 */

1317

1318

//3x3 Convolution of elements starting in 0th row

1319

pixels0 = convolution_3x3_dilation_stridex1_stridey1_bifrost_f16(src_addr, src.stride_x, src.stride_y, 0, weights_addr, weights_stride_y);

1320

//3x3 Convolution of elements starting in 1st row

1321

pixels1 = convolution_3x3_dilation_stridex1_stridey1_bifrost_f16(src_addr, src.stride_x, src.stride_y, 1, weights_addr, weights_stride_y);

1322

//3x3 Convolution of elements starting in 2nd row

1323

pixels2 = convolution_3x3_dilation_stridex1_stridey1_bifrost_f16(src_addr, src.stride_x, src.stride_y, 2, weights_addr, weights_stride_y);

1324

//3x3 Convolution of elements starting in 3rd row

1325

pixels3 = convolution_3x3_dilation_stridex1_stridey1_bifrost_f16(src_addr, src.stride_x, src.stride_y, 3, weights_addr, weights_stride_y);

1326

1327

#endif /* DILATION_X==1 && DILATION_Y==1 */

1328

Michele Di Giorgio

2018-02-21 10:02:58 +0000

[diff] [blame]

1329

#ifdef HAS_BIAS

1330

pixels0 += (half4)bias;

1331

pixels1 += (half4)bias;

1332

pixels2 += (half4)bias;

1333

pixels3 += (half4)bias;

1334

#endif /* defined(HAS_BIAS) */

1335

1336

vstore4(pixels0, 0, (__global half *)(dst.ptr + 0 * dst_stride_y));

1337

vstore4(pixels1, 0, (__global half *)(dst.ptr + 1 * dst_stride_y));

1338

vstore4(pixels2, 0, (__global half *)(dst.ptr + 2 * dst_stride_y));

1339

vstore4(pixels3, 0, (__global half *)(dst.ptr + 3 * dst_stride_y));

1340

}

1341

1342

/** This OpenCL kernel is optimized for Bifrost architectures and computes 16bit floating point the depthwise convolution 3x3

1343

* when both stride_x and stride_y are equal to 2

1344

*

Georgios Pinitas

2018-10-30 14:53:25 +0000

[diff] [blame]

1345

* @param[in] src_ptr Pointer to the source tensor. Supported data types: F16

1346

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

Michele Di Giorgio

2018-02-21 10:02:58 +0000

[diff] [blame]

1347

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

Georgios Pinitas

2018-10-30 14:53:25 +0000

[diff] [blame]

1348

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

Michele Di Giorgio

2018-02-21 10:02:58 +0000

[diff] [blame]

1349

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

Michele Di Giorgio

2018-02-21 10:02:58 +0000

[diff] [blame]

1350

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

Georgios Pinitas

2018-10-30 14:53:25 +0000

[diff] [blame]

1351

* @param[in] src_step_z src_stride_y * number of elements along Z processed per workitem(in bytes)

1352

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

Michele Di Giorgio

2018-02-21 10:02:58 +0000

[diff] [blame]

1353

* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

1354

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1355

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1356

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1357

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

1358

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

1359

* @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)

1360

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1361

* @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr

1362

* @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)

1363

* @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)

1364

* @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)

1365

* @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)

1366

* @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)

1367

* @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)

1368

* @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector

1369

* @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: same as @p src_ptr

1370

* @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)

1371

* @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)

1372

* @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector

1373

*/

1374

__kernel void depthwise_convolution_3x3_stridex2_stridey2_bifrost_f16(

1375

TENSOR3D_DECLARATION(src),

1376

TENSOR3D_DECLARATION(dst),

1377

TENSOR3D_DECLARATION(weights)

1378

#if defined(HAS_BIAS)

1379

,

1380

VECTOR_DECLARATION(biases)

1381

#endif //defined(HAS_BIAS)

1382

)

1383

{

1384

Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);

1385

Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);

Georgios Pinitas

2018-09-13 17:20:04 +0100

[diff] [blame]

1386

Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);

1387

1388

// Extract channel and linearized batch indices

1389

const int channel = get_global_id(2) % DST_CHANNELS;

1390

const int batch = get_global_id(2) / DST_CHANNELS;

Michele Di Giorgio

2018-02-21 10:02:58 +0000

[diff] [blame]

1391

1392

#ifdef HAS_BIAS

1393

Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);

1394

Georgios Pinitas

2018-09-13 17:20:04 +0100

[diff] [blame]

1395

half bias = *((__global half *)(vector_offset(&biases, channel)));

Michele Di Giorgio

2018-02-21 10:02:58 +0000

[diff] [blame]

1396

#endif /* defined(HAS_BIAS) */

1397

1398

half4 pixels0 = 0.0f;

1399

half4 pixels1 = 0.0f;

1400

Georgios Pinitas

2018-09-13 17:20:04 +0100

[diff] [blame]

1401

// Load relevant input and weights data ( Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)

1402

__global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;

1403

__global uchar *src_addr = src.ptr - batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z - (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;

Michele Di Giorgio

2018-02-21 10:02:58 +0000

[diff] [blame]

1404

Usama Arif

2019-04-08 17:30:48 +0100

[diff] [blame^]

1405

#if(DILATION_X == 1 && DILATION_Y == 1)

1406

Michele Di Giorgio

2018-02-21 10:02:58 +0000

[diff] [blame]

1407

// Load the weights

1408

half3 weights_row0 = vload3(0, (__global half *)(weights_addr + 0 * weights_stride_y));

1409

half3 weights_row1 = vload3(0, (__global half *)(weights_addr + 1 * weights_stride_y));

1410

half3 weights_row2 = vload3(0, (__global half *)(weights_addr + 2 * weights_stride_y));

1411

1412

// Note: Since each work-item computes 2x4 elements, we need to load 5 rows from the input tensor

1413

half8 src00 = vload8(0, (__global half *)(src_addr + 0 * src_stride_y)); // Row0

1414

half2 src01 = vload2(4, (__global half *)(src_addr + 0 * src_stride_y)); // Row0

1415

half8 src10 = vload8(0, (__global half *)(src_addr + 1 * src_stride_y)); // Row1

1416

half2 src11 = vload2(4, (__global half *)(src_addr + 1 * src_stride_y)); // Row1

1417

half8 src20 = vload8(0, (__global half *)(src_addr + 2 * src_stride_y)); // Row2

1418

half2 src21 = vload2(4, (__global half *)(src_addr + 2 * src_stride_y)); // Row2

1419

half8 src30 = vload8(0, (__global half *)(src_addr + 3 * src_stride_y)); // Row3

1420

half2 src31 = vload2(4, (__global half *)(src_addr + 3 * src_stride_y)); // Row3

1421

half8 src40 = vload8(0, (__global half *)(src_addr + 4 * src_stride_y)); // Row4

1422

half2 src41 = vload2(4, (__global half *)(src_addr + 4 * src_stride_y)); // Row4

1423

1424

CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels0, src00, src01, weights_row0);

1425

CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels0, src10, src11, weights_row1);

1426

CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels0, src20, src21, weights_row2);

1427

CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels1, src20, src21, weights_row0);

1428

CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels1, src30, src31, weights_row1);

1429

CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels1, src40, src41, weights_row2);

1430

Usama Arif

2019-04-08 17:30:48 +0100

[diff] [blame^]

1431

#else /* DILATION_X==1 && DILATION_Y==1 */

1432

//3x3 Convolution of elements starting in 0th row

1433

pixels0 = convolution_3x3_dilation_stridex2_stridey2_bifrost_f16(src_addr, src.stride_x, src.stride_y, 0, weights_addr, weights_stride_y);

1434

//3x3 Convolution of elements starting in 2nd row

1435

pixels1 = convolution_3x3_dilation_stridex2_stridey2_bifrost_f16(src_addr, src.stride_x, src.stride_y, 2, weights_addr, weights_stride_y);

1436

#endif /* DILATION_X==1 && DILATION_Y==1 */

1437

Michele Di Giorgio

2018-02-21 10:02:58 +0000

[diff] [blame]

1438

#ifdef HAS_BIAS

1439

pixels0 += (half4)bias;

1440

pixels1 += (half4)bias;

1441

#endif /* defined(HAS_BIAS) */

1442

1443

vstore4(pixels0, 0, (__global half *)(dst.ptr + 0 * dst_stride_y));

1444

vstore4(pixels1, 0, (__global half *)(dst.ptr + 1 * dst_stride_y));

1445

}

Georgios Pinitas

2018-09-13 17:20:04 +0100

[diff] [blame]

1446

#endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(DEPTH_MULTIPLIER) && defined(DST_CHANNELS)

Giorgio Arena

2018-06-20 11:46:42 +0100

[diff] [blame]

1447

Giorgio Arena

2018-08-23 11:19:11 +0100

[diff] [blame]

1448

#if defined(VEC_SIZE) && defined(SRC_DIM_2) && defined(CONV_PAD_TOP) && defined(CONV_PAD_LEFT) && defined(DATA_TYPE)

Giorgio Arena

2018-06-20 11:46:42 +0100

[diff] [blame]

1449

Giorgio Arena

2018-08-23 11:19:11 +0100

[diff] [blame]

1450

#if DATA_TYPE != float || DATA_TYPE != half

1451

#error "Unsupported data type"

1452

#endif // DATA_TYPE != float || DATA_TYPE != half

1453

1454

#define VEC_FLOAT VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)

Giorgio Arena

2018-06-20 11:46:42 +0100

[diff] [blame]

1455

1456

#if defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y)

1457

/** This function computes the depthwise convolution for NHWC data layout when the stride along the width or height is not 1.

1458

*

Giorgio Arena

2018-08-23 11:19:11 +0100

[diff] [blame]

1459

* @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float

Giorgio Arena

2018-06-20 11:46:42 +0100

[diff] [blame]

1460

* @note The number of elements read per thread must be passed at compile time using -DVEC_SIZE (e.g. -DVEC_SIZE=2)

1461

* @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)

1462

* @note The convolution pad top must be passed at compile time using -DCONV_PAD_TOP (e.g. -DCONV_PAD_TOP=1)

1463

* @note The convolution pad top must be passed at compile time using -DCONV_PAD_LEFT (e.g. -DCONV_PAD_LEFT=1)

1464

* @note The convolution stride along the width must be passed at compile time using -DCONV_STRIDE_X (e.g. -DCONV_STRIDE_Y=X)

1465

* @note The convolution stride along the height must be passed at compile time using -DCONV_STRIDE_Y (e.g. -DCONV_STRIDE_Y=1)

1466

*

Georgios Pinitas

2018-10-30 14:53:25 +0000

[diff] [blame]

1467

* @param[in] src_ptr Pointer to the source tensor. Supported data types: FP32

1468

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

Giorgio Arena

2018-06-20 11:46:42 +0100

[diff] [blame]

1469

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

Georgios Pinitas

2018-10-30 14:53:25 +0000

[diff] [blame]

1470

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

Giorgio Arena

2018-06-20 11:46:42 +0100

[diff] [blame]

1471

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

Giorgio Arena

2018-06-20 11:46:42 +0100

[diff] [blame]

1472

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

Georgios Pinitas

2018-10-30 14:53:25 +0000

[diff] [blame]

1473

* @param[in] src_step_z src_stride_y * number of elements along Z processed per workitem(in bytes)

1474

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

1475

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

1476

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

Giorgio Arena

2018-06-20 11:46:42 +0100

[diff] [blame]

1477

* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as src_ptr

1478

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1479

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1480

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1481

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

1482

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

1483

* @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)

Georgios Pinitas

2018-10-30 14:53:25 +0000

[diff] [blame]

1484

* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)

1485

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

Giorgio Arena

2018-06-20 11:46:42 +0100

[diff] [blame]

1486

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1487

* @param[in] weights_ptr Pointer to the weights tensor. Supported data types: QASYMM8

1488

* @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)

1489

* @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)

1490

* @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)

1491

* @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)

1492

* @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)

1493

* @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)

1494

* @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor

1495

* @param[in] max_offset Max offset for the input tensor

1496

* @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: same as src_ptr

1497

* @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)

1498

* @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)

1499

* @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector

1500

*/

1501

__kernel void depthwise_convolution_3x3_nhwc(

Georgios Pinitas

2018-10-30 14:53:25 +0000

[diff] [blame]

1502

TENSOR4D_DECLARATION(src),

1503

TENSOR4D_DECLARATION(dst),

Giorgio Arena

2018-06-20 11:46:42 +0100

[diff] [blame]

1504

TENSOR3D_DECLARATION(weights),

1505

#if defined(HAS_BIAS)

1506

VECTOR_DECLARATION(biases),

1507

#endif /* defined(HAS_BIAS) */

1508

int max_offset)

1509

{

1510

int x = get_global_id(0); // channels

1511

int y = get_global_id(1); // spatial coordinate x

Georgios Pinitas

2018-10-30 14:53:25 +0000

[diff] [blame]

1512

#if defined(DST_DEPTH)

1513

int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y

1514

int b = get_global_id(2) / (int)DST_DEPTH; // batch

Usama Arif

2019-04-08 17:30:48 +0100

[diff] [blame^]

1515

#else // defined(DST_DEPTH)

1516

int z = get_global_id(2); // spatial coordinate y

1517

#endif // defined(DST_DEPTH)

Giorgio Arena

2018-06-20 11:46:42 +0100

[diff] [blame]

1518

1519

Vector weights = CONVERT_TO_VECTOR_STRUCT(weights);

1520

Georgios Pinitas

2018-10-30 14:53:25 +0000

[diff] [blame]

1521

#if defined(DST_DEPTH)

1522

__global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) * VEC_SIZE + b * src_stride_w;

1523

#else /* defined(DST_DEPTH) */

Giorgio Arena

2018-08-23 11:19:11 +0100

[diff] [blame]

1524

__global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) * VEC_SIZE;

Georgios Pinitas

2018-10-30 14:53:25 +0000

[diff] [blame]

1525

#endif /* defined(DST_DEPTH) */

Giorgio Arena

2018-06-20 11:46:42 +0100

[diff] [blame]

1526

1527

int z_coord = 0;

1528

int4 offset = 0;

Usama Arif

2019-04-08 17:30:48 +0100

[diff] [blame^]

1529

int4 y_offset = ((int4)(y * CONV_STRIDE_X) + (int4)(0, DILATION_X * 1, DILATION_X * 2, DILATION_X * 3) - CONV_PAD_LEFT) * (int4)src_stride_y;

Giorgio Arena

2018-06-20 11:46:42 +0100

[diff] [blame]

1530

1531

// We compute 2x1x1 [C,W,H] elements

1532

VEC_FLOAT acc = 0;

1533

1534

// Load weights

Giorgio Arena

2018-08-23 11:19:11 +0100

[diff] [blame]

1535

VEC_FLOAT w0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 0 * weights_stride_y + 0 * weights_stride_z));

1536

VEC_FLOAT w1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 1 * weights_stride_y + 0 * weights_stride_z));

1537

VEC_FLOAT w2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 2 * weights_stride_y + 0 * weights_stride_z));

1538

VEC_FLOAT w3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 0 * weights_stride_y + 1 * weights_stride_z));

1539

VEC_FLOAT w4 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 1 * weights_stride_y + 1 * weights_stride_z));

1540

VEC_FLOAT w5 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 2 * weights_stride_y + 1 * weights_stride_z));

1541

VEC_FLOAT w6 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 0 * weights_stride_y + 2 * weights_stride_z));

1542

VEC_FLOAT w7 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 1 * weights_stride_y + 2 * weights_stride_z));

1543

VEC_FLOAT w8 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 2 * weights_stride_y + 2 * weights_stride_z));

Giorgio Arena

2018-06-20 11:46:42 +0100

[diff] [blame]

// Load input values

// z == 0

// Clamp z_coord as for z = 0, it can be negative

1548

// z_coord is casted to unsigned int in order to use just a min() operation

1549

// A "-1" 32 bit signed variable converted to unsigned gives 4294967295

1550

z_coord = z * CONV_STRIDE_Y - (int)CONV_PAD_TOP;

1551

z_coord = min((uint)z_coord, (uint)SRC_DIM_2);

1552

offset = y_offset + (int4)(z_coord * src_stride_z);

Georgios Pinitas

2018-07-10 17:03:11 +0100

[diff] [blame]

1553

offset = min(offset, (int4)max_offset);

Giorgio Arena

2018-06-20 11:46:42 +0100

[diff] [blame]

1554

Giorgio Arena

2018-08-23 11:19:11 +0100

[diff] [blame]

1555

VEC_FLOAT values0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));

1556

VEC_FLOAT values1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));

1557

VEC_FLOAT values2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));

Giorgio Arena

2018-06-20 11:46:42 +0100

[diff] [blame]

1558

1559

// z == 1

1560

// z_coord can be only negative for z = 0 so we do not need to clamp it

1561

// Moreover z_coord cannot be out-of-bound for z = 1 so we do not need to clamp the offset

Usama Arif

2019-04-08 17:30:48 +0100

[diff] [blame^]

1562

z_coord = z * CONV_STRIDE_Y - (int)CONV_PAD_TOP + DILATION_Y;

Giorgio Arena

2018-06-20 11:46:42 +0100

[diff] [blame]

1563

offset = y_offset + (int4)(z_coord * src_stride_z);

Giorgio Arena

2018-08-23 11:19:11 +0100

[diff] [blame]

1564

VEC_FLOAT values3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));

1565

VEC_FLOAT values4 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));

1566

VEC_FLOAT values5 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));

Giorgio Arena

2018-06-20 11:46:42 +0100

[diff] [blame]

1567

1568

// z == 2

Usama Arif

2019-04-08 17:30:48 +0100

[diff] [blame^]

1569

// Offset can be out-of-bound so we need to check if it is greater than max_offset

1570

z_coord = z * CONV_STRIDE_Y - (int)CONV_PAD_TOP + DILATION_Y * 2;

1571

offset = y_offset + (int4)(z_coord * src_stride_z);

Georgios Pinitas

2018-07-10 17:03:11 +0100

[diff] [blame]

1572

offset = min(offset, (int4)max_offset);

Giorgio Arena

2018-08-23 11:19:11 +0100

[diff] [blame]

1573

VEC_FLOAT values6 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));

1574

VEC_FLOAT values7 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));

1575

VEC_FLOAT values8 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));

Giorgio Arena

2018-06-20 11:46:42 +0100

[diff] [blame]

1576

1577

acc = fma(values0, w0, acc);

1578

acc = fma(values1, w1, acc);

1579

acc = fma(values2, w2, acc);

1580

1581

acc = fma(values3, w3, acc);

1582

acc = fma(values4, w4, acc);

1583

acc = fma(values5, w5, acc);

1584

1585

acc = fma(values6, w6, acc);

1586

acc = fma(values7, w7, acc);

1587

acc = fma(values8, w8, acc);

1588

1589

#if defined(HAS_BIAS)

1590

Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);

Giorgio Arena

2018-08-23 11:19:11 +0100

[diff] [blame]

1591

VEC_FLOAT bias_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)biases.ptr);

Giorgio Arena

2018-06-20 11:46:42 +0100

[diff] [blame]

1592

acc += bias_values;

1593

#endif // defined(HAS_BIAS)

1594

Georgios Pinitas

2018-10-30 14:53:25 +0000

[diff] [blame]

1595

#if defined(DST_DEPTH)

1596

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + z * dst_step_z + b * dst_stride_w;

1597

#else /* defined(DST_DEPTH) */

1598

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + z * dst_step_z;

1599

#endif /* defined(DST_DEPTH) */

1600

Giorgio Arena

2018-06-20 11:46:42 +0100

[diff] [blame]

1601

VSTORE(VEC_SIZE)

Georgios Pinitas

2018-10-30 14:53:25 +0000

[diff] [blame]

1602

(acc, 0, (__global DATA_TYPE *)(dst_addr));

Giorgio Arena

2018-06-20 11:46:42 +0100

[diff] [blame]

1603

}

1604

#endif // defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y)

1605

1606

#if defined(NUM_ROWS_PROCESSED) && defined(NUM_PLANES_PROCESSED)

1607

/** This function computes the depthwise convolution for NHWC data layout when the stride along the width and height is 1.

1608

*

1609

* @note The number of elements read per thread must be passed at compile time using -DVEC_SIZE (e.g. -DVEC_SIZE=2)

1610

* @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)

1611

* @note The number of rows processed per thread must be passed at compile time using -DNUM_ROWS_PROCESSED (i.e. -DNUM_ROWS_PROCESSED=2)

1612

* @note The number of planes processed per thread must be passed at compile time using -DNUM_PLANES_PROCESSED (i.e. -DNUM_PLANES_PROCESSED=2)

1613

* @note The convolution pad top must be passed at compile time using -DCONV_PAD_TOP (e.g. -DCONV_PAD_TOP=1)

1614

* @note The convolution pad top must be passed at compile time using -DCONV_PAD_LEFT (e.g. -DCONV_PAD_LEFT=1)

1615

*

Georgios Pinitas

2018-10-30 14:53:25 +0000

[diff] [blame]

1616

* @param[in] src_ptr Pointer to the source tensor. Supported data types: FP32

1617

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

Giorgio Arena

2018-06-20 11:46:42 +0100

[diff] [blame]

1618

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

Georgios Pinitas

2018-10-30 14:53:25 +0000

[diff] [blame]

1619

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

Giorgio Arena

2018-06-20 11:46:42 +0100

[diff] [blame]

1620

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

Giorgio Arena

2018-06-20 11:46:42 +0100

[diff] [blame]

1621

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

Georgios Pinitas

2018-10-30 14:53:25 +0000

[diff] [blame]

1622

* @param[in] src_step_z src_stride_y * number of elements along Z processed per workitem(in bytes)

1623

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

1624

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

1625

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

Giorgio Arena

2018-06-20 11:46:42 +0100

[diff] [blame]

1626

* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as src_ptr

1627

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1628

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1629

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1630

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

1631

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

1632

* @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)

Georgios Pinitas

2018-10-30 14:53:25 +0000

[diff] [blame]

1633

* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)

1634

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

Giorgio Arena

2018-06-20 11:46:42 +0100

[diff] [blame]

1635

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1636

* @param[in] weights_ptr Pointer to the weights tensor. Supported data types: QASYMM8

1637

* @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)

1638

* @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)

1639

* @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)

1640

* @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)

1641

* @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)

1642

* @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)

1643

* @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor

1644

* @param[in] max_offset Max offset for the input tensor

1645

* @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: same as src_ptr

1646

* @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)

1647

* @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)

1648

* @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector

1649

*/

1650

__kernel void depthwise_convolution_3x3_nhwc_stride1(

Georgios Pinitas

2018-10-30 14:53:25 +0000

[diff] [blame]

1651

TENSOR4D_DECLARATION(src),

1652

TENSOR4D_DECLARATION(dst),

Giorgio Arena

2018-06-20 11:46:42 +0100

[diff] [blame]

1653

TENSOR3D_DECLARATION(weights),

1654

#if defined(HAS_BIAS)

1655

VECTOR_DECLARATION(biases),

1656

#endif /* defined(HAS_BIAS) */

1657

int max_offset)

1658

{

1659

int x = get_global_id(0); // channels

1660

int y = get_global_id(1); // spatial coordinate x

Georgios Pinitas

2018-10-30 14:53:25 +0000

[diff] [blame]

1661

#if defined(DST_DEPTH)

1662

int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y

1663

int b = get_global_id(2) / (int)DST_DEPTH; // batch

Usama Arif

2019-04-08 17:30:48 +0100

[diff] [blame^]

1664

#else // defined(DST_DEPTH)

1665

int z = get_global_id(2); // spatial coordinate y

1666

#endif // defined(DST_DEPTH)

Giorgio Arena

2018-06-20 11:46:42 +0100

[diff] [blame]

1667

1668

Vector weights = CONVERT_TO_VECTOR_STRUCT(weights);

1669

Georgios Pinitas

2018-10-30 14:53:25 +0000

[diff] [blame]

1670

#if defined(DST_DEPTH)

1671

__global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) * VEC_SIZE + b * src_stride_w;

1672

#else /* defined(DST_DEPTH) */

Giorgio Arena

2018-08-23 11:19:11 +0100

[diff] [blame]

1673

__global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) * VEC_SIZE;

Georgios Pinitas

2018-10-30 14:53:25 +0000

[diff] [blame]

1674

#endif /* defined(DST_DEPTH) */

Giorgio Arena

2018-06-20 11:46:42 +0100

[diff] [blame]

1675

1676

int z_coord = 0;

1677

int4 offset = 0;

Georgios Pinitas

2018-07-10 17:03:11 +0100

[diff] [blame]

1678

int4 y_offset = ((int4)(y * NUM_ROWS_PROCESSED) + (int4)(0, 1, 2, 3) - (int)CONV_PAD_LEFT) * (int4)src_stride_y;

Giorgio Arena

2018-06-20 11:46:42 +0100

[diff] [blame]

1679

1680

// We compute 2x2x2 [C,W,H] elements

VEC_FLOAT acc0 = 0;

VEC_FLOAT acc1 = 0;

VEC_FLOAT acc2 = 0;

VEC_FLOAT acc3 = 0;

// Load weights

Giorgio Arena

2018-08-23 11:19:11 +0100

[diff] [blame]

1687

VEC_FLOAT w0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 0 * weights_stride_y + 0 * weights_stride_z));

1688

VEC_FLOAT w1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 1 * weights_stride_y + 0 * weights_stride_z));

1689

VEC_FLOAT w2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 2 * weights_stride_y + 0 * weights_stride_z));

1690

VEC_FLOAT w3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 0 * weights_stride_y + 1 * weights_stride_z));

1691

VEC_FLOAT w4 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 1 * weights_stride_y + 1 * weights_stride_z));

1692

VEC_FLOAT w5 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 2 * weights_stride_y + 1 * weights_stride_z));

1693

VEC_FLOAT w6 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 0 * weights_stride_y + 2 * weights_stride_z));

1694

VEC_FLOAT w7 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 1 * weights_stride_y + 2 * weights_stride_z));

1695

VEC_FLOAT w8 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(weights.ptr + 2 * weights_stride_y + 2 * weights_stride_z));

Giorgio Arena

2018-06-20 11:46:42 +0100

[diff] [blame]

// Load input values

// z == 0

// Clamp z_coord as for z = 0, it can be negative

1700

// z_coord is casted to unsigned int in order to use just a min() operation

1701

// A "-1" 32 bit signed variable converted to unsigned gives 4294967295

Georgios Pinitas

2018-07-10 17:03:11 +0100

[diff] [blame]

1702

z_coord = z * (int)NUM_PLANES_PROCESSED - (int)CONV_PAD_TOP;

Giorgio Arena

2018-06-20 11:46:42 +0100

[diff] [blame]

1703

z_coord = min((uint)z_coord, (uint)SRC_DIM_2);

1704

offset = y_offset + (int4)(z_coord * src_stride_z);

Georgios Pinitas

2018-07-10 17:03:11 +0100

[diff] [blame]

1705

offset = min(offset, (int4)max_offset);

Giorgio Arena

2018-06-20 11:46:42 +0100

[diff] [blame]

1706

Giorgio Arena

2018-08-23 11:19:11 +0100

[diff] [blame]

1707

VEC_FLOAT values0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));

1708

VEC_FLOAT values1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));

1709

VEC_FLOAT values2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));

1710

VEC_FLOAT values3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s3));

Giorgio Arena

2018-06-20 11:46:42 +0100

[diff] [blame]

1711

1712

// z == 1

1713

// z_coord can be only negative for z = 0 so we do not need to clamp it

1714

// Moreover z_coord cannot be out-of-bound for z = 1 so we do not need to clamp the offset

Georgios Pinitas

2018-07-10 17:03:11 +0100

[diff] [blame]

1715

z_coord = z * (int)NUM_PLANES_PROCESSED - (int)CONV_PAD_TOP + 1;

Giorgio Arena

2018-06-20 11:46:42 +0100

[diff] [blame]

1716

offset = y_offset + (int4)(z_coord * src_stride_z);

Giorgio Arena

2018-08-23 11:19:11 +0100

[diff] [blame]

1717

VEC_FLOAT values4 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));

1718

VEC_FLOAT values5 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));

1719

VEC_FLOAT values6 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));

1720

VEC_FLOAT values7 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s3));

Giorgio Arena

2018-06-20 11:46:42 +0100

[diff] [blame]

1721

1722

// z == 2

1723

// After z = 1 we can simply add src_stride_z to offset without updating z_coord

1724

// However offset can be out-of-bound so we need to check if it is greater than max_offset

1725

offset += (int4)src_stride_z;

Georgios Pinitas

2018-07-10 17:03:11 +0100

[diff] [blame]

1726

offset = min(offset, (int4)max_offset);

Giorgio Arena

2018-08-23 11:19:11 +0100

[diff] [blame]

1727

VEC_FLOAT values8 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));

1728

VEC_FLOAT values9 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));

1729

VEC_FLOAT values10 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));

1730

VEC_FLOAT values11 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s3));

Giorgio Arena

2018-06-20 11:46:42 +0100

[diff] [blame]

1731

1732

// z == 3

1733

// After z = 1 we can simply add src_stride_z to offset without updating z_coord

1734

// However offset can be out-of-bound so we need to check if it is greater than max_offset

Georgios Pinitas

2018-07-10 17:03:11 +0100

[diff] [blame]

1735

offset += (int4)src_stride_z;

1736

offset = min(offset, (int4)max_offset);

Giorgio Arena

2018-08-23 11:19:11 +0100

[diff] [blame]

1737

VEC_FLOAT values12 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));

1738

VEC_FLOAT values13 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));

1739

VEC_FLOAT values14 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));

1740

VEC_FLOAT values15 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s3));

Giorgio Arena

2018-06-20 11:46:42 +0100

[diff] [blame]

1741

1742

acc0 = fma(values0, w0, acc0);

1743

acc0 = fma(values1, w1, acc0);

1744

acc0 = fma(values2, w2, acc0);

1745

acc1 = fma(values1, w0, acc1);

1746

acc1 = fma(values2, w1, acc1);

1747

acc1 = fma(values3, w2, acc1);

1748

1749

acc0 = fma(values4, w3, acc0);

1750

acc0 = fma(values5, w4, acc0);

1751

acc0 = fma(values6, w5, acc0);

1752

acc1 = fma(values5, w3, acc1);

1753

acc1 = fma(values6, w4, acc1);

1754

acc1 = fma(values7, w5, acc1);

1755

1756

acc0 = fma(values8, w6, acc0);

1757

acc0 = fma(values9, w7, acc0);

1758

acc0 = fma(values10, w8, acc0);

1759

acc1 = fma(values9, w6, acc1);

1760

acc1 = fma(values10, w7, acc1);

1761

acc1 = fma(values11, w8, acc1);

1762

1763

acc2 = fma(values4, w0, acc2);

1764

acc2 = fma(values5, w1, acc2);

1765

acc2 = fma(values6, w2, acc2);

1766

acc3 = fma(values5, w0, acc3);

1767

acc3 = fma(values6, w1, acc3);

1768

acc3 = fma(values7, w2, acc3);

1769

1770

acc2 = fma(values8, w3, acc2);

1771

acc2 = fma(values9, w4, acc2);

1772

acc2 = fma(values10, w5, acc2);

1773

acc3 = fma(values9, w3, acc3);

1774

acc3 = fma(values10, w4, acc3);

1775

acc3 = fma(values11, w5, acc3);

1776

1777

acc2 = fma(values12, w6, acc2);

1778

acc2 = fma(values13, w7, acc2);

1779

acc2 = fma(values14, w8, acc2);

1780

acc3 = fma(values13, w6, acc3);

1781

acc3 = fma(values14, w7, acc3);

1782

acc3 = fma(values15, w8, acc3);

1783

1784

#if defined(HAS_BIAS)

1785

Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);

1786

Giorgio Arena

2018-08-23 11:19:11 +0100

[diff] [blame]

1787

VEC_FLOAT bias_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)biases.ptr);

Giorgio Arena

2018-06-20 11:46:42 +0100

[diff] [blame]

acc0 += bias_values;

acc1 += bias_values;

acc2 += bias_values;

acc3 += bias_values;

#endif // defined(HAS_BIAS)

1794

Georgios Pinitas

2018-10-30 14:53:25 +0000

[diff] [blame]

1795

#if defined(DST_DEPTH)

1796

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + (z * NUM_PLANES_PROCESSED) * dst_step_z + b * dst_stride_w;

1797

#else /* defined(DST_DEPTH) */

Giorgio Arena