Blame - src/core/CL/cl_kernels/depthwise_convolution_quantized.cl - ml/ComputeLibrary

2018-07-02 15:29:57 +0100

[diff] [blame]

708

709

#if WEIGHTS_OFFSET != 0

710

#define MULTIPLY_ADD_ACCUMULATE(x, y, acc, sum) \

711

({ \

712

sum += CONVERT(x, VEC_INT); \

713

MULTIPLY_ADD(x, y, acc); \

714

})

715

#else /* WEIGHTS_OFFSET != 0 */

716

#define MULTIPLY_ADD_ACCUMULATE(x, y, acc, sum) MULTIPLY_ADD(x, y, acc)

717

#endif /* WEIGHTS_OFFSET != 0 */

718

Georgios Pinitas

daa3855

2018-08-28 17:43:18 +0100

[diff] [blame]

719

#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)

giuros01

2019-01-07 17:47:19 +0000

[diff] [blame]

720

#define DOT_PRODUCT(acc, val0, val1, val2, val3, val4, val5, val6, val7, val8, w0, w1) \

721

({ \

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

722

ARM_DOT((VEC_TYPE(4))(val0, val1, val2, val3), w0.s0123, acc); \

723

ARM_DOT((VEC_TYPE(4))(val4, val5, val6, val7), w0.s4567, acc); \

giuros01

2019-01-07 17:47:19 +0000

[diff] [blame]

724

acc += val8 * w1; \

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

725

})

726

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

727

#define DOT_PRODUCT_REDUCTION(sum, val0, val1, val2, val3, val4, val5, val6, val7, val8) \

728

({ \

giuros01

2019-01-07 17:47:19 +0000

[diff] [blame]

729

sum = val0; \

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

730

ARM_DOT((VEC_TYPE(4))(val1, val2, val3, val4), (VEC_TYPE(4))1, sum); \

731

ARM_DOT((VEC_TYPE(4))(val5, val6, val7, val8), (VEC_TYPE(4))1, sum); \

giuros01

2019-01-07 17:47:19 +0000

[diff] [blame]

732

})

733

734

#define DOT_PRODUCT_REDUCTION_WEIGHTS(sum, w0, w1) \

735

({ \

736

sum = w1; \

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

737

ARM_DOT(w0.s0123, (VEC_TYPE(4))1, sum); \

738

ARM_DOT(w0.s4567, (VEC_TYPE(4))1, sum); \

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

739

})

740

Georgios Pinitas

daa3855

2018-08-28 17:43:18 +0100

[diff] [blame]

741

#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

742

Pablo Tello

2019-02-27 13:32:51 +0000

[diff] [blame]

743

#if defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y) && VEC_SIZE == 4

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

744

/** This function computes the depthwise convolution quantized for NHWC data layout when the stride along the width or height is not 1.

745

*

Pablo Tello

2019-02-27 13:32:51 +0000

[diff] [blame]

746

* @note This kernel assumes VEC_SIZE is 4.

giuros01

2019-01-07 17:47:19 +0000

[diff] [blame]

747

* @note The weights tensor is expected to be reshaped using @ref CLDepthwiseConvolutionLayerReshapeWeightsKernel.

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

748

* @note The number of elements read per thread must be passed at compile time using -DVEC_SIZE (e.g. -DVEC_SIZE=2)

749

* @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)

750

* @note The convolution pad top must be passed at compile time using -DCONV_PAD_TOP (e.g. -DCONV_PAD_TOP=1)

751

* @note The convolution pad top must be passed at compile time using -DCONV_PAD_LEFT (e.g. -DCONV_PAD_LEFT=1)

752

* @note The convolution stride along the width must be passed at compile time using -DCONV_STRIDE_X (e.g. -DCONV_STRIDE_Y=X)

753

* @note The convolution stride along the height must be passed at compile time using -DCONV_STRIDE_Y (e.g. -DCONV_STRIDE_Y=1)

754

*

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

755

* @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8

756

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

757

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

758

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

759

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

760

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

761

* @param[in] src_step_z src_stride_y * number of elements along Z processed per workitem(in bytes)

762

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

763

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

764

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

765

* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

766

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

767

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

768

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

769

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

770

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

771

* @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)

772

* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)

773

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

774

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

775

* @param[in] weights_ptr Pointer to the weights tensor reshaped. Supported data types: QASYMM8/QSYMM8_PER_CHANNEL

776

* @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)

777

* @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)

778

* @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)

779

* @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)

780

* @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor

781

* @param[in] output_multipliers_ptr Pointer to the output multipliers vector. Supported data types: S32

782

* @param[in] output_multipliers_stride_x Stride of the output multipliers vector in X dimension (in bytes)

783

* @param[in] output_multipliers_step_x output_multipliers_stride_x * number of elements along X processed per workitem(in bytes)

784

* @param[in] output_multipliers_offset_first_element_in_bytes The offset of the first element in the output multipliers vector

785

* @param[in] output_shifts_ptr Pointer to the output shifts vector. Supported data types: S32

786

* @param[in] output_shifts_stride_x Stride of the output shifts vector in X dimension (in bytes)

787

* @param[in] output_shifts_step_x output_shifts_stride_x * number of elements along X processed per workitem(in bytes)

788

* @param[in] output_shifts_offset_first_element_in_bytes The offset of the first element in the output shifts vector

789

* @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: S32

790

* @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)

791

* @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)

792

* @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector

793

* @param[in] max_offset Max offset for the input tensor

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

794

*/

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

795

__kernel void dwc_3x3_reshaped_quantized8_nhwc(

Georgios Pinitas

2018-10-30 14:53:25 +0000

[diff] [blame]

796

TENSOR4D_DECLARATION(src),

797

TENSOR4D_DECLARATION(dst),

giuros01

2019-01-07 17:47:19 +0000

[diff] [blame]

798

IMAGE_DECLARATION(weights),

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

799

VECTOR_DECLARATION(output_multipliers),

800

VECTOR_DECLARATION(output_shifts),

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

801

#if defined(HAS_BIAS)

802

VECTOR_DECLARATION(biases),

803

#endif /* defined(HAS_BIAS) */

804

int max_offset)

805

{

806

const int x = get_global_id(0); // channels

807

const int y = get_global_id(1); // spatial coordinate x

Georgios Pinitas

2018-10-30 14:53:25 +0000

[diff] [blame]

808

#if defined(DST_DEPTH)

809

int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y

810

int b = get_global_id(2) / (int)DST_DEPTH; // batch

Pablo Tello

2019-02-27 13:32:51 +0000

[diff] [blame]

811

#else // defined(DST_DEPTH)

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

812

int z = get_global_id(2); // spatial coordinate y

Pablo Tello

2019-02-27 13:32:51 +0000

[diff] [blame]

813

#endif // defined(DST_DEPTH)

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

814

giuros01

2019-01-07 17:47:19 +0000

[diff] [blame]

815

__global uchar *weights_addr = weights_ptr + weights_offset_first_element_in_bytes + x * weights_stride_y;

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

816

Georgios Pinitas

2018-10-30 14:53:25 +0000

[diff] [blame]

817

#if defined(DST_DEPTH)

818

__global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * VEC_SIZE + b * src_stride_w;

819

#else /* defined(DST_DEPTH) */

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

820

__global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * VEC_SIZE;

Georgios Pinitas

2018-10-30 14:53:25 +0000

[diff] [blame]

821

#endif /* defined(DST_DEPTH) */

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

822

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

823

int z_coord = 0;

824

int4 offset = 0;

Usama Arif

e73686a

2019-04-08 17:30:48 +0100

[diff] [blame]

825

int4 y_coord = ((int4)(y * CONV_STRIDE_X) + (int4)(0, DILATION_X * 1, DILATION_X * 2, DILATION_X * 3)) - (int)CONV_PAD_LEFT;

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

826

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

827

// Only for y = 0 we can have a negative coordinate. If so, we convert it to SRC_DIM_1

828

y_coord.s0 = min((uint)y_coord.s0, (uint)SRC_DIM_1);

829

y_coord.s1 = min((uint)y_coord.s1, (uint)SRC_DIM_1);

830

y_coord.s2 = min((uint)y_coord.s2, (uint)SRC_DIM_1);

831

y_coord.s3 = min((uint)y_coord.s3, (uint)SRC_DIM_1);

832

833

int4 y_offset = convert_int4(y_coord * (int)src_stride_y);

834

giuros01

2019-01-07 17:47:19 +0000

[diff] [blame]

835

// We compute VEC_SIZEx1x1 [C,W,H] elements

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

836

VEC_INT acc = 0, sum = 0;

837

838

// Load weights

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

839

VEC_DATA_TYPE(WEIGHTS_TYPE, 16)

840

w0_tmp = VLOAD(16)(0, (__global WEIGHTS_TYPE *)(weights_addr));

841

VEC_DATA_TYPE(WEIGHTS_TYPE, 16)

842

w1_tmp = VLOAD(16)(0, (__global WEIGHTS_TYPE *)(weights_addr + 16));

843

VEC_DATA_TYPE(WEIGHTS_TYPE, 4)

844

w8 = VLOAD(4)(0, (__global WEIGHTS_TYPE *)(weights_addr + 2 * 16));

Pablo Tello

2019-02-27 13:32:51 +0000

[diff] [blame]

845

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

846

VEC_DATA_TYPE(WEIGHTS_TYPE, 4)

847

w0 = w0_tmp.s0123;

848

VEC_DATA_TYPE(WEIGHTS_TYPE, 4)

849

w1 = w0_tmp.s4567;

850

VEC_DATA_TYPE(WEIGHTS_TYPE, 4)

851

w2 = w0_tmp.s89AB;

852

VEC_DATA_TYPE(WEIGHTS_TYPE, 4)

853

w3 = w0_tmp.sCDEF;

Pablo Tello

2019-02-27 13:32:51 +0000

[diff] [blame]

854

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

855

VEC_DATA_TYPE(WEIGHTS_TYPE, 4)

856

w4 = w1_tmp.s0123;

857

VEC_DATA_TYPE(WEIGHTS_TYPE, 4)

858

w5 = w1_tmp.s4567;

859

VEC_DATA_TYPE(WEIGHTS_TYPE, 4)

860

w6 = w1_tmp.s89AB;

861

VEC_DATA_TYPE(WEIGHTS_TYPE, 4)

862

w7 = w1_tmp.sCDEF;

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

863

864

#if INPUT_OFFSET != 0

Georgios Pinitas

2018-08-01 20:16:34 +0100

[diff] [blame]

865

VEC_INT sum_we = CONVERT(w0, VEC_INT) + CONVERT(w1, VEC_INT) + CONVERT(w2, VEC_INT)

866

+ CONVERT(w3, VEC_INT) + CONVERT(w4, VEC_INT) + CONVERT(w5, VEC_INT)

867

+ CONVERT(w6, VEC_INT) + CONVERT(w7, VEC_INT) + CONVERT(w8, VEC_INT);

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

868

#endif /* INPUT_OFFSET != 0 */

// Load input values

// z == 0

// Clamp z_coord as for z = 0, it can be negative

873

// z_coord is casted to unsigned int in order to use just a min() operation

874

// A "-1" 32 bit signed variable converted to unsigned gives 4294967295

Georgios Pinitas

2018-08-01 20:16:34 +0100

[diff] [blame]

875

z_coord = z * (int)CONV_STRIDE_Y - (int)CONV_PAD_TOP;

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

876

z_coord = min((uint)z_coord, (uint)SRC_DIM_2);

877

offset = y_offset + (int4)(z_coord * src_stride_z);

Georgios Pinitas

2018-08-01 20:16:34 +0100

[diff] [blame]

878

offset = min(offset, (int4)max_offset);

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

879

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

880

VEC_TYPE(VEC_SIZE)

881

values0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));

882

VEC_TYPE(VEC_SIZE)

883

values1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));

884

VEC_TYPE(VEC_SIZE)

885

values2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

886

887

// z == 1

888

// z_coord can be only negative for z = 0 so we do not need to clamp it

889

// Moreover z_coord cannot be out-of-bound for z = 1 so we do not need to clamp the offset

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

890

z_coord = z * (int)CONV_STRIDE_Y - (int)CONV_PAD_TOP + DILATION_Y;

891

offset = y_offset + (int4)(z_coord * src_stride_z);

892

VEC_TYPE(VEC_SIZE)

893

values3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));

894

VEC_TYPE(VEC_SIZE)

895

values4 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));

896

VEC_TYPE(VEC_SIZE)

897

values5 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

898

899

// z == 2

Usama Arif

e73686a

2019-04-08 17:30:48 +0100

[diff] [blame]

900

// Offset can be out-of-bound so we need to check if it is greater than max_offset

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

901

z_coord = z * (int)CONV_STRIDE_Y - (int)CONV_PAD_TOP + DILATION_Y * 2;

902

offset = y_offset + (int4)(z_coord * src_stride_z);

903

offset = min(offset, (int4)max_offset);

904

VEC_TYPE(VEC_SIZE)

905

values6 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));

906

VEC_TYPE(VEC_SIZE)

907

values7 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));

908

VEC_TYPE(VEC_SIZE)

909

values8 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

910

911

MULTIPLY_ADD_ACCUMULATE(values0, w0, acc, sum);

912

MULTIPLY_ADD_ACCUMULATE(values1, w1, acc, sum);

913

MULTIPLY_ADD_ACCUMULATE(values2, w2, acc, sum);

914

915

MULTIPLY_ADD_ACCUMULATE(values3, w3, acc, sum);

916

MULTIPLY_ADD_ACCUMULATE(values4, w4, acc, sum);

917

MULTIPLY_ADD_ACCUMULATE(values5, w5, acc, sum);

918

919

MULTIPLY_ADD_ACCUMULATE(values6, w6, acc, sum);

920

MULTIPLY_ADD_ACCUMULATE(values7, w7, acc, sum);

921

MULTIPLY_ADD_ACCUMULATE(values8, w8, acc, sum);

922

923

#if defined(HAS_BIAS)

924

Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);

925

VEC_INT bias_values = VLOAD(VEC_SIZE)(0, (__global int *)biases.ptr);

926

acc += bias_values;

927

#endif // defined(HAS_BIAS)

928

929

#if WEIGHTS_OFFSET != 0

930

acc += WEIGHTS_OFFSET * sum;

931

#endif /* WEIGHTS_OFFSET != 0 */

932

933

#if INPUT_OFFSET != 0

934

acc += INPUT_OFFSET * sum_we;

935

#endif /* INPUT_OFFSET != 0 */

936

937

#if K_OFFSET != 0

938

acc += (VEC_INT)K_OFFSET;

939

#endif /* K_OFFSET != 0 */

940

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

941

#if defined(REAL_MULTIPLIER)

942

943

acc = CONVERT(round(CONVERT(acc, VEC_FLOAT) * (VEC_FLOAT)REAL_MULTIPLIER), VEC_INT);

944

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

945

#else // defined(REAL_MULTIPLIER)

946

#if defined(PER_CHANNEL_QUANTIZATION)

947

Vector output_multipliers = CONVERT_TO_VECTOR_STRUCT(output_multipliers);

948

Vector output_shifts = CONVERT_TO_VECTOR_STRUCT(output_shifts);

949

VEC_INT output_multiplier = VLOAD(VEC_SIZE)(0, (__global int *)output_multipliers.ptr);

950

VEC_INT output_shift = VLOAD(VEC_SIZE)(0, (__global int *)output_shifts.ptr);

951

#else // defined(PER_CHANNEL_QUANTIZATION)

952

const int output_multiplier = *((__global int *)(output_multipliers_ptr + output_multipliers_offset_first_element_in_bytes));

953

const int output_shift = *((__global int *)(output_shifts_ptr + output_shifts_offset_first_element_in_bytes));

954

#endif // defined(PER_CHANNEL_QUANTIZATION)

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

955

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

956

acc = asymm_mult_by_quant_multiplier_less_than_one(acc, output_multiplier, output_shift);

957

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

958

#endif // defined(REAL_MULTIPLIER)

959

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

960

acc += (VEC_INT)OUTPUT_OFFSET;

961

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

962

VEC_TYPE(VEC_SIZE)

963

res = CONVERT_SAT(acc, VEC_TYPE(VEC_SIZE));

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

964

Georgios Pinitas

2018-10-30 14:53:25 +0000

[diff] [blame]

965

#if defined(DST_DEPTH)

966

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + z * dst_step_z + b * dst_stride_w;

967

#else /* defined(DST_DEPTH) */

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

968

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + z * dst_step_z;

Georgios Pinitas

2018-10-30 14:53:25 +0000

[diff] [blame]

969

#endif /* defined(DST_DEPTH) */

970

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

971

VSTORE(VEC_SIZE)

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

972

(ACTIVATION_FUNC(res), 0, (__global DATA_TYPE *)(dst_addr));

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

973

}

974

#endif // defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y)

975

Pablo Tello

2019-02-27 13:32:51 +0000

[diff] [blame]

976

#if defined(NUM_ROWS_PROCESSED) && defined(NUM_PLANES_PROCESSED) && VEC_SIZE == 4

giuros01

2019-01-07 17:47:19 +0000

[diff] [blame]

977

/** This function computes the depthwise convolution quantized for NHWC data layout when the stride along the width and height is 1.

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

978

*

Pablo Tello

2019-02-27 13:32:51 +0000

[diff] [blame]

979

* @note This kernel assumes VEC_SIZE is 4.

giuros01

2019-01-07 17:47:19 +0000

[diff] [blame]

980

* @note The weights tensor is expected to be reshaped using @ref CLDepthwiseConvolutionLayerReshapeWeightsKernel.

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

981

* @note The number of elements read per thread must be passed at compile time using -DVEC_SIZE (e.g. -DVEC_SIZE=2)

982

* @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)

983

* @note The number of rows processed per thread must be passed at compile time using -DNUM_ROWS_PROCESSED (i.e. -DNUM_ROWS_PROCESSED=2)

984

* @note The number of planes processed per thread must be passed at compile time using -DNUM_PLANES_PROCESSED (i.e. -DNUM_PLANES_PROCESSED=2)

985

* @note The convolution pad top must be passed at compile time using -DCONV_PAD_TOP (e.g. -DCONV_PAD_TOP=1)

986

* @note The convolution pad top must be passed at compile time using -DCONV_PAD_LEFT (e.g. -DCONV_PAD_LEFT=1).

987

*

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

988

* @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8

989

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

990

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

991

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

992

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

993

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

994

* @param[in] src_step_z src_stride_y * number of elements along Z processed per workitem(in bytes)

995

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

996

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

997

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

998

* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

999

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1000

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1001

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1002

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

1003

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

1004

* @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)

1005

* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)

1006

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

1007

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1008

* @param[in] weights_ptr Pointer to the weights tensor. Supported data types: QASYMM8/QSYMM8_PER_CHANNEL

1009

* @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)

1010

* @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)

1011

* @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)

1012

* @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)

1013

* @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor

1014

* @param[in] output_multipliers_ptr Pointer to the output multipliers vector. Supported data types: S32

1015

* @param[in] output_multipliers_stride_x Stride of the output multipliers vector in X dimension (in bytes)

1016

* @param[in] output_multipliers_step_x output_multipliers_stride_x * number of elements along X processed per workitem(in bytes)

1017

* @param[in] output_multipliers_offset_first_element_in_bytes The offset of the first element in the output multipliers vector

1018

* @param[in] output_shifts_ptr Pointer to the output shifts vector. Supported data types: S32

1019

* @param[in] output_shifts_stride_x Stride of the output shifts vector in X dimension (in bytes)

1020

* @param[in] output_shifts_step_x output_shifts_stride_x * number of elements along X processed per workitem(in bytes)

1021

* @param[in] output_shifts_offset_first_element_in_bytes The offset of the first element in the output shifts vector

1022

* @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: S32

1023

* @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)

1024

* @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)

1025

* @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector

1026

* @param[in] max_offset Max offset for the input tensor

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1027

*/

1028

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

1029

__kernel void dwc_3x3_reshaped_quantized8_stride1_nhwc(

Georgios Pinitas

2018-10-30 14:53:25 +0000

[diff] [blame]

1030

TENSOR4D_DECLARATION(src),

1031

TENSOR4D_DECLARATION(dst),

giuros01

2019-01-07 17:47:19 +0000

[diff] [blame]

1032

IMAGE_DECLARATION(weights),

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

1033

VECTOR_DECLARATION(output_multipliers),

1034

VECTOR_DECLARATION(output_shifts),

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1035

#if defined(HAS_BIAS)

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1036

VECTOR_DECLARATION(biases),

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1037

#endif /* defined(HAS_BIAS) */

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1038

int max_offset)

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1039

{

Giorgio Arena

2018-06-19 11:27:38 +0100

[diff] [blame]

1040

int x = get_global_id(0);

1041

int y = get_global_id(1);

Georgios Pinitas

2018-10-30 14:53:25 +0000

[diff] [blame]

1042

#if defined(DST_DEPTH)

1043

int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y

1044

int b = get_global_id(2) / (int)DST_DEPTH; // batch

Pablo Tello

2019-02-27 13:32:51 +0000

[diff] [blame]

1045

#else // defined(DST_DEPTH)

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

1046

int z = get_global_id(2); // spatial coordinate y

Pablo Tello

2019-02-27 13:32:51 +0000

[diff] [blame]

1047

#endif // defined(DST_DEPTH)

Giorgio Arena

2018-06-19 11:27:38 +0100

[diff] [blame]

1048

giuros01

2019-01-07 17:47:19 +0000

[diff] [blame]

1049

__global uchar *weights_addr = weights_ptr + weights_offset_first_element_in_bytes + x * weights_stride_y;

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1050

Georgios Pinitas

2018-10-30 14:53:25 +0000

[diff] [blame]

1051

#if defined(DST_DEPTH)

1052

__global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * VEC_SIZE + b * src_stride_w;

1053

#else /* defined(DST_DEPTH) */

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

1054

__global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * VEC_SIZE;

Georgios Pinitas

2018-10-30 14:53:25 +0000

[diff] [blame]

1055

#endif /* defined(DST_DEPTH) */

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1056

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1057

int z_coord = 0;

1058

int4 offset = 0;

1059

int4 y_coord = ((int4)(y * NUM_ROWS_PROCESSED) + (int4)(0, 1, 2, 3)) - (int)CONV_PAD_LEFT;

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1060

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1061

// Only for y = 0 we can have a negative coordinate. If so, we convert it to SRC_DIM_1

1062

y_coord.s0 = min((uint)y_coord.s0, (uint)SRC_DIM_1);

1063

y_coord.s1 = min((uint)y_coord.s1, (uint)SRC_DIM_1);

1064

y_coord.s2 = min((uint)y_coord.s2, (uint)SRC_DIM_1);

1065

y_coord.s3 = min((uint)y_coord.s3, (uint)SRC_DIM_1);

1066

1067

int4 y_offset = convert_int4(y_coord * (int)src_stride_y);

1068

1069

// We compute 4x2x2 [C,W,H] elements

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1070

VEC_INT acc0 = 0, sum0 = 0;

1071

VEC_INT acc1 = 0, sum1 = 0;

1072

VEC_INT acc2 = 0, sum2 = 0;

1073

VEC_INT acc3 = 0, sum3 = 0;

1074

1075

// Load weights

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

1076

VEC_DATA_TYPE(WEIGHTS_TYPE, 16)

1077

w0_tmp = VLOAD(16)(0, (__global WEIGHTS_TYPE *)(weights_addr));

1078

VEC_DATA_TYPE(WEIGHTS_TYPE, 16)

1079

w1_tmp = VLOAD(16)(0, (__global WEIGHTS_TYPE *)(weights_addr + 16));

1080

VEC_DATA_TYPE(WEIGHTS_TYPE, 4)

1081

w8 = VLOAD(4)(0, (__global WEIGHTS_TYPE *)(weights_addr + 2 * 16));

Pablo Tello

2019-02-27 13:32:51 +0000

[diff] [blame]

1082

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

1083

VEC_DATA_TYPE(WEIGHTS_TYPE, 4)

1084

w0 = w0_tmp.s0123;

1085

VEC_DATA_TYPE(WEIGHTS_TYPE, 4)

1086

w1 = w0_tmp.s4567;

1087

VEC_DATA_TYPE(WEIGHTS_TYPE, 4)

1088

w2 = w0_tmp.s89AB;

1089

VEC_DATA_TYPE(WEIGHTS_TYPE, 4)

1090

w3 = w0_tmp.sCDEF;

Pablo Tello

2019-02-27 13:32:51 +0000

[diff] [blame]

1091

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

1092

VEC_DATA_TYPE(WEIGHTS_TYPE, 4)

1093

w4 = w1_tmp.s0123;

1094

VEC_DATA_TYPE(WEIGHTS_TYPE, 4)

1095

w5 = w1_tmp.s4567;

1096

VEC_DATA_TYPE(WEIGHTS_TYPE, 4)

1097

w6 = w1_tmp.s89AB;

1098

VEC_DATA_TYPE(WEIGHTS_TYPE, 4)

1099

w7 = w1_tmp.sCDEF;

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1100

1101

#if INPUT_OFFSET != 0

Georgios Pinitas

2018-08-01 20:16:34 +0100

[diff] [blame]

1102

VEC_INT sum_we = CONVERT(w0, VEC_INT) + CONVERT(w1, VEC_INT) + CONVERT(w2, VEC_INT)

1103

+ CONVERT(w3, VEC_INT) + CONVERT(w4, VEC_INT) + CONVERT(w5, VEC_INT)

1104

+ CONVERT(w6, VEC_INT) + CONVERT(w7, VEC_INT) + CONVERT(w8, VEC_INT);

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1105

#endif /* INPUT_OFFSET != 0 */

// Load input values

// z == 0

// Clamp z_coord as for z = 0, it can be negative

1110

// z_coord is casted to unsigned int in order to use just a min() operation

1111

// A "-1" 32 bit signed variable converted to unsigned gives 4294967295

Georgios Pinitas

2018-08-01 20:16:34 +0100

[diff] [blame]

1112

z_coord = z * (int)NUM_PLANES_PROCESSED - (int)CONV_PAD_TOP;

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1113

z_coord = min((uint)z_coord, (uint)SRC_DIM_2);

1114

offset = y_offset + (int4)(z_coord * src_stride_z);

Georgios Pinitas

2018-08-01 20:16:34 +0100

[diff] [blame]

1115

offset = min(offset, (int4)max_offset);

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1116

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

1117

VEC_TYPE(VEC_SIZE)

1118

values0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));

1119

VEC_TYPE(VEC_SIZE)

1120

values1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));

1121

VEC_TYPE(VEC_SIZE)

1122

values2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));

1123

VEC_TYPE(VEC_SIZE)

1124

values3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s3));

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1125

1126

// z == 1

1127

// z_coord can be only negative for z = 0 so we do not need to clamp it

1128

// Moreover z_coord cannot be out-of-bound for z = 1 so we do not need to clamp the offset

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

1129

z_coord = z * (int)NUM_PLANES_PROCESSED - (int)CONV_PAD_TOP + 1;

1130

offset = y_offset + (int4)(z_coord * src_stride_z);

1131

VEC_TYPE(VEC_SIZE)

1132

values4 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));

1133

VEC_TYPE(VEC_SIZE)

1134

values5 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));

1135

VEC_TYPE(VEC_SIZE)

1136

values6 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));

1137

VEC_TYPE(VEC_SIZE)

1138

values7 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s3));

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1139

1140

// z == 2

1141

// After z = 1 we can simply add src_stride_z to offset without updating z_coord

1142

// However offset can be out-of-bound so we need to check if it is greater than max_offset

1143

offset += (int4)src_stride_z;

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

1144

offset = min(offset, (int4)max_offset);

1145

VEC_TYPE(VEC_SIZE)

1146

values8 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));

1147

VEC_TYPE(VEC_SIZE)

1148

values9 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));

1149

VEC_TYPE(VEC_SIZE)

1150

values10 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));

1151

VEC_TYPE(VEC_SIZE)

1152

values11 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s3));

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1153

1154

// z == 3

1155

// After z = 1 we can simply add src_stride_z to offset without updating z_coord

1156

// However offset can be out-of-bound so we need to check if it is greater than max_offset

1157

offset += (int4)(src_stride_z);

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

1158

offset = min(offset, (int4)max_offset);

1159

VEC_TYPE(VEC_SIZE)

1160

values12 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));

1161

VEC_TYPE(VEC_SIZE)

1162

values13 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));

1163

VEC_TYPE(VEC_SIZE)

1164

values14 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));

1165

VEC_TYPE(VEC_SIZE)

1166

values15 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s3));

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1167

1168

MULTIPLY_ADD_ACCUMULATE(values0, w0, acc0, sum0);

1169

MULTIPLY_ADD_ACCUMULATE(values1, w1, acc0, sum0);

1170

MULTIPLY_ADD_ACCUMULATE(values2, w2, acc0, sum0);

1171

MULTIPLY_ADD_ACCUMULATE(values1, w0, acc1, sum1);

1172

MULTIPLY_ADD_ACCUMULATE(values2, w1, acc1, sum1);

1173

MULTIPLY_ADD_ACCUMULATE(values3, w2, acc1, sum1);

1174

1175

MULTIPLY_ADD_ACCUMULATE(values4, w3, acc0, sum0);

1176

MULTIPLY_ADD_ACCUMULATE(values5, w4, acc0, sum0);

1177

MULTIPLY_ADD_ACCUMULATE(values6, w5, acc0, sum0);

1178

MULTIPLY_ADD_ACCUMULATE(values5, w3, acc1, sum1);

1179

MULTIPLY_ADD_ACCUMULATE(values6, w4, acc1, sum1);

1180

MULTIPLY_ADD_ACCUMULATE(values7, w5, acc1, sum1);

1181

1182

MULTIPLY_ADD_ACCUMULATE(values8, w6, acc0, sum0);

1183

MULTIPLY_ADD_ACCUMULATE(values9, w7, acc0, sum0);

1184

MULTIPLY_ADD_ACCUMULATE(values10, w8, acc0, sum0);

1185

MULTIPLY_ADD_ACCUMULATE(values9, w6, acc1, sum1);

1186

MULTIPLY_ADD_ACCUMULATE(values10, w7, acc1, sum1);

1187

MULTIPLY_ADD_ACCUMULATE(values11, w8, acc1, sum1);

1188

1189

MULTIPLY_ADD_ACCUMULATE(values4, w0, acc2, sum2);

1190

MULTIPLY_ADD_ACCUMULATE(values5, w1, acc2, sum2);

1191

MULTIPLY_ADD_ACCUMULATE(values6, w2, acc2, sum2);

1192

MULTIPLY_ADD_ACCUMULATE(values5, w0, acc3, sum3);

1193

MULTIPLY_ADD_ACCUMULATE(values6, w1, acc3, sum3);

1194

MULTIPLY_ADD_ACCUMULATE(values7, w2, acc3, sum3);

1195

1196

MULTIPLY_ADD_ACCUMULATE(values8, w3, acc2, sum2);

1197

MULTIPLY_ADD_ACCUMULATE(values9, w4, acc2, sum2);

1198

MULTIPLY_ADD_ACCUMULATE(values10, w5, acc2, sum2);

1199

MULTIPLY_ADD_ACCUMULATE(values9, w3, acc3, sum3);

1200

MULTIPLY_ADD_ACCUMULATE(values10, w4, acc3, sum3);

1201

MULTIPLY_ADD_ACCUMULATE(values11, w5, acc3, sum3);

1202

1203

MULTIPLY_ADD_ACCUMULATE(values12, w6, acc2, sum2);

1204

MULTIPLY_ADD_ACCUMULATE(values13, w7, acc2, sum2);

1205

MULTIPLY_ADD_ACCUMULATE(values14, w8, acc2, sum2);

1206

MULTIPLY_ADD_ACCUMULATE(values13, w6, acc3, sum3);

1207

MULTIPLY_ADD_ACCUMULATE(values14, w7, acc3, sum3);

1208

MULTIPLY_ADD_ACCUMULATE(values15, w8, acc3, sum3);

1209

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1210

#if defined(HAS_BIAS)

1211

Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);

1212

1213

VEC_INT bias_values = VLOAD(VEC_SIZE)(0, (__global int *)biases.ptr);

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1214

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

acc0 += bias_values;

acc1 += bias_values;

acc2 += bias_values;

acc3 += bias_values;

#endif /* defined(HAS_BIAS) */

1220

1221

#if WEIGHTS_OFFSET != 0

1222

acc0 += WEIGHTS_OFFSET * sum0;

1223

acc1 += WEIGHTS_OFFSET * sum1;

1224

acc2 += WEIGHTS_OFFSET * sum2;

1225

acc3 += WEIGHTS_OFFSET * sum3;

1226

#endif /* WEIGHTS_OFFSET != 0 */

1227

1228

#if INPUT_OFFSET != 0

1229

VEC_INT offs = INPUT_OFFSET * sum_we;

acc0 += offs;

acc1 += offs;

acc2 += offs;

acc3 += offs;

#endif /* INPUT_OFFSET != 0 */

1236

1237

#if K_OFFSET != 0

1238

acc0 += (VEC_INT)K_OFFSET;

1239

acc1 += (VEC_INT)K_OFFSET;

1240

acc2 += (VEC_INT)K_OFFSET;

1241

acc3 += (VEC_INT)K_OFFSET;

1242

#endif /* K_OFFSET != 0 */

1243

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1244

#if defined(REAL_MULTIPLIER)

1245

1246

acc0 = CONVERT(round(CONVERT(acc0, VEC_FLOAT) * (VEC_FLOAT)REAL_MULTIPLIER), VEC_INT);

1247

acc1 = CONVERT(round(CONVERT(acc1, VEC_FLOAT) * (VEC_FLOAT)REAL_MULTIPLIER), VEC_INT);

1248

acc2 = CONVERT(round(CONVERT(acc2, VEC_FLOAT) * (VEC_FLOAT)REAL_MULTIPLIER), VEC_INT);

1249

acc3 = CONVERT(round(CONVERT(acc3, VEC_FLOAT) * (VEC_FLOAT)REAL_MULTIPLIER), VEC_INT);

1250

1251

#else // defined(REAL_MULTIPLIER)

1252

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

1253

#if defined(PER_CHANNEL_QUANTIZATION)

1254

Vector output_multipliers = CONVERT_TO_VECTOR_STRUCT(output_multipliers);

1255

Vector output_shifts = CONVERT_TO_VECTOR_STRUCT(output_shifts);

1256

VEC_INT output_multiplier = VLOAD(VEC_SIZE)(0, (__global int *)output_multipliers.ptr);

1257

VEC_INT output_shift = VLOAD(VEC_SIZE)(0, (__global int *)output_shifts.ptr);

1258

#else // defined(PER_CHANNEL_QUANTIZATION)

1259

const int output_multiplier = *((__global int *)(output_multipliers_ptr + output_multipliers_offset_first_element_in_bytes));

1260

const int output_shift = *((__global int *)(output_shifts_ptr + output_shifts_offset_first_element_in_bytes));

1261

#endif // defined(PER_CHANNEL_QUANTIZATION)

1262

1263

acc0 = asymm_mult_by_quant_multiplier_less_than_one(acc0, output_multiplier, output_shift);

1264

acc1 = asymm_mult_by_quant_multiplier_less_than_one(acc1, output_multiplier, output_shift);

1265

acc2 = asymm_mult_by_quant_multiplier_less_than_one(acc2, output_multiplier, output_shift);

1266

acc3 = asymm_mult_by_quant_multiplier_less_than_one(acc3, output_multiplier, output_shift);

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1267

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1268

#endif // defined(REAL_MULTIPLIER)

1269

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1270

acc0 += (VEC_INT)OUTPUT_OFFSET;

1271

acc1 += (VEC_INT)OUTPUT_OFFSET;

1272

acc2 += (VEC_INT)OUTPUT_OFFSET;

1273

acc3 += (VEC_INT)OUTPUT_OFFSET;

1274

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

1275

VEC_TYPE(VEC_SIZE)

1276

res0 = CONVERT_SAT(acc0, VEC_TYPE(VEC_SIZE));

1277

VEC_TYPE(VEC_SIZE)

1278

res1 = CONVERT_SAT(acc1, VEC_TYPE(VEC_SIZE));

1279

VEC_TYPE(VEC_SIZE)

1280

res2 = CONVERT_SAT(acc2, VEC_TYPE(VEC_SIZE));

1281

VEC_TYPE(VEC_SIZE)

1282

res3 = CONVERT_SAT(acc3, VEC_TYPE(VEC_SIZE));

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1283

Georgios Pinitas

2018-10-30 14:53:25 +0000

[diff] [blame]

1284

#if defined(DST_DEPTH)

1285

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + (z * NUM_PLANES_PROCESSED) * dst_step_z + b * dst_stride_w;

1286

#else /* defined(DST_DEPTH) */

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1287

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + (z * NUM_PLANES_PROCESSED) * dst_step_z;

Georgios Pinitas

2018-10-30 14:53:25 +0000

[diff] [blame]

1288

#endif /* defined(DST_DEPTH) */

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1289

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1290

VSTORE(VEC_SIZE)

Georgios Pinitas

60e9825

2018-10-22 16:17:20 +0100

[diff] [blame]

1291

(ACTIVATION_FUNC(res0), 0, dst_addr + 0 * dst_stride_y);

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1292

VSTORE(VEC_SIZE)

Georgios Pinitas

60e9825

2018-10-22 16:17:20 +0100

[diff] [blame]

1293

(ACTIVATION_FUNC(res1), 0, dst_addr + 1 * dst_stride_y);

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1294

1295

#if((DST_DIM_2 % NUM_PLANES_PROCESSED) != 0)

1296

if((z * NUM_PLANES_PROCESSED + 1) < DST_DIM_2)

1297

#endif // ((DST_DIM_2 % NUM_PLANES_PROCESSED) != 0)

1298

{

1299

VSTORE(VEC_SIZE)

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

1300

(ACTIVATION_FUNC(res2), 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y + 1 * dst_stride_z));

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1301

VSTORE(VEC_SIZE)

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

1302

(ACTIVATION_FUNC(res3), 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y + 1 * dst_stride_z));

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1303

}

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1304

}

1305

giuros01

2019-01-07 17:47:19 +0000

[diff] [blame]

1306

#if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) && VEC_SIZE == 4

1307

/** This function computes the depthwise convolution quantized for NHWC data layout when the stride along the width and height is 1 using dot product.

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1308

*

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

1309

* @note Per-channel quantization is not supported by this kernel.

giuros01

2019-01-07 17:47:19 +0000

[diff] [blame]

1310

* @note This kernel assumes VEC_SIZE is 4.

1311

* @note The weights tensor is expected to be reshaped using @ref CLDepthwiseConvolutionLayerReshapeWeightsKernel.

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1312

* @note The number of elements read per thread must be passed at compile time using -DVEC_SIZE (e.g. -DVEC_SIZE=2)

1313

* @note Dimension two of the input tensor (height for NHWC data layout) must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM_2=112)

1314

* @note The number of rows processed per thread must be passed at compile time using -DNUM_ROWS_PROCESSED (i.e. -DNUM_ROWS_PROCESSED=2)

1315

* @note The number of planes processed per thread must be passed at compile time using -DNUM_PLANES_PROCESSED (i.e. -DNUM_PLANES_PROCESSED=2)

1316

* @note The convolution pad top must be passed at compile time using -DCONV_PAD_TOP (e.g. -DCONV_PAD_TOP=1)

1317

* @note The convolution pad top must be passed at compile time using -DCONV_PAD_LEFT (e.g. -DCONV_PAD_LEFT=1).

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1318

* @note If REAL_MULTIPLIER is passed at compile time (i.e. -DREAL_MULTIPLIER=1.355f), the final quantization is performed using a floating point multiplication.

1319

* If not, the quantization will be performed using a fixed point multiplication

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1320

*

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

1321

* @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8

1322

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1323

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1324

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1325

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1326

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1327

* @param[in] src_step_z src_stride_y * number of elements along Z processed per workitem(in bytes)

1328

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

1329

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

1330

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1331

* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

1332

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1333

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1334

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1335

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

1336

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

1337

* @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)

1338

* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)

1339

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

1340

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1341

* @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr

1342

* @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)

1343

* @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)

1344

* @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)

1345

* @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)

1346

* @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor

1347

* @param[in] output_multipliers_ptr Pointer to the output multipliers vector. Supported data types: S32

1348

* @param[in] output_multipliers_stride_x Stride of the output multipliers vector in X dimension (in bytes)

1349

* @param[in] output_multipliers_step_x output_multipliers_stride_x * number of elements along X processed per workitem(in bytes)

1350

* @param[in] output_multipliers_offset_first_element_in_bytes The offset of the first element in the output multipliers vector

1351

* @param[in] output_shifts_ptr Pointer to the output shifts vector. Supported data types: S32

1352

* @param[in] output_shifts_stride_x Stride of the output shifts vector in X dimension (in bytes)

1353

* @param[in] output_shifts_step_x output_shifts_stride_x * number of elements along X processed per workitem(in bytes)

1354

* @param[in] output_shifts_offset_first_element_in_bytes The offset of the first element in the output shifts vector

1355

* @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: S32

1356

* @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)

1357

* @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)

1358

* @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector

1359

* @param[in] max_offset The maximum allowed offset for the input tensor

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1360

*/

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

1361

__kernel void dwc_3x3_reshaped_quantized8_dot8_stride1_nhwc(

Georgios Pinitas

2018-10-30 14:53:25 +0000

[diff] [blame]

1362

TENSOR4D_DECLARATION(src),

1363

TENSOR4D_DECLARATION(dst),

giuros01

2019-01-07 17:47:19 +0000

[diff] [blame]

1364

IMAGE_DECLARATION(weights),

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

1365

VECTOR_DECLARATION(output_multipliers),

1366

VECTOR_DECLARATION(output_shifts),

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1367

#if defined(HAS_BIAS)

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1368

VECTOR_DECLARATION(biases),

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1369

#endif // defined(HAS_BIAS)

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1370

int max_offset)

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1371

{

Giorgio Arena

2018-06-19 11:27:38 +0100

[diff] [blame]

1372

int x = get_global_id(0);

1373

int y = get_global_id(1);

Georgios Pinitas

2018-10-30 14:53:25 +0000

[diff] [blame]

1374

#if defined(DST_DEPTH)

1375

int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y

1376

int b = get_global_id(2) / (int)DST_DEPTH; // batch

Pablo Tello

2019-02-27 13:32:51 +0000

[diff] [blame]

1377

#else // defined(DST_DEPTH)

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

1378

int z = get_global_id(2); // spatial coordinate y

Pablo Tello

2019-02-27 13:32:51 +0000

[diff] [blame]

1379

#endif // defined(DST_DEPTH)

Giorgio Arena

2018-06-19 11:27:38 +0100

[diff] [blame]

1380

giuros01

2019-01-07 17:47:19 +0000

[diff] [blame]

1381

__global uchar *weights_addr = weights_ptr + weights_offset_first_element_in_bytes + x * weights_stride_y;

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1382

Georgios Pinitas

2018-10-30 14:53:25 +0000

[diff] [blame]

1383

#if defined(DST_DEPTH)

1384

__global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * VEC_SIZE + b * src_stride_w;

1385

#else /* defined(DST_DEPTH) */

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

1386

__global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * VEC_SIZE;

Georgios Pinitas

2018-10-30 14:53:25 +0000

[diff] [blame]

1387

#endif /* defined(DST_DEPTH) */

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1388

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1389

int z_coord = 0;

1390

int4 offset = 0;

1391

int4 y_coord = ((int4)(y * NUM_ROWS_PROCESSED) + (int4)(0, 1, 2, 3)) - (int)CONV_PAD_LEFT;

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1392

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1393

// Only for y = 0 we can have a negative coordinate. If so, we convert it to SRC_DIM_1

1394

y_coord.s0 = min((uint)y_coord.s0, (uint)SRC_DIM_1);

1395

y_coord.s1 = min((uint)y_coord.s1, (uint)SRC_DIM_1);

1396

y_coord.s2 = min((uint)y_coord.s2, (uint)SRC_DIM_1);

1397

y_coord.s3 = min((uint)y_coord.s3, (uint)SRC_DIM_1);

1398

1399

int4 y_offset = convert_int4(y_coord * (int)src_stride_y);

1400

1401

// We compute 4x2x1 [C,W,H] elements

VEC_INT acc0 = 0;

VEC_INT acc1 = 0;

VEC_INT sum0 = 0;

VEC_INT sum1 = 0;

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1406

1407

// Load weights

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

1408

VEC_TYPE(16)

1409

w0 = VLOAD(16)(0, (__global WEIGHTS_TYPE *)(weights_addr));

1410

VEC_TYPE(16)

1411

w1 = VLOAD(16)(0, (__global WEIGHTS_TYPE *)(weights_addr + 16));

1412

VEC_TYPE(4)

1413

w2 = VLOAD(4)(0, (__global WEIGHTS_TYPE *)(weights_addr + 32));

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1414

1415

#if INPUT_OFFSET != 0

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1416

// Initilize the final result with the weights reduction multiplied by INPUT_OFFSET

giuros01

2019-01-07 17:47:19 +0000

[diff] [blame]

1417

DOT_PRODUCT_REDUCTION_WEIGHTS(acc0.s0, w0.s01234567, w0.s8);

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

1418

DOT_PRODUCT_REDUCTION_WEIGHTS(acc0.s1, (VEC_TYPE(8))((w0.s9ABC), (w0.sDEF), w1.s0), w1.s1);

giuros01

2019-01-07 17:47:19 +0000

[diff] [blame]

1419

DOT_PRODUCT_REDUCTION_WEIGHTS(acc0.s2, w1.s23456789, w1.sA);

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

1420

DOT_PRODUCT_REDUCTION_WEIGHTS(acc0.s3, (VEC_TYPE(8))((w1.sBCD), (w1.sEF), (w2.s012)), w2.s3);

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1421

1422

// Multiply the weights reduction with INPUT_OFFSET

1423

acc0 = INPUT_OFFSET * acc0;

1424

1425

acc1 = acc0;

1426

#endif // INPUT_OFFSET != 0

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

// Load input values

// z == 0

// Clamp z_coord as for z = 0, it can be negative

1431

// z_coord is casted to unsigned int in order to use just a min() operation

1432

// A "-1" 32 bit signed variable converted to unsigned gives 4294967295

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1433

z_coord = z - (int)CONV_PAD_TOP;

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1434

z_coord = min((uint)z_coord, (uint)SRC_DIM_2);

1435

offset = y_offset + (int4)(z_coord * src_stride_z);

Georgios Pinitas

2018-08-01 20:16:34 +0100

[diff] [blame]

1436

offset = min(offset, (int4)max_offset);

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1437

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

1438

VEC_TYPE(VEC_SIZE)

1439

values0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));

1440

VEC_TYPE(VEC_SIZE)

1441

values1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));

1442

VEC_TYPE(VEC_SIZE)

1443

values2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));

1444

VEC_TYPE(VEC_SIZE)

1445

values3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s3));

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1446

1447

// z == 1

1448

// z_coord can be only negative for z = 0 so we do not need to clamp it

1449

// Moreover z_coord cannot be out-of-bound for z = 1 so we do not need to clamp the offset

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

1450

z_coord = z - (int)CONV_PAD_TOP + 1;

1451

offset = y_offset + (int4)(z_coord * src_stride_z);

1452

VEC_TYPE(VEC_SIZE)

1453

values4 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));

1454

VEC_TYPE(VEC_SIZE)

1455

values5 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));

1456

VEC_TYPE(VEC_SIZE)

1457

values6 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));

1458

VEC_TYPE(VEC_SIZE)

1459

values7 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s3));

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1460

1461

// z == 2

1462

// After z = 1 we can simply add src_stride_z to offset without updating z_coord

1463

// However offset can be out-of-bound so we need to check if it is greater than max_offset

1464

offset += (int4)src_stride_z;

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

1465

offset = min(offset, (int4)max_offset);

1466

VEC_TYPE(VEC_SIZE)

1467

values8 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s0));

1468

VEC_TYPE(VEC_SIZE)

1469

values9 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s1));

1470

VEC_TYPE(VEC_SIZE)

1471

values10 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s2));

1472

VEC_TYPE(VEC_SIZE)

1473

values11 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(src_addr + offset.s3));

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1474

giuros01

2019-01-07 17:47:19 +0000

[diff] [blame]

1475

DOT_PRODUCT_REDUCTION(sum0.s0, values0.s0, values1.s0, values2.s0, values4.s0, values5.s0, values6.s0, values8.s0, values9.s0, values10.s0);

1476

DOT_PRODUCT_REDUCTION(sum1.s0, values1.s0, values2.s0, values3.s0, values5.s0, values6.s0, values7.s0, values9.s0, values10.s0, values11.s0);

1477

DOT_PRODUCT(acc0.s0, values0.s0, values1.s0, values2.s0, values4.s0, values5.s0, values6.s0, values8.s0, values9.s0, values10.s0, w0.s01234567, w0.s8);

1478

DOT_PRODUCT(acc1.s0, values1.s0, values2.s0, values3.s0, values5.s0, values6.s0, values7.s0, values9.s0, values10.s0, values11.s0, w0.s01234567, w0.s8);

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1479

giuros01

2019-01-07 17:47:19 +0000

[diff] [blame]

1480

DOT_PRODUCT_REDUCTION(sum0.s1, values0.s1, values1.s1, values2.s1, values4.s1, values5.s1, values6.s1, values8.s1, values9.s1, values10.s1);

1481

DOT_PRODUCT_REDUCTION(sum1.s1, values1.s1, values2.s1, values3.s1, values5.s1, values6.s1, values7.s1, values9.s1, values10.s1, values11.s1);

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

1482

DOT_PRODUCT(acc0.s1, values0.s1, values1.s1, values2.s1, values4.s1, values5.s1, values6.s1, values8.s1, values9.s1, values10.s1, (VEC_TYPE(8))((w0.s9ABC), (w0.sDEF), w1.s0), w1.s1);

1483

DOT_PRODUCT(acc1.s1, values1.s1, values2.s1, values3.s1, values5.s1, values6.s1, values7.s1, values9.s1, values10.s1, values11.s1, (VEC_TYPE(8))((w0.s9ABC), (w0.sDEF), w1.s0), w1.s1);

giuros01

2019-01-07 17:47:19 +0000

[diff] [blame]

1484

1485

DOT_PRODUCT_REDUCTION(sum0.s2, values0.s2, values1.s2, values2.s2, values4.s2, values5.s2, values6.s2, values8.s2, values9.s2, values10.s2);

1486

DOT_PRODUCT_REDUCTION(sum1.s2, values1.s2, values2.s2, values3.s2, values5.s2, values6.s2, values7.s2, values9.s2, values10.s2, values11.s2);

1487

DOT_PRODUCT(acc0.s2, values0.s2, values1.s2, values2.s2, values4.s2, values5.s2, values6.s2, values8.s2, values9.s2, values10.s2, w1.s23456789, w1.sA);

1488

DOT_PRODUCT(acc1.s2, values1.s2, values2.s2, values3.s2, values5.s2, values6.s2, values7.s2, values9.s2, values10.s2, values11.s2, w1.s23456789, w1.sA);

1489

1490

DOT_PRODUCT_REDUCTION(sum0.s3, values0.s3, values1.s3, values2.s3, values4.s3, values5.s3, values6.s3, values8.s3, values9.s3, values10.s3);

1491

DOT_PRODUCT_REDUCTION(sum1.s3, values1.s3, values2.s3, values3.s3, values5.s3, values6.s3, values7.s3, values9.s3, values10.s3, values11.s3);

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

1492

DOT_PRODUCT(acc0.s3, values0.s3, values1.s3, values2.s3, values4.s3, values5.s3, values6.s3, values8.s3, values9.s3, values10.s3, (VEC_TYPE(8))((w1.sBCD), (w1.sEF), (w2.s012)), w2.s3);

1493

DOT_PRODUCT(acc1.s3, values1.s3, values2.s3, values3.s3, values5.s3, values6.s3, values7.s3, values9.s3, values10.s3, values11.s3, (VEC_TYPE(8))((w1.sBCD), (w1.sEF), (w2.s012)), w2.s3);

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1494

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1495

#if defined(HAS_BIAS)

1496

Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);

1497

1498

VEC_INT bias_values = VLOAD(VEC_SIZE)(0, (__global int *)biases.ptr);

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1499

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1500

acc0 += bias_values;

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1501

acc1 += bias_values;

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1502

1503

#endif // defined(HAS_BIAS)

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1504

1505

#if WEIGHTS_OFFSET != 0

1506

acc0 += WEIGHTS_OFFSET * sum0;

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1507

acc1 += WEIGHTS_OFFSET * sum1;

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1508

#endif // WEIGHTS_OFFSET != 0

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1509

1510

#if K_OFFSET != 0

1511

acc0 += (VEC_INT)K_OFFSET;

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1512

acc1 += (VEC_INT)K_OFFSET;

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1513

1514

#endif // K_OFFSET != 0

1515

1516

#if defined(REAL_MULTIPLIER)

1517

1518

acc0 = CONVERT(round(CONVERT(acc0, VEC_FLOAT) * (VEC_FLOAT)REAL_MULTIPLIER), VEC_INT);

1519

acc1 = CONVERT(round(CONVERT(acc1, VEC_FLOAT) * (VEC_FLOAT)REAL_MULTIPLIER), VEC_INT);

1520

1521

#else // defined(REAL_MULTIPLIER)

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

1522

const int output_multiplier = *((__global int *)(output_multipliers_ptr + output_multipliers_offset_first_element_in_bytes));

1523

const int output_shift = *((__global int *)(output_shifts_ptr + output_shifts_offset_first_element_in_bytes));

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1524

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

1525

acc0 = asymm_mult_by_quant_multiplier_less_than_one(acc0, output_multiplier, output_shift);

1526

acc1 = asymm_mult_by_quant_multiplier_less_than_one(acc1, output_multiplier, output_shift);

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1527

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1528

#endif // defined(REAL_MULTIPLIER)

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1529

acc0 += (VEC_INT)OUTPUT_OFFSET;

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1530

acc1 += (VEC_INT)OUTPUT_OFFSET;

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1531

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

1532

VEC_TYPE(VEC_SIZE)

1533

res0 = CONVERT_SAT(acc0, VEC_TYPE(VEC_SIZE));

1534

VEC_TYPE(VEC_SIZE)

1535

res1 = CONVERT_SAT(acc1, VEC_TYPE(VEC_SIZE));

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1536

Georgios Pinitas

2018-10-30 14:53:25 +0000

[diff] [blame]

1537

#if defined(DST_DEPTH)

1538

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + z * dst_step_z + b * dst_stride_w;

1539

#else /* defined(DST_DEPTH) */

Gian Marco Iodice

2018-10-18 10:21:02 +0100

[diff] [blame]

1540

__global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x * dst_step_x + y * dst_step_y + z * dst_step_z;

Georgios Pinitas

2018-10-30 14:53:25 +0000

[diff] [blame]

1541

#endif /* defined(DST_DEPTH) */

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1542

1543

VSTORE(VEC_SIZE)

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

1544

(ACTIVATION_FUNC(res0), 0, (__global DATA_TYPE *)(dst_addr + 0 * dst_stride_y));

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1545

VSTORE(VEC_SIZE)

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

1546

(ACTIVATION_FUNC(res1), 0, (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1547

}

giuros01

2019-01-07 17:47:19 +0000

[diff] [blame]

1548

#endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) && VEC_SIZE==4

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1549

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1550

#endif // defined(NUM_ROWS_PROCESSED) && defined(NUM_PLANES_PROCESSED)

Giorgio Arena

2018-01-31 10:30:59 +0000

[diff] [blame]

1551

Giorgio Arena

2018-07-02 15:29:57 +0100

[diff] [blame]

1552

#endif // defined(VEC_SIZE) && defined(SRC_DIM_1) && defined(SRC_DIM_2) && defined(CONV_PAD_TOP) && defined(CONV_PAD_LEFT)

1553

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

1554

#endif // defined(WEIGHTS_PROMOTED_TYPE)

Michele Di Giorgio

2019-10-08 09:36:26 +0100

[diff] [blame]

1555

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

1556

#endif // defined(WEIGHTS_OFFSET) && defined(INPUT_OFFSET) && defined(K_OFFSET) && (defined(OUTPUT_OFFSET) || defined(REAL_MULTIPLIER))

1557

1558

#if defined(SRC_DIM1) && defined(SRC_DIM2) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(N0) && defined(DILATION_X) && defined(DILATION_Y) && defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y) && defined(CONV_PAD_LEFT) && defined(CONV_PAD_TOP) && defined(INPUT_OFFSET) && defined(WEIGHTS_OFFSET) && defined(OUTPUT_OFFSET)

Michele Di Giorgio

2019-10-08 09:36:26 +0100

[diff] [blame]

1559

/** This function computes the depthwise convolution for NHWC data layout. This kernel assumes that the weights tensor is NOT reshaped

1560

*

1561

* @note The number of elements processed must be passed at compile time using -DN0 (e.g. -DN0=2)

1562

* @note The depth multiplier must be passed at compile time using -DDEPTH_MULTIPLIER (e.g. -DDEPTH_MULTIPLIER=1)

1563

* @note The first dimension of the input tensor must be passed at compile time using -DSRC_DIM1 (e.g. -DSRC_DIM1=112)

1564

* @note The second dimension of the input tensor must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM2=80)

1565

* @note The kernel width must be passed at compile time using -DKERNEL_WIDTH (e.g. -DKERNEL_WIDTH=5)

1566

* @note The kernel height must be passed at compile time using -DKERNEL_HEIGHT (e.g. -DKERNEL_HEIGHT=5)

1567

* @note The convolution pad top must be passed at compile time using -DCONV_PAD_TOP (e.g. -DCONV_PAD_TOP=1)

1568

* @note The convolution pad top must be passed at compile time using -DCONV_PAD_LEFT (e.g. -DCONV_PAD_LEFT=1)

1569

* @note The convolution stride along the width must be passed at compile time using -DCONV_STRIDE_X (e.g. -DCONV_STRIDE_Y=X)

1570

* @note The convolution stride along the height must be passed at compile time using -DCONV_STRIDE_Y (e.g. -DCONV_STRIDE_Y=1)

1571

* @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu

1572

* @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively

1573

*

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

1574

* @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8

1575

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

1576

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

1577

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

1578

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

1579

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

1580

* @param[in] src_step_z src_stride_y * number of elements along Z processed per workitem(in bytes)

1581

* @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)

1582

* @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)

1583

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

1584

* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

1585

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

1586

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

1587

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

1588

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

1589

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

1590

* @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)

1591

* @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)

1592

* @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)

1593

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

1594

* @param[in] weights_ptr Pointer to the weights tensor. Supported data types: QASYMM8/QSYMM8_PER_CHANNEL

1595

* @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)

1596

* @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)

1597

* @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)

1598

* @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)

1599

* @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)

1600

* @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)

1601

* @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor

1602

* @param[in] output_multipliers_ptr Pointer to the output multipliers vector. Supported data types: S32

1603

* @param[in] output_multipliers_stride_x Stride of the output multipliers vector in X dimension (in bytes)

1604

* @param[in] output_multipliers_step_x output_multipliers_stride_x * number of elements along X processed per workitem(in bytes)

1605

* @param[in] output_multipliers_offset_first_element_in_bytes The offset of the first element in the output multipliers vector

1606

* @param[in] output_shifts_ptr Pointer to the output shifts vector. Supported data types: S32

1607

* @param[in] output_shifts_stride_x Stride of the output shifts vector in X dimension (in bytes)

1608

* @param[in] output_shifts_step_x output_shifts_stride_x * number of elements along X processed per workitem(in bytes)

1609

* @param[in] output_shifts_offset_first_element_in_bytes The offset of the first element in the output shifts vector

1610

* @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: S32

1611

* @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)

1612

* @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)

1613

* @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector

Michele Di Giorgio

2019-10-08 09:36:26 +0100

[diff] [blame]

1614

*/

1615

__kernel void dwc_MxN_native_quantized8_nhwc(

1616

TENSOR4D_DECLARATION(src),

1617

TENSOR4D_DECLARATION(dst),

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

1618

TENSOR3D_DECLARATION(weights),

1619

VECTOR_DECLARATION(output_multipliers),

1620

VECTOR_DECLARATION(output_shifts)

Michele Di Giorgio

2019-10-08 09:36:26 +0100

[diff] [blame]

1621

#if defined(HAS_BIAS)

Michele Di Giorgio

1dce310

2019-10-22 10:29:03 +0100

[diff] [blame]

1622

,

Michele Di Giorgio

2019-10-08 09:36:26 +0100

[diff] [blame]

1623

VECTOR_DECLARATION(biases)

1624

#endif // defined(HAS_BIAS)

1625

)

1626

{

1627

int x = get_global_id(0); // channels

1628

int y = get_global_id(1); // spatial coordinate x

1629

#if defined(DST_DEPTH)

1630

int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y

1631

int b = get_global_id(2) / (int)DST_DEPTH; // batch

1632

#else // defined(DST_DEPTH)

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

1633

int z = get_global_id(2); // spatial coordinate y

Michele Di Giorgio

2019-10-08 09:36:26 +0100

[diff] [blame]

1634

#endif // defined(DST_DEPTH)

1635

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

1636

__global uchar *s_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) * (int)N0;

Michele Di Giorgio

2019-10-08 09:36:26 +0100

[diff] [blame]

1637

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

1638

__global uchar *d_addr = dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) * (int)DEPTH_MULTIPLIER * (int)N0 + y * dst_stride_y + z * dst_stride_z;

Michele Di Giorgio

2019-10-08 09:36:26 +0100

[diff] [blame]

1639

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

1640

__global uchar *w_addr = weights_ptr + weights_offset_first_element_in_bytes + x * sizeof(WEIGHTS_TYPE) * (int)DEPTH_MULTIPLIER * (int)N0;

Michele Di Giorgio

2019-10-08 09:36:26 +0100

[diff] [blame]

1641

1642

#if defined(HAS_BIAS)

1643

__global uchar *b_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int) * (int)DEPTH_MULTIPLIER * (int)N0;

1644

#endif // defined(HAS_BIAS)

1645

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

1646

#if defined(PER_CHANNEL_QUANTIZATION)

1647

__global uchar *out_mul_addr = output_multipliers_ptr + output_multipliers_offset_first_element_in_bytes + x * sizeof(int) * (int)DEPTH_MULTIPLIER * (int)N0;

1648

__global uchar *out_shift_addr = output_shifts_ptr + output_shifts_offset_first_element_in_bytes + x * sizeof(int) * (int)DEPTH_MULTIPLIER * (int)N0;

1649

1650

VEC_INT output_multiplier = (VEC_INT)0;

1651

VEC_INT output_shift = (VEC_INT)0;

1652

#else // defined(PER_CHANNEL_QUANTIZATION)

1653

const int output_multiplier = *((__global int *)(output_multipliers_ptr + output_multipliers_offset_first_element_in_bytes));

1654

const int output_shift = *((__global int *)(output_shifts_ptr + output_shifts_offset_first_element_in_bytes));

1655

#endif // defined(PER_CHANNEL_QUANTIZATION)

1656

Michele Di Giorgio

2019-10-08 09:36:26 +0100

[diff] [blame]

1657

#if defined(DST_DEPTH)

1658

s_addr += b * src_stride_w;

1659

d_addr += b * dst_stride_w;

1660

#endif // defined(DST_DEPTH)

1661

1662

#if DEPTH_MULTIPLIER > 1

1663

for(int d = 0; d < (int)DEPTH_MULTIPLIER; ++d)

1664

{

1665

#endif // DEPTH_MULTIPLIER > 1

1666

// Each work-item computes N0x1x1 elements

Sang-Hoon Park

2019-11-07 09:37:29 +0000

[diff] [blame]

1667

VEC_INT res = 0;

Michele Di Giorgio

2019-10-08 09:36:26 +0100

[diff] [blame]

1668

1669

int x_coord = y * CONV_STRIDE_X - (int)CONV_PAD_LEFT;

1670

int y_coord = z * CONV_STRIDE_Y - (int)CONV_PAD_TOP;

1671

1672

for(int yk = 0; yk < KERNEL_HEIGHT; ++yk)

1673

{

1674

if(y_coord >= 0 && y_coord < SRC_DIM2)

1675

{

1676

int x_coord_tmp = x_coord;

1677

1678

for(int xk = 0; xk < KERNEL_WIDTH; ++xk)

1679

{

1680

if(x_coord_tmp >= 0 && x_coord_tmp < SRC_DIM1)

1681

{

1682

int s_offset = x_coord_tmp * (int)src_stride_y + y_coord * (int)src_stride_z;

1683

int w_offset = xk * weights_stride_y + yk * weights_stride_z;

1684

1685

// Load input and weights values

Sang-Hoon Park

2019-11-07 09:37:29 +0000

[diff] [blame]

1686

VEC_INT i = CONVERT(VLOAD(N0)(0, (__global DATA_TYPE *)(s_addr + s_offset)), VEC_INT);

1687

VEC_INT w = CONVERT(VLOAD(N0)(0, (__global WEIGHTS_TYPE *)(w_addr + w_offset)), VEC_INT);

Michele Di Giorgio

2019-10-08 09:36:26 +0100

[diff] [blame]

1688

Sang-Hoon Park

2019-11-07 09:37:29 +0000

[diff] [blame]

1689

res += (i + (VEC_INT)INPUT_OFFSET) * (w + (VEC_INT)WEIGHTS_OFFSET);

Michele Di Giorgio

2019-10-08 09:36:26 +0100

[diff] [blame]

1690

}

1691

x_coord_tmp += DILATION_X;

1692

}

1693

}

1694

y_coord += DILATION_Y;

1695

}

1696

1697

#if defined(HAS_BIAS)

Sang-Hoon Park

2019-11-07 09:37:29 +0000

[diff] [blame]

1698

VEC_INT bias = VLOAD(N0)(0, (__global int *)(b_addr));

Michele Di Giorgio

2019-10-08 09:36:26 +0100

[diff] [blame]

1699

res += bias;

1700

#endif // defined(HAS_BIAS)

1701

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

1702

#if defined(PER_CHANNEL_QUANTIZATION)

1703

output_multiplier = VLOAD(N0)(0, (__global int *)(out_mul_addr));

1704

output_shift = VLOAD(N0)(0, (__global int *)(out_shift_addr));

1705

#endif // defined(PER_CHANNEL_QUANTIZATION)

1706

Sang-Hoon Park

2019-11-07 09:37:29 +0000

[diff] [blame]

1707

res = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(res, output_multiplier, output_shift, N0);

1708

res += (VEC_INT)OUTPUT_OFFSET;

Michele Di Giorgio

2019-10-08 09:36:26 +0100

[diff] [blame]

1709

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

1710

VEC_TYPE(VEC_SIZE)

1711

res1 = CONVERT_SAT(res, VEC_TYPE(VEC_SIZE));

Michele Di Giorgio

2019-10-08 09:36:26 +0100

[diff] [blame]

1712

1713

VSTORE(N0)

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

1714

(ACTIVATION_FUNC(res1), 0, (__global DATA_TYPE *)(d_addr));

Michele Di Giorgio

2019-10-08 09:36:26 +0100

[diff] [blame]

1715

1716

#if DEPTH_MULTIPLIER > 1

Michele Di Giorgio

2019-10-09 15:32:39 +0100

[diff] [blame]

1717

w_addr += sizeof(WEIGHTS_TYPE);

1718

d_addr += sizeof(DATA_TYPE);

1719

#if defined(PER_CHANNEL_QUANTIZATION)

1720

out_mul_addr += sizeof(int);

1721

out_shift_addr += sizeof(int);

1722

#endif // defined(PER_CHANNEL_QUANTIZATION)

Michele Di Giorgio

2019-10-08 09:36:26 +0100

[diff] [blame]

1723

#if defined(HAS_BIAS)

1724

b_addr += sizeof(int);

1725

#endif // defined(HAS_BIAS)

1726

}

1727

#endif // DEPTH_MULTIPLIER > 1

1728

}

Michele Di Giorgio