Blame - src/core/CL/cl_kernels/depthwise_convolution.cl - ml/ComputeLibrary

2017-08-23 16:36:24 +0100

[diff] [blame]

506

}

507

#endif //defined(SRC_WIDTH) && defined(DATA_TYPE)

508

Giorgio Arena

2018-04-04 17:44:26 +0100

[diff] [blame^]

509

#if defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(PAD_RIGHT) && defined(PAD_BOTTOM) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DATA_TYPE) && defined(PAD_VALUE) && defined(DEPTH_MULTIPLIER)

Giorgio Arena

2017-08-23 16:36:24 +0100

[diff] [blame]

510

/** This kernel performs a reshaping of the input tensor to a tensor used to perform depthwise convolution using vector to matrix multiplication.

511

*

512

* @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float

Giorgio Arena

2018-04-04 17:44:26 +0100

[diff] [blame^]

513

* @note The convolution information must be passed at compile time using -DSTRIDE_X, -DSTRIDE_Y, -DPAD_LEFT, -DPAD_TOP, -DPAD_RIGHT, -DPAD_BOTTOM, -DKERNEL_WIDHT, -DKERNEL_HEIGHT, -DSRC_WIDTH, -DSRC_HEIGHT, -DDEPTH_MULTIPLIER

Giorgio Arena

2017-08-23 16:36:24 +0100

[diff] [blame]

514

*

515

* @param[in] src_ptr Pointer to the source tensor. Supported data types: QS8/QS16/F16/F32

516

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

517

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

518

* @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)

519

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

520

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

521

* @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)

522

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

523

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

524

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

525

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

526

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

527

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

528

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

529

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

530

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

531

*/

Giorgio Arena

2017-08-23 16:36:24 +0100

[diff] [blame]

532

__kernel void depthwise_im2col(TENSOR3D_DECLARATION(src), TENSOR3D_DECLARATION(dst))

533

{

534

Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);

535

536

const int src_pixel_linear = get_global_id(1) * STRIDE_X;

Jaroslaw Rzepecki

a1ed41f

2017-10-13 11:13:58 +0100

[diff] [blame]

537

const int full_length = SRC_WIDTH + PAD_LEFT + PAD_RIGHT;

Giorgio Arena

2017-08-23 16:36:24 +0100

[diff] [blame]

538

const int max_initial_x = STRIDE_X * (((full_length - KERNEL_WIDTH) / STRIDE_X) + 1);

539

Jaroslaw Rzepecki

a1ed41f

2017-10-13 11:13:58 +0100

[diff] [blame]

540

const int src_x = -PAD_LEFT + src_pixel_linear % max_initial_x;

541

const int src_y = -PAD_TOP + src_pixel_linear / max_initial_x * STRIDE_Y;

Giorgio Arena

2018-04-04 17:44:26 +0100

[diff] [blame^]

542

const int src_z = get_global_id(2) / DEPTH_MULTIPLIER;

Giorgio Arena

2017-08-23 16:36:24 +0100

[diff] [blame]

543

544

__global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + src_z * src_stride_z;

545

__global DATA_TYPE *output_ptr = ((__global DATA_TYPE *)(dst.ptr));

546

547

for(int y = src_y; y < src_y + KERNEL_HEIGHT; ++y)

548

{

549

for(int x = src_x; x < src_x + KERNEL_WIDTH; ++x, ++output_ptr)

550

{

551

if(x < 0 || x >= SRC_WIDTH || y < 0 || y >= SRC_HEIGHT)

552

{

Georgios Pinitas

de5a1cc

2018-02-02 12:52:07 +0000

[diff] [blame]

553

*output_ptr = PAD_VALUE;

Giorgio Arena

2017-08-23 16:36:24 +0100

[diff] [blame]

}

else

{

*output_ptr = *((__global DATA_TYPE *)(input_ptr + x * src_stride_x + y * src_stride_y));

558

}

559

}

560

}

Georgios Pinitas

81a26ad

2017-10-23 20:29:30 +0100

[diff] [blame]

561

#if defined(HAS_BIAS)

562

*output_ptr = (DATA_TYPE)(1);

563

#endif // defined(HAS_BIAS)

Giorgio Arena

2017-08-23 16:36:24 +0100

[diff] [blame]

564

}

565

Giorgio Arena

2018-04-04 17:44:26 +0100

[diff] [blame^]

566

#endif //defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(PAD_RIGHT) && defined(PAD_BOTTOM) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_WIDTH) && defined(DATA_TYPE) && defined(PAD_VALUE) && defined(DEPTH_MULTIPLIER)

Giorgio Arena

2017-08-23 16:36:24 +0100

[diff] [blame]

567

568

#if defined(CONV_WIDTH) && defined(CONV_HEIGHT) && defined(DATA_TYPE)

569

570

/** This kernel performs a reshaping of the output of the depthwise generic convolution.

571

*

572

* @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float

573

* @note The convolution information must be passed at compile time using -DCONV_WIDTH, -DCONV_HEIGHT, e.g -DCONV_WIDTH=32, -DCONV_HEIGHT=42

574

*

575

* @param[in] src_ptr Pointer to the source tensor. Supported data types: QS8/QS16/F16/F32

576

* @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)

577

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

578

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor

579

* @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

580

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

581

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

582

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

583

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

584

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

585

* @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)

586

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

587

*/

588

__kernel void depthwise_vector_to_tensor(

589

VECTOR_DECLARATION(src),

590

TENSOR3D_DECLARATION(dst))

591

{

592

Vector src = CONVERT_TO_VECTOR_STRUCT(src);

593

594

const int patch_size = CONV_WIDTH * CONV_HEIGHT;

595

const int id0 = get_global_id(0);

596

const int z = id0 / patch_size;

597

const int index2D = id0 - z * patch_size;

598

599

__global uchar *out_ptr = dst_ptr + dst_offset_first_element_in_bytes + index2D % CONV_WIDTH * dst_stride_x + index2D / CONV_WIDTH * dst_stride_y + z * dst_stride_z;

600

*((__global DATA_TYPE *)out_ptr) = *((__global DATA_TYPE *)src.ptr);

601

}

602

603

#endif //defined(CONV_WIDTH) && defined(CONV_HEIGHT) && defined(DATA_TYPE)

Michele Di Giorgio

2018-02-19 15:42:12 +0000

[diff] [blame]

604

Giorgio Arena

2018-04-04 17:44:26 +0100

[diff] [blame^]

605

#if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(DEPTH_MULTIPLIER)

Michele Di Giorgio

2018-02-19 15:42:12 +0000

[diff] [blame]

606

#if defined(CONV_STRIDE_X)

607

#if CONV_STRIDE_X == 1

608

#define convolution1x3_f16 convolution1x3_stride_1_f16

609

#elif CONV_STRIDE_X == 2

610

#define convolution1x3_f16 convolution1x3_stride_2_f16

611

#elif CONV_STRIDE_X == 3

612

#define convolution1x3_f16 convolution1x3_stride_3_f16

613

#else /* CONV_STRIDE_X */

614

#error "Stride not supported"

615

#endif /* CONV_STRIDE_X */

616

617

/** Compute a 1D horizontal convolution of size 3 and stride 1 for 16bit floating point type.

618

*

619

* @param[in] left_pixel Pointer to the left pixel.

620

* @param[in] left_coeff Weight of the left pixel

621

* @param[in] middle_coeff Weight of the middle pixel

622

* @param[in] right_coeff Weight of the right pixel

623

*

624

* @return a half4 containing 4 convoluted values.

625

*/

626

inline half4 convolution1x3_stride_1_f16(__global const uchar *left_pixel,

627

const half left_coeff,

628

const half middle_coeff,

629

const half right_coeff)

630

{

631

half8 temp = vload8(0, (__global half *)left_pixel);

632

633

half4 left = CONVERT(temp.s0123, half4);

634

half4 middle = CONVERT(temp.s1234, half4);

635

half4 right = CONVERT(temp.s2345, half4);

636

637

return left * (half4)left_coeff + middle * (half4)middle_coeff + right * (half4)right_coeff;

638

}

639

640

/** Compute a 1D horizontal convolution of size 3 and stride 2 for 16bit floating point type.

641

*

642

* @param[in] left_pixel Pointer to the left pixel.

643

* @param[in] left_coeff Weight of the left pixel

644

* @param[in] middle_coeff Weight of the middle pixel

645

* @param[in] right_coeff Weight of the right pixel

646

*

647

* @return a half4 containing 4 convoluted values.

648

*/

649

inline half4 convolution1x3_stride_2_f16(__global const uchar *left_pixel,

650

const half left_coeff,

651

const half middle_coeff,

652

const half right_coeff)

653

{

654

half8 temp0 = vload8(0, (__global half *)left_pixel);

655

half temp1 = *((__global half *)(left_pixel + 8 * sizeof(half)));

656

657

half4 left = CONVERT(temp0.s0246, half4);

658

half4 middle = CONVERT(temp0.s1357, half4);

659

half4 right = CONVERT((half4)(temp0.s246, temp1), half4);

660

661

return left * (half4)left_coeff + middle * (half4)middle_coeff + right * (half4)right_coeff;

662

}

663

664

/** Compute a 1D horizontal convolution of size 3 and stride 3 for 16bit floating point type.

665

*

666

* @param[in] left_pixel Pointer to the left pixel.

667

* @param[in] left_coeff Weight of the left pixel

668

* @param[in] middle_coeff Weight of the middle pixel

669

* @param[in] right_coeff Weight of the right pixel

670

*

671

* @return a half4 containing 4 convoluted values.

672

*/

673

inline half4 convolution1x3_stride_3_f16(__global const uchar *left_pixel,

674

const half left_coeff,

675

const half middle_coeff,

676

const half right_coeff)

677

{

678

half16 temp0 = vload16(0, (__global half *)left_pixel);

679

680

half4 left = CONVERT(temp0.s0369, half4);

681

half4 middle = CONVERT(temp0.s147A, half4);

682

half4 right = CONVERT(temp0.s258B, half4);

683

684

return left * (half4)left_coeff + middle * (half4)middle_coeff + right * (half4)right_coeff;

685

}

686

687

/** Apply a 3x3 convolution matrix to a single channel F16 input image and return the result.

688

*

689

* Convolution matrix layout:

690

*

691

* [ mat0, mat1, mat2 ]\n

692

* [ mat3, mat4, mat5 ]\n

693

* [ mat6, mat7, mat8 ]\n

694

*

695

* @param[in] src A pointer to source Image structure

696

* @param[in] mat0 Coefficient from the convolution matrix

697

* @param[in] mat1 Coefficient from the convolution matrix

698

* @param[in] mat2 Coefficient from the convolution matrix

699

* @param[in] mat3 Coefficient from the convolution matrix

700

* @param[in] mat4 Coefficient from the convolution matrix

701

* @param[in] mat5 Coefficient from the convolution matrix

702

* @param[in] mat6 Coefficient from the convolution matrix

703

* @param[in] mat0 Coefficient from the convolution matrix

704

* @param[in] mat7 Coefficient from the convolution matrix

705

* @param[in] mat8 Coefficient from the convolution matrix

706

*

707

* @return a half4 containing 4 convoluted values.

708

*/

709

inline half4 convolution3x3_f16(

710

Image *src,

711

const half mat0, const half mat1, const half mat2,

712

const half mat3, const half mat4, const half mat5,

713

const half mat6, const half mat7, const half mat8)

{

half4 pixels;

pixels = convolution1x3_f16(offset(src, 0, 0), mat0, mat1, mat2);

718

pixels += convolution1x3_f16(offset(src, 0, 1), mat3, mat4, mat5);

719

pixels += convolution1x3_f16(offset(src, 0, 2), mat6, mat7, mat8);

return pixels;

}

Giorgio Arena

2018-04-04 17:44:26 +0100

[diff] [blame^]

724

#if defined(DEPTH_MULTIPLIER)

725

Michele Di Giorgio

2018-02-19 15:42:12 +0000

[diff] [blame]

726

/** This OpenCL kernel computes the depthwise convolution 3x3

727

*

728

* @param[in] src_ptr Pointer to the source image. Supported data types: F16

729

* @param[in] src_stride_x Stride of the source image in X dimension (in bytes)

730

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

731

* @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)

732

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

733

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image

734

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

735

* @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)

Michele Di Giorgio

2018-02-21 10:02:58 +0000

[diff] [blame]

736

* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

Michele Di Giorgio

2018-02-19 15:42:12 +0000

[diff] [blame]

737

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

738

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

739

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

740

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

741

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

742

* @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)

743

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

Michele Di Giorgio

2018-02-21 10:02:58 +0000

[diff] [blame]

744

* @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr

Michele Di Giorgio

2018-02-19 15:42:12 +0000

[diff] [blame]

745

* @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)

746

* @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)

747

* @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)

748

* @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)

749

* @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)

750

* @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)

751

* @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector

752

* @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: F16/F32

753

* @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)

754

* @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)

755

* @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector

756

*/

757

__kernel void depthwise_convolution_3x3_f16(

758

TENSOR3D_DECLARATION(src),

759

TENSOR3D_DECLARATION(dst),

760

TENSOR3D_DECLARATION(weights)

761

#if defined(HAS_BIAS)

762

,

763

VECTOR_DECLARATION(biases)

764

#endif //defined(HAS_BIAS)

765

)

766

{

767

Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);

768

Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);

769

Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT(weights);

770

#if defined(HAS_BIAS)

771

Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);

772

#endif //defined(HAS_BIAS)

773

Giorgio Arena

2018-04-04 17:44:26 +0100

[diff] [blame^]

774

src.ptr -= (get_global_id(2) - get_global_id(2) / DEPTH_MULTIPLIER) * src_step_z;

775

Michele Di Giorgio

2018-02-19 15:42:12 +0000

[diff] [blame]

776

uchar3 offset = (uchar3)(0, 1, 2) * (uchar3)weights_stride_y;

777

half3 weights_values0 = vload3(0, (__global half *)(weights.ptr + offset.s0));

778

half3 weights_values1 = vload3(0, (__global half *)(weights.ptr + offset.s1));

779

half3 weights_values2 = vload3(0, (__global half *)(weights.ptr + offset.s2));

780

781

half4 pixels = convolution3x3_f16(&src, weights_values0.s0, weights_values0.s1, weights_values0.s2,

782

weights_values1.s0, weights_values1.s1, weights_values1.s2,

783

weights_values2.s0, weights_values2.s1, weights_values2.s2);

784

#if defined(HAS_BIAS)

785

pixels += (half4)(*((__global half *)(biases.ptr + get_global_id(2) * biases_stride_x)));

786

#endif //defined(HAS_BIAS)

787

788

vstore4(pixels, 0, (__global half *)dst.ptr);

789

}

Giorgio Arena

2018-04-04 17:44:26 +0100

[diff] [blame^]

790

#endif // defined(DEPTH_MULTIPLIER)

Michele Di Giorgio

2018-02-19 15:42:12 +0000

[diff] [blame]

791

#endif // defined(CONV_STRIDE_X)

Michele Di Giorgio

2018-02-21 10:02:58 +0000

[diff] [blame]

792

793

/** This OpenCL kernel is optimized for Bifrost architectures and computes the 16bit floating point depthwise convolution 3x3

794

* when both stride_x and stride_y are equal to 1

795

*

796

* @param[in] src_ptr Pointer to the source image. Supported data types: F16

797

* @param[in] src_stride_x Stride of the source image in X dimension (in bytes)

798

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

799

* @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)

800

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

801

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image

802

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

803

* @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)

804

* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

805

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

806

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

807

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

808

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

809

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

810

* @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)

811

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

812

* @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr

813

* @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)

814

* @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)

815

* @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)

816

* @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)

817

* @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)

818

* @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)

819

* @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector

820

* @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: same as @p src_ptr

821

* @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)

822

* @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)

823

* @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector

824

*/

825

__kernel void depthwise_convolution_3x3_stridex1_stridey1_bifrost_f16(

826

TENSOR3D_DECLARATION(src),

827

TENSOR3D_DECLARATION(dst),

828

TENSOR3D_DECLARATION(weights)

829

#if defined(HAS_BIAS)

830

,

831

VECTOR_DECLARATION(biases)

832

#endif //defined(HAS_BIAS)

833

)

834

{

835

Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);

836

Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);

837

Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT(weights);

838

839

#ifdef HAS_BIAS

840

Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);

841

842

half bias = *((__global half *)(vector_offset(&biases, get_global_id(2))));

843

#endif /* defined(HAS_BIAS) */

844

845

half4 pixels0 = 0.0f;

846

half4 pixels1 = 0.0f;

847

half4 pixels2 = 0.0f;

848

half4 pixels3 = 0.0f;

849

850

__global uchar *weights_addr = (__global uchar *)weights.ptr;

Giorgio Arena

2018-04-04 17:44:26 +0100

[diff] [blame^]

851

__global uchar *src_addr = (__global uchar *)offset(&src, 0, 0) - (get_global_id(2) - get_global_id(2) / DEPTH_MULTIPLIER) * src_step_z;

Michele Di Giorgio

2018-02-21 10:02:58 +0000

[diff] [blame]

852

853

// Load the weights

854

half3 weights_row0 = vload3(0, (__global half *)(weights_addr + 0 * weights_stride_y));

855

half3 weights_row1 = vload3(0, (__global half *)(weights_addr + 1 * weights_stride_y));

856

half3 weights_row2 = vload3(0, (__global half *)(weights_addr + 2 * weights_stride_y));

857

858

// Note: Since each work-item computes 4x4 elements, we need to load 6 rows from the input tensor

859

half8 src00 = vload8(0, (__global half *)(src_addr + 0 * src_stride_y)); // Row0

860

half8 src10 = vload8(0, (__global half *)(src_addr + 1 * src_stride_y)); // Row1

861

half8 src20 = vload8(0, (__global half *)(src_addr + 2 * src_stride_y)); // Row2

862

half8 src30 = vload8(0, (__global half *)(src_addr + 3 * src_stride_y)); // Row3

863

half8 src40 = vload8(0, (__global half *)(src_addr + 4 * src_stride_y)); // Row4

864

half8 src50 = vload8(0, (__global half *)(src_addr + 5 * src_stride_y)); // Row5

865

866

CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels0, src00, weights_row0);

867

CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels0, src10, weights_row1);

868

CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels0, src20, weights_row2);

869

CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels1, src10, weights_row0);

870

CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels1, src20, weights_row1);

871

CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels1, src30, weights_row2);

872

CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels2, src20, weights_row0);

873

CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels2, src30, weights_row1);

874

CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels2, src40, weights_row2);

875

CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels3, src30, weights_row0);

876

CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels3, src40, weights_row1);

877

CONVOLUTION1x3_BIFROST4X1_STRIDE1(pixels3, src50, weights_row2);

878

879

#ifdef HAS_BIAS

880

pixels0 += (half4)bias;

881

pixels1 += (half4)bias;

882

pixels2 += (half4)bias;

883

pixels3 += (half4)bias;

884

#endif /* defined(HAS_BIAS) */

885

886

vstore4(pixels0, 0, (__global half *)(dst.ptr + 0 * dst_stride_y));

887

vstore4(pixels1, 0, (__global half *)(dst.ptr + 1 * dst_stride_y));

888

vstore4(pixels2, 0, (__global half *)(dst.ptr + 2 * dst_stride_y));

889

vstore4(pixels3, 0, (__global half *)(dst.ptr + 3 * dst_stride_y));

890

}

891

892

/** This OpenCL kernel is optimized for Bifrost architectures and computes 16bit floating point the depthwise convolution 3x3

893

* when both stride_x and stride_y are equal to 2

894

*

895

* @param[in] src_ptr Pointer to the source image. Supported data types: F16

896

* @param[in] src_stride_x Stride of the source image in X dimension (in bytes)

897

* @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)

898

* @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)

899

* @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)

900

* @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image

901

* @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)

902

* @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)

903

* @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr

904

* @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)

905

* @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)

906

* @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)

907

* @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)

908

* @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)

909

* @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)

910

* @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor

911

* @param[in] weights_ptr Pointer to the weights tensor. Supported data types: same as @p src_ptr

912

* @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)

913

* @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)

914

* @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)

915

* @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)

916

* @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)

917

* @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)

918

* @param[in] weights_offset_first_element_in_bytes The offset of the first element in the biases vector

919

* @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: same as @p src_ptr

920

* @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)

921

* @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)

922

* @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector

923

*/

924

__kernel void depthwise_convolution_3x3_stridex2_stridey2_bifrost_f16(

925

TENSOR3D_DECLARATION(src),

926

TENSOR3D_DECLARATION(dst),

927

TENSOR3D_DECLARATION(weights)

928

#if defined(HAS_BIAS)

929

,

930

VECTOR_DECLARATION(biases)

931

#endif //defined(HAS_BIAS)

932

)

933

{

934

Image src = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(src);

935

Image dst = CONVERT_TENSOR3D_TO_IMAGE_STRUCT(dst);

936

Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT(weights);

937

938

#ifdef HAS_BIAS

939

Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);

940

941

half bias = *((__global half *)(vector_offset(&biases, get_global_id(2))));

942

#endif /* defined(HAS_BIAS) */

943

944

half4 pixels0 = 0.0f;

945

half4 pixels1 = 0.0f;

946

947

__global uchar *weights_addr = (__global uchar *)weights.ptr;

Giorgio Arena

2018-04-04 17:44:26 +0100

[diff] [blame^]

948

__global uchar *src_addr = (__global uchar *)offset(&src, 0, 0) - (get_global_id(2) - get_global_id(2) / DEPTH_MULTIPLIER) * src_step_z;

Michele Di Giorgio

2018-02-21 10:02:58 +0000

[diff] [blame]

949

950

// Load the weights

951

half3 weights_row0 = vload3(0, (__global half *)(weights_addr + 0 * weights_stride_y));

952

half3 weights_row1 = vload3(0, (__global half *)(weights_addr + 1 * weights_stride_y));

953

half3 weights_row2 = vload3(0, (__global half *)(weights_addr + 2 * weights_stride_y));

954

955

// Note: Since each work-item computes 2x4 elements, we need to load 5 rows from the input tensor

956

half8 src00 = vload8(0, (__global half *)(src_addr + 0 * src_stride_y)); // Row0

957

half2 src01 = vload2(4, (__global half *)(src_addr + 0 * src_stride_y)); // Row0

958

half8 src10 = vload8(0, (__global half *)(src_addr + 1 * src_stride_y)); // Row1

959

half2 src11 = vload2(4, (__global half *)(src_addr + 1 * src_stride_y)); // Row1

960

half8 src20 = vload8(0, (__global half *)(src_addr + 2 * src_stride_y)); // Row2

961

half2 src21 = vload2(4, (__global half *)(src_addr + 2 * src_stride_y)); // Row2

962

half8 src30 = vload8(0, (__global half *)(src_addr + 3 * src_stride_y)); // Row3

963

half2 src31 = vload2(4, (__global half *)(src_addr + 3 * src_stride_y)); // Row3

964

half8 src40 = vload8(0, (__global half *)(src_addr + 4 * src_stride_y)); // Row4

965

half2 src41 = vload2(4, (__global half *)(src_addr + 4 * src_stride_y)); // Row4

966

967

CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels0, src00, src01, weights_row0);

968

CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels0, src10, src11, weights_row1);

969

CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels0, src20, src21, weights_row2);

970

CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels1, src20, src21, weights_row0);

971

CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels1, src30, src31, weights_row1);

972

CONVOLUTION1x3_BIFROST4X1_STRIDE2(pixels1, src40, src41, weights_row2);

973

974

#ifdef HAS_BIAS

975

pixels0 += (half4)bias;

976

pixels1 += (half4)bias;

977

#endif /* defined(HAS_BIAS) */

978

979

vstore4(pixels0, 0, (__global half *)(dst.ptr + 0 * dst_stride_y));

980

vstore4(pixels1, 0, (__global half *)(dst.ptr + 1 * dst_stride_y));

981

}

Giorgio Arena