COMPMID-1017: Implement dilated convolution in NEON, OpenCL, and GC

Change-Id: If4626ec9e215e14dffe22e80812da5bac84a52e2
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/125734
Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
Tested-by: Jenkins <bsgcomp@arm.com>
diff --git a/src/core/CL/cl_kernels/im2col.cl b/src/core/CL/cl_kernels/im2col.cl
index 75d99bd..1e85e1b 100644
--- a/src/core/CL/cl_kernels/im2col.cl
+++ b/src/core/CL/cl_kernels/im2col.cl
@@ -680,6 +680,7 @@
  * @note The pad_left, pad_right, pad_top and pad_bottom must be passed at compile time using -DPAD_LEFT, -DPAD_RIGHT, -DPAD_TOP and -DPAD_BOTTOM: e.g. -DPAD_LEFT=1, -DPAD_RIGHT=2, -DPAD_TOP=3 and -DPAD_BOTTOM=2
  * @note The zero value to store in case we load values out-of-bounds must be passed at compile time using -DPAD_VALUE: e.g. -DPAD_VALUE=0.0
  * @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1
+ * @note The dilation_x and dilation_y must be passed at compile time using -DDILATION_X and -DDILATION_Y: e.g. -DDILATION_X=1, -DDILATION_Y=1
  * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
  *
  * @param[in]  src_ptr                           Pointer to the source tensor. Supported data types: QS8/QASYMM8/QS16/F16/F32
@@ -722,10 +723,12 @@
     __global DATA_TYPE *output_ptr = ((__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + yo * dst_stride_y + batch * dst_stride_w)) + xo;
 
     // Linearize convolution elements
-    for(int y = yi, y_e = yi + KERNEL_HEIGHT; y < y_e; ++y)
+    for(int yk = 0; yk < KERNEL_HEIGHT; ++yk)
     {
-        for(int x = xi, x_e = xi + KERNEL_WIDTH; x < x_e; ++x, ++output_ptr)
+        int y = yi + yk * DILATION_Y;
+        for(int xk = 0; xk < KERNEL_WIDTH; ++xk, ++output_ptr)
         {
+            int x = xi + xk * DILATION_X;
 #if PAD_LEFT == 0 && PAD_TOP == 0 && PAD_RIGHT == 0 && PAD_BOTTOM == 0
             *output_ptr = *((__global DATA_TYPE *)(input_ptr + x * src_stride_x + y * src_stride_y));
 #else  // PAD_LEFT == 0 && PAD_TOP == 0 && PAD_RIGHT == 0 && PAD_BOTTOM == 0
diff --git a/src/core/CL/kernels/CLIm2ColKernel.cpp b/src/core/CL/kernels/CLIm2ColKernel.cpp
index 9bc4787..cc19d3c 100644
--- a/src/core/CL/kernels/CLIm2ColKernel.cpp
+++ b/src/core/CL/kernels/CLIm2ColKernel.cpp
@@ -41,11 +41,12 @@
 
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, bool has_bias)
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, bool has_bias, const Size2D &dilation)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QASYMM8, DataType::QS16, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::QASYMM8 && has_bias);
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
+    ARM_COMPUTE_RETURN_ERROR_ON((dilation.x() < 1) || (dilation.y() < 1));
 
     // Checks performed when output is configured
     if(output->total_size() != 0)
@@ -63,12 +64,12 @@
 {
 }
 
-void CLIm2ColKernel::configure(const ICLTensor *input, ICLTensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias)
+void CLIm2ColKernel::configure(const ICLTensor *input, ICLTensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
     // Perform validation step
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), has_bias));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), has_bias, dilation));
 
     _input       = input;
     _output      = output;
@@ -107,7 +108,7 @@
 
         _convolved_dims = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1),
                                             kernel_dims.width, kernel_dims.height,
-                                            conv_info);
+                                            conv_info, dilation);
 
         build_opts.add_option("-DKERNEL_WIDTH=" + support::cpp11::to_string(kernel_dims.width));
         build_opts.add_option("-DKERNEL_HEIGHT=" + support::cpp11::to_string(kernel_dims.height));
@@ -122,77 +123,82 @@
         build_opts.add_option("-DPAD_BOTTOM=" + support::cpp11::to_string(conv_info.pad_bottom()));
         build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(input->info()->dimension(0)));
         build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(input->info()->dimension(1)));
+        build_opts.add_option("-DDILATION_X=" + support::cpp11::to_string(dilation.x()));
+        build_opts.add_option("-DDILATION_Y=" + support::cpp11::to_string(dilation.y()));
         build_opts.add_option_if_else(is_data_type_quantized(data_type), "-DPAD_VALUE=" + support::cpp11::to_string(input->info()->quantization_info().offset), "-DPAD_VALUE=0");
 
         const bool squared_im2col = kernel_dims.width == kernel_dims.height;
 
-        if(squared_im2col && !is_data_type_fixed_point(data_type))
+        if(dilation == Size2D(1U, 1U))
         {
-            // Check if we can run an optimized im2col
-            switch(kernel_dims.width)
+            if(squared_im2col && !is_data_type_fixed_point(data_type))
             {
-                case 1:
-                    // Optimized im2col1x1 if stride_x = 1 and conv_info.has_padding() = false
-                    if(conv_info.stride().first == 1 && !conv_info.has_padding())
-                    {
-                        // Set hint for LWS
+                // Check if we can run an optimized im2col
+                switch(kernel_dims.width)
+                {
+                    case 1:
+                        // Optimized im2col1x1 if stride_x = 1 and conv_info.has_padding() = false
+                        if(conv_info.stride().first == 1 && !conv_info.has_padding())
+                        {
+                            // Set hint for LWS
+                            _lws_hint                          = cl::NDRange(1, 1, 8);
+                            _num_elems_processed_per_iteration = 4;
+                            is_optimized_path                  = true;
+                            kernel_name                        = "im2col1x1_stridex1_dchw";
+                        }
+                        break;
+                    case 3:
                         _lws_hint                          = cl::NDRange(1, 1, 8);
-                        _num_elems_processed_per_iteration = 4;
-                        is_optimized_path                  = true;
-                        kernel_name                        = "im2col1x1_stridex1_dchw";
-                    }
-                    break;
-                case 3:
-                    _lws_hint                          = cl::NDRange(1, 1, 8);
-                    _num_elems_processed_per_iteration = 1;
-                    is_optimized_path                  = true;
-                    kernel_name                        = "im2col3x3_dchw";
-                    break;
-                case 5:
-                    _num_elems_processed_per_iteration = 1;
-                    is_optimized_path                  = true;
-                    kernel_name                        = "im2col5x5_dchw";
-                    break;
-                case 11:
-                    // Optimized im2col11x11 if pad_x = pad_y = 0
-                    if(!conv_info.has_padding())
-                    {
                         _num_elems_processed_per_iteration = 1;
                         is_optimized_path                  = true;
-                        kernel_name                        = "im2col11x11_padx0_pady0_dchw";
-                    }
-                    break;
-                default:
-                    is_optimized_path = false;
-                    break;
+                        kernel_name                        = "im2col3x3_dchw";
+                        break;
+                    case 5:
+                        _num_elems_processed_per_iteration = 1;
+                        is_optimized_path                  = true;
+                        kernel_name                        = "im2col5x5_dchw";
+                        break;
+                    case 11:
+                        // Optimized im2col11x11 if pad_x = pad_y = 0
+                        if(!conv_info.has_padding())
+                        {
+                            _num_elems_processed_per_iteration = 1;
+                            is_optimized_path                  = true;
+                            kernel_name                        = "im2col11x11_padx0_pady0_dchw";
+                        }
+                        break;
+                    default:
+                        is_optimized_path = false;
+                        break;
+                }
             }
-        }
-        else if(kernel_dims.width > 1 && !conv_info.has_padding())
-        {
-            _num_elems_processed_per_iteration = 1;
-            kernel_name                        = "im2col_generic_padx0_pady0_dchw";
+            else if(kernel_dims.width > 1 && !conv_info.has_padding())
+            {
+                _num_elems_processed_per_iteration = 1;
+                kernel_name                        = "im2col_generic_padx0_pady0_dchw";
 
-            // Optimized im2col is performed using one or more vector operations with the specified vector size
-            // and a remainder. For example, for 5x5 convolutions, im2col is performed using vectors of size 4
-            // and scalars; for 7x7 convolutions, using vectors of size 4 and vectors of size 3.
-            // Using the vector size of 4 is always safe since OpenCL supports vectors of size 2 and 3.
-            // Using the vector size of 8, however, may be faster.
-            size_t vector_size = 4;
-            // For 2x2 convolutions, use vectors of size 2. (For 3x3 convolutions, im2col_kernel3x3_padx0_pady0
-            // is used instead.)
-            if(kernel_dims.width < vector_size)
-            {
-                vector_size = kernel_dims.width;
+                // Optimized im2col is performed using one or more vector operations with the specified vector size
+                // and a remainder. For example, for 5x5 convolutions, im2col is performed using vectors of size 4
+                // and scalars; for 7x7 convolutions, using vectors of size 4 and vectors of size 3.
+                // Using the vector size of 4 is always safe since OpenCL supports vectors of size 2 and 3.
+                // Using the vector size of 8, however, may be faster.
+                size_t vector_size = 4;
+                // For 2x2 convolutions, use vectors of size 2. (For 3x3 convolutions, im2col_kernel3x3_padx0_pady0
+                // is used instead.)
+                if(kernel_dims.width < vector_size)
+                {
+                    vector_size = kernel_dims.width;
+                }
+                // Local work size and vector size optimized for the 11x11 AlexNet convolution on Bifrost.
+                if(gpu_target_is_in(gpu_target, GPUTarget::G71, GPUTarget::G72) && kernel_dims.width == 11)
+                {
+                    _lws_hint   = cl::NDRange(1, 1, 1);
+                    vector_size = 8;
+                }
+                const size_t width_mod_vector_size = kernel_dims.width % vector_size;
+                build_opts.add_option("-DVECTOR_SIZE=" + support::cpp11::to_string(vector_size));
+                build_opts.add_option("-DWIDTH_MOD_VECTOR_SIZE=" + support::cpp11::to_string(width_mod_vector_size));
             }
-            // Local work size and vector size optimized for the 11x11 AlexNet convolution on Bifrost.
-            if(gpu_target_is_in(gpu_target, GPUTarget::G71, GPUTarget::G72) && kernel_dims.width == 11)
-            {
-                _lws_hint   = cl::NDRange(1, 1, 1);
-                vector_size = 8;
-            }
-            const size_t width_mod_vector_size = kernel_dims.width % vector_size;
-            build_opts.add_option("-DVECTOR_SIZE=" + support::cpp11::to_string(vector_size));
-            build_opts.add_option("-DWIDTH_MOD_VECTOR_SIZE=" + support::cpp11::to_string(width_mod_vector_size));
         }
         _run_func = &CLIm2ColKernel::run_generic;
     }
@@ -206,7 +212,7 @@
     // Create kernel
     _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
 
-    // Configure  kernel window
+    // Configure kernel window
     Window win;
     if(is_optimized_path)
     {
@@ -250,12 +256,12 @@
     _config_id += support::cpp11::to_string(output->info()->dimension(1));
 }
 
-Status CLIm2ColKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias)
+Status CLIm2ColKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation)
 {
     ARM_COMPUTE_UNUSED(kernel_dims);
     ARM_COMPUTE_UNUSED(conv_info);
     ARM_COMPUTE_UNUSED(has_bias);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, has_bias));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, has_bias, dilation));
     return Status{};
 }