COMPMID-3708 Remove OpenCL padding: CLCopyKernel [Patch2]

* Remove the user-supplied padding from CLCopyKernel
  Note that this padding is different from the internal "padding" in the
  original task, as it is user-supplied instead of internal.
  This user-supplied padding interface is removed simply because it has
  been replaced by a more capable CLPadLayerKernel, and is not used
  anywhere else.

Signed-off-by: SiCong Li <sicong.li@arm.com>
Change-Id: Ib53e76efd7d043ee79dcd47ca734c6dc685da43e
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/4194
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
diff --git a/arm_compute/core/CL/kernels/CLCopyKernel.h b/arm_compute/core/CL/kernels/CLCopyKernel.h
index 11a6d54..5c91e27 100644
--- a/arm_compute/core/CL/kernels/CLCopyKernel.h
+++ b/arm_compute/core/CL/kernels/CLCopyKernel.h
@@ -49,29 +49,26 @@
      *
      * @param[in]  input         Source tensor. Data types supported: All.
      * @param[out] output        Destination tensor. Data types supported: same as @p input.
-     * @param[in]  padding       (Optional) Padding to be applied to the input tensor
      * @param[in]  output_window (Optional) Window to be used in case only copying into part of a tensor. Default is nullptr.
      */
-    void configure(const ICLTensor *input, ICLTensor *output, const PaddingList &padding = PaddingList(), Window *output_window = nullptr);
+    void configure(const ICLTensor *input, ICLTensor *output, Window *output_window = nullptr);
     /** Initialize the kernel's input, output.
      *
      * @param[in]  compile_context The compile context to be used.
      * @param[in]  input           Source tensor. Data types supported: All.
      * @param[out] output          Destination tensor. Data types supported: same as @p input.
-     * @param[in]  padding         (Optional) Padding to be applied to the input tensor
      * @param[in]  output_window   (Optional) Window to be used in case only copying into part of a tensor. Default is nullptr.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const PaddingList &padding = PaddingList(), Window *output_window = nullptr);
+    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, Window *output_window = nullptr);
     /** Static function to check if given info will lead to a valid configuration of @ref CLCopyKernel
      *
      * @param[in] input         Source tensor info. Data types supported: All.
      * @param[in] output        Destination tensor info. Data types supported: same as @p input.
-     * @param[in] padding       (Optional) Padding to be applied to the input tensor
      * @param[in] output_window (Optional) Window to be used in case only copying into part of a tensor. Default is nullptr.
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding = PaddingList(), Window *output_window = nullptr);
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, Window *output_window = nullptr);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp
index 0d0b7f6..8c5607e 100644
--- a/src/core/CL/CLKernelLibrary.cpp
+++ b/src/core/CL/CLKernelLibrary.cpp
@@ -108,7 +108,6 @@
     { "convolution_separable1x9_static", "convolution9x9.cl" },
     { "convolution_separable9x1_static", "convolution9x9.cl" },
     { "copy_tensor", "copy_tensor.cl" },
-    { "copy_pad_tensor", "copy_tensor.cl" },
     { "copy_plane", "channel_extract.cl" },
     { "copy_planes_3p", "channel_combine.cl" },
     { "copy_to_keypoint", "fast_corners.cl" },
diff --git a/src/core/CL/cl_kernels/copy_tensor.cl b/src/core/CL/cl_kernels/copy_tensor.cl
index 95da9a3..9c90969 100644
--- a/src/core/CL/cl_kernels/copy_tensor.cl
+++ b/src/core/CL/cl_kernels/copy_tensor.cl
@@ -23,60 +23,6 @@
  */
 #include "helpers.h"
 
-#if defined(PAD00) && defined(PAD10) && defined(PAD20) && defined(PAD21) && defined(PAD30) && defined(DATA_TYPE) && defined(VEC_SIZE) // Compile time constants
-
-/** Perform a padded copy of input tensor to the output tensor. Padding values are defined at compile time
- *
- * @attention The following variables must be passed at compile time:
- * -# -DPAD{d}{0,1} = padding before{0} and after{1} dimension d (d < 4)
- * -# -DDEPTH = The third dimension (depth) of the tensor (it is needed only if d == 3)
- * -# -DDATA_TYPE = Input and output datatypes.
- *
- * @param[in]  in_ptr                            Pointer to the source tensor. Supported data types: All
- * @param[in]  in_stride_x                       Stride of the source tensor in X dimension (in bytes)
- * @param[in]  in_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  in_stride_y                       Stride of the source tensor in Y dimension (in bytes)
- * @param[in]  in_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  in_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  in_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  in_offset_first_element_in_bytes  The offset of the first element in the source tensor
- * @param[out] out_ptr                           Pointer to the destination tensor. Supported data types: same as @p in_ptr
- * @param[in]  out_stride_x                      Stride of the destination tensor in X dimension (in bytes)
- * @param[in]  out_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  out_stride_y                      Stride of the destination tensor in Y dimension (in bytes)
- * @param[in]  out_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  out_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  out_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  out_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void copy_pad_tensor(
-    TENSOR3D_DECLARATION(in),
-    TENSOR3D_DECLARATION(out))
-
-{
-    Tensor3D in  = CONVERT_TO_TENSOR3D_STRUCT(in);
-    Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
-
-    const int offset_x = PAD00;
-    const int offset_y = PAD10;
-    const int offset_z = PAD20;
-
-#if PAD30 > 0
-    const size_t in_batch    = get_global_id(2) / DEPTH;
-    const int    total_depth = DEPTH + PAD20 + PAD21;
-    const int    offset_w    = PAD30 * total_depth + in_batch * (PAD20 + PAD21);
-#else  // PAD30 == 0
-    const int offset_w = 0;
-#endif // PAD30
-
-    VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-    data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr);
-
-    VSTORE(VEC_SIZE)
-    (data, 0, (__global DATA_TYPE *)tensor3D_offset(&out, offset_x, offset_y, offset_z + offset_w));
-}
-#endif // Compile time constants
-
 #if defined(DATA_TYPE) && defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER)
 /** Performs a copy of input tensor to the output tensor.
  *
diff --git a/src/core/CL/kernels/CLCopyKernel.cpp b/src/core/CL/kernels/CLCopyKernel.cpp
index 769f15d..184b80c 100644
--- a/src/core/CL/kernels/CLCopyKernel.cpp
+++ b/src/core/CL/kernels/CLCopyKernel.cpp
@@ -26,11 +26,7 @@
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/AccessWindowStatic.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
@@ -39,100 +35,28 @@
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding = PaddingList(), Window *output_window = nullptr)
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, Window *output_window = nullptr)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_ERROR_ON(!padding.empty() && output_window != nullptr);
-    ARM_COMPUTE_RETURN_ERROR_ON(padding.size() > 4);
 
     // Validate output if initialized
     if(output->total_size() != 0)
     {
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
         if(output_window == nullptr)
         {
-            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(misc::shape_calculator::compute_padded_shape(input->tensor_shape(), padding), output->tensor_shape());
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(input->tensor_shape(), output->tensor_shape());
         }
         else
         {
-            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
             ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(input->tensor_shape(), output_window->shape());
         }
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
     }
 
     return Status{};
 }
 
-std::pair<Status, Window> configure_window(ITensorInfo *input, ITensorInfo *output)
-{
-    // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*output, *input);
-
-    // Configure window
-    const unsigned int vec_size_x = adjust_vec_size(16 / input->element_size(), input->dimension(0));
-
-    const Window win = calculate_max_window(*input, Steps(vec_size_x));
-    return std::make_pair(Status{}, win);
-}
-
-std::pair<Status, Window> validate_and_configure_window_with_padding(ITensorInfo *input, ITensorInfo *output, const PaddingList &padding)
-{
-    TensorShape input_shape  = input->tensor_shape();
-    TensorShape padded_shape = misc::shape_calculator::compute_padded_shape(input_shape, padding);
-
-    auto_init_if_empty(*output, input->clone()->set_tensor_shape(padded_shape));
-
-    // Configure window
-    const unsigned int num_elems_processed_per_iteration = 16 / input->element_size();
-
-    Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
-
-    // Pad on the x dimension accounting for the padding offset along the same dimension
-    AccessWindowHorizontal output_access(output, padding[0].first, num_elems_processed_per_iteration);
-    AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
-    bool                   window_changed = update_window_and_padding(win, input_access, output_access);
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, win);
-}
-
-/** Generate the string "-DPAD= @p dim @p index @p padding"
- *
- * @param[in] dim     The dimension index
- * @param[in] index   Can be 0 for the start dimension and 1 for the end dimension
- * @param[in] padding The value to pad for that index/dimension pair
- *
- * @return The correct concatenated string
- */
-std::string generate_pad_string(const size_t dim, const size_t index, const size_t padding)
-{
-    return "-DPAD" + support::cpp11::to_string(dim) + support::cpp11::to_string(index) + "=" + support::cpp11::to_string(padding);
-}
-
-/** Pass the padding as build option to the kernel.
- *
- * @param[in]  tensor     The padded tensor
- * @param[in]  padding    The list of the padding for each dimension
- * @param[out] build_opts The build option to which adding the padding
- */
-void add_padding_as_build_options(const PaddingList &padding, CLBuildOptions &build_opts)
-{
-    size_t dim = 0;
-    for(dim = 0; dim < padding.size(); dim++)
-    {
-        build_opts.add_option(generate_pad_string(dim, 0, padding[dim].first));
-        build_opts.add_option(generate_pad_string(dim, 1, padding[dim].second));
-    }
-
-    while(dim < TensorShape::num_max_dimensions)
-    {
-        build_opts.add_option(generate_pad_string(dim, 0, 0));
-        build_opts.add_option(generate_pad_string(dim, 1, 0));
-        dim++;
-    }
-}
-
 } // namespace
 
 CLCopyKernel::CLCopyKernel()
@@ -140,15 +64,15 @@
 {
 }
 
-void CLCopyKernel::configure(const ICLTensor *input, ICLTensor *output, const PaddingList &padding, Window *output_window)
+void CLCopyKernel::configure(const ICLTensor *input, ICLTensor *output, Window *output_window)
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, padding, output_window);
+    configure(CLKernelLibrary::get().get_compile_context(), input, output, output_window);
 }
 
-void CLCopyKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const PaddingList &padding, Window *output_window)
+void CLCopyKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, Window *output_window)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), padding, output_window));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), output_window));
 
     auto padding_info = get_padding_info({ input, output });
 
@@ -159,80 +83,51 @@
     CLBuildOptions build_opts;
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
 
-    std::pair<Status, Window> win_config;
+    // Output auto inizialitation if not yet initialized
+    auto_init_if_empty(*(output->info()), *(input->info()));
 
-    const unsigned int vec_size_x = 16 / input->info()->element_size();
+    // Configure window
+    const unsigned int vec_size_x = adjust_vec_size(16 / input->info()->element_size(), input->info()->dimension(0));
 
-    if(padding.empty())
+    const Window win_config = calculate_max_window(*(input->info()), Steps(vec_size_x));
+
+    if(output_window != nullptr)
     {
-        // Configure window
-        win_config = configure_window(input->info(), output->info());
+        _has_output_window             = true;
+        _output_window                 = Window(*output_window);
+        const int  width_x             = output_window->num_iterations(0);
+        const int  vec_size_x_leftover = width_x % vec_size_x;
+        const bool multi_access_x      = width_x >= static_cast<int32_t>(vec_size_x);
 
-        if(output_window != nullptr)
+        if(multi_access_x)
         {
-            _has_output_window             = true;
-            _output_window                 = Window(*output_window);
-            const int  width_x             = output_window->num_iterations(0);
-            const int  vec_size_x_leftover = width_x % vec_size_x;
-            const bool multi_access_x      = width_x >= static_cast<int32_t>(vec_size_x);
-
-            if(multi_access_x)
-            {
-                _output_window.set(Window::DimX, Window::Dimension(output_window->x().start(), ceil_to_multiple(output_window->x().end(), vec_size_x), vec_size_x));
-            }
-
-            build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(vec_size_x_leftover));
-        }
-        else
-        {
-            const int width_x             = input->info()->tensor_shape().x();
-            const int vec_size_x_leftover = width_x % vec_size_x;
-
-            build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(vec_size_x_leftover));
+            _output_window.set(Window::DimX, Window::Dimension(output_window->x().start(), ceil_to_multiple(output_window->x().end(), vec_size_x), vec_size_x));
         }
 
-        build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
-
-        // Build kernel
-        _kernel = create_kernel(compile_context, "copy_tensor", build_opts.options());
+        build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(vec_size_x_leftover));
     }
     else
     {
-        build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
+        const int width_x             = input->info()->tensor_shape().x();
+        const int vec_size_x_leftover = width_x % vec_size_x;
 
-        // Add compile time options
-        add_padding_as_build_options(padding, build_opts);
-
-        // If we are padding in the fourth dimension the kernel needs to know the depth of the
-        // different cubes
-        if(padding.size() == 4)
-        {
-            const size_t depth = input->info()->tensor_shape()[2];
-            build_opts.add_option("-DDEPTH=" + support::cpp11::to_string(depth));
-        }
-
-        // Build kernel
-        _kernel = create_kernel(compile_context, "copy_pad_tensor", build_opts.options());
-
-        // Configure window
-        win_config = validate_and_configure_window_with_padding(input->info(), output->info(), padding);
+        build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(vec_size_x_leftover));
     }
 
+    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
+
+    // Build kernel
+    _kernel = create_kernel(compile_context, "copy_tensor", build_opts.options());
+
     // Validate and set the window
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    ICLKernel::configure_internal(win_config.second);
+    ICLKernel::configure_internal(win_config);
 
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status CLCopyKernel::validate(const arm_compute::ITensorInfo *input, const arm_compute::ITensorInfo *output, const PaddingList &padding, Window *output_window)
+Status CLCopyKernel::validate(const arm_compute::ITensorInfo *input, const arm_compute::ITensorInfo *output, Window *output_window)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, padding, output_window));
-
-    if(!padding.empty())
-    {
-        ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_with_padding(input->clone().get(), output->clone().get(), padding).first);
-    }
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, output_window));
 
     return Status{};
 }
diff --git a/src/runtime/CL/functions/CLCropResize.cpp b/src/runtime/CL/functions/CLCropResize.cpp
index 6167e9d..4cf9f13 100644
--- a/src/runtime/CL/functions/CLCropResize.cpp
+++ b/src/runtime/CL/functions/CLCropResize.cpp
@@ -146,7 +146,7 @@
         win.set(3, Window::Dimension(num_box, num_box + 1, 1));
 
         auto copy_kernel = support::cpp14::make_unique<CLCopyKernel>();
-        copy_kernel->configure(compile_context, _scaled_results[num_box].get(), _output, PaddingList(), &win);
+        copy_kernel->configure(compile_context, _scaled_results[num_box].get(), _output, &win);
         _copy.emplace_back(std::move(copy_kernel));
 
         _crop_results[num_box]->allocator()->allocate();
diff --git a/src/runtime/CL/functions/CLPadLayer.cpp b/src/runtime/CL/functions/CLPadLayer.cpp
index 12a51f1..fb6078cc 100644
--- a/src/runtime/CL/functions/CLPadLayer.cpp
+++ b/src/runtime/CL/functions/CLPadLayer.cpp
@@ -67,9 +67,7 @@
     }
     else
     {
-        Window copy_window = Window();
-        copy_window.use_tensor_dimensions(output->tensor_shape());
-        ARM_COMPUTE_RETURN_ON_ERROR(CLCopyKernel::validate(input, output, PaddingList(), &copy_window));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLCopyKernel::validate(input, output));
     }
     return Status{};
 }