Remove padding from ClPool2dKernel NCHW

- Simplify NCHW kernel structure by removing old optimized paths
- Merge quantized with fp kernels

Resolve COMPMID-4722

Signed-off-by: Giorgio Arena <giorgio.arena@arm.com>
Change-Id: I79016b119619aed6a6193295601cd6517f14b88c
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/6183
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
diff --git a/src/gpu/cl/ClKernelLibrary.cpp b/src/gpu/cl/ClKernelLibrary.cpp
index 5cd969e..c05bb96 100644
--- a/src/gpu/cl/ClKernelLibrary.cpp
+++ b/src/gpu/cl/ClKernelLibrary.cpp
@@ -328,10 +328,6 @@
     { "pixelwise_mul_float", "common/pixelwise_mul_float.cl" },
     { "pixelwise_mul_int", "common/pixelwise_mul_int.cl" },
     { "pixelwise_mul_quantized", "common/pixelwise_mul_int.cl" },
-    { "pooling_layer_2", "common/pooling_layer.cl" },
-    { "pooling_layer_3", "common/pooling_layer.cl" },
-    { "pooling_layer_optimized_3", "common/pooling_layer.cl" },
-    { "pooling_layer_7", "common/pooling_layer.cl" },
     { "qlstm_layer_normalization", "common/qlstm_layer_normalization.cl" },
     { "quantization_layer", "common/quantization_layer.cl" },
     { "range", "common/range.cl" },
@@ -385,9 +381,7 @@
     { "normalize_planar_yuv_layer_nchw", "nchw/normalize_planar_yuv_layer.cl" },
     { "normalize_planar_yuv_layer_q8_nchw", "nchw/normalize_planar_yuv_layer_quantized.cl" },
     { "pooling_layer_MxN_nchw", "nchw/pooling_layer.cl" },
-    { "pooling_layer_2_nchw_indices_fp32", "nchw/pooling_layer.cl" },
-    { "pooling_layer_2_nchw_indices_fp16", "nchw/pooling_layer.cl" },
-    { "pooling_layer_MxN_quantized_nchw", "nchw/pooling_layer_quantized.cl" },
+    { "pooling_layer_2_nchw_indices", "nchw/pooling_layer.cl" },
     { "prior_box_layer_nchw", "nchw/prior_box_layer.cl" },
     { "remap_nearest_neighbour_nchw", "nchw/remap.cl" },
     { "remap_bilinear_nchw", "nchw/remap.cl" },
@@ -668,10 +662,6 @@
 #include "./cl_kernels/common/pixelwise_mul_int.clembed"
     },
     {
-        "common/pooling_layer.cl",
-#include "./cl_kernels/common/pooling_layer.clembed"
-    },
-    {
         "common/qlstm_layer_normalization.cl",
 #include "./cl_kernels/common/qlstm_layer_normalization.clembed"
     },
@@ -805,10 +795,6 @@
 #include "./cl_kernels/nchw/pooling_layer.clembed"
     },
     {
-        "nchw/pooling_layer_quantized.cl",
-#include "./cl_kernels/nchw/pooling_layer_quantized.clembed"
-    },
-    {
         "nchw/prior_box_layer.cl",
 #include "./cl_kernels/nchw/prior_box_layer.clembed"
     },
diff --git a/src/gpu/cl/kernels/ClPool2dKernel.cpp b/src/gpu/cl/kernels/ClPool2dKernel.cpp
index 04f2b14..5e53799 100644
--- a/src/gpu/cl/kernels/ClPool2dKernel.cpp
+++ b/src/gpu/cl/kernels/ClPool2dKernel.cpp
@@ -23,18 +23,13 @@
  */
 #include "src/gpu/cl/kernels/ClPool2dKernel.h"
 
-#include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "support/Cast.h"
-#include "support/StringSupport.h"
 
 namespace arm_compute
 {
@@ -46,19 +41,6 @@
 
 namespace
 {
-// Internal window config info
-using ClPoolingConfig = std::pair<unsigned int, BorderSize>; //num_elems_processed_per_iteration, border_size
-
-void auto_init(const ITensorInfo *src, ITensorInfo *dst, ITensorInfo *indices, PoolingLayerInfo pool_info)
-{
-    TensorShape out_shape = compute_pool_shape(*src, pool_info);
-    auto_init_if_empty(*dst, src->clone()->set_tensor_shape(out_shape));
-    if(indices)
-    {
-        auto_init_if_empty(*indices, src->clone()->set_tensor_shape(out_shape).set_data_type(DataType::U32));
-    }
-}
-
 Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
@@ -104,102 +86,6 @@
 
     return Status{};
 }
-
-std::tuple<Status, Window, ClPoolingConfig> validate_and_configure_window(ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices = nullptr)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-
-    // Get data layout
-    const DataLayout data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout;
-    const int        idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const int        idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-
-    int                 pool_stride_x   = 0;
-    int                 pool_stride_y   = 0;
-    unsigned int        pooled_w        = 0;
-    unsigned int        pooled_h        = 0;
-    int                 pool_size_x     = pool_info.is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width;
-    int                 pool_size_y     = pool_info.is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height;
-    const PadStrideInfo pad_stride_info = pool_info.pad_stride_info;
-    std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
-    const int  pool_pad_right  = pad_stride_info.pad_right();
-    const int  pool_pad_top    = pad_stride_info.pad_top();
-    const int  pool_pad_left   = pad_stride_info.pad_left();
-    const int  pool_pad_bottom = pad_stride_info.pad_bottom();
-    BorderSize border_size     = BorderSize();
-
-    auto_init(src, dst, indices, pool_info);
-    pooled_w = dst->tensor_shape()[idx_width];
-    pooled_h = dst->tensor_shape()[idx_height];
-
-    const DataType data_type = src->data_type();
-
-    const int src_width  = src->dimension(idx_width);
-    const int src_height = src->dimension(idx_height);
-
-    unsigned int num_elems_processed_per_iteration = 0;
-    bool         window_changed                    = false;
-    Window       win{};
-    switch(data_layout)
-    {
-        case DataLayout::NCHW:
-        {
-            // Initialize border size
-            border_size = BorderSize(pool_pad_top, pool_pad_right, pool_pad_bottom, pool_pad_left);
-            // Change the number of elements processed per iteration
-            // for pooling 3x3 with stride less equal than 3
-            const bool can_optimize                         = (pool_size_x == 3) && (pool_size_y == 3) && (pool_stride_x <= 3) && !is_data_type_quantized(data_type);
-            num_elems_processed_per_iteration               = can_optimize ? 4 : 1;
-            const unsigned int num_elems_read_per_iteration = (num_elems_processed_per_iteration - 1) * pool_stride_x + pool_size_x;
-
-            // Number of iterations in X dimension
-            const int num_iterations_x = (pooled_w + num_elems_processed_per_iteration - 1) / num_elems_processed_per_iteration;
-
-            // Upper limit for the number of right/bottom border elements that are accessed
-            const int upper_bound_w = ((num_iterations_x - 1) * num_elems_processed_per_iteration * pool_stride_x - pool_pad_left + num_elems_read_per_iteration) - src_width;
-            const int upper_bound_h = ((pooled_h - 1) * pool_stride_y - pool_pad_top + pool_size_y) - src_height;
-
-            border_size.right  = std::max(upper_bound_w, pool_pad_right);
-            border_size.bottom = std::max(upper_bound_h, pool_pad_bottom);
-
-            win = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration));
-
-            AccessWindowRectangle src_access(src, -pool_pad_left, -pool_pad_top, num_elems_read_per_iteration, pool_size_y,
-                                             pool_stride_x, pool_stride_y);
-            AccessWindowHorizontal dst_access(dst, 0, num_elems_processed_per_iteration);
-
-            // Update indices window
-            if(indices)
-            {
-                AccessWindowHorizontal indices_access(indices, 0, num_elems_processed_per_iteration);
-                window_changed = update_window_and_padding(win, src_access, dst_access, indices_access);
-                indices_access.set_valid_region(win, ValidRegion(Coordinates(), indices->tensor_shape()));
-            }
-            else
-            {
-                window_changed = update_window_and_padding(win, src_access, dst_access);
-            }
-
-            dst_access.set_valid_region(win, ValidRegion(Coordinates(), dst->tensor_shape()));
-            break;
-        }
-        case DataLayout::NHWC:
-        {
-            const size_t vec_size = dst->data_type() == DataType::F32 ? 2 : 4;
-
-            // Initialize border size
-            border_size                       = BorderSize();
-            num_elems_processed_per_iteration = adjust_vec_size(vec_size, dst->dimension(0));
-            win                               = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration));
-            break;
-        }
-        default:
-            ARM_COMPUTE_ERROR("Not implemented");
-    }
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_tuple(err, win, ClPoolingConfig(num_elems_processed_per_iteration, border_size));
-}
 } // namespace
 
 ClPool2dKernel::ClPool2dKernel()
@@ -207,20 +93,27 @@
     _type = CLKernelType::POOL;
 }
 
-BorderSize ClPool2dKernel::border_size() const
-{
-    return _border_size;
-}
-
 void ClPool2dKernel::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, pool_info, indices));
 
     auto padding_info = get_padding_info({ src, dst, indices });
 
+    // Auto init if empty
+    TensorShape out_shape = compute_pool_shape(*src, pool_info);
+    auto_init_if_empty(*dst, src->clone()->set_tensor_shape(out_shape));
+    if(indices)
+    {
+        auto_init_if_empty(*indices, src->clone()->set_tensor_shape(out_shape).set_data_type(DataType::U32));
+    }
+
     // Set instance variables
-    _pool_info                          = pool_info;
-    _data_layout                        = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout;
+    _pool_info                         = pool_info;
+    _data_layout                       = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout;
+    _num_elems_processed_per_iteration = (_data_layout == DataLayout::NCHW) ? 1 : ((dst->data_type() == DataType::F32) ? 2 : 4);
+    _num_elems_processed_per_iteration = adjust_vec_size(_num_elems_processed_per_iteration, dst->dimension(0));
+
     int                 pool_stride_x   = 0;
     int                 pool_stride_y   = 0;
     const PoolingType   pool_type       = pool_info.pool_type;
@@ -233,53 +126,13 @@
     const PadStrideInfo pad_stride_info = pool_info.pad_stride_info;
     const bool          exclude_padding = pool_info.exclude_padding;
     std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
-    const int pool_pad_top  = pad_stride_info.pad_top();
-    const int pool_pad_left = pad_stride_info.pad_left();
+    const int      pool_pad_top  = pad_stride_info.pad_top();
+    const int      pool_pad_left = pad_stride_info.pad_left();
+    const DataType data_type     = src->data_type();
 
     // Set build options
     CLBuildOptions build_opts;
-    const DataType data_type = src->data_type();
-
-    // Configure kernel window
-    auto win_config = validate_and_configure_window(src, dst, pool_info, indices);
-
-    ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config));
-    ICLKernel::configure_internal(std::get<1>(win_config));
-
-    ClPoolingConfig pooling_config     = std::get<2>(win_config);
-    _num_elems_processed_per_iteration = pooling_config.first;
-    _border_size                       = pooling_config.second;
-
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(_num_elems_processed_per_iteration));
-
-    // Tensor paddings are used to calculate the indicies for MAX pooling
-    if(pool_info.pool_size == Size2D(2, 2) && pool_type == PoolingType::MAX && indices && is_data_type_float(data_type))
-    {
-        build_opts.add_option("-DPAD_TENSOR_LEFT=" + support::cpp11::to_string(src->padding().left));
-        build_opts.add_option("-DPAD_TENSOR_RIGHT=" + support::cpp11::to_string(src->padding().right));
-        build_opts.add_option("-DPAD_TENSOR_TOP=" + support::cpp11::to_string(src->padding().top));
-        build_opts.add_option("-DPAD_TENSOR_BOTTOM=" + support::cpp11::to_string(src->padding().bottom));
-        build_opts.add_option("-DTENSOR_CHANNEL=" + support::cpp11::to_string(src->dimension(idx_channel)));
-        build_opts.add_option("-DTENSOR_WIDTH=" + support::cpp11::to_string(src->dimension(idx_width)));
-        build_opts.add_option("-DTENSOR_HEIGHT=" + support::cpp11::to_string(src->dimension(idx_height)));
-    }
-
-    if(is_data_type_quantized_asymmetric(data_type) && src->quantization_info() != dst->quantization_info())
-    {
-        const UniformQuantizationInfo iq_info = src->quantization_info().uniform();
-        const UniformQuantizationInfo oq_info = dst->quantization_info().uniform();
-
-        build_opts.add_option("-DOFFSET_IN1=" + float_to_string_with_full_precision(iq_info.offset));
-        build_opts.add_option("-DOFFSET_OUT=" + float_to_string_with_full_precision(oq_info.offset));
-        build_opts.add_option("-DSCALE_IN1=" + float_to_string_with_full_precision(iq_info.scale));
-        build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(oq_info.scale));
-    }
-
-    // Check dst dimensions
-    auto_init(src, dst, indices, pool_info);
-
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, pool_info, indices));
-
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
     build_opts.add_option("-DPOOL_" + string_from_pooling_type(pool_type));
     build_opts.add_option("-DSTRIDE_X=" + support::cpp11::to_string(pool_stride_x));
@@ -288,6 +141,32 @@
     build_opts.add_option("-DPAD_Y=" + support::cpp11::to_string(pool_pad_top));
     build_opts.add_option("-DPOOL_SIZE_X=" + support::cpp11::to_string(pool_size_x));
     build_opts.add_option("-DPOOL_SIZE_Y=" + support::cpp11::to_string(pool_size_y));
+    build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(src->dimension(idx_width)));
+    build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(src->dimension(idx_height)));
+    build_opts.add_option("-DMAX_WIDTH=" + support::cpp11::to_string(src->dimension(idx_width) + (exclude_padding ? 0 : pool_pad_left)));
+    build_opts.add_option("-DMAX_HEIGHT=" + support::cpp11::to_string(src->dimension(idx_height) + (exclude_padding ? 0 : pool_pad_top)));
+
+    // Tensor paddings are used to calculate the indicies for MAX pooling
+    if(pool_info.pool_size == Size2D(2, 2) && pool_type == PoolingType::MAX && indices && is_data_type_float(data_type))
+    {
+        build_opts.add_option("-DSRC_BATCH=" + support::cpp11::to_string(src->tensor_shape().total_size_lower(3)));
+    }
+
+    if(is_data_type_quantized_asymmetric(data_type))
+    {
+        build_opts.add_option("-DQUANTIZED");
+
+        if(src->quantization_info() != dst->quantization_info())
+        {
+            const UniformQuantizationInfo iq_info = src->quantization_info().uniform();
+            const UniformQuantizationInfo oq_info = dst->quantization_info().uniform();
+
+            build_opts.add_option("-DOFFSET_IN1=" + float_to_string_with_full_precision(iq_info.offset));
+            build_opts.add_option("-DOFFSET_OUT=" + float_to_string_with_full_precision(oq_info.offset));
+            build_opts.add_option("-DSCALE_IN1=" + float_to_string_with_full_precision(iq_info.scale));
+            build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(oq_info.scale));
+        }
+    }
 
     // Set the initial value for the pooling operation accordingly with the data type
     if(pool_type == PoolingType::MAX)
@@ -309,9 +188,6 @@
         build_opts.add_option("-DINITIAL_VALUE=0");
     }
 
-    build_opts.add_option("-DMAX_WIDTH=" + support::cpp11::to_string(src->dimension(idx_width) + (exclude_padding ? 0 : pool_pad_left)));
-    build_opts.add_option("-DMAX_HEIGHT=" + support::cpp11::to_string(src->dimension(idx_height) + (exclude_padding ? 0 : pool_pad_top)));
-
     // Create kernel
     switch(_data_layout)
     {
@@ -319,7 +195,7 @@
         {
             const auto use_fp_mixed_precision = (data_type == DataType::F16) && pool_info.fp_mixed_precision;
             const auto use_wider_accumulator  = use_fp_mixed_precision && (pool_type != PoolingType::MAX);
-            const auto acc_data_type          = get_cl_type_from_data_type(use_wider_accumulator ? DataType::F32 : data_type);
+            const auto acc_data_type          = get_cl_type_from_data_type(use_wider_accumulator ? DataType::F32 : (is_data_type_quantized(data_type) ? DataType::S32 : data_type));
             build_opts.add_option("-DACC_DATA_TYPE=" + acc_data_type);
             build_opts.add_option_if(use_wider_accumulator, "-DFP_MIXED_PRECISION");
 
@@ -328,33 +204,15 @@
                 build_opts.add_option_if(exclude_padding, "-DEXCLUDE_PADDING");
             }
 
-            if((pool_size_x == 3) && (pool_size_y == 3) && !is_data_type_quantized_asymmetric(data_type))
-            {
-                // Check if we have pool3x3 with stride_x less equal than 3. In these cases, run an optimized OpenCL kernel where
-                // each thread computes 4 dst elements
-                const bool is_pool3x3_stride_le3 = (pool_size_x == 3) && (pool_size_y == 3) && (pool_stride_x <= 3);
-
-                std::string kernel_name = ((is_pool3x3_stride_le3) ? "pooling_layer_optimized_" : "pooling_layer_")
-                                          + support::cpp11::to_string(pool_size_x);
-                _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-            }
-            else if(pool_info.pool_size == Size2D(2, 2) && pool_type == PoolingType::MAX && indices && is_data_type_float(data_type))
+            if(pool_info.pool_size == Size2D(2, 2) && pool_type == PoolingType::MAX && indices && is_data_type_float(data_type))
             {
                 // For max pooling with pool2x2, store indicies which will be used in max unpooling
-                if(data_type == DataType::F32)
-                {
-                    std::string kernel_name = "pooling_layer_2_nchw_indices_fp32";
-                    _kernel                 = create_kernel(compile_context, kernel_name, build_opts.options());
-                }
-                else if(data_type == DataType::F16)
-                {
-                    std::string kernel_name = "pooling_layer_2_nchw_indices_fp16";
-                    _kernel                 = create_kernel(compile_context, kernel_name, build_opts.options());
-                }
+                std::string kernel_name = "pooling_layer_2_nchw_indices";
+                _kernel                 = create_kernel(compile_context, kernel_name, build_opts.options());
             }
             else // Run general case
             {
-                std::string kernel_name = is_data_type_quantized_asymmetric(data_type) ? "pooling_layer_MxN_quantized_nchw" : "pooling_layer_MxN_nchw";
+                std::string kernel_name = "pooling_layer_MxN_nchw";
                 _kernel                 = create_kernel(compile_context, kernel_name, build_opts.options());
             }
             break;
@@ -405,6 +263,10 @@
             ARM_COMPUTE_ERROR("Not implemented");
     }
 
+    // Configure kernel window
+    Window win = calculate_max_window(*dst, Steps(_num_elems_processed_per_iteration));
+    ICLKernel::configure_internal(win);
+
     // Set config_id for enabling LWS tuning
     _config_id = "pooling_layer_";
     _config_id += lower_string(string_from_data_type(data_type));
@@ -419,14 +281,12 @@
     _config_id += "_";
     _config_id += lower_string(string_from_data_layout(src->data_layout()));
 
-    ARM_COMPUTE_ERROR_ON(src->data_layout() == DataLayout::NHWC && has_padding_changed(padding_info));
+    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
 Status ClPool2dKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, pool_info, indices));
-    ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(src->clone().get(), dst->clone().get(), pool_info)));
-
     return Status{};
 }
 
@@ -453,18 +313,9 @@
             Window slice = window_collapsed.first_slice_window_3D();
             do
             {
-                // Upsample src by pool size
-                Window in_slice(slice);
-                in_slice.set(Window::DimX, Window::Dimension(in_slice.x().start() - _pool_info.pad_stride_info.pad_left(),
-                                                             (in_slice.x().end() - _pool_info.pad_stride_info.pad_left()) * pool_stride_x,
-                                                             pool_stride_x * _num_elems_processed_per_iteration));
-                in_slice.set(Window::DimY, Window::Dimension(in_slice.y().start() - _pool_info.pad_stride_info.pad_top(),
-                                                             (in_slice.y().end() - _pool_info.pad_stride_info.pad_top()) * pool_stride_y,
-                                                             pool_stride_y));
-
                 // Set srcs
                 unsigned int idx = 0;
-                add_3D_tensor_argument(idx, src, in_slice);
+                add_3D_tensor_argument(idx, src, slice);
                 add_3D_tensor_argument(idx, dst, slice);
                 if(indices && is_data_type_float(src->info()->data_type()) && (_pool_info.pool_size == Size2D(2, 2)))
                 {
diff --git a/src/gpu/cl/kernels/ClPool2dKernel.h b/src/gpu/cl/kernels/ClPool2dKernel.h
index 61d204d..f5bb068 100644
--- a/src/gpu/cl/kernels/ClPool2dKernel.h
+++ b/src/gpu/cl/kernels/ClPool2dKernel.h
@@ -61,12 +61,10 @@
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
-    BorderSize border_size() const override;
 
 public:
     PoolingLayerInfo _pool_info{};
     DataLayout       _data_layout{ DataLayout::UNKNOWN };
-    BorderSize       _border_size{ 0 };
     unsigned int     _num_elems_processed_per_iteration{ 1 };
 };
 } // namespace kernels
diff --git a/src/gpu/cl/operators/ClPool2d.cpp b/src/gpu/cl/operators/ClPool2d.cpp
index fdadd19..a5b18a2 100644
--- a/src/gpu/cl/operators/ClPool2d.cpp
+++ b/src/gpu/cl/operators/ClPool2d.cpp
@@ -25,7 +25,6 @@
 
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/kernels/ClPool2dKernel.h"
 
@@ -40,62 +39,15 @@
     auto k = std::make_unique<kernels::ClPool2dKernel>();
     k->set_target(CLScheduler::get().target());
     k->configure(compile_context, src, dst, info, indices);
-    _pooling = std::move(k);
-
-    const DataType data_type = src->data_type();
-
-    // Configure border depending on operation required (quantize border in case of asymmetric data_type)
-    BorderMode border_mode{};
-    PixelValue pixel_value(0.f);
-    if(is_data_type_quantized_asymmetric(data_type) && !info.exclude_padding)
-    {
-        pixel_value = PixelValue(0, data_type, src->quantization_info());
-    }
-
-    // Data layout
-    const auto data_layout = info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : info.data_layout;
-
-    switch(data_layout)
-    {
-        case DataLayout::NCHW:
-            border_mode = (PoolingType::MAX == info.pool_type) ? BorderMode::REPLICATE : BorderMode::CONSTANT;
-            break;
-        case DataLayout::NHWC:
-            border_mode = BorderMode::CONSTANT;
-            if(PoolingType::MAX == info.pool_type)
-            {
-                if(is_data_type_quantized(data_type))
-                {
-                    std::tie(pixel_value, std::ignore) = get_min_max(data_type);
-                }
-                else
-                {
-                    pixel_value = PixelValue(std::numeric_limits<float>::lowest());
-                }
-            }
-            break;
-        default:
-            ARM_COMPUTE_ERROR("Data layout not supported");
-    }
-    auto b = std::make_unique<CLFillBorderKernel>();
-    b->configure(compile_context, src, _pooling->border_size(), border_mode, pixel_value);
-    _border_handler = std::move(b);
+    _kernel = std::move(k);
 
     // Tune kernels
-    CLScheduler::get().tune_kernel_static(*_pooling);
+    CLScheduler::get().tune_kernel_static(*_kernel);
 }
 
 Status ClPool2d::validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &info, const ITensorInfo *indices)
 {
     return kernels::ClPool2dKernel::validate(src, dst, info, indices);
 }
-
-void ClPool2d::run(ITensorPack &tensors)
-{
-    ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided");
-
-    CLScheduler::get().enqueue_op(*_border_handler.get(), tensors, false);
-    CLScheduler::get().enqueue_op(*_pooling.get(), tensors, false);
-}
 } // namespace opencl
 } // namespace arm_compute
diff --git a/src/gpu/cl/operators/ClPool2d.h b/src/gpu/cl/operators/ClPool2d.h
index a041053..f353ba2 100644
--- a/src/gpu/cl/operators/ClPool2d.h
+++ b/src/gpu/cl/operators/ClPool2d.h
@@ -35,7 +35,6 @@
 {
 /** Basic function to simulate a pooling layer with the specified pooling operation. This function calls the following OpenCL kernels:
  *
- * -# @ref CLFillBorderKernel (executed if padding size is different from zero)
  * -# @ref opencl::ClPool2d
  */
 class ClPool2d : public IClOperator
@@ -59,13 +58,6 @@
      * @return a status
      */
     static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &info, const ITensorInfo *indices = nullptr);
-
-    // Inherited method overridden
-    void run(ITensorPack &tensors) override;
-
-private:
-    std::unique_ptr<ICLKernel> _pooling{ nullptr };
-    std::unique_ptr<ICLKernel> _border_handler{ nullptr };
 };
 } // namespace opencl
 } // namespace arm_compute