COMPMID-3145: Remove padding from NEScaleKernel

Change-Id: I530b12c6270d7dbeb3ef7af62484842ebcb65925
Signed-off-by: Manuel Bottini <manuel.bottini@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/4000
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com>
Reviewed-by: Michalis Spyrou <michalis.spyrou@arm.com>
diff --git a/arm_compute/core/NEON/kernels/NEScaleKernel.h b/arm_compute/core/NEON/kernels/NEScaleKernel.h
index a2328b1..b35bb72 100644
--- a/arm_compute/core/NEON/kernels/NEScaleKernel.h
+++ b/arm_compute/core/NEON/kernels/NEScaleKernel.h
@@ -83,34 +83,44 @@
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
-    BorderSize border_size() const override;
 
 private:
-    /** function to perform scale using nearest interpolation on the given window */
-    void scale_nearest_nchw(const Window &window);
-    /** function to perform scale using bilinear interpolation on the given window */
-    void scale_bilinear_nchw(const Window &window);
     /** function to perform scale using area interpolation on the given window
      *
      *  @note Used only in case down-sampling.
      */
-    void scale_area_nchw(const Window &window);
-    /** function to perform scale on the given window */
-    void scale_nhwc(const Window &window);
-    /** Scale function to use for the particular interpolation type passed to configure() */
-    void (NEScaleKernel::*_func)(const Window &window);
+    void scale_area_nchw_u8(const Window &window);
 
+    /** function to perform scale using bilinear interpolation on the given window */
+    template <typename T>
+    void scale_bilinear_nchw(const Window &window);
+    /** function to perform scale using bilinear interpolation on the given window */
+    template <typename T>
+    void scale_bilinear_nhwc(const Window &window);
+    /** function to perform scale using bilinear interpolation on the given window */
+    template <typename T>
+    void scale_bilinear_qasymm(const Window &window);
+
+    /** function to perform scale using nearest neighbour on the given window */
+    template <typename T>
+    void scale_nearest_nchw(const Window &window);
+    /** function to perform scale using nearest neighbour on the given window */
+    template <typename T>
+    void scale_nearest_nhwc(const Window &window);
+
+    /** Scale function to use for the particular function to use */
+    using ScaleFunctionPtr = void (NEScaleKernel::*)(const Window &window);
+
+    ScaleFunctionPtr    _func;
     const ITensor      *_offsets;
     const ITensor      *_dx;
     const ITensor      *_dy;
     const ITensor      *_input;
     ITensor            *_output;
     InterpolationPolicy _policy;
-    BorderSize          _border_size;
     BorderMode          _border_mode;
     PixelValue          _constant_border_value;
     float               _sampling_offset;
-    bool                _use_padding;
     bool                _align_corners;
 };
 } // namespace arm_compute
diff --git a/arm_compute/runtime/NEON/functions/NEScale.h b/arm_compute/runtime/NEON/functions/NEScale.h
index f149e3b..4063e55 100644
--- a/arm_compute/runtime/NEON/functions/NEScale.h
+++ b/arm_compute/runtime/NEON/functions/NEScale.h
@@ -24,20 +24,17 @@
 #ifndef ARM_COMPUTE_NESCALEIMAGE_H
 #define ARM_COMPUTE_NESCALEIMAGE_H
 
-#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
 #include "arm_compute/core/NEON/kernels/NEScaleKernel.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
 #include "arm_compute/runtime/Tensor.h"
 
-#include <cstdint>
-
 namespace arm_compute
 {
 class ITensor;
 
 /** Basic function to run @ref NEScaleKernel */
-class NEScale : public IFunction
+class NEScale : public INESimpleFunctionNoBorder
 {
 public:
     /** Constructor
@@ -47,20 +44,6 @@
     NEScale();
     /** Initialize the function's source, destination, interpolation type and border_mode.
      *
-     * @param[in, out] input                 Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/U8/S16/F16/F32. (Written to only for @p border_mode != UNDEFINED)
-     * @param[out]     output                Destination tensor. Data type supported: Same as @p input. All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
-     * @param[in]      policy                The interpolation type.
-     * @param[in]      border_mode           Strategy to use for borders.
-     * @param[in]      constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     * @param[in]      sampling_policy       (Optional) Sampling policy used by the interpolation. Defaults to @ref SamplingPolicy::CENTER
-     * @param[in]      use_padding           (Optional) Is padding in use or not. Defaults to true.
-     * @param[in]      align_corners         (Optional) Align corners of input and output, only affecting bilinear policy with TOP_LEFT sampling policy. Defaults to false.
-     */
-    ARM_COMPUTE_DEPRECATED_REL(20.08)
-    void configure(ITensor *input, ITensor *output, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value = PixelValue(),
-                   SamplingPolicy sampling_policy = SamplingPolicy::CENTER, bool use_padding = true, bool align_corners = false);
-    /** Initialize the function's source, destination, interpolation type and border_mode.
-     *
      * @param[in, out] input  Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/U8/S16/F16/F32. (Written to only for @p border_mode != UNDEFINED)
      * @param[out]     output Destination tensor. Data type supported: Same as @p input. All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
      * @param[in]      info   @ref ScaleKernelInfo to be used for configuration
@@ -68,22 +51,6 @@
     void configure(ITensor *input, ITensor *output, const ScaleKernelInfo &info);
     /** Static function to check if given info will lead to a valid configuration of @ref NEScale
      *
-     * @param[in] input                 Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/U8/S16/F16/F32. (Written to only for @p border_mode != UNDEFINED)
-     * @param[in] output                Destination tensor. Data type supported: Same as @p input. All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
-     * @param[in] policy                The interpolation type.
-     * @param[in] border_mode           Strategy to use for borders.
-     * @param[in] constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
-     * @param[in] sampling_policy       (Optional) Sampling policy used by the interpolation. Defaults to @ref SamplingPolicy::CENTER
-     * @param[in] use_padding           (Optional) Is padding in use or not. Defaults to true.
-     * @param[in] align_corners         (Optional) Align corners of input and output, only affecting bilinear policy with TOP_LEFT sampling policy. Defaults to false.
-     *
-     * @return a status
-     */
-    ARM_COMPUTE_DEPRECATED_REL(20.08)
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, InterpolationPolicy policy, BorderMode border_mode,
-                           PixelValue constant_border_value = PixelValue(), SamplingPolicy sampling_policy = SamplingPolicy::CENTER, bool use_padding = true, bool align_corners = false);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEScale
-     *
      * @param[in] input  Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/U8/S16/F16/F32. (Written to only for @p border_mode != UNDEFINED)
      * @param[in] output Destination tensor. Data type supported: Same as @p input. All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
      * @param[in] info   @ref ScaleKernelInfo to be used for validation
@@ -92,16 +59,10 @@
      */
     static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ScaleKernelInfo &info);
 
-    // Inherited methods overridden:
-    void run() override;
-
 private:
-    Tensor             _offsets;        /**< Offset to access the element with NEAREST interpolation or the top-left element with BILINEAR interpolation in the input tensor */
-    Tensor             _dx;             /**< Element's distance between the X real coordinate and the smallest X following integer */
-    Tensor             _dy;             /**< Element's distance between the Y real coordinate and the smallest Y following integer */
-    NEScaleKernel      _scale_kernel;   /**< Kernel to perform the scaling */
-    NEFillBorderKernel _border_handler; /**< kernel to handle tensor borders */
-    bool               _use_padding;    /**< Is padding used on the tensors */
+    Tensor _offsets; /**< Offset to access the element with NEAREST interpolation or the top-left element with BILINEAR interpolation in the input tensor */
+    Tensor _dx;      /**< Element's distance between the X real coordinate and the smallest X following integer */
+    Tensor _dy;      /**< Element's distance between the Y real coordinate and the smallest Y following integer */
 };
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_NESCALEIMAGE_H */
diff --git a/src/core/NEON/kernels/NEScaleKernel.cpp b/src/core/NEON/kernels/NEScaleKernel.cpp
index 94fcfe2..1a85352 100644
--- a/src/core/NEON/kernels/NEScaleKernel.cpp
+++ b/src/core/NEON/kernels/NEScaleKernel.cpp
@@ -34,11 +34,24 @@
 #include "src/core/utils/ScaleUtils.h"
 
 #include <arm_neon.h>
+#include <map>
 
 namespace arm_compute
 {
 namespace
 {
+inline float compute_bilinear(float a00, float a01, float a10, float a11, float dx_val, float dy_val)
+{
+    const float dx1_val = 1.0f - dx_val;
+    const float dy1_val = 1.0f - dy_val;
+
+    const float w1 = dx1_val * dy1_val;
+    const float w2 = dx_val * dy1_val;
+    const float w3 = dx1_val * dy_val;
+    const float w4 = dx_val * dy_val;
+    return a00 * w1 + a01 * w2 + a10 * w3 + a11 * w4;
+}
+
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *dx, const ITensorInfo *dy,
                           const ITensorInfo *offsets, ITensorInfo *output, const ScaleKernelInfo &info)
 {
@@ -48,8 +61,8 @@
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON(output == input);
     ARM_COMPUTE_RETURN_ERROR_ON(info.sampling_policy != SamplingPolicy::CENTER && info.sampling_policy != SamplingPolicy::TOP_LEFT);
-    ARM_COMPUTE_RETURN_ERROR_ON(!info.use_padding && info.border_mode != BorderMode::CONSTANT);
     ARM_COMPUTE_UNUSED(info.constant_border_value);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.use_padding, "Padding is not supported");
 
     const DataLayout data_layout   = input->data_layout();
     const auto       width_index   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
@@ -71,7 +84,7 @@
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dy, 1, DataType::F32);
     }
 
-    ARM_COMPUTE_RETURN_ERROR_ON(info.align_corners && !arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(info.sampling_policy));
+    ARM_COMPUTE_RETURN_ERROR_ON(info.align_corners && !scale_utils::is_align_corners_allowed_sampling_policy(info.sampling_policy));
 
     if(info.interpolation_policy == InterpolationPolicy::AREA)
     {
@@ -81,267 +94,14 @@
 
     return Status{};
 }
-
-std::pair<Status, Window> validate_and_configure_window_nchw(ITensorInfo *input, ITensorInfo *dx, ITensorInfo *dy, ITensorInfo *offsets, ITensorInfo *output,
-                                                             const ScaleKernelInfo &info, BorderSize border_size)
-{
-    bool   window_changed{ false };
-    Window win{};
-
-    constexpr unsigned int num_elems_processed_per_iteration = 16;
-
-    // Configure kernel window
-    win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
-
-    const ValidRegion &input_valid_region = input->valid_region();
-
-    if(offsets != nullptr)
-    {
-        AccessWindowHorizontal offsets_access(offsets, 0, num_elems_processed_per_iteration);
-        window_changed = window_changed || update_window_and_padding(win, offsets_access);
-    }
-    if(dx != nullptr && dy != nullptr)
-    {
-        AccessWindowHorizontal dx_access(dx, 0, num_elems_processed_per_iteration);
-        AccessWindowHorizontal dy_access(dy, 0, num_elems_processed_per_iteration);
-        window_changed = window_changed || update_window_and_padding(win, dx_access, dy_access);
-    }
-
-    // Reads can occur within the valid region of the input
-    AccessWindowStatic input_access(input, input_valid_region.anchor[0] - border_size.left,
-                                    input_valid_region.anchor[1] - border_size.top,
-                                    input_valid_region.anchor[0] + input_valid_region.shape[0] + border_size.right,
-                                    input_valid_region.anchor[1] + input_valid_region.shape[1] + border_size.bottom);
-    AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
-    window_changed = window_changed || update_window_and_padding(win, input_access, output_access);
-    output_access.set_valid_region(win, calculate_valid_region_scale(*input, output->tensor_shape(),
-                                                                     info.interpolation_policy, info.sampling_policy, info.border_mode == BorderMode::UNDEFINED));
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, win);
-}
-
-std::pair<Status, Window> validate_and_configure_window_nhwc(ITensorInfo *input, ITensorInfo *output, const ScaleKernelInfo &info, BorderSize border_size)
-{
-    bool   window_changed{ false };
-    Window win{};
-
-    const unsigned int num_elems_processed_per_iteration = (info.use_padding && info.interpolation_policy == InterpolationPolicy::NEAREST_NEIGHBOR) ? 16 / input->element_size() : 1;
-
-    // Configure kernel window
-    win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
-
-    if(info.use_padding)
-    {
-        AccessWindowStatic     input_access(input, 0, -border_size.top, ceil_to_multiple(input->tensor_shape()[0], num_elems_processed_per_iteration), input->tensor_shape()[1]);
-        AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
-        window_changed = update_window_and_padding(win, input_access, output_access);
-        output->set_valid_region(calculate_valid_region_scale(*input, output->tensor_shape(), info.interpolation_policy, info.sampling_policy, info.border_mode == BorderMode::UNDEFINED));
-    }
-
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
-    return std::make_pair(err, win);
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *dx, ITensorInfo *dy, ITensorInfo *offsets, ITensorInfo *output,
-                                                        const ScaleKernelInfo &info, BorderSize border_size)
-{
-    std::pair<Status, Window> win_config;
-    switch(input->data_layout())
-    {
-        case DataLayout::NCHW:
-            if(!info.use_padding)
-            {
-                return std::make_pair(ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Padding required for NCHW"), Window{});
-            }
-            win_config = validate_and_configure_window_nchw(input, dx, dy, offsets, output, info, border_size);
-            break;
-        case DataLayout::NHWC:
-            win_config = validate_and_configure_window_nhwc(input, output, info, border_size);
-            break;
-        default:
-            win_config = std::make_pair(ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Unsupported data layout!"), Window{});
-    }
-
-    return win_config;
-}
-
-template <typename T>
-inline void scale_nearest_nhwc_core(const ITensor *input, const ITensor *offsets, ITensor *output,
-                                    float hr, Window window, const Window &win_in, size_t stride_w, size_t stride_h, size_t stride_c, float sampling_offset, bool align_corners)
-{
-    const int  window_step_x  = 16 / sizeof(T);
-    const auto window_start_x = static_cast<int32_t>(window.x().start());
-    const auto window_end_x   = static_cast<int32_t>(window.x().end());
-
-    window.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator in(input, win_in);
-    Iterator out(output, window);
-
-    const size_t offsets_stride = stride_w / sizeof(T);
-
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        const int32_t offset     = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())));
-        const auto    in_yi      = static_cast<int>(align_corners ? arm_compute::utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) : std::floor((id.z() + sampling_offset) * hr));
-        const int     offset_row = in_yi * stride_h;
-        int32_t       x          = window_start_x;
-        for(; x < window_end_x - window_step_x; x += window_step_x)
-        {
-            wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x,
-                            wrapper::vloadq(reinterpret_cast<const T *>(in.ptr() + offset * offsets_stride + offset_row + x * stride_c)));
-        }
-        for(; x < window_end_x; ++x)
-        {
-            *(reinterpret_cast<T *>(out.ptr()) + x) =
-                *(reinterpret_cast<const T *>(in.ptr() + offset * offsets_stride + offset_row + x * stride_c));
-        }
-    },
-    in, out);
-}
-
-template <typename T, typename ConstType>
-inline void scale_bilinear_nhwc_core(const ITensor *input, const ITensor *offsets, const ITensor *dx, const ITensor *dy, ITensor *output,
-                                     float hr, float sampling_offset, Window window, const Window &win_in, size_t stride_w, size_t stride_h,
-                                     size_t stride_c, BorderMode border_mode, PixelValue constant_border_value, bool use_padding)
-{
-    Iterator in(input, win_in);
-    Iterator out(output, window);
-
-    const size_t stride_w_elems = stride_w / sizeof(T);
-    const size_t stride_h_elems = stride_h / sizeof(T);
-
-    const int input_width  = input->info()->dimension(1);
-    const int input_height = input->info()->dimension(2);
-
-    T border_value;
-    if(use_padding && border_mode != BorderMode::REPLICATE)
-    {
-        // configure() sets top border to 0 for BorderMode::REPLICATE and border_value is not needed in execute_window_loop() for REPLICATE
-        border_value = *reinterpret_cast<T *>(input->buffer() + input->info()->offset_first_element_in_bytes() - stride_w);
-    }
-    else
-    {
-        border_value = static_cast<T>(constant_border_value.get<ConstType>());
-    }
-
-    auto is_valid = [](int64_t x, int64_t low_x, int64_t high_x, int64_t y, int64_t low_y, int64_t high_y)
-    {
-        return !(x < low_x || x > high_x || y < low_y || y > high_y);
-    };
-
-    int border_size = (border_mode == BorderMode::UNDEFINED) ? 0 : 1;
-
-    const UniformQuantizationInfo iq_info = input->info()->quantization_info().uniform();
-    const UniformQuantizationInfo oq_info = output->info()->quantization_info().uniform();
-
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        const auto offset     = (*reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())))) / static_cast<int>(sizeof(T));
-        const auto dx_scale   = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z())));
-        const auto dy_scale   = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z())));
-        const int  in_yi      = std::floor((id.z() + sampling_offset) * hr - sampling_offset);
-        const int  offset_row = in_yi * stride_h + id.x() * stride_c;
-        const T   *in_ptr     = reinterpret_cast<T *>(in.ptr() + offset * stride_w + offset_row);
-
-        if(is_valid(offset, -border_size, input_width - 1 + border_size, in_yi, -border_size, input_height - 1 + border_size))
-        {
-            T a00 = 0;
-            T a01 = 0;
-            T a10 = 0;
-            T a11 = 0;
-
-            if(border_mode == BorderMode::CONSTANT)
-            {
-                a00 = is_valid(offset, 0, input_width - 1, in_yi, 0, input_height - 1) ? *in_ptr : border_value;
-                a01 = is_valid(offset + 1, 0, input_width - 1, in_yi, 0, input_height - 1) ? *(in_ptr + stride_w_elems) : border_value;
-                a10 = is_valid(offset, 0, input_width - 1, in_yi + 1, 0, input_height - 1) ? *(in_ptr + stride_h_elems) : border_value;
-                a11 = is_valid(offset + 1, 0, input_width - 1, in_yi + 1, 0, input_height - 1) ? *(in_ptr + stride_h_elems + stride_w_elems) : border_value;
-            }
-            else if(border_mode == BorderMode::REPLICATE)
-            {
-                auto clamped_x  = utility::clamp<int>(offset, 0, input_width - 1);
-                auto clamped_x1 = utility::clamp<int>(offset + 1, 0, input_width - 1);
-                auto clamped_y  = utility::clamp<int>(in_yi, 0, input_height - 1);
-                auto clamped_y1 = utility::clamp<int>(in_yi + 1, 0, input_height - 1);
-
-                a00 = *reinterpret_cast<T *>(in.ptr() + clamped_x * stride_w + clamped_y * stride_h + id.x() * stride_c);
-                a01 = *reinterpret_cast<T *>(in.ptr() + clamped_x1 * stride_w + clamped_y * stride_h + id.x() * stride_c);
-                a10 = *reinterpret_cast<T *>(in.ptr() + clamped_x * stride_w + clamped_y1 * stride_h + id.x() * stride_c);
-                a11 = *reinterpret_cast<T *>(in.ptr() + clamped_x1 * stride_w + clamped_y1 * stride_h + id.x() * stride_c);
-            }
-            else
-            {
-                a00 = is_valid(offset, 0, input_width - 1, in_yi, 0, input_height - 1) ? *in_ptr : 0;
-                a01 = is_valid(offset + 1, 0, input_width - 1, in_yi, 0, input_height - 1) ? *(in_ptr + stride_w_elems) : 0;
-                a10 = is_valid(offset, 0, input_width - 1, in_yi + 1, 0, input_height - 1) ? *(in_ptr + stride_h_elems) : 0;
-                a11 = is_valid(offset + 1, 0, input_width - 1, in_yi + 1, 0, input_height - 1) ? *(in_ptr + stride_h_elems + stride_w_elems) : 0;
-            }
-
-            // Perform interpolation
-            const float dx1 = 1.0f - dx_scale;
-            const float dy1 = 1.0f - dy_scale;
-
-            const float w1 = dx1 * dy1;
-            const float w2 = dx_scale * dy1;
-            const float w3 = dx1 * dy_scale;
-            const float w4 = dx_scale * dy_scale;
-
-            T res = 0;
-            //dequantize quantized input
-            if(input->info()->data_type() == DataType::QASYMM8)
-            {
-                float inp00 = dequantize_qasymm8(a00, iq_info);
-                float inp01 = dequantize_qasymm8(a01, iq_info);
-                float inp10 = dequantize_qasymm8(a10, iq_info);
-                float inp11 = dequantize_qasymm8(a11, iq_info);
-                res         = static_cast<T>(quantize_qasymm8((inp00 * w1 + inp01 * w2 + inp10 * w3 + inp11 * w4), oq_info));
-            }
-            else if(input->info()->data_type() == DataType::QASYMM8_SIGNED)
-            {
-                float inp00 = dequantize_qasymm8_signed(a00, iq_info);
-                float inp01 = dequantize_qasymm8_signed(a01, iq_info);
-                float inp10 = dequantize_qasymm8_signed(a10, iq_info);
-                float inp11 = dequantize_qasymm8_signed(a11, iq_info);
-                res         = static_cast<T>(quantize_qasymm8_signed((inp00 * w1 + inp01 * w2 + inp10 * w3 + inp11 * w4), oq_info));
-            }
-            else
-            {
-                res = static_cast<T>(a00 * w1 + a01 * w2 + a10 * w3 + a11 * w4);
-            }
-            // Store result
-            *reinterpret_cast<T *>(out.ptr()) = res;
-        }
-        else
-        {
-            if(border_mode == BorderMode::CONSTANT)
-            {
-                *reinterpret_cast<T *>(out.ptr()) = border_value;
-            }
-            else if(border_mode == BorderMode::REPLICATE)
-            {
-                auto clamped_x                    = utility::clamp<int>(offset, 0, input_width - 1);
-                auto clamped_y                    = utility::clamp<int>(in_yi, 0, input_height - 1);
-                *reinterpret_cast<T *>(out.ptr()) = *reinterpret_cast<T *>(in.ptr() + clamped_x * stride_w + clamped_y * stride_h + id.x() * stride_c);
-            }
-        }
-    },
-    in, out);
-}
 } // namespace
 
 NEScaleKernel::NEScaleKernel()
-    : _func(nullptr), _offsets(nullptr), _dx(nullptr), _dy(nullptr), _input(nullptr), _output(nullptr), _policy(), _border_size(1), _border_mode(), _constant_border_value(PixelValue()),
-      _sampling_offset(0), _use_padding(true), _align_corners(false)
+    : _func(nullptr), _offsets(nullptr), _dx(nullptr), _dy(nullptr), _input(nullptr), _output(nullptr), _policy(), _border_mode(), _constant_border_value(PixelValue()), _sampling_offset(0),
+      _align_corners(false)
 {
 }
 
-BorderSize NEScaleKernel::border_size() const
-{
-    return _border_size;
-}
-
 void NEScaleKernel::configure(const ITensor *input, const ITensor *dx, const ITensor *dy, const ITensor *offsets,
                               ITensor *output, const ScaleKernelInfo &info)
 {
@@ -365,10 +125,8 @@
     _dx                    = dx;
     _dy                    = dy;
     _policy                = info.interpolation_policy;
-    _border_size           = BorderSize(1);
     _border_mode           = info.border_mode;
     _constant_border_value = info.constant_border_value;
-    _use_padding           = info.use_padding;
     _align_corners         = info.align_corners;
 
     if(info.sampling_policy == SamplingPolicy::CENTER)
@@ -377,58 +135,85 @@
     }
 
     // Compute the ratio between source width/height and destination width/height
-    const auto wr = arm_compute::scale_utils::calculate_resize_ratio(input->info()->dimension(idx_width), output->info()->dimension(idx_width), _align_corners);
-    const auto hr = arm_compute::scale_utils::calculate_resize_ratio(input->info()->dimension(idx_height), output->info()->dimension(idx_height), _align_corners);
-
-    // Add constant border only on top in case of NHWC layout
-    if(data_layout == DataLayout::NHWC)
-    {
-        _border_size = (info.border_mode != BorderMode::REPLICATE && info.interpolation_policy == InterpolationPolicy::BILINEAR && info.use_padding) ? BorderSize(1, 0, 0, 0) : BorderSize(0);
-    }
+    const auto wr = scale_utils::calculate_resize_ratio(input->info()->dimension(idx_width), output->info()->dimension(idx_width), _align_corners);
+    const auto hr = scale_utils::calculate_resize_ratio(input->info()->dimension(idx_height), output->info()->dimension(idx_height), _align_corners);
 
     // Area interpolation behaves as Nearest Neighbour in case of up-sampling
     const auto policy_to_use = (info.interpolation_policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f) ? InterpolationPolicy::NEAREST_NEIGHBOR : _policy;
 
-    // Select interpolation function
-    switch(policy_to_use)
+    if(_border_mode == BorderMode::UNDEFINED)
     {
-        case InterpolationPolicy::NEAREST_NEIGHBOR:
-        {
-            _func = (data_layout == DataLayout::NCHW) ? &NEScaleKernel::scale_nearest_nchw : &NEScaleKernel::scale_nhwc;
-            break;
-        }
-        case InterpolationPolicy::BILINEAR:
-        {
-            _func = (data_layout == DataLayout::NCHW) ? &NEScaleKernel::scale_bilinear_nchw : &NEScaleKernel::scale_nhwc;
-            break;
-        }
-        case InterpolationPolicy::AREA:
-        {
-            _func = &NEScaleKernel::scale_area_nchw;
-            break;
-        }
-        default:
-            ARM_COMPUTE_ERROR("Unsupported interpolation mode");
+        _border_mode           = BorderMode::CONSTANT;
+        _constant_border_value = PixelValue();
+    }
+    std::string function_to_call("scale_");
+    function_to_call += string_from_data_type(_input->info()->data_type()) + "_";
+    function_to_call += string_from_data_layout(_input->info()->data_layout()) + "_";
+    function_to_call += string_from_interpolation_policy(policy_to_use);
+
+    static std::map<std::string, ScaleFunctionPtr> map_function =
+    {
+        { "scale_U8_NCHW_AREA_CONSTANT", &NEScaleKernel::scale_area_nchw_u8 },
+
+        { "scale_U8_NCHW_BILINEAR", &NEScaleKernel::scale_bilinear_nchw<uint8_t> },
+        { "scale_U8_NCHW_NEAREST_NEIGHBOUR", &NEScaleKernel::scale_nearest_nchw<uint8_t> },
+
+        { "scale_U8_NHWC_BILINEAR", &NEScaleKernel::scale_bilinear_nhwc<uint8_t> },
+        { "scale_U8_NHWC_NEAREST_NEIGHBOUR", &NEScaleKernel::scale_nearest_nhwc<uint8_t> },
+
+        { "scale_QASYMM8_NCHW_BILINEAR", &NEScaleKernel::scale_bilinear_qasymm<uint8_t> },
+        { "scale_QASYMM8_NCHW_NEAREST_NEIGHBOUR", &NEScaleKernel::scale_nearest_nchw<uint8_t> },
+
+        { "scale_QASYMM8_NHWC_BILINEAR", &NEScaleKernel::scale_bilinear_qasymm<uint8_t> },
+        { "scale_QASYMM8_NHWC_NEAREST_NEIGHBOUR", &NEScaleKernel::scale_nearest_nhwc<uint8_t> },
+
+        { "scale_QASYMM8_SIGNED_NCHW_BILINEAR", &NEScaleKernel::scale_bilinear_qasymm<int8_t> },
+        { "scale_QASYMM8_SIGNED_NCHW_NEAREST_NEIGHBOUR", &NEScaleKernel::scale_nearest_nchw<uint8_t> },
+
+        { "scale_QASYMM8_SIGNED_NHWC_BILINEAR", &NEScaleKernel::scale_bilinear_qasymm<int8_t> },
+        { "scale_QASYMM8_SIGNED_NHWC_NEAREST_NEIGHBOUR", &NEScaleKernel::scale_nearest_nhwc<uint8_t> },
+
+        { "scale_S16_NCHW_BILINEAR", &NEScaleKernel::scale_bilinear_nchw<int16_t> },
+        { "scale_S16_NCHW_NEAREST_NEIGHBOUR", &NEScaleKernel::scale_nearest_nchw<uint16_t> },
+
+        { "scale_S16_NHWC_BILINEAR", &NEScaleKernel::scale_bilinear_nhwc<int16_t> },
+        { "scale_S16_NHWC_NEAREST_NEIGHBOUR", &NEScaleKernel::scale_nearest_nhwc<uint16_t> },
+
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+        { "scale_F16_NCHW_BILINEAR", &NEScaleKernel::scale_bilinear_nchw<float16_t> },
+        { "scale_F16_NCHW_NEAREST_NEIGHBOUR", &NEScaleKernel::scale_nearest_nchw<uint16_t> },
+
+        { "scale_F16_NHWC_BILINEAR", &NEScaleKernel::scale_bilinear_nhwc<float16_t> },
+        { "scale_F16_NHWC_NEAREST_NEIGHBOUR", &NEScaleKernel::scale_nearest_nhwc<uint16_t> },
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+
+        { "scale_F32_NCHW_BILINEAR", &NEScaleKernel::scale_bilinear_nchw<float> },
+        { "scale_F32_NCHW_NEAREST_NEIGHBOUR", &NEScaleKernel::scale_nearest_nchw<float> },
+
+        { "scale_F32_NHWC_BILINEAR", &NEScaleKernel::scale_bilinear_nhwc<float> },
+        { "scale_F32_NHWC_NEAREST_NEIGHBOUR", &NEScaleKernel::scale_nearest_nhwc<float> },
+    };
+    auto it = map_function.find(function_to_call);
+    if(it != map_function.end())
+    {
+        _func = it->second;
     }
 
     // Configure window
-    std::pair<Status, Window> win_config = validate_and_configure_window(input->info(),
-                                                                         dx != nullptr ? dx->info() : nullptr,
-                                                                         dy != nullptr ? dy->info() : nullptr,
-                                                                         offsets != nullptr ? offsets->info() : nullptr,
-                                                                         output->info(),
-                                                                         info, border_size());
-
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    INEKernel::configure(win_config.second);
+    Window      win = calculate_max_window(*output->info(), Steps());
+    Coordinates coord;
+    coord.set_num_dimensions(output->info()->num_dimensions());
+    output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
+    INEKernel::configure(win);
 }
 
+template <typename T>
 void NEScaleKernel::scale_nearest_nchw(const Window &window)
 {
-    const size_t input_stride = _input->info()->strides_in_bytes()[1];
+    const size_t in_dim_x = _input->info()->dimension(0);
 
     // Compute the ratio between source height and destination height
-    const auto hr = arm_compute::scale_utils::calculate_resize_ratio(_input->info()->dimension(1), _output->info()->dimension(1), _align_corners);
+    const auto hr = scale_utils::calculate_resize_ratio(_input->info()->dimension(1), _output->info()->dimension(1), _align_corners);
 
     // Don't increment in X and Y direction for the input tensor
     // A pointer to the start of this plane is needed as base for the precomputed offsets
@@ -449,216 +234,24 @@
     Iterator in(_input, win_in);
     Iterator out(_output, window);
     Iterator offsets(_offsets, win_off);
-
-    switch(_input->info()->data_type())
+    execute_window_loop(window, [&](const Coordinates & id)
     {
-        case DataType::QASYMM8_SIGNED:
-        {
-            int8x16_t tmp = vdupq_n_s8(0);
-
-            execute_window_loop(window, [&](const Coordinates & id)
-            {
-                const auto           offsets_ptr = reinterpret_cast<const int32_t *>(offsets.ptr());
-                const uint8_t *const in_ptr      = in.ptr();
-
-                const auto in_yi         = static_cast<int>(_align_corners ? arm_compute::utils::rounding::round_half_away_from_zero((id.y() + _sampling_offset) * hr) : std::floor((id.y() + _sampling_offset) * hr));
-                const int  in_yi_clamped = std::min(static_cast<int>(_input->info()->dimension(1)), std::max(in_yi, -1));
-                ARM_COMPUTE_ERROR_ON(in_yi_clamped < -1 || in_yi_clamped > static_cast<int>(_input->info()->dimension(1)));
-                const int offset_row = in_yi_clamped * input_stride;
-
-                tmp = vsetq_lane_s8(in_ptr[offsets_ptr[0] + offset_row], tmp, 0);
-                tmp = vsetq_lane_s8(in_ptr[offsets_ptr[1] + offset_row], tmp, 1);
-                tmp = vsetq_lane_s8(in_ptr[offsets_ptr[2] + offset_row], tmp, 2);
-                tmp = vsetq_lane_s8(in_ptr[offsets_ptr[3] + offset_row], tmp, 3);
-                tmp = vsetq_lane_s8(in_ptr[offsets_ptr[4] + offset_row], tmp, 4);
-                tmp = vsetq_lane_s8(in_ptr[offsets_ptr[5] + offset_row], tmp, 5);
-                tmp = vsetq_lane_s8(in_ptr[offsets_ptr[6] + offset_row], tmp, 6);
-                tmp = vsetq_lane_s8(in_ptr[offsets_ptr[7] + offset_row], tmp, 7);
-                tmp = vsetq_lane_s8(in_ptr[offsets_ptr[8] + offset_row], tmp, 8);
-                tmp = vsetq_lane_s8(in_ptr[offsets_ptr[9] + offset_row], tmp, 9);
-                tmp = vsetq_lane_s8(in_ptr[offsets_ptr[10] + offset_row], tmp, 10);
-                tmp = vsetq_lane_s8(in_ptr[offsets_ptr[11] + offset_row], tmp, 11);
-                tmp = vsetq_lane_s8(in_ptr[offsets_ptr[12] + offset_row], tmp, 12);
-                tmp = vsetq_lane_s8(in_ptr[offsets_ptr[13] + offset_row], tmp, 13);
-                tmp = vsetq_lane_s8(in_ptr[offsets_ptr[14] + offset_row], tmp, 14);
-                tmp = vsetq_lane_s8(in_ptr[offsets_ptr[15] + offset_row], tmp, 15);
-
-                vst1q_s8(reinterpret_cast<int8_t *>(out.ptr()), tmp);
-            },
-            in, offsets, out);
-            break;
-        }
-        case DataType::QASYMM8:
-        case DataType::U8:
-        {
-            uint8x16_t tmp = vdupq_n_u8(0);
-
-            execute_window_loop(window, [&](const Coordinates & id)
-            {
-                const auto           offsets_ptr = reinterpret_cast<const int32_t *>(offsets.ptr());
-                const uint8_t *const in_ptr      = in.ptr();
-
-                const auto in_yi         = static_cast<int>(_align_corners ? arm_compute::utils::rounding::round_half_away_from_zero((id.y() + _sampling_offset) * hr) : std::floor((id.y() + _sampling_offset) * hr));
-                const int  in_yi_clamped = std::min(static_cast<int>(_input->info()->dimension(1)), std::max(in_yi, -1));
-                ARM_COMPUTE_ERROR_ON(in_yi_clamped < -1 || in_yi_clamped > static_cast<int>(_input->info()->dimension(1)));
-                const int offset_row = in_yi_clamped * input_stride;
-
-                tmp = vsetq_lane_u8(in_ptr[offsets_ptr[0] + offset_row], tmp, 0);
-                tmp = vsetq_lane_u8(in_ptr[offsets_ptr[1] + offset_row], tmp, 1);
-                tmp = vsetq_lane_u8(in_ptr[offsets_ptr[2] + offset_row], tmp, 2);
-                tmp = vsetq_lane_u8(in_ptr[offsets_ptr[3] + offset_row], tmp, 3);
-                tmp = vsetq_lane_u8(in_ptr[offsets_ptr[4] + offset_row], tmp, 4);
-                tmp = vsetq_lane_u8(in_ptr[offsets_ptr[5] + offset_row], tmp, 5);
-                tmp = vsetq_lane_u8(in_ptr[offsets_ptr[6] + offset_row], tmp, 6);
-                tmp = vsetq_lane_u8(in_ptr[offsets_ptr[7] + offset_row], tmp, 7);
-                tmp = vsetq_lane_u8(in_ptr[offsets_ptr[8] + offset_row], tmp, 8);
-                tmp = vsetq_lane_u8(in_ptr[offsets_ptr[9] + offset_row], tmp, 9);
-                tmp = vsetq_lane_u8(in_ptr[offsets_ptr[10] + offset_row], tmp, 10);
-                tmp = vsetq_lane_u8(in_ptr[offsets_ptr[11] + offset_row], tmp, 11);
-                tmp = vsetq_lane_u8(in_ptr[offsets_ptr[12] + offset_row], tmp, 12);
-                tmp = vsetq_lane_u8(in_ptr[offsets_ptr[13] + offset_row], tmp, 13);
-                tmp = vsetq_lane_u8(in_ptr[offsets_ptr[14] + offset_row], tmp, 14);
-                tmp = vsetq_lane_u8(in_ptr[offsets_ptr[15] + offset_row], tmp, 15);
-
-                vst1q_u8(out.ptr(), tmp);
-            },
-            in, offsets, out);
-            break;
-        }
-        case DataType::S16:
-        {
-            int16x8x2_t tmp =
-            {
-                {
-                    vdupq_n_s16(0),
-                    vdupq_n_s16(0)
-                }
-            };
-
-            execute_window_loop(window, [&](const Coordinates & id)
-            {
-                const auto offsets_ptr = reinterpret_cast<const int32_t *>(offsets.ptr());
-                const auto in_yi       = static_cast<int>(_align_corners ? arm_compute::utils::rounding::round_half_away_from_zero((id.y() + _sampling_offset) * hr) : std::floor((id.y() + _sampling_offset) * hr));
-                const int  offset_row  = in_yi * input_stride;
-
-                tmp.val[0] = vsetq_lane_s16(*reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[0] + offset_row), tmp.val[0], 0);
-                tmp.val[0] = vsetq_lane_s16(*reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[2] + offset_row), tmp.val[0], 1);
-                tmp.val[0] = vsetq_lane_s16(*reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[4] + offset_row), tmp.val[0], 2);
-                tmp.val[0] = vsetq_lane_s16(*reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[6] + offset_row), tmp.val[0], 3);
-                tmp.val[0] = vsetq_lane_s16(*reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[8] + offset_row), tmp.val[0], 4);
-                tmp.val[0] = vsetq_lane_s16(*reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[10] + offset_row), tmp.val[0], 5);
-                tmp.val[0] = vsetq_lane_s16(*reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[12] + offset_row), tmp.val[0], 6);
-                tmp.val[0] = vsetq_lane_s16(*reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[14] + offset_row), tmp.val[0], 7);
-
-                tmp.val[1] = vsetq_lane_s16(*reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[1] + offset_row), tmp.val[1], 0);
-                tmp.val[1] = vsetq_lane_s16(*reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[3] + offset_row), tmp.val[1], 1);
-                tmp.val[1] = vsetq_lane_s16(*reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[5] + offset_row), tmp.val[1], 2);
-                tmp.val[1] = vsetq_lane_s16(*reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[7] + offset_row), tmp.val[1], 3);
-                tmp.val[1] = vsetq_lane_s16(*reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[9] + offset_row), tmp.val[1], 4);
-                tmp.val[1] = vsetq_lane_s16(*reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[11] + offset_row), tmp.val[1], 5);
-                tmp.val[1] = vsetq_lane_s16(*reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[13] + offset_row), tmp.val[1], 6);
-                tmp.val[1] = vsetq_lane_s16(*reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[15] + offset_row), tmp.val[1], 7);
-
-                vst2q_s16(reinterpret_cast<int16_t *>(out.ptr()), tmp);
-            },
-            in, offsets, out);
-            break;
-        }
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        case DataType::F16:
-        {
-            float16x8x2_t tmp =
-            {
-                {
-                    vdupq_n_f16(0),
-                    vdupq_n_f16(0)
-                }
-            };
-
-            execute_window_loop(window, [&](const Coordinates & id)
-            {
-                const auto offsets_ptr = reinterpret_cast<const int32_t *>(offsets.ptr());
-                const auto in_yi       = static_cast<int>(_align_corners ? arm_compute::utils::rounding::round_half_away_from_zero((id.y() + _sampling_offset) * hr) : std::floor((id.y() + _sampling_offset) * hr));
-                const int  offset_row  = in_yi * input_stride;
-
-                tmp.val[0] = vsetq_lane_f16(*reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[0] + offset_row), tmp.val[0], 0);
-                tmp.val[0] = vsetq_lane_f16(*reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[2] + offset_row), tmp.val[0], 1);
-                tmp.val[0] = vsetq_lane_f16(*reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[4] + offset_row), tmp.val[0], 2);
-                tmp.val[0] = vsetq_lane_f16(*reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[6] + offset_row), tmp.val[0], 3);
-                tmp.val[0] = vsetq_lane_f16(*reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[8] + offset_row), tmp.val[0], 4);
-                tmp.val[0] = vsetq_lane_f16(*reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[10] + offset_row), tmp.val[0], 5);
-                tmp.val[0] = vsetq_lane_f16(*reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[12] + offset_row), tmp.val[0], 6);
-                tmp.val[0] = vsetq_lane_f16(*reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[14] + offset_row), tmp.val[0], 7);
-
-                tmp.val[1] = vsetq_lane_f16(*reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[1] + offset_row), tmp.val[1], 0);
-                tmp.val[1] = vsetq_lane_f16(*reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[3] + offset_row), tmp.val[1], 1);
-                tmp.val[1] = vsetq_lane_f16(*reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[5] + offset_row), tmp.val[1], 2);
-                tmp.val[1] = vsetq_lane_f16(*reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[7] + offset_row), tmp.val[1], 3);
-                tmp.val[1] = vsetq_lane_f16(*reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[9] + offset_row), tmp.val[1], 4);
-                tmp.val[1] = vsetq_lane_f16(*reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[11] + offset_row), tmp.val[1], 5);
-                tmp.val[1] = vsetq_lane_f16(*reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[13] + offset_row), tmp.val[1], 6);
-                tmp.val[1] = vsetq_lane_f16(*reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[15] + offset_row), tmp.val[1], 7);
-
-                vst2q_f16(reinterpret_cast<__fp16 *>(out.ptr()), tmp);
-            },
-            in, offsets, out);
-            break;
-        }
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-        case DataType::F32:
-        {
-            float32x4x4_t tmp =
-            {
-                {
-                    vdupq_n_f32(0),
-                    vdupq_n_f32(0),
-                    vdupq_n_f32(0),
-                    vdupq_n_f32(0)
-                }
-            };
-
-            execute_window_loop(window, [&](const Coordinates & id)
-            {
-                const auto offsets_ptr = reinterpret_cast<const int32_t *>(offsets.ptr());
-                const auto in_yi       = static_cast<int>(_align_corners ? arm_compute::utils::rounding::round_half_away_from_zero((id.y() + _sampling_offset) * hr) : std::floor((id.y() + _sampling_offset) * hr));
-                const int  offset_row  = in_yi * input_stride;
-
-                tmp.val[0] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[0] + offset_row), tmp.val[0], 0);
-                tmp.val[0] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[4] + offset_row), tmp.val[0], 1);
-                tmp.val[0] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[8] + offset_row), tmp.val[0], 2);
-                tmp.val[0] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[12] + offset_row), tmp.val[0], 3);
-
-                tmp.val[1] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[1] + offset_row), tmp.val[1], 0);
-                tmp.val[1] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[5] + offset_row), tmp.val[1], 1);
-                tmp.val[1] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[9] + offset_row), tmp.val[1], 2);
-                tmp.val[1] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[13] + offset_row), tmp.val[1], 3);
-
-                tmp.val[2] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[2] + offset_row), tmp.val[2], 0);
-                tmp.val[2] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[6] + offset_row), tmp.val[2], 1);
-                tmp.val[2] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[10] + offset_row), tmp.val[2], 2);
-                tmp.val[2] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[14] + offset_row), tmp.val[2], 3);
-
-                tmp.val[3] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[3] + offset_row), tmp.val[3], 0);
-                tmp.val[3] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[7] + offset_row), tmp.val[3], 1);
-                tmp.val[3] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[11] + offset_row), tmp.val[3], 2);
-                tmp.val[3] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[15] + offset_row), tmp.val[3], 3);
-
-                vst4q_f32(reinterpret_cast<float *>(out.ptr()), tmp);
-            },
-            in, offsets, out);
-            break;
-        }
-        default:
-            ARM_COMPUTE_ERROR("Not supported");
-    }
+        const auto    offsets_ptr         = reinterpret_cast<const int32_t *>(offsets.ptr());
+        const auto    in_yi               = static_cast<int32_t>(_align_corners ? utils::rounding::round_half_away_from_zero((id.y() + _sampling_offset) * hr) : std::floor((id.y() + _sampling_offset) * hr));
+        const int32_t offset_row          = in_yi * in_dim_x;
+        *reinterpret_cast<T *>(out.ptr()) = *(reinterpret_cast<const T *>(in.ptr()) + offsets_ptr[0] + offset_row);
+    },
+    in, offsets, out);
 }
 
+template <typename T>
 void NEScaleKernel::scale_bilinear_nchw(const Window &window)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(_input, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S16, DataType::F16, DataType::F32);
-
     // Compute the ratio between source height and destination height
-    const auto hr = arm_compute::scale_utils::calculate_resize_ratio(_input->info()->dimension(1), _output->info()->dimension(1), _align_corners);
+    const auto hr = scale_utils::calculate_resize_ratio(_input->info()->dimension(1), _output->info()->dimension(1), _align_corners);
+    Window     win_off;
+    win_off.set(Window::DimX, window.x());
+    win_off.set(Window::DimY, window.y());
 
     // Don't increment in X and Y direction for the input tensor
     // A pointer to the start of this plane is needed as base for the precomputed offsets
@@ -666,10 +259,6 @@
     win_in.set(Window::DimX, Window::Dimension(0, 0, 0));
     win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
 
-    Window win_off;
-    win_off.set(Window::DimX, window.x());
-    win_off.set(Window::DimY, window.y());
-
     for(size_t d = Window::DimZ; d < _offsets->info()->num_dimensions(); ++d)
     {
         win_off.set(d, Window::Dimension(0, 0, 0));
@@ -681,271 +270,71 @@
     Iterator dx(_dx, win_off);
     Iterator dy(_dy, win_off);
 
-    /* Input image stride */
-    const size_t in_stide_in_bytes = _input->info()->strides_in_bytes()[1];
-    const size_t in_stride         = in_stide_in_bytes / _input->info()->element_size();
+    const int32_t in_dim_w = _input->info()->dimension(0);
+    const int32_t in_dim_h = _input->info()->dimension(1);
 
-    const UniformQuantizationInfo iq_info = _input->info()->quantization_info().uniform();
-    const UniformQuantizationInfo oq_info = _output->info()->quantization_info().uniform();
-
-    switch(_input->info()->data_type())
+    if(_border_mode == BorderMode::CONSTANT)
     {
-        case DataType::QASYMM8_SIGNED:
-        {
-            execute_window_loop(window, [&](const Coordinates & id)
-            {
-                const auto offsets_ptr = reinterpret_cast<const int32_t *>(offsets.ptr());
-                const auto dx_ptr      = reinterpret_cast<const float *>(dx.ptr());
-                const auto dy_ptr      = reinterpret_cast<const float *>(dy.ptr());
-                const auto in_ptr      = reinterpret_cast<const int8_t *>(in.ptr());
-
-                const int in_yi      = std::floor((id.y() + _sampling_offset) * hr - _sampling_offset);
-                const int offset_row = in_yi * in_stide_in_bytes;
-
-                int8x8_t tmp0 = vdup_n_s8(0);
-
-                tmp0 = vset_lane_s8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[0] + offset_row], in_stride, dx_ptr[0], dy_ptr[0], iq_info, oq_info), tmp0, 0);
-                tmp0 = vset_lane_s8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[1] + offset_row], in_stride, dx_ptr[1], dy_ptr[1], iq_info, oq_info), tmp0, 1);
-                tmp0 = vset_lane_s8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[2] + offset_row], in_stride, dx_ptr[2], dy_ptr[2], iq_info, oq_info), tmp0, 2);
-                tmp0 = vset_lane_s8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[3] + offset_row], in_stride, dx_ptr[3], dy_ptr[3], iq_info, oq_info), tmp0, 3);
-                tmp0 = vset_lane_s8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[4] + offset_row], in_stride, dx_ptr[4], dy_ptr[4], iq_info, oq_info), tmp0, 4);
-                tmp0 = vset_lane_s8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[5] + offset_row], in_stride, dx_ptr[5], dy_ptr[5], iq_info, oq_info), tmp0, 5);
-                tmp0 = vset_lane_s8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[6] + offset_row], in_stride, dx_ptr[6], dy_ptr[6], iq_info, oq_info), tmp0, 6);
-                tmp0 = vset_lane_s8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[7] + offset_row], in_stride, dx_ptr[7], dy_ptr[7], iq_info, oq_info), tmp0, 7);
-
-                int8x8_t tmp1 = vdup_n_s8(0);
-
-                tmp1 = vset_lane_s8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[8] + offset_row], in_stride, dx_ptr[8], dy_ptr[8], iq_info, oq_info), tmp1, 0);
-                tmp1 = vset_lane_s8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[9] + offset_row], in_stride, dx_ptr[9], dy_ptr[9], iq_info, oq_info), tmp1, 1);
-                tmp1 = vset_lane_s8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[10] + offset_row], in_stride, dx_ptr[10], dy_ptr[10], iq_info, oq_info), tmp1, 2);
-                tmp1 = vset_lane_s8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[11] + offset_row], in_stride, dx_ptr[11], dy_ptr[11], iq_info, oq_info), tmp1, 3);
-                tmp1 = vset_lane_s8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[12] + offset_row], in_stride, dx_ptr[12], dy_ptr[12], iq_info, oq_info), tmp1, 4);
-                tmp1 = vset_lane_s8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[13] + offset_row], in_stride, dx_ptr[13], dy_ptr[13], iq_info, oq_info), tmp1, 5);
-                tmp1 = vset_lane_s8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[14] + offset_row], in_stride, dx_ptr[14], dy_ptr[14], iq_info, oq_info), tmp1, 6);
-                tmp1 = vset_lane_s8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[15] + offset_row], in_stride, dx_ptr[15], dy_ptr[15], iq_info, oq_info), tmp1, 7);
-
-                vst1q_s8(reinterpret_cast<int8_t *>(out.ptr()), vcombine_s8(tmp0, tmp1));
-            },
-            in, offsets, dx, dy, out);
-            break;
-        }
-        case DataType::QASYMM8:
-        {
-            execute_window_loop(window, [&](const Coordinates & id)
-            {
-                const auto offsets_ptr = reinterpret_cast<const int32_t *>(offsets.ptr());
-                const auto dx_ptr      = reinterpret_cast<const float *>(dx.ptr());
-                const auto dy_ptr      = reinterpret_cast<const float *>(dy.ptr());
-                const auto in_ptr      = reinterpret_cast<const uint8_t *>(in.ptr());
-
-                const int in_yi      = std::floor((id.y() + _sampling_offset) * hr - _sampling_offset);
-                const int offset_row = in_yi * in_stide_in_bytes;
-
-                uint8x8_t tmp0 = vdup_n_u8(0);
-
-                tmp0 = vset_lane_u8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[0] + offset_row], in_stride, dx_ptr[0], dy_ptr[0], iq_info, oq_info), tmp0, 0);
-                tmp0 = vset_lane_u8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[1] + offset_row], in_stride, dx_ptr[1], dy_ptr[1], iq_info, oq_info), tmp0, 1);
-                tmp0 = vset_lane_u8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[2] + offset_row], in_stride, dx_ptr[2], dy_ptr[2], iq_info, oq_info), tmp0, 2);
-                tmp0 = vset_lane_u8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[3] + offset_row], in_stride, dx_ptr[3], dy_ptr[3], iq_info, oq_info), tmp0, 3);
-                tmp0 = vset_lane_u8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[4] + offset_row], in_stride, dx_ptr[4], dy_ptr[4], iq_info, oq_info), tmp0, 4);
-                tmp0 = vset_lane_u8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[5] + offset_row], in_stride, dx_ptr[5], dy_ptr[5], iq_info, oq_info), tmp0, 5);
-                tmp0 = vset_lane_u8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[6] + offset_row], in_stride, dx_ptr[6], dy_ptr[6], iq_info, oq_info), tmp0, 6);
-                tmp0 = vset_lane_u8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[7] + offset_row], in_stride, dx_ptr[7], dy_ptr[7], iq_info, oq_info), tmp0, 7);
-
-                uint8x8_t tmp1 = vdup_n_u8(0);
-
-                tmp1 = vset_lane_u8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[8] + offset_row], in_stride, dx_ptr[8], dy_ptr[8], iq_info, oq_info), tmp1, 0);
-                tmp1 = vset_lane_u8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[9] + offset_row], in_stride, dx_ptr[9], dy_ptr[9], iq_info, oq_info), tmp1, 1);
-                tmp1 = vset_lane_u8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[10] + offset_row], in_stride, dx_ptr[10], dy_ptr[10], iq_info, oq_info), tmp1, 2);
-                tmp1 = vset_lane_u8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[11] + offset_row], in_stride, dx_ptr[11], dy_ptr[11], iq_info, oq_info), tmp1, 3);
-                tmp1 = vset_lane_u8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[12] + offset_row], in_stride, dx_ptr[12], dy_ptr[12], iq_info, oq_info), tmp1, 4);
-                tmp1 = vset_lane_u8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[13] + offset_row], in_stride, dx_ptr[13], dy_ptr[13], iq_info, oq_info), tmp1, 5);
-                tmp1 = vset_lane_u8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[14] + offset_row], in_stride, dx_ptr[14], dy_ptr[14], iq_info, oq_info), tmp1, 6);
-                tmp1 = vset_lane_u8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[15] + offset_row], in_stride, dx_ptr[15], dy_ptr[15], iq_info, oq_info), tmp1, 7);
-
-                vst1q_u8(out.ptr(), vcombine_u8(tmp0, tmp1));
-            },
-            in, offsets, dx, dy, out);
-            break;
-        }
-        case DataType::U8:
-        {
-            execute_window_loop(window, [&](const Coordinates & id)
-            {
-                const auto offsets_ptr = reinterpret_cast<const int32_t *>(offsets.ptr());
-                const auto dx_ptr      = reinterpret_cast<const float *>(dx.ptr());
-                const auto dy_ptr      = reinterpret_cast<const float *>(dy.ptr());
-                const auto in_ptr      = reinterpret_cast<const uint8_t *>(in.ptr());
-
-                const int in_yi      = std::floor((id.y() + _sampling_offset) * hr - _sampling_offset);
-                const int offset_row = in_yi * in_stide_in_bytes;
-
-                uint8x8_t tmp0 = vdup_n_u8(0);
-
-                tmp0 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[0] + offset_row], in_stride, dx_ptr[0], dy_ptr[0]), tmp0, 0);
-                tmp0 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[1] + offset_row], in_stride, dx_ptr[1], dy_ptr[1]), tmp0, 1);
-                tmp0 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[2] + offset_row], in_stride, dx_ptr[2], dy_ptr[2]), tmp0, 2);
-                tmp0 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[3] + offset_row], in_stride, dx_ptr[3], dy_ptr[3]), tmp0, 3);
-                tmp0 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[4] + offset_row], in_stride, dx_ptr[4], dy_ptr[4]), tmp0, 4);
-                tmp0 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[5] + offset_row], in_stride, dx_ptr[5], dy_ptr[5]), tmp0, 5);
-                tmp0 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[6] + offset_row], in_stride, dx_ptr[6], dy_ptr[6]), tmp0, 6);
-                tmp0 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[7] + offset_row], in_stride, dx_ptr[7], dy_ptr[7]), tmp0, 7);
-
-                uint8x8_t tmp1 = vdup_n_u8(0);
-
-                tmp1 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[8] + offset_row], in_stride, dx_ptr[8], dy_ptr[8]), tmp1, 0);
-                tmp1 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[9] + offset_row], in_stride, dx_ptr[9], dy_ptr[9]), tmp1, 1);
-                tmp1 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[10] + offset_row], in_stride, dx_ptr[10], dy_ptr[10]), tmp1, 2);
-                tmp1 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[11] + offset_row], in_stride, dx_ptr[11], dy_ptr[11]), tmp1, 3);
-                tmp1 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[12] + offset_row], in_stride, dx_ptr[12], dy_ptr[12]), tmp1, 4);
-                tmp1 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[13] + offset_row], in_stride, dx_ptr[13], dy_ptr[13]), tmp1, 5);
-                tmp1 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[14] + offset_row], in_stride, dx_ptr[14], dy_ptr[14]), tmp1, 6);
-                tmp1 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[15] + offset_row], in_stride, dx_ptr[15], dy_ptr[15]), tmp1, 7);
-
-                vst1q_u8(out.ptr(), vcombine_u8(tmp0, tmp1));
-            },
-            in, offsets, dx, dy, out);
-            break;
-        }
-        case DataType::S16:
-        {
-            execute_window_loop(window, [&](const Coordinates & id)
-            {
-                const auto offsets_ptr = reinterpret_cast<const int32_t *>(offsets.ptr());
-                const auto dx_ptr      = reinterpret_cast<const float *>(dx.ptr());
-                const auto dy_ptr      = reinterpret_cast<const float *>(dy.ptr());
-
-                const int in_yi      = std::floor((id.y() + _sampling_offset) * hr - _sampling_offset);
-                const int offset_row = in_yi * in_stide_in_bytes;
-
-                int16x8x2_t tmp =
-                {
-                    {
-                        vdupq_n_s16(0),
-                        vdupq_n_s16(0)
-                    }
-                };
-
-                tmp.val[0] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[0] + offset_row), in_stride, dx_ptr[0], dy_ptr[0]), tmp.val[0], 0);
-                tmp.val[0] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[2] + offset_row), in_stride, dx_ptr[2], dy_ptr[2]), tmp.val[0], 1);
-                tmp.val[0] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[4] + offset_row), in_stride, dx_ptr[4], dy_ptr[4]), tmp.val[0], 2);
-                tmp.val[0] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[6] + offset_row), in_stride, dx_ptr[6], dy_ptr[6]), tmp.val[0], 3);
-                tmp.val[0] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[8] + offset_row), in_stride, dx_ptr[8], dy_ptr[8]), tmp.val[0], 4);
-                tmp.val[0] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[10] + offset_row), in_stride, dx_ptr[10], dy_ptr[10]), tmp.val[0], 5);
-                tmp.val[0] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[12] + offset_row), in_stride, dx_ptr[12], dy_ptr[12]), tmp.val[0], 6);
-                tmp.val[0] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[14] + offset_row), in_stride, dx_ptr[14], dy_ptr[14]), tmp.val[0], 7);
-
-                tmp.val[1] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[1] + offset_row), in_stride, dx_ptr[1], dy_ptr[1]), tmp.val[1], 0);
-                tmp.val[1] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[3] + offset_row), in_stride, dx_ptr[3], dy_ptr[3]), tmp.val[1], 1);
-                tmp.val[1] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[5] + offset_row), in_stride, dx_ptr[5], dy_ptr[5]), tmp.val[1], 2);
-                tmp.val[1] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[7] + offset_row), in_stride, dx_ptr[7], dy_ptr[7]), tmp.val[1], 3);
-                tmp.val[1] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[9] + offset_row), in_stride, dx_ptr[9], dy_ptr[9]), tmp.val[1], 4);
-                tmp.val[1] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[11] + offset_row), in_stride, dx_ptr[11], dy_ptr[11]), tmp.val[1], 5);
-                tmp.val[1] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[13] + offset_row), in_stride, dx_ptr[13], dy_ptr[13]), tmp.val[1], 6);
-                tmp.val[1] = vsetq_lane_s16(delta_bilinear_c1(reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[15] + offset_row), in_stride, dx_ptr[15], dy_ptr[15]), tmp.val[1], 7);
-
-                vst2q_s16(reinterpret_cast<int16_t *>(out.ptr()), tmp);
-            },
-            in, offsets, dx, dy, out);
-            break;
-        }
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        case DataType::F16:
-        {
-            execute_window_loop(window, [&](const Coordinates & id)
-            {
-                const auto offsets_ptr = reinterpret_cast<const int32_t *>(offsets.ptr());
-                const auto dx_ptr      = reinterpret_cast<const float *>(dx.ptr());
-                const auto dy_ptr      = reinterpret_cast<const float *>(dy.ptr());
-
-                const int in_yi      = std::floor((id.y() + _sampling_offset) * hr - _sampling_offset);
-                const int offset_row = in_yi * in_stide_in_bytes;
-
-                float16x8x2_t tmp =
-                {
-                    {
-                        vdupq_n_f16(0),
-                        vdupq_n_f16(0)
-                    }
-                };
-
-                tmp.val[0] = vsetq_lane_f16(delta_bilinear_c1(reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[0] + offset_row), in_stride, dx_ptr[0], dy_ptr[0]), tmp.val[0], 0);
-                tmp.val[0] = vsetq_lane_f16(delta_bilinear_c1(reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[2] + offset_row), in_stride, dx_ptr[2], dy_ptr[2]), tmp.val[0], 1);
-                tmp.val[0] = vsetq_lane_f16(delta_bilinear_c1(reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[4] + offset_row), in_stride, dx_ptr[4], dy_ptr[4]), tmp.val[0], 2);
-                tmp.val[0] = vsetq_lane_f16(delta_bilinear_c1(reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[6] + offset_row), in_stride, dx_ptr[6], dy_ptr[6]), tmp.val[0], 3);
-                tmp.val[0] = vsetq_lane_f16(delta_bilinear_c1(reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[8] + offset_row), in_stride, dx_ptr[8], dy_ptr[8]), tmp.val[0], 4);
-                tmp.val[0] = vsetq_lane_f16(delta_bilinear_c1(reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[10] + offset_row), in_stride, dx_ptr[10], dy_ptr[10]), tmp.val[0], 5);
-                tmp.val[0] = vsetq_lane_f16(delta_bilinear_c1(reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[12] + offset_row), in_stride, dx_ptr[12], dy_ptr[12]), tmp.val[0], 6);
-                tmp.val[0] = vsetq_lane_f16(delta_bilinear_c1(reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[14] + offset_row), in_stride, dx_ptr[14], dy_ptr[14]), tmp.val[0], 7);
-
-                tmp.val[1] = vsetq_lane_f16(delta_bilinear_c1(reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[1] + offset_row), in_stride, dx_ptr[1], dy_ptr[1]), tmp.val[1], 0);
-                tmp.val[1] = vsetq_lane_f16(delta_bilinear_c1(reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[3] + offset_row), in_stride, dx_ptr[3], dy_ptr[3]), tmp.val[1], 1);
-                tmp.val[1] = vsetq_lane_f16(delta_bilinear_c1(reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[5] + offset_row), in_stride, dx_ptr[5], dy_ptr[5]), tmp.val[1], 2);
-                tmp.val[1] = vsetq_lane_f16(delta_bilinear_c1(reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[7] + offset_row), in_stride, dx_ptr[7], dy_ptr[7]), tmp.val[1], 3);
-                tmp.val[1] = vsetq_lane_f16(delta_bilinear_c1(reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[9] + offset_row), in_stride, dx_ptr[9], dy_ptr[9]), tmp.val[1], 4);
-                tmp.val[1] = vsetq_lane_f16(delta_bilinear_c1(reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[11] + offset_row), in_stride, dx_ptr[11], dy_ptr[11]), tmp.val[1], 5);
-                tmp.val[1] = vsetq_lane_f16(delta_bilinear_c1(reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[13] + offset_row), in_stride, dx_ptr[13], dy_ptr[13]), tmp.val[1], 6);
-                tmp.val[1] = vsetq_lane_f16(delta_bilinear_c1(reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[15] + offset_row), in_stride, dx_ptr[15], dy_ptr[15]), tmp.val[1], 7);
-
-                vst2q_f16(reinterpret_cast<__fp16 *>(out.ptr()), tmp);
-            },
-            in, offsets, dx, dy, out);
-            break;
-        }
+        using ConstType = typename std::conditional<std::is_same<T, float16_t>::value, half, T>::type;
+#else  /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+        using ConstType = T;
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-        case DataType::F32:
+        const T const_border_value = static_cast<T>(_constant_border_value.get<ConstType>());
+        execute_window_loop(window, [&](const Coordinates & id)
         {
-            execute_window_loop(window, [&](const Coordinates & id)
-            {
-                const auto offsets_ptr = reinterpret_cast<const int32_t *>(offsets.ptr());
-                const auto dx_ptr      = reinterpret_cast<const float *>(dx.ptr());
-                const auto dy_ptr      = reinterpret_cast<const float *>(dy.ptr());
+            const int32_t index_h       = std::floor((id.y() + _sampling_offset) * hr - _sampling_offset);
+            const auto    index_w       = *(reinterpret_cast<const int32_t *>(offsets.ptr()));
+            const auto    dx_val        = *(reinterpret_cast<const float *>(dx.ptr()));
+            const auto    dy_val        = *(reinterpret_cast<const float *>(dy.ptr()));
+            const auto    pixel_row_ptr = reinterpret_cast<const T *>(in.ptr());
 
-                const int in_yi      = std::floor((id.y() + _sampling_offset) * hr - _sampling_offset);
-                const int offset_row = in_yi * in_stide_in_bytes;
+            const auto a00 = (0 <= index_w && index_w < in_dim_w && 0 <= index_h && index_h < in_dim_h) ? (*(pixel_row_ptr + index_w + index_h * in_dim_w)) : const_border_value;
+            const auto a01 = (-1 <= index_w && index_w < in_dim_w - 1 && 0 <= index_h && index_h < in_dim_h) ? (*(pixel_row_ptr + index_w + 1 + index_h * in_dim_w)) : const_border_value;
+            const auto a10 = (0 <= index_w && index_w < in_dim_w && -1 <= index_h
+                              && index_h < in_dim_h - 1) ?
+                             (*(pixel_row_ptr + index_w + index_h * in_dim_w + in_dim_w)) :
+                             const_border_value;
+            const auto a11 = (-1 <= index_w && index_w < in_dim_w - 1 && -1 <= index_h
+                              && index_h < in_dim_h - 1) ?
+                             (*(pixel_row_ptr + index_w + 1 + index_h * in_dim_w + in_dim_w)) :
+                             const_border_value;
 
-                float32x4x4_t tmp =
-                {
-                    {
-                        vdupq_n_f32(0),
-                        vdupq_n_f32(0),
-                        vdupq_n_f32(0),
-                        vdupq_n_f32(0)
-                    }
-                };
+            *reinterpret_cast<T *>(out.ptr()) = static_cast<T>(compute_bilinear(a00, a01, a10, a11, dx_val, dy_val));
+        },
+        in, offsets, dx, dy, out);
+    }
+    else if(_border_mode == BorderMode::REPLICATE)
+    {
+        execute_window_loop(window, [&](const Coordinates & id)
+        {
+            const int  index_h       = std::floor((id.y() + _sampling_offset) * hr - _sampling_offset);
+            const auto index_w       = *(reinterpret_cast<const int32_t *>(offsets.ptr()));
+            const auto dx_val        = *(reinterpret_cast<const float *>(dx.ptr()));
+            const auto dy_val        = *(reinterpret_cast<const float *>(dy.ptr()));
+            const auto pixel_row_ptr = reinterpret_cast<const T *>(in.ptr());
 
-                tmp.val[0] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[0] + offset_row), in_stride, dx_ptr[0], dy_ptr[0]), tmp.val[0], 0);
-                tmp.val[0] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[4] + offset_row), in_stride, dx_ptr[4], dy_ptr[4]), tmp.val[0], 1);
-                tmp.val[0] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[8] + offset_row), in_stride, dx_ptr[8], dy_ptr[8]), tmp.val[0], 2);
-                tmp.val[0] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[12] + offset_row), in_stride, dx_ptr[12], dy_ptr[12]), tmp.val[0], 3);
+            auto clamped_x  = utility::clamp<int>(index_w, 0, in_dim_w - 1);
+            auto clamped_x1 = utility::clamp<int>(index_w + 1, 0, in_dim_w - 1);
+            auto clamped_y  = utility::clamp<int>(index_h, 0, in_dim_h - 1);
+            auto clamped_y1 = utility::clamp<int>(index_h + 1, 0, in_dim_h - 1);
 
-                tmp.val[1] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[1] + offset_row), in_stride, dx_ptr[1], dy_ptr[1]), tmp.val[1], 0);
-                tmp.val[1] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[5] + offset_row), in_stride, dx_ptr[5], dy_ptr[5]), tmp.val[1], 1);
-                tmp.val[1] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[9] + offset_row), in_stride, dx_ptr[9], dy_ptr[9]), tmp.val[1], 2);
-                tmp.val[1] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[13] + offset_row), in_stride, dx_ptr[13], dy_ptr[13]), tmp.val[1], 3);
+            const auto a00 = *(pixel_row_ptr + clamped_x + clamped_y * in_dim_w);
+            const auto a01 = *(pixel_row_ptr + clamped_x1 + clamped_y * in_dim_w);
+            const auto a10 = *(pixel_row_ptr + clamped_x + clamped_y1 * in_dim_w);
+            const auto a11 = *(pixel_row_ptr + clamped_x1 + clamped_y1 * in_dim_w);
 
-                tmp.val[2] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[2] + offset_row), in_stride, dx_ptr[2], dy_ptr[2]), tmp.val[2], 0);
-                tmp.val[2] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[6] + offset_row), in_stride, dx_ptr[6], dy_ptr[6]), tmp.val[2], 1);
-                tmp.val[2] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[10] + offset_row), in_stride, dx_ptr[10], dy_ptr[10]), tmp.val[2], 2);
-                tmp.val[2] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[14] + offset_row), in_stride, dx_ptr[14], dy_ptr[14]), tmp.val[2], 3);
-
-                tmp.val[3] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[3] + offset_row), in_stride, dx_ptr[3], dy_ptr[3]), tmp.val[3], 0);
-                tmp.val[3] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[7] + offset_row), in_stride, dx_ptr[7], dy_ptr[7]), tmp.val[3], 1);
-                tmp.val[3] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[11] + offset_row), in_stride, dx_ptr[11], dy_ptr[11]), tmp.val[3], 2);
-                tmp.val[3] = vsetq_lane_f32(delta_bilinear_c1(reinterpret_cast<const float *>(in.ptr() + offsets_ptr[15] + offset_row), in_stride, dx_ptr[15], dy_ptr[15]), tmp.val[3], 3);
-
-                vst4q_f32(reinterpret_cast<float *>(out.ptr()), tmp);
-            },
-            in, offsets, dx, dy, out);
-            break;
-        }
-        default:
-            ARM_COMPUTE_ERROR("Not supported");
-            break;
+            *reinterpret_cast<T *>(out.ptr()) = static_cast<T>(compute_bilinear(a00, a01, a10, a11, dx_val, dy_val));
+        },
+        in, offsets, dx, dy, out);
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR("Not implemented");
     }
 }
 
-void NEScaleKernel::scale_area_nchw(const Window &window)
+void NEScaleKernel::scale_area_nchw_u8(const Window &window)
 {
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(_input, 1, DataType::U8);
 
@@ -959,8 +348,8 @@
     Iterator in(_input, win_in);
     Iterator out(_output, window);
 
-    const auto   wr        = arm_compute::scale_utils::calculate_resize_ratio(_input->info()->dimension(0), _output->info()->dimension(0), _align_corners);
-    const auto   hr        = arm_compute::scale_utils::calculate_resize_ratio(_input->info()->dimension(1), _output->info()->dimension(1), _align_corners);
+    const auto   wr        = scale_utils::calculate_resize_ratio(_input->info()->dimension(0), _output->info()->dimension(0), _align_corners);
+    const auto   hr        = scale_utils::calculate_resize_ratio(_input->info()->dimension(1), _output->info()->dimension(1), _align_corners);
     const auto   w         = _input->info()->dimension(0);
     const auto   h         = _input->info()->dimension(1);
     const size_t in_stride = _input->info()->strides_in_bytes()[1];
@@ -994,123 +383,234 @@
     in, out);
 }
 
-void NEScaleKernel::scale_nhwc(const Window &window)
+template <typename T>
+void NEScaleKernel::scale_nearest_nhwc(const Window &window)
 {
-    // Get data layout and width/height indices
-    const DataLayout data_layout  = DataLayout::NHWC;
-    const int        idx_channels = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
-    const int        idx_width    = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const int        idx_height   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-
-    const size_t input_stride_w = _input->info()->strides_in_bytes()[idx_width];
-    const size_t input_stride_h = _input->info()->strides_in_bytes()[idx_height];
-    const size_t input_stride_c = _input->info()->strides_in_bytes()[idx_channels];
+    const size_t in_dim_w  = _input->info()->dimension(1);
+    const size_t in_dim_h  = _input->info()->dimension(2);
+    const size_t in_dim_c  = _input->info()->dimension(0);
+    const size_t in_dim_wc = in_dim_w * in_dim_c;
 
     // Compute the ratio between source height and destination height
-    const auto hr = arm_compute::scale_utils::calculate_resize_ratio(_input->info()->dimension(idx_height), _output->info()->dimension(idx_height), _align_corners);
+    const auto hr             = scale_utils::calculate_resize_ratio(in_dim_h, _output->info()->dimension(2), _align_corners);
+    const auto window_start_x = static_cast<int32_t>(window.x().start());
+    const auto window_end_x   = static_cast<int32_t>(window.x().end());
+    const int  window_step_x  = 16 / sizeof(T);
 
-    // Don't increment in width/height/channels for the input tensor
+    Window win(window);
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+    // Don't increment in X and Y direction for the input tensor
     // A pointer to the start of this plane is needed as base for the precomputed offsets
     Window win_in(window);
     win_in.set(Window::DimX, Window::Dimension(0, 0, 0));
     win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
     win_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
+    Iterator in(_input, win_in);
+    Iterator out(_output, win);
 
-    switch(_input->info()->data_type())
+    execute_window_loop(win, [&](const Coordinates & id)
     {
-        case DataType::QASYMM8_SIGNED:
+        const int32_t offset     = *reinterpret_cast<const int32_t *>(_offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_dim_c;
+        const auto    in_hi      = static_cast<int>(_align_corners ? utils::rounding::round_half_away_from_zero((id.z() + _sampling_offset) * hr) : std::floor((id.z() + _sampling_offset) * hr));
+        const int     offset_row = in_hi * in_dim_wc;
+        int32_t       x          = window_start_x;
+        for(; x <= window_end_x - window_step_x; x += window_step_x)
         {
-            if(_policy == InterpolationPolicy::NEAREST_NEIGHBOR)
-            {
-                scale_nearest_nhwc_core<int8_t>(_input, _offsets, _output, hr, window, win_in, input_stride_w, input_stride_h, input_stride_c, _sampling_offset, _align_corners);
-            }
-            else
-            {
-                scale_bilinear_nhwc_core<int8_t, int8_t>(_input, _offsets, _dx, _dy, _output, hr, _sampling_offset,
-                                                         window, win_in, input_stride_w, input_stride_h, input_stride_c, _border_mode, _constant_border_value, _use_padding);
-            }
-            break;
+            wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x,
+                            wrapper::vloadq(reinterpret_cast<const T *>(in.ptr()) + offset + offset_row + x));
         }
-        case DataType::QASYMM8:
-        case DataType::U8:
+        for(; x < window_end_x; ++x)
         {
-            if(_policy == InterpolationPolicy::NEAREST_NEIGHBOR)
-            {
-                scale_nearest_nhwc_core<uint8_t>(_input, _offsets, _output, hr, window, win_in, input_stride_w, input_stride_h, input_stride_c, _sampling_offset, _align_corners);
-            }
-            else
-            {
-                scale_bilinear_nhwc_core<uint8_t, uint8_t>(_input, _offsets, _dx, _dy, _output, hr, _sampling_offset,
-                                                           window, win_in, input_stride_w, input_stride_h, input_stride_c, _border_mode, _constant_border_value, _use_padding);
-            }
-            break;
+            *(reinterpret_cast<T *>(out.ptr()) + x) = *(reinterpret_cast<const T *>(in.ptr()) + offset + offset_row + x);
         }
-        case DataType::S16:
-        {
-            if(_policy == InterpolationPolicy::NEAREST_NEIGHBOR)
-            {
-                scale_nearest_nhwc_core<int16_t>(_input, _offsets, _output, hr, window, win_in, input_stride_w, input_stride_h, input_stride_c, _sampling_offset, _align_corners);
-            }
-            else
-            {
-                scale_bilinear_nhwc_core<int16_t, int16_t>(_input, _offsets, _dx, _dy, _output, hr, _sampling_offset,
-                                                           window, win_in, input_stride_w, input_stride_h, input_stride_c, _border_mode, _constant_border_value, _use_padding);
-            }
-            break;
-        }
+    },
+    in, out);
+}
+
+template <typename T>
+void NEScaleKernel::scale_bilinear_nhwc(const Window &window)
+{
+    // Compute the ratio between source height and destination height
+    const auto hr = scale_utils::calculate_resize_ratio(_input->info()->dimension(2), _output->info()->dimension(2), _align_corners);
+
+    Iterator  out(_output, window);
+    const int in_dim_c = _input->info()->dimension(0);
+    const int in_dim_w = _input->info()->dimension(1);
+    const int in_dim_h = _input->info()->dimension(2);
+    const int input_wc = in_dim_c * in_dim_w;
+
+    // Don't increment in Y and Z direction for the input tensor
+    // A pointer to the start of this plane is needed as base for the precomputed offsets
+    Window win_in(window);
+    win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+    win_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
+    Iterator in(_input, win_in);
+
+    if(_border_mode == BorderMode::CONSTANT)
+    {
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        case DataType::F16:
-        {
-            if(_policy == InterpolationPolicy::NEAREST_NEIGHBOR)
-            {
-                scale_nearest_nhwc_core<float16_t>(_input, _offsets, _output, hr,
-                                                   window, win_in, input_stride_w, input_stride_h, input_stride_c, _sampling_offset, _align_corners);
-            }
-            else
-            {
-                scale_bilinear_nhwc_core<float16_t, half>(_input, _offsets, _dx, _dy, _output, hr, _sampling_offset,
-                                                          window, win_in, input_stride_w, input_stride_h, input_stride_c, _border_mode, _constant_border_value, _use_padding);
-            }
-            break;
-        }
+        using ConstType = typename std::conditional<std::is_same<T, float16_t>::value, half, T>::type;
+#else  /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+        using ConstType = T;
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-        case DataType::F32:
+        const T const_border_value = static_cast<T>(_constant_border_value.get<ConstType>());
+        execute_window_loop(window, [&](const Coordinates & id)
         {
-            if(_policy == InterpolationPolicy::NEAREST_NEIGHBOR)
-            {
-                scale_nearest_nhwc_core<float>(_input, _offsets, _output, hr, window, win_in, input_stride_w, input_stride_h, input_stride_c, _sampling_offset, _align_corners);
-            }
-            else
-            {
-                scale_bilinear_nhwc_core<float, float>(_input, _offsets, _dx, _dy, _output, hr, _sampling_offset,
-                                                       window, win_in, input_stride_w, input_stride_h, input_stride_c, _border_mode, _constant_border_value, _use_padding);
-            }
-            break;
-        }
-        default:
-            ARM_COMPUTE_ERROR("Not supported");
-            break;
+            const auto    offset = *reinterpret_cast<const int32_t *>(_offsets->ptr_to_element(Coordinates(id.y(), id.z())));
+            const auto    dx_val = *reinterpret_cast<const float *>(_dx->ptr_to_element(Coordinates(id.y(), id.z())));
+            const auto    dy_val = *reinterpret_cast<const float *>(_dy->ptr_to_element(Coordinates(id.y(), id.z())));
+            const int32_t in_hi  = std::floor((id.z() + _sampling_offset) * hr - _sampling_offset);
+            const T      *in_ptr = reinterpret_cast<const T *>(in.ptr()) + offset * in_dim_c + in_hi * input_wc;
+
+            const auto a00 = (0 <= offset && offset < in_dim_w && 0 <= in_hi && in_hi < in_dim_h) ? *in_ptr : const_border_value;
+            const auto a01 = (-1 <= offset && offset < in_dim_w - 1 && 0 <= in_hi && in_hi < in_dim_h) ? *(in_ptr + in_dim_c) : const_border_value;
+            const auto a10 = (0 <= offset && offset < in_dim_w && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + input_wc) : const_border_value;
+            const auto a11 = (-1 <= offset && offset < in_dim_w - 1 && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_dim_c + input_wc) : const_border_value;
+
+            *reinterpret_cast<T *>(out.ptr()) = static_cast<T>(compute_bilinear(a00, a01, a10, a11, dx_val, dy_val));
+        },
+        in, out);
+    }
+    else if(_border_mode == BorderMode::REPLICATE)
+    {
+        execute_window_loop(window, [&](const Coordinates & id)
+        {
+            const auto offset = *reinterpret_cast<const int32_t *>(_offsets->ptr_to_element(Coordinates(id.y(), id.z())));
+            const auto dx_val = *reinterpret_cast<const float *>(_dx->ptr_to_element(Coordinates(id.y(), id.z())));
+            const auto dy_val = *reinterpret_cast<const float *>(_dy->ptr_to_element(Coordinates(id.y(), id.z())));
+            const int  in_hi  = std::floor((id.z() + _sampling_offset) * hr - _sampling_offset);
+
+            auto clamped_w  = utility::clamp<int>(offset, 0, in_dim_w - 1);
+            auto clamped_w1 = utility::clamp<int>(offset + 1, 0, in_dim_w - 1);
+            auto clamped_h  = utility::clamp<int>(in_hi, 0, in_dim_h - 1);
+            auto clamped_h1 = utility::clamp<int>(in_hi + 1, 0, in_dim_h - 1);
+
+            const auto a00 = *(reinterpret_cast<const T *>(in.ptr()) + clamped_w * in_dim_c + clamped_h * input_wc);
+            const auto a01 = *(reinterpret_cast<const T *>(in.ptr()) + clamped_w1 * in_dim_c + clamped_h * input_wc);
+            const auto a10 = *(reinterpret_cast<const T *>(in.ptr()) + clamped_w * in_dim_c + clamped_h1 * input_wc);
+            const auto a11 = *(reinterpret_cast<const T *>(in.ptr()) + clamped_w1 * in_dim_c + clamped_h1 * input_wc);
+
+            *reinterpret_cast<T *>(out.ptr()) = static_cast<T>(compute_bilinear(a00, a01, a10, a11, dx_val, dy_val));
+        },
+        in, out);
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR("Not implemented");
+    }
+}
+
+template <typename T>
+void NEScaleKernel::scale_bilinear_qasymm(const Window &window)
+{
+    // Get data layout and width/height indices
+    const DataLayout data_layout = _input->info()->data_layout();
+    const int        idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const int        idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+
+    // Compute the ratio between source height and destination height
+    const auto hr = scale_utils::calculate_resize_ratio(_input->info()->dimension(idx_height), _output->info()->dimension(idx_height), _align_corners);
+    Window     win_off;
+    win_off.set(Window::DimX, Window::Dimension(0, 0, 0));
+    win_off.set(Window::DimY, Window::Dimension(0, 0, 0));
+
+    // Don't increment in X and Y direction for the input tensor
+    // A pointer to the start of this plane is needed as base for the precomputed offsets
+    Window win_in(window);
+    win_in.set(idx_width, Window::Dimension(0, 0, 0));
+    win_in.set(idx_height, Window::Dimension(0, 0, 0));
+
+    for(size_t d = Window::DimZ; d < _offsets->info()->num_dimensions(); ++d)
+    {
+        win_off.set(d, Window::Dimension(0, 0, 0));
+    }
+
+    Iterator in(_input, win_in);
+    Iterator out(_output, window);
+
+    const int32_t in_dim_w = _input->info()->dimension(idx_width);
+    const int32_t in_dim_h = _input->info()->dimension(idx_height);
+    const int32_t stride_w = _input->info()->strides_in_bytes()[idx_width];
+    const int32_t stride_h = _input->info()->strides_in_bytes()[idx_height];
+
+    const UniformQuantizationInfo iq_info = _input->info()->quantization_info().uniform();
+    const UniformQuantizationInfo oq_info = _output->info()->quantization_info().uniform();
+
+    if(_border_mode == BorderMode::CONSTANT)
+    {
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+        using ConstType = typename std::conditional<std::is_same<T, float16_t>::value, half, T>::type;
+#else  /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+        using ConstType = T;
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+        const T const_border_value = static_cast<T>(_constant_border_value.get<ConstType>());
+        execute_window_loop(window, [&](const Coordinates & id)
+        {
+            const int32_t index_h       = std::floor((id[idx_height] + _sampling_offset) * hr - _sampling_offset);
+            const int32_t index_w       = *(reinterpret_cast<const int32_t *>(_offsets->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
+            const auto    dx_val        = *(reinterpret_cast<const float *>(_dx->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
+            const auto    dy_val        = *(reinterpret_cast<const float *>(_dy->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
+            const auto    pixel_row_ptr = reinterpret_cast<const T *>(in.ptr());
+
+            const auto a00 = (0 <= index_w && index_w < in_dim_w && 0 <= index_h && index_h < in_dim_h) ?
+                             (*(pixel_row_ptr + index_w * stride_w + index_h * stride_h)) :
+                             const_border_value;
+            const auto a01 = (-1 <= index_w && index_w < in_dim_w - 1 && 0 <= index_h && index_h < in_dim_h) ?
+                             (*(pixel_row_ptr + (index_w + 1) * stride_w + index_h * stride_h)) :
+                             const_border_value;
+            const auto a10 = (0 <= index_w && index_w < in_dim_w && -1 <= index_h && index_h < in_dim_h - 1) ?
+                             (*(pixel_row_ptr + index_w * stride_w + (index_h + 1) * stride_h)) :
+                             const_border_value;
+            const auto a11 = (-1 <= index_w && index_w < in_dim_w - 1 && -1 <= index_h && index_h < in_dim_h - 1) ?
+                             (*(pixel_row_ptr + (index_w + 1) * stride_w + (index_h + 1) * stride_h)) :
+                             const_border_value;
+
+            const float inp00                 = Qasymm8QuantizationHelper<T>::dequantize(a00, iq_info);
+            const float inp01                 = Qasymm8QuantizationHelper<T>::dequantize(a01, iq_info);
+            const float inp10                 = Qasymm8QuantizationHelper<T>::dequantize(a10, iq_info);
+            const float inp11                 = Qasymm8QuantizationHelper<T>::dequantize(a11, iq_info);
+            *reinterpret_cast<T *>(out.ptr()) = Qasymm8QuantizationHelper<T>::quantize(compute_bilinear(inp00, inp01, inp10, inp11, dx_val, dy_val), oq_info);
+        },
+        in, out);
+    }
+    else if(_border_mode == BorderMode::REPLICATE)
+    {
+        execute_window_loop(window, [&](const Coordinates & id)
+        {
+            const int     index_h       = std::floor((id[idx_height] + _sampling_offset) * hr - _sampling_offset);
+            const int32_t index_w       = *(reinterpret_cast<const int32_t *>(_offsets->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
+            const auto    dx_val        = *(reinterpret_cast<const float *>(_dx->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
+            const auto    dy_val        = *(reinterpret_cast<const float *>(_dy->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
+            const auto    pixel_row_ptr = reinterpret_cast<const T *>(in.ptr());
+
+            auto clamped_w  = utility::clamp<int>(index_w, 0, in_dim_w - 1);
+            auto clamped_w1 = utility::clamp<int>(index_w + 1, 0, in_dim_w - 1);
+            auto clamped_h  = utility::clamp<int>(index_h, 0, in_dim_h - 1);
+            auto clamped_h1 = utility::clamp<int>(index_h + 1, 0, in_dim_h - 1);
+
+            const auto a00 = *(pixel_row_ptr + clamped_w * stride_w + clamped_h * stride_h);
+            const auto a01 = *(pixel_row_ptr + clamped_w1 * stride_w + clamped_h * stride_h);
+            const auto a10 = *(pixel_row_ptr + clamped_w * stride_w + clamped_h1 * stride_h);
+            const auto a11 = *(pixel_row_ptr + clamped_w1 * stride_w + clamped_h1 * stride_h);
+
+            const float inp00                 = Qasymm8QuantizationHelper<T>::dequantize(a00, iq_info);
+            const float inp01                 = Qasymm8QuantizationHelper<T>::dequantize(a01, iq_info);
+            const float inp10                 = Qasymm8QuantizationHelper<T>::dequantize(a10, iq_info);
+            const float inp11                 = Qasymm8QuantizationHelper<T>::dequantize(a11, iq_info);
+            *reinterpret_cast<T *>(out.ptr()) = Qasymm8QuantizationHelper<T>::quantize(compute_bilinear(inp00, inp01, inp10, inp11, dx_val, dy_val), oq_info);
+        },
+        in, out);
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR("Not implemented");
     }
 }
 
 Status NEScaleKernel::validate(const ITensorInfo *input, const ITensorInfo *dx, const ITensorInfo *dy,
                                const ITensorInfo *offsets, ITensorInfo *output, const ScaleKernelInfo &info)
 {
-    BorderSize border_size(1);
-    if(input->data_layout() == DataLayout::NHWC)
-    {
-        border_size = (info.border_mode == BorderMode::CONSTANT && info.interpolation_policy == InterpolationPolicy::BILINEAR) ? BorderSize(1, 0, 0, 0) : BorderSize(0);
-    }
-
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, dx, dy, offsets, output, info));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(),
-                                                              dx != nullptr ? dx->clone().get() : nullptr,
-                                                              dy != nullptr ? dy->clone().get() : nullptr,
-                                                              offsets != nullptr ? offsets->clone().get() : nullptr,
-                                                              output->clone().get(),
-                                                              info, border_size)
-                                .first);
-
     return Status{};
 }
 
diff --git a/src/runtime/NEON/functions/NEGaussianPyramid.cpp b/src/runtime/NEON/functions/NEGaussianPyramid.cpp
index ae883bc..e4e20e0 100644
--- a/src/runtime/NEON/functions/NEGaussianPyramid.cpp
+++ b/src/runtime/NEON/functions/NEGaussianPyramid.cpp
@@ -168,7 +168,7 @@
             _gaus5x5[i].configure(_pyramid->get_pyramid_level(i), _tmp.get_pyramid_level(i), border_mode, constant_border_value);
 
             /* Configure scale */
-            _scale_nearest[i].configure(_tmp.get_pyramid_level(i), _pyramid->get_pyramid_level(i + 1), ScaleKernelInfo{ InterpolationPolicy::NEAREST_NEIGHBOR, BorderMode::UNDEFINED });
+            _scale_nearest[i].configure(_tmp.get_pyramid_level(i), _pyramid->get_pyramid_level(i + 1), ScaleKernelInfo{ InterpolationPolicy::NEAREST_NEIGHBOR, BorderMode::UNDEFINED, PixelValue(), SamplingPolicy::CENTER, false });
         }
 
         _tmp.allocate();
diff --git a/src/runtime/NEON/functions/NELaplacianReconstruct.cpp b/src/runtime/NEON/functions/NELaplacianReconstruct.cpp
index 24755fc..aa5f8a2 100644
--- a/src/runtime/NEON/functions/NELaplacianReconstruct.cpp
+++ b/src/runtime/NEON/functions/NELaplacianReconstruct.cpp
@@ -73,7 +73,7 @@
     // Scale levels n-1 to 1, and add levels n-2 to 0
     for(size_t l = 0; l < last_level; ++l)
     {
-        _scalef[l].configure(_tmp_pyr.get_pyramid_level(l + 1), _tmp_pyr.get_pyramid_level(l), ScaleKernelInfo{ arm_compute::InterpolationPolicy::NEAREST_NEIGHBOR, border_mode, constant_border_value });
+        _scalef[l].configure(_tmp_pyr.get_pyramid_level(l + 1), _tmp_pyr.get_pyramid_level(l), ScaleKernelInfo{ arm_compute::InterpolationPolicy::NEAREST_NEIGHBOR, border_mode, constant_border_value, SamplingPolicy::CENTER, false });
         _addf[l].configure(_tmp_pyr.get_pyramid_level(l), pyramid->get_pyramid_level(l), _tmp_pyr.get_pyramid_level(l), ConvertPolicy::SATURATE);
     }
 
diff --git a/src/runtime/NEON/functions/NEScale.cpp b/src/runtime/NEON/functions/NEScale.cpp
index 424049f..2278f07 100644
--- a/src/runtime/NEON/functions/NEScale.cpp
+++ b/src/runtime/NEON/functions/NEScale.cpp
@@ -44,7 +44,7 @@
 {
 namespace
 {
-void precompute_dx_dy_offsets(ITensor *dx, ITensor *dy, ITensor *offsets, float wr, float hr, size_t input_element_size, SamplingPolicy sampling_policy, bool align_corners)
+void precompute_dx_dy_offsets(ITensor *dx, ITensor *dy, ITensor *offsets, float wr, float hr, SamplingPolicy sampling_policy, bool align_corners)
 {
     ARM_COMPUTE_ERROR_ON(nullptr == offsets);
     ARM_COMPUTE_UNUSED(sampling_policy);
@@ -72,7 +72,7 @@
             const int   in_xi = std::floor(in_x);
             const int   in_yi = std::floor(in_y);
 
-            *reinterpret_cast<int32_t *>(offsets_it.ptr()) = in_xi * static_cast<int>(input_element_size);
+            *reinterpret_cast<int32_t *>(offsets_it.ptr()) = in_xi;
             *reinterpret_cast<float *>(dx_it.ptr())        = in_x - in_xi;
             *reinterpret_cast<float *>(dy_it.ptr())        = in_y - in_yi;
         },
@@ -85,23 +85,17 @@
 
         execute_window_loop(win, [&](const Coordinates & id)
         {
-            const float float_in_xi = (id.x() + sampling_offset) * wr;
-            const auto  in_xi       = static_cast<size_t>(align_corners ? arm_compute::utils::rounding::round_half_away_from_zero(float_in_xi) : std::floor(float_in_xi));
-
-            *reinterpret_cast<int32_t *>(offsets_it.ptr()) = in_xi * input_element_size;
+            const float float_in_xi                        = (id.x() + sampling_offset) * wr;
+            const auto  in_xi                              = static_cast<size_t>(align_corners ? arm_compute::utils::rounding::round_half_away_from_zero(float_in_xi) : std::floor(float_in_xi));
+            *reinterpret_cast<int32_t *>(offsets_it.ptr()) = in_xi;
         },
         offsets_it);
     }
 }
 } // namespace
 
-NEScale::NEScale() // NOLINT
-    : _offsets(),
-      _dx(),
-      _dy(),
-      _scale_kernel(),
-      _border_handler(),
-      _use_padding(true)
+NEScale::NEScale()
+    : _offsets(), _dx(), _dy()
 {
 }
 
@@ -110,7 +104,6 @@
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_ERROR_THROW_ON(NEScale::validate(input->info(), output->info(), info));
 
-    _use_padding                     = info.use_padding;
     const bool is_align_corners_used = info.align_corners && arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(info.sampling_policy);
 
     // Get data layout and width/height indices
@@ -119,18 +112,17 @@
     const int        idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
 
     // Get the tensor shape
-    const TensorShape shape(output->info()->dimension(idx_width), output->info()->dimension(idx_height));
+    TensorShape shape(output->info()->dimension(idx_width));
+    shape.set(1, output->info()->dimension(idx_height), false);
 
     // Compute the ratio between source width/height and destination width/height
     const auto wr = arm_compute::scale_utils::calculate_resize_ratio(input->info()->dimension(idx_width), output->info()->dimension(idx_width), is_align_corners_used);
     const auto hr = arm_compute::scale_utils::calculate_resize_ratio(input->info()->dimension(idx_height), output->info()->dimension(idx_height), is_align_corners_used);
 
-    // Get the element size of the input image
-    const size_t input_element_size = input->info()->element_size();
-
     // Area interpolation behaves as Nearest Neighbour in case of up-sampling
     const auto policy_to_use = (info.interpolation_policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f) ? InterpolationPolicy::NEAREST_NEIGHBOR : info.interpolation_policy;
 
+    auto scale_kernel = arm_compute::support::cpp14::make_unique<NEScaleKernel>();
     switch(policy_to_use)
     {
         case InterpolationPolicy::NEAREST_NEIGHBOR:
@@ -138,13 +130,13 @@
             TensorInfo tensor_info_offsets(shape, Format::S32);
             _offsets.allocator()->init(tensor_info_offsets);
 
-            _scale_kernel.configure(input, nullptr, nullptr, &_offsets, output, info);
+            scale_kernel->configure(input, nullptr, nullptr, &_offsets, output, info);
 
             // Allocate once the configure methods have been called
             _offsets.allocator()->allocate();
 
             // Pre-compute offsets for nearest interpolation
-            precompute_dx_dy_offsets(nullptr, nullptr, &_offsets, wr, hr, input_element_size, info.sampling_policy, is_align_corners_used);
+            precompute_dx_dy_offsets(nullptr, nullptr, &_offsets, wr, hr, info.sampling_policy, is_align_corners_used);
             break;
         }
         case InterpolationPolicy::BILINEAR:
@@ -156,7 +148,7 @@
             _dx.allocator()->init(tensor_info_dxdy);
             _dy.allocator()->init(tensor_info_dxdy);
 
-            _scale_kernel.configure(input, &_dx, &_dy, &_offsets, output, info);
+            scale_kernel->configure(input, &_dx, &_dy, &_offsets, output, info);
 
             // Allocate once the configure methods have been called
             _offsets.allocator()->allocate();
@@ -164,27 +156,18 @@
             _dy.allocator()->allocate();
 
             // Pre-compute dx, dy and offsets for bilinear interpolation
-            precompute_dx_dy_offsets(&_dx, &_dy, &_offsets, wr, hr, input_element_size, info.sampling_policy, is_align_corners_used);
+            precompute_dx_dy_offsets(&_dx, &_dy, &_offsets, wr, hr, info.sampling_policy, is_align_corners_used);
             break;
         }
         case InterpolationPolicy::AREA:
         {
-            _scale_kernel.configure(input, nullptr, nullptr, nullptr, output, info);
+            scale_kernel->configure(input, nullptr, nullptr, nullptr, output, info);
             break;
         }
         default:
             ARM_COMPUTE_ERROR("Unsupported interpolation mode");
     }
-    if(info.use_padding)
-    {
-        _border_handler.configure(input, _scale_kernel.border_size(), info.border_mode, info.constant_border_value);
-    }
-}
-
-void NEScale::configure(ITensor *input, ITensor *output, InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, SamplingPolicy sampling_policy, bool use_padding,
-                        bool align_corners)
-{
-    configure(input, output, ScaleKernelInfo{ policy, border_mode, constant_border_value, sampling_policy, use_padding, align_corners });
+    _kernel = std::move(scale_kernel);
 }
 
 Status NEScale::validate(const ITensorInfo *input, const ITensorInfo *output, const ScaleKernelInfo &info)
@@ -225,20 +208,4 @@
     ARM_COMPUTE_RETURN_ON_ERROR(NEScaleKernel::validate(input->clone().get(), dx, dy, offsets, output->clone().get(), info));
     return Status{};
 }
-
-Status NEScale::validate(const ITensorInfo *input, const ITensorInfo *output, InterpolationPolicy policy,
-                         BorderMode border_mode, PixelValue constant_border_value, SamplingPolicy sampling_policy, bool use_padding, bool align_corners)
-{
-    ARM_COMPUTE_RETURN_ON_ERROR(NEScale::validate(input, output, ScaleKernelInfo{ policy, border_mode, constant_border_value, sampling_policy, use_padding, align_corners }));
-    return Status{};
-}
-
-void NEScale::run()
-{
-    if(_use_padding)
-    {
-        NEScheduler::get().schedule(&_border_handler, Window::DimZ);
-    }
-    NEScheduler::get().schedule(&_scale_kernel, Window::DimY);
-}
 } // namespace arm_compute
diff --git a/tests/datasets/BorderModeDataset.h b/tests/datasets/BorderModeDataset.h
index 84a7a4c..bb90ad2 100644
--- a/tests/datasets/BorderModeDataset.h
+++ b/tests/datasets/BorderModeDataset.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,8 +27,6 @@
 #include "arm_compute/core/Types.h"
 #include "tests/framework/datasets/ContainerDataset.h"
 
-#include "utils/TypePrinter.h"
-
 namespace arm_compute
 {
 namespace test
diff --git a/tests/validation/NEON/Scale.cpp b/tests/validation/NEON/Scale.cpp
index 1b59faa..9a1e9b0 100644
--- a/tests/validation/NEON/Scale.cpp
+++ b/tests/validation/NEON/Scale.cpp
@@ -117,9 +117,8 @@
 constexpr auto default_data_type            = DataType::U8;
 constexpr auto default_data_layout          = DataLayout::NHWC;
 constexpr auto default_interpolation_policy = InterpolationPolicy::NEAREST_NEIGHBOR;
-constexpr auto default_border_mode          = BorderMode::UNDEFINED;
+constexpr auto default_border_mode          = BorderMode::CONSTANT;
 constexpr auto default_sampling_policy      = SamplingPolicy::CENTER;
-constexpr bool default_use_padding          = false;
 
 TEST_CASE(NullPtr, framework::DatasetMode::ALL)
 {
@@ -128,11 +127,11 @@
     Status     result{};
 
     // nullptr is given as input
-    result = NEScale::validate(nullptr, &output, ScaleKernelInfo{ default_interpolation_policy, default_border_mode });
+    result = NEScale::validate(nullptr, &output, ScaleKernelInfo{ default_interpolation_policy, default_border_mode, PixelValue(), SamplingPolicy::CENTER, false });
     ARM_COMPUTE_EXPECT(bool(result) == false, framework::LogLevel::ERRORS);
 
     // nullptr is given as output
-    result = NEScale::validate(&input, nullptr, ScaleKernelInfo{ default_interpolation_policy, default_border_mode });
+    result = NEScale::validate(&input, nullptr, ScaleKernelInfo{ default_interpolation_policy, default_border_mode, PixelValue(), SamplingPolicy::CENTER, false });
     ARM_COMPUTE_EXPECT(bool(result) == false, framework::LogLevel::ERRORS);
 }
 
@@ -170,7 +169,7 @@
         const auto input  = TensorInfo{ input_shape, 1, kv.first, default_data_layout };
         const auto output = TensorInfo{ output_shape, 1, kv.first, default_data_layout };
 
-        result = NEScale::validate(&input, &output, ScaleKernelInfo{ default_interpolation_policy, default_border_mode });
+        result = NEScale::validate(&input, &output, ScaleKernelInfo{ default_interpolation_policy, default_border_mode, PixelValue(), SamplingPolicy::CENTER, false });
         ARM_COMPUTE_EXPECT(bool(result) == kv.second, framework::LogLevel::ERRORS);
     }
 }
@@ -183,7 +182,7 @@
     const auto output = TensorInfo{ output_shape, 1, non_default_data_type, default_data_layout };
     Status     result{};
 
-    result = NEScale::validate(&input, &output, ScaleKernelInfo{ default_interpolation_policy, default_border_mode });
+    result = NEScale::validate(&input, &output, ScaleKernelInfo{ default_interpolation_policy, default_border_mode, PixelValue(), SamplingPolicy::CENTER, false });
     ARM_COMPUTE_EXPECT(bool(result) == false, framework::LogLevel::ERRORS);
 }
 
@@ -193,9 +192,9 @@
     const auto output = TensorInfo{ output_shape, 1, default_data_type, default_data_layout };
     Status     result{};
 
-    // When use padding is false, border mode should be constant
-    constexpr auto border_mode = BorderMode::UNDEFINED;
-    constexpr bool use_padding = false;
+    // Padding is not supported anymore
+    constexpr auto border_mode = BorderMode::CONSTANT;
+    constexpr bool use_padding = true;
 
     result = NEScale::validate(&input, &output, ScaleKernelInfo{ default_interpolation_policy, border_mode, PixelValue(), default_sampling_policy, use_padding });
     ARM_COMPUTE_EXPECT(bool(result) == false, framework::LogLevel::ERRORS);
@@ -211,7 +210,7 @@
     const auto output = TensorInfo{ output_shape, 1, default_data_type, data_layout };
     Status     result{};
 
-    result = NEScale::validate(&input, &output, ScaleKernelInfo{ interpolation_policy, default_border_mode });
+    result = NEScale::validate(&input, &output, ScaleKernelInfo{ interpolation_policy, default_border_mode, PixelValue(), SamplingPolicy::CENTER, false });
     ARM_COMPUTE_EXPECT(bool(result) == false, framework::LogLevel::ERRORS);
 }
 
@@ -226,7 +225,7 @@
     const auto output = TensorInfo{ output_shape, 1, data_type, data_layout };
     Status     result{};
 
-    result = NEScale::validate(&input, &output, ScaleKernelInfo{ interpolation_policy, default_border_mode });
+    result = NEScale::validate(&input, &output, ScaleKernelInfo{ interpolation_policy, default_border_mode, PixelValue(), SamplingPolicy::CENTER, false });
     ARM_COMPUTE_EXPECT(bool(result) == false, framework::LogLevel::ERRORS);
 }
 
@@ -241,11 +240,80 @@
     const auto output = TensorInfo{ output_shape, 1, default_data_type, default_data_layout };
     Status     result{};
 
-    result = NEScale::validate(&input, &output, ScaleKernelInfo{ interpolation_policy, default_border_mode, PixelValue(), sampling_policy, default_use_padding, align_corners });
+    result = NEScale::validate(&input, &output, ScaleKernelInfo{ interpolation_policy, default_border_mode, PixelValue(), sampling_policy, false, align_corners });
     ARM_COMPUTE_EXPECT(bool(result) == false, framework::LogLevel::ERRORS);
 }
 TEST_SUITE_END() // Validate
 
+DATA_TEST_CASE(CheckNoPadding, framework::DatasetMode::ALL, combine(combine(combine(combine(datasets::Medium4DShapes(),
+                                                                                            framework::dataset::make("DataType", { DataType::F32, DataType::QASYMM8 })),
+                                                                                    framework::dataset::make("InterpolationPolicy", { InterpolationPolicy::BILINEAR, InterpolationPolicy::NEAREST_NEIGHBOR })),
+                                                                            framework::dataset::make("SamplingPolicy", { SamplingPolicy::CENTER, SamplingPolicy::TOP_LEFT })),
+                                                                    framework::dataset::make("DataLayout", { DataLayout::NHWC, DataLayout::NCHW })),
+               shape, data_type, interpolation_policy, sampling_policy, data_layout)
+{
+    constexpr auto  default_border_mode = BorderMode::CONSTANT;
+    ScaleKernelInfo info(interpolation_policy, default_border_mode, PixelValue(), sampling_policy, false);
+
+    // Create tensors
+    Tensor src = create_tensor<Tensor>(shape, data_type);
+    src.info()->set_data_layout(data_layout);
+
+    const float scale_x = 0.5f;
+    const float scale_y = 0.5f;
+    TensorShape shape_scaled(shape);
+    const int   idx_width  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const int   idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+    shape_scaled.set(idx_width, shape[idx_width] * scale_x, /* apply_dim_correction = */ false);
+    shape_scaled.set(idx_height, shape[idx_height] * scale_y, /* apply_dim_correction = */ false);
+    Tensor dst = create_tensor<Tensor>(shape_scaled, data_type);
+
+    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+
+    // Create and configure function
+    NEScale scale;
+    scale.configure(&src, &dst, info);
+
+    validate(src.info()->padding(), PaddingSize(0, 0, 0, 0));
+    validate(dst.info()->padding(), PaddingSize(0, 0, 0, 0));
+}
+
+DATA_TEST_CASE(CheckNoPaddingInterpAREA, framework::DatasetMode::ALL, combine(combine(combine(combine(datasets::Medium4DShapes(),
+                                                                                                      framework::dataset::make("DataType", { DataType::U8 })),
+                                                                                              framework::dataset::make("InterpolationPolicy", { InterpolationPolicy::AREA })),
+                                                                                      framework::dataset::make("SamplingPolicy", { SamplingPolicy::CENTER, SamplingPolicy::TOP_LEFT })),
+                                                                              framework::dataset::make("DataLayout", { DataLayout::NCHW })),
+               shape, data_type, interpolation_policy, sampling_policy, data_layout)
+{
+    constexpr auto  default_border_mode = BorderMode::CONSTANT;
+    ScaleKernelInfo info(interpolation_policy, default_border_mode, PixelValue(), sampling_policy, false);
+
+    // Create tensors
+    Tensor src = create_tensor<Tensor>(shape, data_type);
+    src.info()->set_data_layout(data_layout);
+
+    const float scale_x = 0.5f;
+    const float scale_y = 0.5f;
+    TensorShape shape_scaled(shape);
+    const int   idx_width  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const int   idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+    shape_scaled.set(idx_width, shape[idx_width] * scale_x, /* apply_dim_correction = */ false);
+    shape_scaled.set(idx_height, shape[idx_height] * scale_y, /* apply_dim_correction = */ false);
+
+    Tensor dst = create_tensor<Tensor>(shape, data_type);
+
+    ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
+
+    // Create and configure function
+    NEScale scale;
+    scale.configure(&src, &dst, info);
+
+    validate(src.info()->padding(), PaddingSize(0, 0, 0, 0));
+    validate(dst.info()->padding(), PaddingSize(0, 0, 0, 0));
+}
+
 template <typename T>
 using NEScaleFixture = ScaleValidationFixture<Tensor, Accessor, NEScale, T>;
 template <typename T>
diff --git a/tests/validation/fixtures/ScaleFixture.h b/tests/validation/fixtures/ScaleFixture.h
index e2ed3ab..1e66306 100644
--- a/tests/validation/fixtures/ScaleFixture.h
+++ b/tests/validation/fixtures/ScaleFixture.h
@@ -137,7 +137,7 @@
         // Create and configure function
         FunctionType scale;
 
-        scale.configure(&src, &dst, ScaleKernelInfo{ _policy, _border_mode, _constant_border_value, _sampling_policy, /* use_padding */ true, _align_corners });
+        scale.configure(&src, &dst, ScaleKernelInfo{ _policy, _border_mode, _constant_border_value, _sampling_policy, /* use_padding */ false, _align_corners });
 
         ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
         ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);