COMPMID-1814 : NEScale add support for TOP_LEFT and QASYMM8

Added support for TOP_LEFT sampling policy and QASYMM8 data type.

Change-Id: Id9135bb4b6ebd93f1d6fb70b06e83684a167eb94
Reviewed-on: https://review.mlplatform.org/533
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Michalis Spyrou <michalis.spyrou@arm.com>
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
diff --git a/arm_compute/core/Helpers.h b/arm_compute/core/Helpers.h
index 8f4220f..91d85be 100644
--- a/arm_compute/core/Helpers.h
+++ b/arm_compute/core/Helpers.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -144,6 +144,40 @@
     return static_cast<T>(a00 * w1 + a01 * w2 + a10 * w3 + a11 * w4);
 }
 
+/** Computes bilinear interpolation for quantized input and output, using the pointer to the top-left pixel and the pixel's distance between
+ * the real coordinates and the smallest following integer coordinates. Input must be quantized and in single channel format.
+ *
+ * @param[in] pixel_ptr Pointer to the top-left pixel value of a single channel input.
+ * @param[in] stride    Stride to access the bottom-left and bottom-right pixel values
+ * @param[in] dx        Pixel's distance between the X real coordinate and the smallest X following integer
+ * @param[in] dy        Pixel's distance between the Y real coordinate and the smallest Y following integer
+ * @param[in] iq_info   Input QuantizationInfo
+ * @param[in] oq_info   Output QuantizationInfo
+ *
+ * @note dx and dy must be in the range [0, 1.0]
+ *
+ * @return The bilinear interpolated pixel value
+ */
+inline uint8_t delta_bilinear_c1_quantized(const uint8_t *pixel_ptr, size_t stride, float dx, float dy, QuantizationInfo iq_info, QuantizationInfo oq_info)
+{
+    ARM_COMPUTE_ERROR_ON(pixel_ptr == nullptr);
+
+    const float dx1 = 1.0f - dx;
+    const float dy1 = 1.0f - dy;
+
+    const float a00 = iq_info.dequantize(*pixel_ptr);
+    const float a01 = iq_info.dequantize(*(pixel_ptr + 1));
+    const float a10 = iq_info.dequantize(*(pixel_ptr + stride));
+    const float a11 = iq_info.dequantize(*(pixel_ptr + stride + 1));
+
+    const float w1  = dx1 * dy1;
+    const float w2  = dx * dy1;
+    const float w3  = dx1 * dy;
+    const float w4  = dx * dy;
+    float       res = a00 * w1 + a01 * w2 + a10 * w3 + a11 * w4;
+    return static_cast<uint8_t>(oq_info.quantize(res, RoundingPolicy::TO_NEAREST_UP));
+}
+
 /** Computes linear interpolation using the pointer to the top pixel and the pixel's distance between
  * the real coordinates and the smallest following integer coordinates. Input must be in single channel format.
  *
diff --git a/arm_compute/core/NEON/kernels/NEScaleKernel.h b/arm_compute/core/NEON/kernels/NEScaleKernel.h
index c851b3d..83d9964 100644
--- a/arm_compute/core/NEON/kernels/NEScaleKernel.h
+++ b/arm_compute/core/NEON/kernels/NEScaleKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -110,6 +110,7 @@
     InterpolationPolicy _policy;
     BorderSize          _border_size;
     BorderMode          _border_mode;
+    float               _sampling_offset;
 };
 } // namespace arm_compute
 #endif /*__ARM_COMPUTE_NESCALEKERNEL_H__ */
diff --git a/src/core/NEON/kernels/NEScaleKernel.cpp b/src/core/NEON/kernels/NEScaleKernel.cpp
index 5fef4f9..3d300ef 100644
--- a/src/core/NEON/kernels/NEScaleKernel.cpp
+++ b/src/core/NEON/kernels/NEScaleKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -48,11 +48,11 @@
                           BorderMode border_mode, SamplingPolicy sampling_policy)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32, DataType::QASYMM8);
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON(output == input);
-    ARM_COMPUTE_RETURN_ERROR_ON(sampling_policy != SamplingPolicy::CENTER);
+    ARM_COMPUTE_RETURN_ERROR_ON(sampling_policy != SamplingPolicy::CENTER && sampling_policy != SamplingPolicy::TOP_LEFT);
     ARM_COMPUTE_UNUSED(border_mode);
 
     const DataLayout data_layout = input->data_layout();
@@ -74,6 +74,7 @@
     if(policy == InterpolationPolicy::AREA)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(data_layout != DataLayout::NCHW);
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
     }
 
     return Status{};
@@ -184,7 +185,7 @@
 
 template <typename T>
 inline void scale_bilinear_nhwc_core(const ITensor *input, const ITensor *offsets, const ITensor *dx, const ITensor *dy, ITensor *output,
-                                     float hr, Window window, const Window &win_in, size_t stride_w, size_t stride_h, size_t stride_c, BorderMode border_mode)
+                                     float hr, float sampling_offset, Window window, const Window &win_in, size_t stride_w, size_t stride_h, size_t stride_c, BorderMode border_mode)
 {
     Iterator in(input, win_in);
     Iterator out(output, window);
@@ -204,12 +205,16 @@
 
     int border_size = (border_mode == BorderMode::UNDEFINED) ? 0 : 1;
 
+    const bool             is_quantized = (input->info()->data_type() == DataType::QASYMM8);
+    const QuantizationInfo iq_info      = input->info()->quantization_info();
+    const QuantizationInfo oq_info      = output->info()->quantization_info();
+
     execute_window_loop(window, [&](const Coordinates & id)
     {
         const auto offset     = (*reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())))) / static_cast<int>(sizeof(T));
         const auto dx_scale   = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z())));
         const auto dy_scale   = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z())));
-        const int  in_yi      = std::floor((id.z() + 0.5f) * hr - 0.5f);
+        const int  in_yi      = std::floor((id.z() + sampling_offset) * hr - sampling_offset);
         const int  offset_row = in_yi * stride_h + id.x() * stride_c;
         const T   *in_ptr     = reinterpret_cast<T *>(in.ptr() + offset * stride_w + offset_row);
 
@@ -253,8 +258,22 @@
             const float w3 = dx1 * dy_scale;
             const float w4 = dx_scale * dy_scale;
 
+            T res = 0;
+            //dequantize quantized input
+            if(is_quantized)
+            {
+                float inp00 = iq_info.dequantize(a00);
+                float inp01 = iq_info.dequantize(a01);
+                float inp10 = iq_info.dequantize(a10);
+                float inp11 = iq_info.dequantize(a11);
+                res         = static_cast<T>(oq_info.quantize((inp00 * w1 + inp01 * w2 + inp10 * w3 + inp11 * w4), RoundingPolicy::TO_NEAREST_UP));
+            }
+            else
+            {
+                res = static_cast<T>(a00 * w1 + a01 * w2 + a10 * w3 + a11 * w4);
+            }
             // Store result
-            *reinterpret_cast<T *>(out.ptr()) = static_cast<T>(a00 * w1 + a01 * w2 + a10 * w3 + a11 * w4);
+            *reinterpret_cast<T *>(out.ptr()) = res;
         }
         else
         {
@@ -275,7 +294,7 @@
 } // namespace
 
 NEScaleKernel::NEScaleKernel()
-    : _func(nullptr), _offsets(nullptr), _dx(nullptr), _dy(nullptr), _input(nullptr), _output(nullptr), _policy(), _border_size(1), _border_mode()
+    : _func(nullptr), _offsets(nullptr), _dx(nullptr), _dy(nullptr), _input(nullptr), _output(nullptr), _policy(), _border_size(1), _border_mode(), _sampling_offset(0)
 {
 }
 
@@ -311,6 +330,11 @@
     _border_size = BorderSize(1);
     _border_mode = border_mode;
 
+    if(sampling_policy == SamplingPolicy::CENTER)
+    {
+        _sampling_offset = 0.5f;
+    }
+
     // Compute the ratio between source width/height and destination width/height
     const auto wr = static_cast<float>(input->info()->dimension(idx_width)) / static_cast<float>(output->info()->dimension(idx_width));
     const auto hr = static_cast<float>(input->info()->dimension(idx_height)) / static_cast<float>(output->info()->dimension(idx_height));
@@ -389,6 +413,7 @@
 
     switch(_input->info()->data_type())
     {
+        case DataType::QASYMM8:
         case DataType::U8:
         {
             uint8x16_t tmp = vdupq_n_u8(0);
@@ -559,7 +584,7 @@
 
 void NEScaleKernel::scale_bilinear_nchw(const Window &window)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(_input, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(_input, 1, DataType::U8, DataType::QASYMM8, DataType::S16, DataType::F16, DataType::F32);
 
     // Compute the ratio between source height and destination height
     const auto hr = static_cast<float>(_input->info()->dimension(1)) / static_cast<float>(_output->info()->dimension(1));
@@ -589,8 +614,13 @@
     const size_t in_stide_in_bytes = _input->info()->strides_in_bytes()[1];
     const size_t in_stride         = in_stide_in_bytes / _input->info()->element_size();
 
+    const bool             is_quantized = (_input->info()->data_type() == DataType::QASYMM8);
+    const QuantizationInfo iq_info      = _input->info()->quantization_info();
+    const QuantizationInfo oq_info      = _output->info()->quantization_info();
+
     switch(_input->info()->data_type())
     {
+        case DataType::QASYMM8:
         case DataType::U8:
         {
             execute_window_loop(window, [&](const Coordinates & id)
@@ -600,29 +630,55 @@
                 const auto dy_ptr      = reinterpret_cast<const float *>(dy.ptr());
                 const auto in_ptr      = reinterpret_cast<const uint8_t *>(in.ptr());
 
-                const int in_yi      = std::floor((id.y() + 0.5f) * hr - 0.5f);
+                const int in_yi      = std::floor((id.y() + _sampling_offset) * hr - _sampling_offset);
                 const int offset_row = in_yi * in_stide_in_bytes;
 
                 uint8x8_t tmp0 = vdup_n_u8(0);
-                tmp0           = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[0] + offset_row], in_stride, dx_ptr[0], dy_ptr[0]), tmp0, 0);
-                tmp0           = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[1] + offset_row], in_stride, dx_ptr[1], dy_ptr[1]), tmp0, 1);
-                tmp0           = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[2] + offset_row], in_stride, dx_ptr[2], dy_ptr[2]), tmp0, 2);
-                tmp0           = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[3] + offset_row], in_stride, dx_ptr[3], dy_ptr[3]), tmp0, 3);
-                tmp0           = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[4] + offset_row], in_stride, dx_ptr[4], dy_ptr[4]), tmp0, 4);
-                tmp0           = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[5] + offset_row], in_stride, dx_ptr[5], dy_ptr[5]), tmp0, 5);
-                tmp0           = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[6] + offset_row], in_stride, dx_ptr[6], dy_ptr[6]), tmp0, 6);
-                tmp0           = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[7] + offset_row], in_stride, dx_ptr[7], dy_ptr[7]), tmp0, 7);
-
+                if(is_quantized)
+                {
+                    tmp0 = vset_lane_u8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[0] + offset_row], in_stride, dx_ptr[0], dy_ptr[0], iq_info, oq_info), tmp0, 0);
+                    tmp0 = vset_lane_u8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[1] + offset_row], in_stride, dx_ptr[1], dy_ptr[1], iq_info, oq_info), tmp0, 1);
+                    tmp0 = vset_lane_u8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[2] + offset_row], in_stride, dx_ptr[2], dy_ptr[2], iq_info, oq_info), tmp0, 2);
+                    tmp0 = vset_lane_u8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[3] + offset_row], in_stride, dx_ptr[3], dy_ptr[3], iq_info, oq_info), tmp0, 3);
+                    tmp0 = vset_lane_u8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[4] + offset_row], in_stride, dx_ptr[4], dy_ptr[4], iq_info, oq_info), tmp0, 4);
+                    tmp0 = vset_lane_u8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[5] + offset_row], in_stride, dx_ptr[5], dy_ptr[5], iq_info, oq_info), tmp0, 5);
+                    tmp0 = vset_lane_u8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[6] + offset_row], in_stride, dx_ptr[6], dy_ptr[6], iq_info, oq_info), tmp0, 6);
+                    tmp0 = vset_lane_u8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[7] + offset_row], in_stride, dx_ptr[7], dy_ptr[7], iq_info, oq_info), tmp0, 7);
+                }
+                else
+                {
+                    tmp0 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[0] + offset_row], in_stride, dx_ptr[0], dy_ptr[0]), tmp0, 0);
+                    tmp0 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[1] + offset_row], in_stride, dx_ptr[1], dy_ptr[1]), tmp0, 1);
+                    tmp0 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[2] + offset_row], in_stride, dx_ptr[2], dy_ptr[2]), tmp0, 2);
+                    tmp0 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[3] + offset_row], in_stride, dx_ptr[3], dy_ptr[3]), tmp0, 3);
+                    tmp0 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[4] + offset_row], in_stride, dx_ptr[4], dy_ptr[4]), tmp0, 4);
+                    tmp0 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[5] + offset_row], in_stride, dx_ptr[5], dy_ptr[5]), tmp0, 5);
+                    tmp0 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[6] + offset_row], in_stride, dx_ptr[6], dy_ptr[6]), tmp0, 6);
+                    tmp0 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[7] + offset_row], in_stride, dx_ptr[7], dy_ptr[7]), tmp0, 7);
+                }
                 uint8x8_t tmp1 = vdup_n_u8(0);
-                tmp1           = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[8] + offset_row], in_stride, dx_ptr[8], dy_ptr[8]), tmp1, 0);
-                tmp1           = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[9] + offset_row], in_stride, dx_ptr[9], dy_ptr[9]), tmp1, 1);
-                tmp1           = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[10] + offset_row], in_stride, dx_ptr[10], dy_ptr[10]), tmp1, 2);
-                tmp1           = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[11] + offset_row], in_stride, dx_ptr[11], dy_ptr[11]), tmp1, 3);
-                tmp1           = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[12] + offset_row], in_stride, dx_ptr[12], dy_ptr[12]), tmp1, 4);
-                tmp1           = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[13] + offset_row], in_stride, dx_ptr[13], dy_ptr[13]), tmp1, 5);
-                tmp1           = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[14] + offset_row], in_stride, dx_ptr[14], dy_ptr[14]), tmp1, 6);
-                tmp1           = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[15] + offset_row], in_stride, dx_ptr[15], dy_ptr[15]), tmp1, 7);
-
+                if(is_quantized)
+                {
+                    tmp1 = vset_lane_u8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[8] + offset_row], in_stride, dx_ptr[8], dy_ptr[8], iq_info, oq_info), tmp1, 0);
+                    tmp1 = vset_lane_u8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[9] + offset_row], in_stride, dx_ptr[9], dy_ptr[9], iq_info, oq_info), tmp1, 1);
+                    tmp1 = vset_lane_u8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[10] + offset_row], in_stride, dx_ptr[10], dy_ptr[10], iq_info, oq_info), tmp1, 2);
+                    tmp1 = vset_lane_u8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[11] + offset_row], in_stride, dx_ptr[11], dy_ptr[11], iq_info, oq_info), tmp1, 3);
+                    tmp1 = vset_lane_u8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[12] + offset_row], in_stride, dx_ptr[12], dy_ptr[12], iq_info, oq_info), tmp1, 4);
+                    tmp1 = vset_lane_u8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[13] + offset_row], in_stride, dx_ptr[13], dy_ptr[13], iq_info, oq_info), tmp1, 5);
+                    tmp1 = vset_lane_u8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[14] + offset_row], in_stride, dx_ptr[14], dy_ptr[14], iq_info, oq_info), tmp1, 6);
+                    tmp1 = vset_lane_u8(delta_bilinear_c1_quantized(&in_ptr[offsets_ptr[15] + offset_row], in_stride, dx_ptr[15], dy_ptr[15], iq_info, oq_info), tmp1, 7);
+                }
+                else
+                {
+                    tmp1 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[8] + offset_row], in_stride, dx_ptr[8], dy_ptr[8]), tmp1, 0);
+                    tmp1 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[9] + offset_row], in_stride, dx_ptr[9], dy_ptr[9]), tmp1, 1);
+                    tmp1 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[10] + offset_row], in_stride, dx_ptr[10], dy_ptr[10]), tmp1, 2);
+                    tmp1 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[11] + offset_row], in_stride, dx_ptr[11], dy_ptr[11]), tmp1, 3);
+                    tmp1 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[12] + offset_row], in_stride, dx_ptr[12], dy_ptr[12]), tmp1, 4);
+                    tmp1 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[13] + offset_row], in_stride, dx_ptr[13], dy_ptr[13]), tmp1, 5);
+                    tmp1 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[14] + offset_row], in_stride, dx_ptr[14], dy_ptr[14]), tmp1, 6);
+                    tmp1 = vset_lane_u8(delta_bilinear_c1(&in_ptr[offsets_ptr[15] + offset_row], in_stride, dx_ptr[15], dy_ptr[15]), tmp1, 7);
+                }
                 vst1q_u8(out.ptr(), vcombine_u8(tmp0, tmp1));
             },
             in, offsets, dx, dy, out);
@@ -636,7 +692,7 @@
                 const auto dx_ptr      = reinterpret_cast<const float *>(dx.ptr());
                 const auto dy_ptr      = reinterpret_cast<const float *>(dy.ptr());
 
-                const int in_yi      = std::floor((id.y() + 0.5f) * hr - 0.5f);
+                const int in_yi      = std::floor((id.y() + _sampling_offset) * hr - _sampling_offset);
                 const int offset_row = in_yi * in_stide_in_bytes;
 
                 int16x8x2_t tmp =
@@ -679,7 +735,7 @@
                 const auto dx_ptr      = reinterpret_cast<const float *>(dx.ptr());
                 const auto dy_ptr      = reinterpret_cast<const float *>(dy.ptr());
 
-                const int in_yi      = std::floor((id.y() + 0.5f) * hr - 0.5f);
+                const int in_yi      = std::floor((id.y() + _sampling_offset) * hr - _sampling_offset);
                 const int offset_row = in_yi * in_stide_in_bytes;
 
                 float16x8x2_t tmp =
@@ -722,7 +778,7 @@
                 const auto dx_ptr      = reinterpret_cast<const float *>(dx.ptr());
                 const auto dy_ptr      = reinterpret_cast<const float *>(dy.ptr());
 
-                const int in_yi      = std::floor((id.y() + 0.5f) * hr - 0.5f);
+                const int in_yi      = std::floor((id.y() + _sampling_offset) * hr - _sampling_offset);
                 const int offset_row = in_yi * in_stide_in_bytes;
 
                 float32x4x4_t tmp =
@@ -839,6 +895,7 @@
 
     switch(_input->info()->data_type())
     {
+        case DataType::QASYMM8:
         case DataType::U8:
         {
             if(_policy == InterpolationPolicy::NEAREST_NEIGHBOR)
@@ -847,7 +904,7 @@
             }
             else
             {
-                scale_bilinear_nhwc_core<uint8_t>(_input, _offsets, _dx, _dy, _output, hr,
+                scale_bilinear_nhwc_core<uint8_t>(_input, _offsets, _dx, _dy, _output, hr, _sampling_offset,
                                                   window, win_in, input_stride_w, input_stride_h, input_stride_c, _border_mode);
             }
             break;
@@ -860,7 +917,7 @@
             }
             else
             {
-                scale_bilinear_nhwc_core<int16_t>(_input, _offsets, _dx, _dy, _output, hr,
+                scale_bilinear_nhwc_core<int16_t>(_input, _offsets, _dx, _dy, _output, hr, _sampling_offset,
                                                   window, win_in, input_stride_w, input_stride_h, input_stride_c, _border_mode);
             }
             break;
@@ -875,7 +932,7 @@
             }
             else
             {
-                scale_bilinear_nhwc_core<float16_t>(_input, _offsets, _dx, _dy, _output, hr,
+                scale_bilinear_nhwc_core<float16_t>(_input, _offsets, _dx, _dy, _output, hr, _sampling_offset,
                                                     window, win_in, input_stride_w, input_stride_h, input_stride_c, _border_mode);
             }
             break;
@@ -889,7 +946,7 @@
             }
             else
             {
-                scale_bilinear_nhwc_core<float>(_input, _offsets, _dx, _dy, _output, hr,
+                scale_bilinear_nhwc_core<float>(_input, _offsets, _dx, _dy, _output, hr, _sampling_offset,
                                                 window, win_in, input_stride_w, input_stride_h, input_stride_c, _border_mode);
             }
             break;
diff --git a/src/runtime/NEON/functions/NEScale.cpp b/src/runtime/NEON/functions/NEScale.cpp
index 169b9bb..483aa4c 100644
--- a/src/runtime/NEON/functions/NEScale.cpp
+++ b/src/runtime/NEON/functions/NEScale.cpp
@@ -46,6 +46,11 @@
 {
     ARM_COMPUTE_ERROR_ON(nullptr == offsets);
     ARM_COMPUTE_UNUSED(sampling_policy);
+    float sampling_offset = 0.0f;
+    if(sampling_policy == SamplingPolicy::CENTER)
+    {
+        sampling_offset = 0.5f;
+    }
 
     Window win;
     win.set(Window::DimX, Window::Dimension(0, offsets->info()->dimension(0), 1));
@@ -60,8 +65,8 @@
 
         execute_window_loop(win, [&](const Coordinates & id)
         {
-            const float in_x  = (id.x() + 0.5f) * wr - 0.5f;
-            const float in_y  = (id.y() + 0.5f) * hr - 0.5f;
+            const float in_x  = (id.x() + sampling_offset) * wr - sampling_offset;
+            const float in_y  = (id.y() + sampling_offset) * hr - sampling_offset;
             const int   in_xi = std::floor(in_x);
             const int   in_yi = std::floor(in_y);
 
@@ -174,7 +179,7 @@
                          BorderMode border_mode, PixelValue constant_border_value, SamplingPolicy sampling_policy)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_RETURN_ERROR_ON(sampling_policy != SamplingPolicy::CENTER);
+    ARM_COMPUTE_RETURN_ERROR_ON(sampling_policy != SamplingPolicy::CENTER && sampling_policy != SamplingPolicy::TOP_LEFT);
     ARM_COMPUTE_UNUSED(border_mode, constant_border_value);
 
     ITensorInfo *offsets = nullptr;
diff --git a/tests/validation/NEON/Scale.cpp b/tests/validation/NEON/Scale.cpp
index 127a552..c05b8ac 100644
--- a/tests/validation/NEON/Scale.cpp
+++ b/tests/validation/NEON/Scale.cpp
@@ -81,19 +81,16 @@
 // clang-format off
 DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(
         framework::dataset::make("InputInfo", { TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::U8),  // Mismatching data type
-                                                TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32), // Unsupported sampling point
                                                 TensorInfo(TensorShape(4U, 27U, 13U), 1, DataType::F32), // Invalid policy
                                                 TensorInfo(TensorShape(27U, 13U, 2U), 1, DataType::F32), // Insufficient padding
                                                 TensorInfo(TensorShape(4U, 27U, 13U), 1, DataType::F32),
                                               }),
         framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(132U, 25U, 2U), 1, DataType::F32),
-                                                TensorInfo(TensorShape(132U, 25U, 2U), 1, DataType::F32),
                                                 TensorInfo(TensorShape(4U, 132U, 25U), 1, DataType::F32),
                                                 TensorInfo(TensorShape(132U, 25U, 2U), 1, DataType::F32),
                                                 TensorInfo(TensorShape(4U, 132U, 25U), 1, DataType::F32),
                                               })),
         framework::dataset::make("InterpolationPolicy", { InterpolationPolicy::NEAREST_NEIGHBOR,
-                                                          InterpolationPolicy::NEAREST_NEIGHBOR,
                                                           InterpolationPolicy::AREA,
                                                           InterpolationPolicy::AREA,
                                                           InterpolationPolicy::NEAREST_NEIGHBOR,
@@ -101,22 +98,19 @@
         framework::dataset::make("BorderMode",  { BorderMode::UNDEFINED,
                                                   BorderMode::UNDEFINED,
                                                   BorderMode::UNDEFINED,
-                                                  BorderMode::UNDEFINED,
                                                   BorderMode::REPLICATE,
                                                 })),
         framework::dataset::make("SamplingPolicy",  { SamplingPolicy::CENTER,
-                                                      SamplingPolicy::TOP_LEFT,
                                                       SamplingPolicy::CENTER,
                                                       SamplingPolicy::CENTER,
                                                       SamplingPolicy::CENTER,
                                                     })),
         framework::dataset::make("DataLayout",  { DataLayout::NCHW,
-                                                  DataLayout::NCHW,
                                                   DataLayout::NHWC,
                                                   DataLayout::NCHW,
                                                   DataLayout::NHWC,
                                                 })),
-        framework::dataset::make("Expected", { false, false, false, false ,true })),
+        framework::dataset::make("Expected", { false, false, false ,true })),
         input_info, output_info, policy,border_mode, sampling_policy, data_layout, expected)
 {
     const PixelValue constant_border(5);
@@ -201,6 +195,8 @@
 
 template <typename T>
 using NEScaleFixture = ScaleValidationFixture<Tensor, Accessor, NEScale, T>;
+template <typename T>
+using NEScaleQuantizedFixture = ScaleValidationQuantizedFixture<Tensor, Accessor, NEScale, T>;
 
 TEST_SUITE(Float)
 TEST_SUITE(FP32)
@@ -209,7 +205,7 @@
                                                                                                                      framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
                                                                                                              framework::dataset::make("InterpolationPolicy", { InterpolationPolicy::NEAREST_NEIGHBOR, InterpolationPolicy::BILINEAR })),
                                                                                                      datasets::BorderModes()),
-                                                                                             framework::dataset::make("SamplingPolicy", { SamplingPolicy::CENTER })))
+                                                                                             framework::dataset::make("SamplingPolicy", { SamplingPolicy::TOP_LEFT, SamplingPolicy::CENTER })))
 {
     //Create valid region
     TensorInfo  src_info(_shape, 1, _data_type);
@@ -223,7 +219,7 @@
                                                                                                                  framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
                                                                                                                  framework::dataset::make("InterpolationPolicy", { InterpolationPolicy::NEAREST_NEIGHBOR, InterpolationPolicy::BILINEAR })),
                                                                                                          datasets::BorderModes()),
-                                                                                                 framework::dataset::make("SamplingPolicy", { SamplingPolicy::CENTER })))
+                                                                                                 framework::dataset::make("SamplingPolicy", { SamplingPolicy::TOP_LEFT, SamplingPolicy::CENTER })))
 {
     //Create valid region
     TensorInfo  src_info(_shape, 1, _data_type);
@@ -240,7 +236,7 @@
                                                                                                                     framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
                                                                                                             framework::dataset::make("InterpolationPolicy", { InterpolationPolicy::NEAREST_NEIGHBOR, InterpolationPolicy::BILINEAR })),
                                                                                                     datasets::BorderModes()),
-                                                                                            framework::dataset::make("SamplingPolicy", { SamplingPolicy::CENTER })))
+                                                                                            framework::dataset::make("SamplingPolicy", { SamplingPolicy::TOP_LEFT, SamplingPolicy::CENTER })))
 {
     //Create valid region
     TensorInfo        src_info(_shape, 1, _data_type);
@@ -254,7 +250,7 @@
                                                                                                                         framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
                                                                                                                 framework::dataset::make("InterpolationPolicy", { InterpolationPolicy::NEAREST_NEIGHBOR, InterpolationPolicy::BILINEAR })),
                                                                                                         datasets::BorderModes()),
-                                                                                                framework::dataset::make("SamplingPolicy", { SamplingPolicy::CENTER })))
+                                                                                                framework::dataset::make("SamplingPolicy", { SamplingPolicy::TOP_LEFT, SamplingPolicy::CENTER })))
 {
     //Create valid region
     TensorInfo        src_info(_shape, 1, _data_type);
@@ -274,7 +270,7 @@
                                                                                                                        framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
                                                                                                                framework::dataset::make("InterpolationPolicy", { InterpolationPolicy::NEAREST_NEIGHBOR, InterpolationPolicy::BILINEAR })),
                                                                                                        datasets::BorderModes()),
-                                                                                               framework::dataset::make("SamplingPolicy", { SamplingPolicy::CENTER })))
+                                                                                               framework::dataset::make("SamplingPolicy", { SamplingPolicy::TOP_LEFT, SamplingPolicy::CENTER })))
 {
     //Create valid region
     TensorInfo  src_info(_shape, 1, _data_type);
@@ -288,7 +284,7 @@
                                                                                                                    framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
                                                                                                                    framework::dataset::make("InterpolationPolicy", { InterpolationPolicy::NEAREST_NEIGHBOR, InterpolationPolicy::BILINEAR })),
                                                                                                            datasets::BorderModes()),
-                                                                                                   framework::dataset::make("SamplingPolicy", { SamplingPolicy::CENTER })))
+                                                                                                   framework::dataset::make("SamplingPolicy", { SamplingPolicy::TOP_LEFT, SamplingPolicy::CENTER })))
 {
     //Create valid region
     TensorInfo  src_info(_shape, 1, _data_type);
@@ -304,7 +300,7 @@
                                                                                                                        framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
                                                                                                                framework::dataset::make("InterpolationPolicy", { InterpolationPolicy::NEAREST_NEIGHBOR, InterpolationPolicy::BILINEAR })),
                                                                                                        datasets::BorderModes()),
-                                                                                               framework::dataset::make("SamplingPolicy", { SamplingPolicy::CENTER })))
+                                                                                               framework::dataset::make("SamplingPolicy", { SamplingPolicy::TOP_LEFT, SamplingPolicy::CENTER })))
 {
     //Create valid region
     TensorInfo  src_info(_shape, 1, _data_type);
@@ -318,7 +314,7 @@
                                                                                                                    framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
                                                                                                                    framework::dataset::make("InterpolationPolicy", { InterpolationPolicy::NEAREST_NEIGHBOR, InterpolationPolicy::BILINEAR })),
                                                                                                            datasets::BorderModes()),
-                                                                                                   framework::dataset::make("SamplingPolicy", { SamplingPolicy::CENTER })))
+                                                                                                   framework::dataset::make("SamplingPolicy", { SamplingPolicy::TOP_LEFT, SamplingPolicy::CENTER })))
 {
     //Create valid region
     TensorInfo  src_info(_shape, 1, _data_type);
@@ -330,6 +326,26 @@
 TEST_SUITE_END() // S16
 TEST_SUITE_END() // Integer
 
+TEST_SUITE(Quantized)
+TEST_SUITE(QASYMM8)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEScaleQuantizedFixture<uint8_t>, framework::DatasetMode::ALL, combine(combine(combine(combine(combine(combine(datasets::SmallShapes(),
+                                                                                                                        framework::dataset::make("DataType", DataType::QASYMM8)),
+                                                                                                                        framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, -10) })),
+                                                                                                                        framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                                                                                                                        framework::dataset::make("InterpolationPolicy", { InterpolationPolicy::NEAREST_NEIGHBOR, InterpolationPolicy::BILINEAR })),
+                                                                                                                datasets::BorderModes()),
+                                                                                                        framework::dataset::make("SamplingPolicy", { SamplingPolicy::TOP_LEFT, SamplingPolicy::CENTER })))
+{
+    //Create valid region
+    TensorInfo  src_info(_shape, 1, _data_type);
+    ValidRegion valid_region = calculate_valid_region_scale(src_info, _reference.shape(), _policy, _sampling_policy, (_border_mode == BorderMode::UNDEFINED));
+
+    // Validate output
+    validate(Accessor(_target), _reference, valid_region, tolerance_u8);
+}
+TEST_SUITE_END() // QASYMM8
+TEST_SUITE_END() // Quantized
+
 TEST_SUITE_END() // Scale
 TEST_SUITE_END() // NEON
 } // namespace validation