COMPMID-2447: Align TFlite nearest neighbor NE/CL functions with ACL

Change-Id: Idd7b23247491d6e2e31d19b2a8aa522470ca174c
Signed-off-by: Michalis Spyrou <michalis.spyrou@arm.com>
Reviewed-on: https://review.mlplatform.org/c/1500
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
diff --git a/src/core/CL/cl_kernels/scale.cl b/src/core/CL/cl_kernels/scale.cl
index 5ac6443..499f9ea 100644
--- a/src/core/CL/cl_kernels/scale.cl
+++ b/src/core/CL/cl_kernels/scale.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -33,10 +33,19 @@
  */
 inline const float8 transform_nearest(const float2 coord, const float2 scale)
 {
+#ifdef SAMPLING_POLICY_TOP_LEFT
+    const float4 in_x_coords = (float4)(coord.s0, 1 + coord.s0, 2 + coord.s0, 3 + coord.s0);
+    const float4 new_x       = in_x_coords * (float4)(scale.s0);
+    const float4 new_y       = (float4)(coord.s1 * scale.s1);
+    return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3);
+#elif SAMPLING_POLICY_CENTER
     const float4 in_x_coords = (float4)(coord.s0, 1 + coord.s0, 2 + coord.s0, 3 + coord.s0);
     const float4 new_x       = (in_x_coords + ((float4)(0.5f))) * (float4)(scale.s0);
     const float4 new_y       = (float4)((coord.s1 + 0.5f) * scale.s1);
     return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3);
+#else /* SAMPLING_POLICY */
+#error("Unsupported sampling policy");
+#endif /* SAMPLING_POLICY */
 }
 
 /** Transforms four 2D coordinates. This is used to map the output coordinates to the input coordinates.
@@ -172,8 +181,15 @@
     Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(in, 0);
     Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT(out, DEPTH_OUT);
 
-    const float new_x     = (get_global_id(1) + 0.5f) * scale_x;
-    const float new_y     = ((get_global_id(2) % DEPTH_OUT) + 0.5f) * scale_y;
+#ifdef SAMPLING_POLICY_TOP_LEFT
+    const float new_x = get_global_id(1) * scale_x;
+    const float new_y = (get_global_id(2) % DEPTH_OUT) * scale_y;
+#elif SAMPLING_POLICY_CENTER
+    const float new_x = (get_global_id(1) + 0.5f) * scale_x;
+    const float new_y = ((get_global_id(2) % DEPTH_OUT) + 0.5f) * scale_y;
+#else /* SAMPLING_POLICY */
+#error("Unsupported sampling policy");
+#endif /* SAMPLING_POLICY */
     const float clamped_x = clamp(new_x, 0.0f, input_width - 1);
     const float clamped_y = clamp(new_y, 0.0f, input_height - 1);
 
diff --git a/src/core/GLES_COMPUTE/cs_shaders/scale.cs b/src/core/GLES_COMPUTE/cs_shaders/scale.cs
index b72c339..8a1d3e4 100644
--- a/src/core/GLES_COMPUTE/cs_shaders/scale.cs
+++ b/src/core/GLES_COMPUTE/cs_shaders/scale.cs
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -58,8 +58,15 @@
     vec4 in_x_coords = vec4(coord.x, 1.f + coord.x, 2.f + coord.x, 3.f + coord.x);
 
     vec4[2] t;
+#if defined(SAMPLING_POLICY_CENTER) /* SAMPLING_POLICY_CENTER */
     t[0] = (in_x_coords + (vec4(0.5f))) * scale.x;
     t[1] = vec4((coord.y + 0.5f) * scale.y);
+#elif defined(SAMPLING_POLICY_TOP_LEFT) /* SAMPLING_POLICY_TOP_LEFT */
+    t[0] = in_x_coords * scale.x;
+    t[1] = vec4(coord.y) * scale.y;
+#else                                   /* Unsupported sampling policy */
+#error Unsupported sampling policy
+#endif
 
     return t;
 }
diff --git a/src/core/GLES_COMPUTE/kernels/GCScaleKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCScaleKernel.cpp
index f87615a..1de0852 100644
--- a/src/core/GLES_COMPUTE/kernels/GCScaleKernel.cpp
+++ b/src/core/GLES_COMPUTE/kernels/GCScaleKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -80,6 +80,14 @@
 
     build_opts.emplace("#define DATA_TYPE_FP16");
     build_opts.emplace("#define BORDER_SIZE " + support::cpp11::to_string(border.right));
+    if(sampling_policy == SamplingPolicy::TOP_LEFT)
+    {
+        build_opts.emplace("#define SAMPLING_POLICY_TOP_LEFT");
+    }
+    else
+    {
+        build_opts.emplace("#define SAMPLING_POLICY_CENTER");
+    }
 
     // Configure kernel window
     unsigned int num_elems_processed_per_iteration = 4;
diff --git a/src/core/NEON/kernels/NEScaleKernel.cpp b/src/core/NEON/kernels/NEScaleKernel.cpp
index 4a51627..8e9a346 100644
--- a/src/core/NEON/kernels/NEScaleKernel.cpp
+++ b/src/core/NEON/kernels/NEScaleKernel.cpp
@@ -169,7 +169,7 @@
 
 template <typename T>
 inline void scale_nearest_nhwc_core(const ITensor *input, const ITensor *offsets, ITensor *output,
-                                    float hr, Window window, const Window &win_in, size_t stride_w, size_t stride_h, size_t stride_c)
+                                    float hr, Window window, const Window &win_in, size_t stride_w, size_t stride_h, size_t stride_c, float sampling_offset)
 {
     const int  window_step_x  = 16 / sizeof(T);
     const auto window_start_x = static_cast<int32_t>(window.x().start());
@@ -185,7 +185,7 @@
     execute_window_loop(window, [&](const Coordinates & id)
     {
         const int32_t offset     = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())));
-        const int     in_yi      = (id.z() + 0.5f) * hr;
+        const int     in_yi      = std::floor((id.z() + sampling_offset) * hr);
         const int     offset_row = in_yi * stride_h;
         int32_t       x          = window_start_x;
         for(; x < window_end_x - window_step_x; x += window_step_x)
@@ -459,7 +459,7 @@
                 const auto           offsets_ptr = reinterpret_cast<const int32_t *>(offsets.ptr());
                 const uint8_t *const in_ptr      = in.ptr();
 
-                const int in_yi         = std::floor((id.y() + 0.5f) * hr);
+                const int in_yi         = std::floor((id.y() + _sampling_offset) * hr);
                 const int in_yi_clamped = std::min(static_cast<int>(_input->info()->dimension(1)), std::max(in_yi, -1));
                 ARM_COMPUTE_ERROR_ON(in_yi_clamped < -1 || in_yi_clamped > static_cast<int>(_input->info()->dimension(1)));
                 const int offset_row = in_yi_clamped * input_stride;
@@ -500,7 +500,7 @@
             {
                 const auto offsets_ptr = reinterpret_cast<const int32_t *>(offsets.ptr());
 
-                const int in_yi      = (id.y() + 0.5f) * hr;
+                const int in_yi      = std::floor((id.y() + _sampling_offset) * hr);
                 const int offset_row = in_yi * input_stride;
 
                 tmp.val[0] = vsetq_lane_s16(*reinterpret_cast<const int16_t *>(in.ptr() + offsets_ptr[0] + offset_row), tmp.val[0], 0);
@@ -541,7 +541,7 @@
             {
                 const auto offsets_ptr = reinterpret_cast<const int32_t *>(offsets.ptr());
 
-                const int in_yi      = (id.y() + 0.5f) * hr;
+                const int in_yi      = std::floor((id.y() + _sampling_offset) * hr);
                 const int offset_row = in_yi * input_stride;
 
                 tmp.val[0] = vsetq_lane_f16(*reinterpret_cast<const __fp16 *>(in.ptr() + offsets_ptr[0] + offset_row), tmp.val[0], 0);
@@ -584,7 +584,7 @@
             {
                 const auto offsets_ptr = reinterpret_cast<const int32_t *>(offsets.ptr());
 
-                const int in_yi      = (id.y() + 0.5f) * hr;
+                const int in_yi      = std::floor((id.y() + _sampling_offset) * hr);
                 const int offset_row = in_yi * input_stride;
 
                 tmp.val[0] = vsetq_lane_f32(*reinterpret_cast<const float *>(in.ptr() + offsets_ptr[0] + offset_row), tmp.val[0], 0);
@@ -614,7 +614,6 @@
         }
         default:
             ARM_COMPUTE_ERROR("Not supported");
-            break;
     }
 }
 
@@ -936,7 +935,7 @@
         {
             if(_policy == InterpolationPolicy::NEAREST_NEIGHBOR)
             {
-                scale_nearest_nhwc_core<uint8_t>(_input, _offsets, _output, hr, window, win_in, input_stride_w, input_stride_h, input_stride_c);
+                scale_nearest_nhwc_core<uint8_t>(_input, _offsets, _output, hr, window, win_in, input_stride_w, input_stride_h, input_stride_c, _sampling_offset);
             }
             else
             {
@@ -949,7 +948,7 @@
         {
             if(_policy == InterpolationPolicy::NEAREST_NEIGHBOR)
             {
-                scale_nearest_nhwc_core<int16_t>(_input, _offsets, _output, hr, window, win_in, input_stride_w, input_stride_h, input_stride_c);
+                scale_nearest_nhwc_core<int16_t>(_input, _offsets, _output, hr, window, win_in, input_stride_w, input_stride_h, input_stride_c, _sampling_offset);
             }
             else
             {
@@ -964,7 +963,7 @@
             if(_policy == InterpolationPolicy::NEAREST_NEIGHBOR)
             {
                 scale_nearest_nhwc_core<float16_t>(_input, _offsets, _output, hr,
-                                                   window, win_in, input_stride_w, input_stride_h, input_stride_c);
+                                                   window, win_in, input_stride_w, input_stride_h, input_stride_c, _sampling_offset);
             }
             else
             {
@@ -978,7 +977,7 @@
         {
             if(_policy == InterpolationPolicy::NEAREST_NEIGHBOR)
             {
-                scale_nearest_nhwc_core<float>(_input, _offsets, _output, hr, window, win_in, input_stride_w, input_stride_h, input_stride_c);
+                scale_nearest_nhwc_core<float>(_input, _offsets, _output, hr, window, win_in, input_stride_w, input_stride_h, input_stride_c, _sampling_offset);
             }
             else
             {
diff --git a/src/runtime/NEON/functions/NEScale.cpp b/src/runtime/NEON/functions/NEScale.cpp
index 425ee6c..be643b3 100644
--- a/src/runtime/NEON/functions/NEScale.cpp
+++ b/src/runtime/NEON/functions/NEScale.cpp
@@ -83,7 +83,7 @@
 
         execute_window_loop(win, [&](const Coordinates & id)
         {
-            const size_t in_xi = (id.x() + 0.5f) * wr;
+            const size_t in_xi = std::floor((id.x() + sampling_offset) * wr);
 
             *reinterpret_cast<int32_t *>(offsets_it.ptr()) = in_xi * input_element_size;
         },
diff --git a/tests/validation/reference/CropResize.cpp b/tests/validation/reference/CropResize.cpp
index 8cfce97..f25a031 100644
--- a/tests/validation/reference/CropResize.cpp
+++ b/tests/validation/reference/CropResize.cpp
@@ -59,8 +59,8 @@
             case InterpolationPolicy::NEAREST_NEIGHBOR:
             {
                 //Calculate the source coords without -0.5f is equivalent to round the x_scr/y_src coords
-                float x_src = (idw + 0.5f) * wr;
-                float y_src = (idh + 0.5f) * hr;
+                float x_src = std::floor(idw * wr);
+                float y_src = std::floor(idh * hr);
                 in_id.set(1, x_src);
                 in_id.set(2, y_src);
 
diff --git a/tests/validation/reference/Scale.cpp b/tests/validation/reference/Scale.cpp
index 84f4fb8..63a2853 100644
--- a/tests/validation/reference/Scale.cpp
+++ b/tests/validation/reference/Scale.cpp
@@ -71,28 +71,25 @@
         float       x_src = 0;
         float       y_src = 0;
 
-        switch(sampling_policy)
-        {
-            case SamplingPolicy::TOP_LEFT:
-                x_src = idx * wr;
-                y_src = idy * hr;
-                break;
-            case SamplingPolicy::CENTER:
-                x_src = (idx + 0.5f) * wr - 0.5f;
-                y_src = (idy + 0.5f) * hr - 0.5f;
-                break;
-            default:
-                ARM_COMPUTE_ERROR("Unsupported sampling policy.");
-                break;
-        }
-
         switch(policy)
         {
             case InterpolationPolicy::NEAREST_NEIGHBOR:
             {
-                //Calculate the source coords without -0.5f is equivalent to round the x_scr/y_src coords
-                x_src = (idx + 0.5f) * wr;
-                y_src = (idy + 0.5f) * hr;
+                switch(sampling_policy)
+                {
+                    case SamplingPolicy::TOP_LEFT:
+                        x_src = std::floor(idx * wr);
+                        y_src = std::floor(idy * hr);
+                        break;
+                    case SamplingPolicy::CENTER:
+                        //Calculate the source coords without -0.5f is equivalent to round the x_scr/y_src coords
+                        x_src = (idx + 0.5f) * wr;
+                        y_src = (idy + 0.5f) * hr;
+                        break;
+                    default:
+                        ARM_COMPUTE_ERROR("Unsupported sampling policy.");
+                }
+
                 id.set(0, x_src);
                 id.set(1, y_src);
 
@@ -105,6 +102,20 @@
             }
             case InterpolationPolicy::BILINEAR:
             {
+                switch(sampling_policy)
+                {
+                    case SamplingPolicy::TOP_LEFT:
+                        x_src = idx * wr;
+                        y_src = idy * hr;
+                        break;
+                    case SamplingPolicy::CENTER:
+                        x_src = (idx + 0.5f) * wr - 0.5f;
+                        y_src = (idy + 0.5f) * hr - 0.5f;
+                        break;
+                    default:
+                        ARM_COMPUTE_ERROR("Unsupported sampling policy.");
+                }
+
                 id.set(0, std::floor(x_src));
                 id.set(1, std::floor(y_src));
                 if(is_valid_pixel_index(x_src, y_src, width, height, border_size))