Introduce Context opaque object of the new interface

An AclContext is introduced as part of the new interface.
This object is responsible for any constructural services that the
operators and other objects might need.

Main options that can be passed to a context object are:
- a target: for which all the subsequent object should bind with
- capabilities: which are the isa/target features to enable
- a mode: for which different strategies can be selected in the backend

Signed-off-by: Georgios Pinitas <georgios.pinitas@arm.com>
Change-Id: I315549e55d4d064cbe94dfa29d070dc281b447de
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5088
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
diff --git a/src/core/NEON/kernels/NELogicalKernel.cpp b/src/core/NEON/kernels/NELogicalKernel.cpp
index 27605e1..d98694f 100644
--- a/src/core/NEON/kernels/NELogicalKernel.cpp
+++ b/src/core/NEON/kernels/NELogicalKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 Arm Limited.
+ * Copyright (c) 2020-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,7 +25,7 @@
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Validate.h"
-#include "src/core/common/Validate.h"
+#include "src/common/utils/Validate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 
diff --git a/src/core/NEON/kernels/batchnormalization/impl/NEON/fp16.cpp b/src/core/NEON/kernels/batchnormalization/impl/NEON/fp16.cpp
index 4108d79..c105ada 100644
--- a/src/core/NEON/kernels/batchnormalization/impl/NEON/fp16.cpp
+++ b/src/core/NEON/kernels/batchnormalization/impl/NEON/fp16.cpp
@@ -27,7 +27,6 @@
 #include "src/core/NEON/NEMath.h"
 #include "src/core/NEON/kernels/detail/NEActivationFunctionDetail.h"
 #include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/common/Validate.h"
 
 #include <arm_neon.h>
 #include <cmath>
diff --git a/src/core/NEON/kernels/batchnormalization/impl/NEON/fp32.cpp b/src/core/NEON/kernels/batchnormalization/impl/NEON/fp32.cpp
index 1fdc5bd..4a90a21 100644
--- a/src/core/NEON/kernels/batchnormalization/impl/NEON/fp32.cpp
+++ b/src/core/NEON/kernels/batchnormalization/impl/NEON/fp32.cpp
@@ -27,7 +27,6 @@
 #include "src/core/NEON/NEMath.h"
 #include "src/core/NEON/kernels/detail/NEActivationFunctionDetail.h"
 #include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/common/Validate.h"
 
 #include <arm_neon.h>
 #include <cmath>
diff --git a/src/core/NEON/kernels/batchnormalization/impl/SVE/fp16.cpp b/src/core/NEON/kernels/batchnormalization/impl/SVE/fp16.cpp
index 5638dce..3e3e81d 100644
--- a/src/core/NEON/kernels/batchnormalization/impl/SVE/fp16.cpp
+++ b/src/core/NEON/kernels/batchnormalization/impl/SVE/fp16.cpp
@@ -25,7 +25,6 @@
 #include "arm_compute/core/ITensorPack.h"
 #include "arm_compute/core/Window.h"
 #include "src/core/NEON/SVEMath.h"
-#include "src/core/common/Validate.h"
 
 #include <cmath>
 #include <cstddef>
diff --git a/src/core/NEON/kernels/batchnormalization/impl/SVE/fp32.cpp b/src/core/NEON/kernels/batchnormalization/impl/SVE/fp32.cpp
index 51397ac..b0d4cbb 100644
--- a/src/core/NEON/kernels/batchnormalization/impl/SVE/fp32.cpp
+++ b/src/core/NEON/kernels/batchnormalization/impl/SVE/fp32.cpp
@@ -25,7 +25,6 @@
 #include "arm_compute/core/ITensorPack.h"
 #include "arm_compute/core/Window.h"
 #include "src/core/NEON/SVEMath.h"
-#include "src/core/common/Validate.h"
 
 #include <cmath>
 #include <cstddef>
diff --git a/src/core/NEON/kernels/scale/impl/NEON/fp16.cpp b/src/core/NEON/kernels/scale/impl/NEON/fp16.cpp
new file mode 100644
index 0000000..0ad66ca
--- /dev/null
+++ b/src/core/NEON/kernels/scale/impl/NEON/fp16.cpp
@@ -0,0 +1,174 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensorPack.h"
+#include "arm_compute/core/Window.h"
+#include "src/core/NEON/NEMath.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/helpers/ScaleHelpers.h"
+#include "src/core/utils/ScaleUtils.h"
+#include "support/Rounding.h"
+
+#include <arm_neon.h>
+#include <cmath>
+#include <cstddef>
+
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+
+namespace arm_compute
+{
+namespace
+{
+void fp16_neon_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *offsets,
+                             float sampling_offset, bool align_corners, const Window &window)
+{
+    const size_t in_stride_c  = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
+    const size_t in_stride_w  = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom;
+    const size_t in_stride_wc = in_stride_w * in_stride_c;
+    const size_t in_dim_h     = src->info()->dimension(2);
+
+    // Compute the ratio between source height and destination height
+    const auto hr             = scale_utils::calculate_resize_ratio(in_dim_h, dst->info()->dimension(2), align_corners);
+    const auto window_start_x = static_cast<int32_t>(window.x().start());
+    const auto window_end_x   = static_cast<int32_t>(window.x().end());
+    const int  window_step_x  = 8;
+
+    Window win(window);
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+    Iterator out(dst, win);
+
+    const uint8_t     *in_ptr_start        = src->buffer() + src->info()->offset_first_element_in_bytes();
+    const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3];
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        const int32_t    offset     = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c;
+        const auto       in_hi      = static_cast<int>(align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) : std::floor((id.z() + sampling_offset) * hr));
+        const int        offset_row = in_hi * in_stride_wc;
+        int32_t          x          = window_start_x;
+        const float16_t *in_ptr     = reinterpret_cast<const float16_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]);
+
+        for(; x <= window_end_x - window_step_x; x += window_step_x)
+        {
+            wrapper::vstore(reinterpret_cast<float16_t *>(out.ptr()) + x,
+                            wrapper::vloadq(in_ptr + offset + offset_row + x));
+        }
+        for(; x < window_end_x; ++x)
+        {
+            *(reinterpret_cast<float16_t *>(out.ptr()) + x) = *(in_ptr + offset + offset_row + x);
+        }
+    },
+    out);
+}
+
+void fp16_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
+                              BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
+                              bool align_corners, const Window &window)
+{
+    // Compute the ratio between source height and destination height
+    const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners);
+
+    Iterator  out(dst, window);
+    const int in_stride_c  = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
+    const int in_dim_w     = src->info()->dimension(1);
+    const int in_dim_h     = src->info()->dimension(2);
+    const int in_stride_wc = in_stride_c * (in_dim_w + src->info()->padding().top + src->info()->padding().bottom);
+
+    // Don't increment in Y and Z direction for the input tensor
+    // A pointer to the start of this plane is needed as base for the precomputed offsets
+    Window win_in(window);
+    win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+    win_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
+    Iterator in(src, win_in);
+
+    if(border_mode == BorderMode::CONSTANT)
+    {
+        using ConstType = typename std::conditional<std::is_same<float16_t, float16_t>::value, half, float16_t>::type;
+
+        const float16_t const_border_value = static_cast<float16_t>(constant_border_value.get<ConstType>());
+        execute_window_loop(window, [&](const Coordinates & id)
+        {
+            const auto       offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())));
+            const auto       dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z())));
+            const auto       dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z())));
+            const int32_t    in_hi  = std::floor((id.z() + sampling_offset) * hr - sampling_offset);
+            const float16_t *in_ptr = reinterpret_cast<const float16_t *>(in.ptr()) + offset * in_stride_c + in_hi * in_stride_wc;
+
+            const auto a00 = (0 <= offset && offset < in_dim_w && 0 <= in_hi && in_hi < in_dim_h) ? *in_ptr : const_border_value;
+            const auto a01 = (-1 <= offset && offset < in_dim_w - 1 && 0 <= in_hi && in_hi < in_dim_h) ? *(in_ptr + in_stride_c) : const_border_value;
+            const auto a10 = (0 <= offset && offset < in_dim_w && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_wc) : const_border_value;
+            const auto a11 = (-1 <= offset && offset < in_dim_w - 1 && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_c + in_stride_wc) : const_border_value;
+
+            *reinterpret_cast<float16_t *>(out.ptr()) = static_cast<float16_t>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
+        },
+        in, out);
+    }
+    else if(border_mode == BorderMode::REPLICATE)
+    {
+        execute_window_loop(window, [&](const Coordinates & id)
+        {
+            const auto offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())));
+            const auto dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z())));
+            const auto dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z())));
+            const int  in_hi  = std::floor((id.z() + sampling_offset) * hr - sampling_offset);
+
+            auto clamped_w  = utility::clamp<int>(offset, 0, in_dim_w - 1);
+            auto clamped_w1 = utility::clamp<int>(offset + 1, 0, in_dim_w - 1);
+            auto clamped_h  = utility::clamp<int>(in_hi, 0, in_dim_h - 1);
+            auto clamped_h1 = utility::clamp<int>(in_hi + 1, 0, in_dim_h - 1);
+
+            const auto a00 = *(reinterpret_cast<const float16_t *>(in.ptr()) + clamped_w * in_stride_c + clamped_h * in_stride_wc);
+            const auto a01 = *(reinterpret_cast<const float16_t *>(in.ptr()) + clamped_w1 * in_stride_c + clamped_h * in_stride_wc);
+            const auto a10 = *(reinterpret_cast<const float16_t *>(in.ptr()) + clamped_w * in_stride_c + clamped_h1 * in_stride_wc);
+            const auto a11 = *(reinterpret_cast<const float16_t *>(in.ptr()) + clamped_w1 * in_stride_c + clamped_h1 * in_stride_wc);
+
+            *reinterpret_cast<float16_t *>(out.ptr()) = static_cast<float16_t>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
+        },
+        in, out);
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR("Not implemented");
+    }
+}
+}
+namespace cpu
+{
+void fp16_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
+                     InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
+                     bool align_corners, const Window &window)
+{
+    if(policy == InterpolationPolicy::BILINEAR)
+    {
+        fp16_neon_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, align_corners, window);
+    }
+    else if(policy == InterpolationPolicy::NEAREST_NEIGHBOR)
+    {
+        fp16_neon_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window);
+    }
+}
+} // namespace cpu
+} // namespace arm_compute
+
+#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
\ No newline at end of file
diff --git a/src/core/NEON/kernels/scale/impl/NEON/integer.cpp b/src/core/NEON/kernels/scale/impl/NEON/integer.cpp
new file mode 100644
index 0000000..a2359aa
--- /dev/null
+++ b/src/core/NEON/kernels/scale/impl/NEON/integer.cpp
@@ -0,0 +1,293 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensorPack.h"
+#include "arm_compute/core/Window.h"
+#include "src/core/NEON/NEMath.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/core/helpers/ScaleHelpers.h"
+#include "src/core/utils/ScaleUtils.h"
+#include "support/Rounding.h"
+
+#include <arm_neon.h>
+#include <cmath>
+#include <cstddef>
+
+namespace arm_compute
+{
+namespace
+{
+void u8_neon_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *offsets,
+                           float sampling_offset, bool align_corners, const Window &window)
+{
+    const size_t in_stride_c  = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
+    const size_t in_stride_w  = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom;
+    const size_t in_stride_wc = in_stride_w * in_stride_c;
+    const size_t in_dim_h     = src->info()->dimension(2);
+
+    // Compute the ratio between source height and destination height
+    const auto hr             = scale_utils::calculate_resize_ratio(in_dim_h, dst->info()->dimension(2), align_corners);
+    const auto window_start_x = static_cast<int32_t>(window.x().start());
+    const auto window_end_x   = static_cast<int32_t>(window.x().end());
+    const int  window_step_x  = 16;
+
+    Window win(window);
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+    Iterator out(dst, win);
+
+    const uint8_t     *in_ptr_start        = src->buffer() + src->info()->offset_first_element_in_bytes();
+    const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3];
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        const int32_t  offset     = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c;
+        const auto     in_hi      = static_cast<int>(align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) : std::floor((id.z() + sampling_offset) * hr));
+        const int      offset_row = in_hi * in_stride_wc;
+        int32_t        x          = window_start_x;
+        const uint8_t *in_ptr     = reinterpret_cast<const uint8_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]);
+
+        for(; x <= window_end_x - window_step_x; x += window_step_x)
+        {
+            wrapper::vstore(reinterpret_cast<uint8_t *>(out.ptr()) + x,
+                            wrapper::vloadq(in_ptr + offset + offset_row + x));
+        }
+        for(; x < window_end_x; ++x)
+        {
+            *(reinterpret_cast<uint8_t *>(out.ptr()) + x) = *(in_ptr + offset + offset_row + x);
+        }
+    },
+    out);
+}
+
+void u8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
+                            BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
+                            bool align_corners, const Window &window)
+{
+    // Compute the ratio between source height and destination height
+    const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners);
+
+    Iterator  out(dst, window);
+    const int in_stride_c  = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
+    const int in_dim_w     = src->info()->dimension(1);
+    const int in_dim_h     = src->info()->dimension(2);
+    const int in_stride_wc = in_stride_c * (in_dim_w + src->info()->padding().top + src->info()->padding().bottom);
+
+    // Don't increment in Y and Z direction for the input tensor
+    // A pointer to the start of this plane is needed as base for the precomputed offsets
+    Window win_in(window);
+    win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+    win_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
+    Iterator in(src, win_in);
+
+    if(border_mode == BorderMode::CONSTANT)
+    {
+        const uint8_t const_border_value = static_cast<uint8_t>(constant_border_value.get<uint8_t>());
+        execute_window_loop(window, [&](const Coordinates & id)
+        {
+            const auto     offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())));
+            const auto     dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z())));
+            const auto     dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z())));
+            const int32_t  in_hi  = std::floor((id.z() + sampling_offset) * hr - sampling_offset);
+            const uint8_t *in_ptr = reinterpret_cast<const uint8_t *>(in.ptr()) + offset * in_stride_c + in_hi * in_stride_wc;
+
+            const auto a00 = (0 <= offset && offset < in_dim_w && 0 <= in_hi && in_hi < in_dim_h) ? *in_ptr : const_border_value;
+            const auto a01 = (-1 <= offset && offset < in_dim_w - 1 && 0 <= in_hi && in_hi < in_dim_h) ? *(in_ptr + in_stride_c) : const_border_value;
+            const auto a10 = (0 <= offset && offset < in_dim_w && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_wc) : const_border_value;
+            const auto a11 = (-1 <= offset && offset < in_dim_w - 1 && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_c + in_stride_wc) : const_border_value;
+
+            *reinterpret_cast<uint8_t *>(out.ptr()) = static_cast<uint8_t>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
+        },
+        in, out);
+    }
+    else if(border_mode == BorderMode::REPLICATE)
+    {
+        execute_window_loop(window, [&](const Coordinates & id)
+        {
+            const auto offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())));
+            const auto dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z())));
+            const auto dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z())));
+            const int  in_hi  = std::floor((id.z() + sampling_offset) * hr - sampling_offset);
+
+            auto clamped_w  = utility::clamp<int>(offset, 0, in_dim_w - 1);
+            auto clamped_w1 = utility::clamp<int>(offset + 1, 0, in_dim_w - 1);
+            auto clamped_h  = utility::clamp<int>(in_hi, 0, in_dim_h - 1);
+            auto clamped_h1 = utility::clamp<int>(in_hi + 1, 0, in_dim_h - 1);
+
+            const auto a00 = *(reinterpret_cast<const uint8_t *>(in.ptr()) + clamped_w * in_stride_c + clamped_h * in_stride_wc);
+            const auto a01 = *(reinterpret_cast<const uint8_t *>(in.ptr()) + clamped_w1 * in_stride_c + clamped_h * in_stride_wc);
+            const auto a10 = *(reinterpret_cast<const uint8_t *>(in.ptr()) + clamped_w * in_stride_c + clamped_h1 * in_stride_wc);
+            const auto a11 = *(reinterpret_cast<const uint8_t *>(in.ptr()) + clamped_w1 * in_stride_c + clamped_h1 * in_stride_wc);
+
+            *reinterpret_cast<uint8_t *>(out.ptr()) = static_cast<uint8_t>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
+        },
+        in, out);
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR("Not implemented");
+    }
+}
+
+void s16_neon_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *offsets,
+                            float sampling_offset, bool align_corners, const Window &window)
+{
+    const size_t in_stride_c  = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
+    const size_t in_stride_w  = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom;
+    const size_t in_stride_wc = in_stride_w * in_stride_c;
+    const size_t in_dim_h     = src->info()->dimension(2);
+
+    // Compute the ratio between source height and destination height
+    const auto hr             = scale_utils::calculate_resize_ratio(in_dim_h, dst->info()->dimension(2), align_corners);
+    const auto window_start_x = static_cast<int32_t>(window.x().start());
+    const auto window_end_x   = static_cast<int32_t>(window.x().end());
+    const int  window_step_x  = 8;
+
+    Window win(window);
+    win.set(Window::DimX, Window::Dimension(0, 1, 1));
+    Iterator out(dst, win);
+
+    const uint8_t     *in_ptr_start        = src->buffer() + src->info()->offset_first_element_in_bytes();
+    const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3];
+
+    execute_window_loop(win, [&](const Coordinates & id)
+    {
+        const int32_t  offset     = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c;
+        const auto     in_hi      = static_cast<int>(align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) : std::floor((id.z() + sampling_offset) * hr));
+        const int      offset_row = in_hi * in_stride_wc;
+        int32_t        x          = window_start_x;
+        const int16_t *in_ptr     = reinterpret_cast<const int16_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]);
+
+        for(; x <= window_end_x - window_step_x; x += window_step_x)
+        {
+            wrapper::vstore(reinterpret_cast<int16_t *>(out.ptr()) + x,
+                            wrapper::vloadq(in_ptr + offset + offset_row + x));
+        }
+        for(; x < window_end_x; ++x)
+        {
+            *(reinterpret_cast<int16_t *>(out.ptr()) + x) = *(in_ptr + offset + offset_row + x);
+        }
+    },
+    out);
+}
+
+void s16_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
+                             BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
+                             bool align_corners, const Window &window)
+{
+    // Compute the ratio between source height and destination height
+    const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners);
+
+    Iterator  out(dst, window);
+    const int in_stride_c  = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
+    const int in_dim_w     = src->info()->dimension(1);
+    const int in_dim_h     = src->info()->dimension(2);
+    const int in_stride_wc = in_stride_c * (in_dim_w + src->info()->padding().top + src->info()->padding().bottom);
+
+    // Don't increment in Y and Z direction for the input tensor
+    // A pointer to the start of this plane is needed as base for the precomputed offsets
+    Window win_in(window);
+    win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
+    win_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
+    Iterator in(src, win_in);
+
+    if(border_mode == BorderMode::CONSTANT)
+    {
+        const int16_t const_border_value = static_cast<int16_t>(constant_border_value.get<int16_t>());
+        execute_window_loop(window, [&](const Coordinates & id)
+        {
+            const auto     offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())));
+            const auto     dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z())));
+            const auto     dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z())));
+            const int32_t  in_hi  = std::floor((id.z() + sampling_offset) * hr - sampling_offset);
+            const int16_t *in_ptr = reinterpret_cast<const int16_t *>(in.ptr()) + offset * in_stride_c + in_hi * in_stride_wc;
+
+            const auto a00 = (0 <= offset && offset < in_dim_w && 0 <= in_hi && in_hi < in_dim_h) ? *in_ptr : const_border_value;
+            const auto a01 = (-1 <= offset && offset < in_dim_w - 1 && 0 <= in_hi && in_hi < in_dim_h) ? *(in_ptr + in_stride_c) : const_border_value;
+            const auto a10 = (0 <= offset && offset < in_dim_w && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_wc) : const_border_value;
+            const auto a11 = (-1 <= offset && offset < in_dim_w - 1 && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_c + in_stride_wc) : const_border_value;
+
+            *reinterpret_cast<int16_t *>(out.ptr()) = static_cast<int16_t>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
+        },
+        in, out);
+    }
+    else if(border_mode == BorderMode::REPLICATE)
+    {
+        execute_window_loop(window, [&](const Coordinates & id)
+        {
+            const auto offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())));
+            const auto dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z())));
+            const auto dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z())));
+            const int  in_hi  = std::floor((id.z() + sampling_offset) * hr - sampling_offset);
+
+            auto clamped_w  = utility::clamp<int>(offset, 0, in_dim_w - 1);
+            auto clamped_w1 = utility::clamp<int>(offset + 1, 0, in_dim_w - 1);
+            auto clamped_h  = utility::clamp<int>(in_hi, 0, in_dim_h - 1);
+            auto clamped_h1 = utility::clamp<int>(in_hi + 1, 0, in_dim_h - 1);
+
+            const auto a00 = *(reinterpret_cast<const int16_t *>(in.ptr()) + clamped_w * in_stride_c + clamped_h * in_stride_wc);
+            const auto a01 = *(reinterpret_cast<const int16_t *>(in.ptr()) + clamped_w1 * in_stride_c + clamped_h * in_stride_wc);
+            const auto a10 = *(reinterpret_cast<const int16_t *>(in.ptr()) + clamped_w * in_stride_c + clamped_h1 * in_stride_wc);
+            const auto a11 = *(reinterpret_cast<const int16_t *>(in.ptr()) + clamped_w1 * in_stride_c + clamped_h1 * in_stride_wc);
+
+            *reinterpret_cast<int16_t *>(out.ptr()) = static_cast<int16_t>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
+        },
+        in, out);
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR("Not implemented");
+    }
+}
+}
+namespace cpu
+{
+void u8_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
+                   InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
+                   bool align_corners, const Window &window)
+{
+    if(policy == InterpolationPolicy::BILINEAR)
+    {
+        u8_neon_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, align_corners, window);
+    }
+    else if(policy == InterpolationPolicy::NEAREST_NEIGHBOR)
+    {
+        u8_neon_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window);
+    }
+}
+
+void s16_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
+                    InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
+                    bool align_corners, const Window &window)
+{
+    if(policy == InterpolationPolicy::BILINEAR)
+    {
+        s16_neon_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, align_corners, window);
+    }
+    else if(policy == InterpolationPolicy::NEAREST_NEIGHBOR)
+    {
+        s16_neon_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window);
+    }
+}
+} // namespace cpu
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/core/NEON/kernels/scale/impl/NEON/list.h b/src/core/NEON/kernels/scale/impl/NEON/list.h
index 11cac26..c91242f 100644
--- a/src/core/NEON/kernels/scale/impl/NEON/list.h
+++ b/src/core/NEON/kernels/scale/impl/NEON/list.h
@@ -29,7 +29,6 @@
 #include "arm_compute/core/Window.h"
 #include "src/core/NEON/NEMath.h"
 #include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/common/Validate.h"
 #include "src/core/helpers/ScaleHelpers.h"
 #include "src/core/utils/ScaleUtils.h"
 #include "support/Rounding.h"
diff --git a/src/core/NEON/kernels/scale/impl/NEON/qasymm8_signed.cpp b/src/core/NEON/kernels/scale/impl/NEON/qasymm8_signed.cpp
index 7b18608..149cdf4 100644
--- a/src/core/NEON/kernels/scale/impl/NEON/qasymm8_signed.cpp
+++ b/src/core/NEON/kernels/scale/impl/NEON/qasymm8_signed.cpp
@@ -21,7 +21,6 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-
 #include "src/core/NEON/kernels/scale/impl/NEON/list.h"
 
 namespace arm_compute
diff --git a/src/core/NEON/kernels/scale/impl/SVE/fp16.cpp b/src/core/NEON/kernels/scale/impl/SVE/fp16.cpp
index 91c3dc3..99f08db 100644
--- a/src/core/NEON/kernels/scale/impl/SVE/fp16.cpp
+++ b/src/core/NEON/kernels/scale/impl/SVE/fp16.cpp
@@ -26,7 +26,6 @@
 #include "arm_compute/core/Window.h"
 #include "src/core/NEON/NEMath.h"
 #include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/common/Validate.h"
 #include "src/core/helpers/ScaleHelpers.h"
 #include "src/core/utils/ScaleUtils.h"
 #include "support/Rounding.h"
diff --git a/src/core/NEON/kernels/scale/impl/SVE/fp32.cpp b/src/core/NEON/kernels/scale/impl/SVE/fp32.cpp
index abb4faa..94055ae 100644
--- a/src/core/NEON/kernels/scale/impl/SVE/fp32.cpp
+++ b/src/core/NEON/kernels/scale/impl/SVE/fp32.cpp
@@ -26,7 +26,6 @@
 #include "arm_compute/core/Window.h"
 #include "src/core/NEON/NEMath.h"
 #include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/common/Validate.h"
 #include "src/core/helpers/ScaleHelpers.h"
 #include "src/core/utils/ScaleUtils.h"
 #include "support/Rounding.h"
diff --git a/src/core/NEON/kernels/scale/impl/SVE/integer.cpp b/src/core/NEON/kernels/scale/impl/SVE/integer.cpp
index 5f5263c..2a724ec 100644
--- a/src/core/NEON/kernels/scale/impl/SVE/integer.cpp
+++ b/src/core/NEON/kernels/scale/impl/SVE/integer.cpp
@@ -26,7 +26,6 @@
 #include "arm_compute/core/Window.h"
 #include "src/core/NEON/NEMath.h"
 #include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/common/Validate.h"
 #include "src/core/helpers/ScaleHelpers.h"
 #include "src/core/utils/ScaleUtils.h"
 #include "support/Rounding.h"
diff --git a/src/core/NEON/kernels/scale/impl/SVE/qasymm8.cpp b/src/core/NEON/kernels/scale/impl/SVE/qasymm8.cpp
index fc65ff4..c475ad6 100644
--- a/src/core/NEON/kernels/scale/impl/SVE/qasymm8.cpp
+++ b/src/core/NEON/kernels/scale/impl/SVE/qasymm8.cpp
@@ -26,7 +26,6 @@
 #include "arm_compute/core/Window.h"
 #include "src/core/NEON/NEMath.h"
 #include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/common/Validate.h"
 #include "src/core/helpers/ScaleHelpers.h"
 #include "src/core/helpers/ScaleHelpers.h"
 #include "src/core/utils/ScaleUtils.h"
diff --git a/src/core/NEON/kernels/scale/impl/SVE/qasymm8_signed.cpp b/src/core/NEON/kernels/scale/impl/SVE/qasymm8_signed.cpp
index 676ca94..b39b75a 100644
--- a/src/core/NEON/kernels/scale/impl/SVE/qasymm8_signed.cpp
+++ b/src/core/NEON/kernels/scale/impl/SVE/qasymm8_signed.cpp
@@ -26,7 +26,6 @@
 #include "arm_compute/core/Window.h"
 #include "src/core/NEON/NEMath.h"
 #include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/common/Validate.h"
 #include "src/core/helpers/ScaleHelpers.h"
 #include "src/core/helpers/ScaleHelpers.h"
 #include "src/core/utils/ScaleUtils.h"
diff --git a/src/core/common/Validate.h b/src/core/common/Validate.h
deleted file mode 100644
index fa24bf5..0000000
--- a/src/core/common/Validate.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
- * Copyright (c) 2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#ifndef SRC_CORE_COMMON_VALIDATE_H
-#define SRC_CORE_COMMON_VALIDATE_H
-
-#if defined(ARM_COMPUTE_ASSERTS_ENABLED)
-
-#include <cassert>
-
-#define ARM_COMPUTE_ASSERT(cond) assert(cond)
-#define ARM_COMPUTE_ASSERT_NOT_NULLPTR(ptr) assert((ptr) != nullptr)
-
-#else /* defined(ARM_COMPUTE_ASSERTS_ENABLED) */
-
-#define ARM_COMPUTE_ASSERT(cond)
-#define ARM_COMPUTE_ASSERT_NOT_NULLPTR(ptr)
-
-#endif /* defined(ARM_COMPUTE_ASSERTS_ENABLED) */
-#endif /* SRC_CORE_COMMON_VALIDATE_H */
diff --git a/src/core/cpu/kernels/activation/NEON/fp16.cpp b/src/core/cpu/kernels/activation/NEON/fp16.cpp
index 7fe4ab3..0ddd43e 100644
--- a/src/core/cpu/kernels/activation/NEON/fp16.cpp
+++ b/src/core/cpu/kernels/activation/NEON/fp16.cpp
@@ -26,7 +26,6 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Validate.h"
 #include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/common/Validate.h"
 
 #include <arm_neon.h>
 #include <cmath>
diff --git a/src/core/cpu/kernels/activation/NEON/fp32.cpp b/src/core/cpu/kernels/activation/NEON/fp32.cpp
index f1f2753..244ca57 100644
--- a/src/core/cpu/kernels/activation/NEON/fp32.cpp
+++ b/src/core/cpu/kernels/activation/NEON/fp32.cpp
@@ -26,7 +26,6 @@
 #include "arm_compute/core/Window.h"
 #include "src/core/NEON/NEMath.h"
 #include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/common/Validate.h"
 
 #include <arm_neon.h>
 #include <cmath>
diff --git a/src/core/cpu/kernels/activation/NEON/qasymm8.cpp b/src/core/cpu/kernels/activation/NEON/qasymm8.cpp
index 7506a82..a121743 100644
--- a/src/core/cpu/kernels/activation/NEON/qasymm8.cpp
+++ b/src/core/cpu/kernels/activation/NEON/qasymm8.cpp
@@ -27,7 +27,6 @@
 #include "src/core/NEON/NEAsymm.h"
 #include "src/core/NEON/NEMath.h"
 #include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/common/Validate.h"
 
 #include <arm_neon.h>
 #include <cmath>
diff --git a/src/core/cpu/kernels/activation/NEON/qasymm8_signed.cpp b/src/core/cpu/kernels/activation/NEON/qasymm8_signed.cpp
index 8f75abe..8b40bf8 100644
--- a/src/core/cpu/kernels/activation/NEON/qasymm8_signed.cpp
+++ b/src/core/cpu/kernels/activation/NEON/qasymm8_signed.cpp
@@ -26,7 +26,6 @@
 #include "src/core/NEON/NEAsymm.h"
 #include "src/core/NEON/NEMath.h"
 #include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/common/Validate.h"
 
 #include <arm_neon.h>
 #include <cmath>
diff --git a/src/core/cpu/kernels/activation/NEON/qsymm16.cpp b/src/core/cpu/kernels/activation/NEON/qsymm16.cpp
index 9eee360..54b4182 100644
--- a/src/core/cpu/kernels/activation/NEON/qsymm16.cpp
+++ b/src/core/cpu/kernels/activation/NEON/qsymm16.cpp
@@ -28,7 +28,6 @@
 #include "src/core/NEON/NEMath.h"
 #include "src/core/NEON/NESymm.h"
 #include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/common/Validate.h"
 
 #include <arm_neon.h>
 #include <cmath>
diff --git a/src/core/cpu/kernels/activation/SVE/fp16.cpp b/src/core/cpu/kernels/activation/SVE/fp16.cpp
index 8208813..bf31fd7 100644
--- a/src/core/cpu/kernels/activation/SVE/fp16.cpp
+++ b/src/core/cpu/kernels/activation/SVE/fp16.cpp
@@ -24,7 +24,6 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensorPack.h"
 #include "arm_compute/core/Window.h"
-#include "src/core/common/Validate.h"
 
 #include <cmath>
 #include <cstddef>
diff --git a/src/core/cpu/kernels/activation/SVE/fp32.cpp b/src/core/cpu/kernels/activation/SVE/fp32.cpp
index 55bdc99..75f9f8a 100644
--- a/src/core/cpu/kernels/activation/SVE/fp32.cpp
+++ b/src/core/cpu/kernels/activation/SVE/fp32.cpp
@@ -25,7 +25,6 @@
 #include "arm_compute/core/ITensorPack.h"
 #include "arm_compute/core/Window.h"
 #include "src/core/NEON/SVEMath.h"
-#include "src/core/common/Validate.h"
 
 #include <cmath>
 #include <cstddef>
diff --git a/src/core/cpu/kernels/activation/SVE/qasymm8.cpp b/src/core/cpu/kernels/activation/SVE/qasymm8.cpp
index 3d9476a..228b4ae 100644
--- a/src/core/cpu/kernels/activation/SVE/qasymm8.cpp
+++ b/src/core/cpu/kernels/activation/SVE/qasymm8.cpp
@@ -24,7 +24,6 @@
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Window.h"
-#include "src/core/common/Validate.h"
 
 #include <cmath>
 #include <cstddef>
diff --git a/src/core/cpu/kernels/activation/SVE/qasymm8_signed.cpp b/src/core/cpu/kernels/activation/SVE/qasymm8_signed.cpp
index 0b3d798..989f825 100644
--- a/src/core/cpu/kernels/activation/SVE/qasymm8_signed.cpp
+++ b/src/core/cpu/kernels/activation/SVE/qasymm8_signed.cpp
@@ -24,7 +24,6 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Window.h"
 #include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/common/Validate.h"
 
 #include <cmath>
 #include <cstddef>
diff --git a/src/core/cpu/kernels/activation/SVE/qsymm16.cpp b/src/core/cpu/kernels/activation/SVE/qsymm16.cpp
index dbaf267..6697487 100644
--- a/src/core/cpu/kernels/activation/SVE/qsymm16.cpp
+++ b/src/core/cpu/kernels/activation/SVE/qsymm16.cpp
@@ -25,7 +25,6 @@
 #include "arm_compute/core/ITensorPack.h"
 #include "arm_compute/core/Window.h"
 #include "arm_compute/core/experimental/Types.h"
-#include "src/core/common/Validate.h"
 
 #include <cmath>
 #include <cstddef>
diff --git a/src/core/cpu/kernels/floor/NEON/fp16.cpp b/src/core/cpu/kernels/floor/NEON/fp16.cpp
index 0d31eb7..f362676 100644
--- a/src/core/cpu/kernels/floor/NEON/fp16.cpp
+++ b/src/core/cpu/kernels/floor/NEON/fp16.cpp
@@ -23,8 +23,8 @@
  */
 #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
 
+#include "src/common/utils/Validate.h"
 #include "src/core/NEON/NEMath.h"
-#include "src/core/common/Validate.h"
 
 #include <arm_neon.h>
 #include <cmath>
diff --git a/src/core/cpu/kernels/floor/NEON/fp32.cpp b/src/core/cpu/kernels/floor/NEON/fp32.cpp
index dd63f9f..f5efb2e 100644
--- a/src/core/cpu/kernels/floor/NEON/fp32.cpp
+++ b/src/core/cpu/kernels/floor/NEON/fp32.cpp
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
+#include "src/common/utils/Validate.h"
 #include "src/core/NEON/NEMath.h"
-#include "src/core/common/Validate.h"
 
 #include <arm_neon.h>
 #include <cmath>
diff --git a/src/core/gpu/cl/kernels/ClElementwiseKernel.cpp b/src/core/gpu/cl/kernels/ClElementwiseKernel.cpp
index 7d204b1..8f12eb2 100644
--- a/src/core/gpu/cl/kernels/ClElementwiseKernel.cpp
+++ b/src/core/gpu/cl/kernels/ClElementwiseKernel.cpp
@@ -25,8 +25,8 @@
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/ICLTensor.h"
+#include "src/common/utils/Validate.h"
 #include "src/core/CL/CLValidate.h"
-#include "src/core/common/Validate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "support/Cast.h"