Port Arm(R) Neon(TM) Quantization to new API

Partially resolves: COMPMID-4193

Change-Id: I91dc964d4308687e76127c305a6bedca796f8ba0
Signed-off-by: Manuel Bottini <manuel.bottini@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5246
Reviewed-by: Michalis Spyrou <michalis.spyrou@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
diff --git a/Android.bp b/Android.bp
index 8fd4751..a2df76e 100644
--- a/Android.bp
+++ b/Android.bp
@@ -229,7 +229,6 @@
         "src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp",
         "src/core/NEON/kernels/NEPriorBoxLayerKernel.cpp",
         "src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.cpp",
-        "src/core/NEON/kernels/NEQuantizationLayerKernel.cpp",
         "src/core/NEON/kernels/NEROIAlignLayerKernel.cpp",
         "src/core/NEON/kernels/NEROIPoolingLayerKernel.cpp",
         "src/core/NEON/kernels/NERangeKernel.cpp",
@@ -341,6 +340,7 @@
         "src/core/cpu/kernels/CpuPermuteKernel.cpp",
         "src/core/cpu/kernels/CpuPoolingAssemblyWrapperKernel.cpp",
         "src/core/cpu/kernels/CpuPoolingKernel.cpp",
+        "src/core/cpu/kernels/CpuQuantizationKernel.cpp",
         "src/core/cpu/kernels/CpuReshapeKernel.cpp",
         "src/core/cpu/kernels/CpuSoftmaxKernel.cpp",
         "src/core/cpu/kernels/CpuSubKernel.cpp",
@@ -664,6 +664,7 @@
         "src/runtime/cpu/operators/CpuPermute.cpp",
         "src/runtime/cpu/operators/CpuPooling.cpp",
         "src/runtime/cpu/operators/CpuPoolingAssemblyDispatch.cpp",
+        "src/runtime/cpu/operators/CpuQuantization.cpp",
         "src/runtime/cpu/operators/CpuReshape.cpp",
         "src/runtime/cpu/operators/CpuSoftmax.cpp",
         "src/runtime/cpu/operators/CpuSub.cpp",
diff --git a/arm_compute/runtime/NEON/functions/NEGenerateProposalsLayer.h b/arm_compute/runtime/NEON/functions/NEGenerateProposalsLayer.h
index 90e2307..979b3ba 100644
--- a/arm_compute/runtime/NEON/functions/NEGenerateProposalsLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEGenerateProposalsLayer.h
@@ -44,14 +44,14 @@
 
 /** Basic function to generate proposals for a RPN (Region Proposal Network)
  *
- * This function calls the following Neon kernels:
+ * This function calls the following Arm(R) Neon(TM) layers/kernels:
  * -# @ref NEComputeAllAnchorsKernel
  * -# @ref NEPermute x 2
  * -# @ref NEReshapeLayer x 2
  * -# @ref NEBoundingBoxTransform
  * -# @ref NEPadLayerKernel
  * -# @ref NEDequantizationLayerKernel x 2
- * -# @ref NEQuantizationLayerKernel
+ * -# @ref NEQuantizationLayer
  * And the following CPP kernels:
  * -# @ref CPPBoxWithNonMaximaSuppressionLimit
  */
@@ -113,7 +113,7 @@
     // Memory group manager
     MemoryGroup _memory_group;
 
-    // Neon kernels
+    // kernels/layers
     NEPermute                                  _permute_deltas;
     NEReshapeLayer                             _flatten_deltas;
     NEPermute                                  _permute_scores;
diff --git a/arm_compute/runtime/NEON/functions/NEQuantizationLayer.h b/arm_compute/runtime/NEON/functions/NEQuantizationLayer.h
index 8b0532b..54ec76b 100644
--- a/arm_compute/runtime/NEON/functions/NEQuantizationLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEQuantizationLayer.h
@@ -24,26 +24,37 @@
 #ifndef ARM_COMPUTE_NEQUANTIZATIONLAYER_H
 #define ARM_COMPUTE_NEQUANTIZATIONLAYER_H
 
-#include "arm_compute/runtime/IFunction.h"
-
-#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
-
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/IRuntimeContext.h"
+
+#include <memory>
 
 namespace arm_compute
 {
 class ITensor;
 class ITensorInfo;
 
-/** Basic function to simulate a quantization layer. This function calls the following Neon kernels:
+/** Basic function to simulate a quantization layer. This function calls the following Arm(R) Neon(TM) implementation layers:
  *
  *
- * -# @ref NEQuantizationLayerKernel
+ * -# @ref cpu::CpuQuantization
  *
  */
-class NEQuantizationLayer : public INESimpleFunctionNoBorder
+class NEQuantizationLayer : public IFunction
 {
 public:
+    NEQuantizationLayer();
+    /** Default Destructor */
+    ~NEQuantizationLayer();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEQuantizationLayer(const NEQuantizationLayer &) = delete;
+    /** Default move constructor */
+    NEQuantizationLayer(NEQuantizationLayer &&) = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEQuantizationLayer &operator=(const NEQuantizationLayer &) = delete;
+    /** Default move assignment operator */
+    NEQuantizationLayer &operator=(NEQuantizationLayer &&) = default;
     /** Set the input and output tensors.
      *
      * @param[in]  input  Source tensor. The dimensions over the third will be interpreted as batches. Data types supported: QASYMM8/QASYMM8_SIGNED/F32/F16.
@@ -58,6 +69,13 @@
      * @return a status
      */
     static Status validate(const ITensorInfo *input, const ITensorInfo *output);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_NEQUANTIZATIONLAYER_H */
diff --git a/docs/00_introduction.dox b/docs/00_introduction.dox
index eb8256f..1fdc688 100644
--- a/docs/00_introduction.dox
+++ b/docs/00_introduction.dox
@@ -1298,7 +1298,7 @@
     - @ref NEDequantizationLayerKernel / @ref NEDequantizationLayer
     - NEFloorKernel / @ref NEFloor
     - @ref NEL2NormalizeLayerKernel / @ref NEL2NormalizeLayer
-    - @ref NEQuantizationLayerKernel @ref NEMinMaxLayerKernel / @ref NEQuantizationLayer
+    - NEQuantizationLayerKernel @ref NEMinMaxLayerKernel / @ref NEQuantizationLayer
     - @ref NEROIPoolingLayerKernel / @ref NEROIPoolingLayer
     - @ref NEReductionOperationKernel / @ref NEReductionOperation
     - NEReshapeLayerKernel / @ref NEReshapeLayer
diff --git a/src/core/NEON/NEKernels.h b/src/core/NEON/NEKernels.h
index b2c5592..224634f 100644
--- a/src/core/NEON/NEKernels.h
+++ b/src/core/NEON/NEKernels.h
@@ -76,7 +76,6 @@
 #include "src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h"
 #include "src/core/NEON/kernels/NEPriorBoxLayerKernel.h"
 #include "src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.h"
-#include "src/core/NEON/kernels/NEQuantizationLayerKernel.h"
 #include "src/core/NEON/kernels/NEROIAlignLayerKernel.h"
 #include "src/core/NEON/kernels/NEROIPoolingLayerKernel.h"
 #include "src/core/NEON/kernels/NERangeKernel.h"
diff --git a/src/core/NEON/kernels/NEQuantizationLayerKernel.h b/src/core/NEON/kernels/NEQuantizationLayerKernel.h
deleted file mode 100644
index 5ee0ed4..0000000
--- a/src/core/NEON/kernels/NEQuantizationLayerKernel.h
+++ /dev/null
@@ -1,102 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#ifndef ARM_COMPUTE_NEQUANTIZATIONLAYERKERNEL_H
-#define ARM_COMPUTE_NEQUANTIZATIONLAYERKERNEL_H
-
-#include "src/core/NEON/INEKernel.h"
-
-namespace arm_compute
-{
-class ITensor;
-
-/** Interface for the quantization layer kernel.
- *
- * @note The implementation supports only 3D input tensors
- *
- */
-class NEQuantizationLayerKernel : public INEKernel
-{
-public:
-    const char *name() const override
-    {
-        return "NEQuantizationLayerKernel";
-    }
-    /** Default constructor */
-    NEQuantizationLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEQuantizationLayerKernel(const NEQuantizationLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEQuantizationLayerKernel &operator=(const NEQuantizationLayerKernel &) = delete;
-    /** Default Move Constructor. */
-    NEQuantizationLayerKernel(NEQuantizationLayerKernel &&) = default;
-    /** Default move assignment operator */
-    NEQuantizationLayerKernel &operator=(NEQuantizationLayerKernel &&) = default;
-    /** Default destructor */
-    ~NEQuantizationLayerKernel() = default;
-    /** Set the input, output.
-     *
-     * @param[in]  input  Source tensor. The dimensions over the third will be interpreted as batches. Data types supported: QASYMM8/QASYMM8_SIGNED/F32/F16.
-     * @param[out] output Destination tensor with the same dimensions of input. Data types supported: QASYMM8/QASYMM8_SIGNED/QASYMM16.
-     *
-     * @note Output auto initialization is not supported by this kernel
-     */
-    void configure(const ITensor *input, ITensor *output);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEQuantizationLayerKernel
-     *
-     * @param[in] input  Input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F32/F16.
-     * @param[in] output Output tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/QASYMM16.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output);
-
-    // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
-
-private:
-    /** Common signature for all the specialised @ref NEQuantizationLayerKernel functions
-     *
-     * @param[in] window Region on which to execute the kernel.
-     */
-    using QuantizationFunctionExecutorPtr = void (NEQuantizationLayerKernel::*)(const Window &window);
-    /** Function to apply QASYMM8 or QASYMM8_SIGNED quantization on a tensor.
-     *
-     * @param[in] window Region on which to execute the kernel.
-     */
-    template <typename TIn, typename TOut>
-    void run_quantize_qasymm8(const Window &window);
-    /** Function to apply QASYMM16 quantization on a tensor.
-     *
-     * @param[in] window Region on which to execute the kernel.
-     */
-    template <typename T>
-    void run_quantize_qasymm16(const Window &window);
-
-    const ITensor *_input;
-    ITensor       *_output;
-
-    QuantizationFunctionExecutorPtr _func;
-};
-} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEQUANTIZATIONLAYERKERNEL_H */
diff --git a/src/core/NEON/kernels/NEQuantizationLayerKernel.cpp b/src/core/cpu/kernels/CpuQuantizationKernel.cpp
similarity index 63%
rename from src/core/NEON/kernels/NEQuantizationLayerKernel.cpp
rename to src/core/cpu/kernels/CpuQuantizationKernel.cpp
index 6beb69f..9b1e017 100644
--- a/src/core/NEON/kernels/NEQuantizationLayerKernel.cpp
+++ b/src/core/cpu/kernels/CpuQuantizationKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "src/core/NEON/kernels/NEQuantizationLayerKernel.h"
+#include "src/core/cpu/kernels/CpuQuantizationKernel.h"
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
@@ -41,18 +41,22 @@
 
 namespace arm_compute
 {
+namespace cpu
+{
+namespace kernels
+{
 namespace
 {
 constexpr auto window_step = 16;
 
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
+Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape().total_size() == 0);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QASYMM16);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON(dst->tensor_shape().total_size() == 0);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QASYMM16);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
 
     return Status{};
 }
@@ -104,43 +108,40 @@
 
 } // namespace
 
-NEQuantizationLayerKernel::NEQuantizationLayerKernel()
-    : _input(nullptr), _output(nullptr), _func(nullptr)
+CpuQuantizationKernel::CpuQuantizationKernel()
+    : _func(nullptr)
 {
 }
 
-void NEQuantizationLayerKernel::configure(const ITensor *input, ITensor *output)
+void CpuQuantizationKernel::configure(ITensorInfo *src, ITensorInfo *dst)
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info()));
-
-    _input  = input;
-    _output = output;
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst));
 
     static const std::map<std::string, QuantizationFunctionExecutorPtr> quant_map =
     {
-        { "op_QASYMM8_QASYMM8", &NEQuantizationLayerKernel::run_quantize_qasymm8<uint8_t, uint8_t> },
-        { "op_QASYMM8_QASYMM8_SIGNED", &NEQuantizationLayerKernel::run_quantize_qasymm8<uint8_t, int8_t> },
-        { "op_QASYMM8_QASYMM16", &NEQuantizationLayerKernel::run_quantize_qasymm16<uint8_t> },
+        { "op_QASYMM8_QASYMM8", &CpuQuantizationKernel::run_quantize_qasymm8<uint8_t, uint8_t> },
+        { "op_QASYMM8_QASYMM8_SIGNED", &CpuQuantizationKernel::run_quantize_qasymm8<uint8_t, int8_t> },
+        { "op_QASYMM8_QASYMM16", &CpuQuantizationKernel::run_quantize_qasymm16<uint8_t> },
 
-        { "op_QASYMM8_SIGNED_QASYMM8", &NEQuantizationLayerKernel::run_quantize_qasymm8<int8_t, uint8_t> },
-        { "op_QASYMM8_SIGNED_QASYMM8_SIGNED", &NEQuantizationLayerKernel::run_quantize_qasymm8<int8_t, int8_t> },
-        { "op_QASYMM8_SIGNED_QASYMM16", &NEQuantizationLayerKernel::run_quantize_qasymm16<int8_t> },
+        { "op_QASYMM8_SIGNED_QASYMM8", &CpuQuantizationKernel::run_quantize_qasymm8<int8_t, uint8_t> },
+        { "op_QASYMM8_SIGNED_QASYMM8_SIGNED", &CpuQuantizationKernel::run_quantize_qasymm8<int8_t, int8_t> },
+        { "op_QASYMM8_SIGNED_QASYMM16", &CpuQuantizationKernel::run_quantize_qasymm16<int8_t> },
 
-        { "op_F32_QASYMM8", &NEQuantizationLayerKernel::run_quantize_qasymm8<float, uint8_t> },
-        { "op_F32_QASYMM8_SIGNED", &NEQuantizationLayerKernel::run_quantize_qasymm8<float, int8_t> },
-        { "op_F32_QASYMM16", &NEQuantizationLayerKernel::run_quantize_qasymm16<float> },
+        { "op_F32_QASYMM8", &CpuQuantizationKernel::run_quantize_qasymm8<float, uint8_t> },
+        { "op_F32_QASYMM8_SIGNED", &CpuQuantizationKernel::run_quantize_qasymm8<float, int8_t> },
+        { "op_F32_QASYMM16", &CpuQuantizationKernel::run_quantize_qasymm16<float> },
 
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        { "op_F16_QASYMM8", &NEQuantizationLayerKernel::run_quantize_qasymm8<float16_t, uint8_t> },
-        { "op_F16_QASYMM8_SIGNED", &NEQuantizationLayerKernel::run_quantize_qasymm8<float16_t, int8_t> },
-        { "op_F16_QASYMM16", &NEQuantizationLayerKernel::run_quantize_qasymm16<float16_t> },
+        { "op_F16_QASYMM8", &CpuQuantizationKernel::run_quantize_qasymm8<float16_t, uint8_t> },
+        { "op_F16_QASYMM8_SIGNED", &CpuQuantizationKernel::run_quantize_qasymm8<float16_t, int8_t> },
+        { "op_F16_QASYMM16", &CpuQuantizationKernel::run_quantize_qasymm16<float16_t> },
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC*/
     };
 
     std::string function_to_call("op_");
-    function_to_call += string_from_data_type(_input->info()->data_type()) + "_";
-    function_to_call += string_from_data_type(_output->info()->data_type());
+    function_to_call += string_from_data_type(src->data_type()) + "_";
+    function_to_call += string_from_data_type(dst->data_type());
 
     auto it = quant_map.find(function_to_call);
 
@@ -151,26 +152,25 @@
     _func = it->second;
 
     // Configure kernel window
-    Window win_config = calculate_max_window(*input->info(), Steps());
-
-    INEKernel::configure(win_config);
+    Window win_config = calculate_max_window(*src, Steps());
+    ICpuKernel::configure(win_config);
 }
 
-Status NEQuantizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output)
+Status CpuQuantizationKernel::validate(const ITensorInfo *src, const ITensorInfo *dst)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst));
     return Status{};
 }
 
 template <typename TIn, typename TOut>
-void NEQuantizationLayerKernel::run_quantize_qasymm8(const Window &window)
+void CpuQuantizationKernel::run_quantize_qasymm8(const ITensor *src, ITensor *dst, const Window &window)
 {
     const auto window_start_x = static_cast<int>(window.x().start());
     const auto window_end_x   = static_cast<int>(window.x().end());
 
-    const UniformQuantizationInfo uqinfo_in = _input->info()->quantization_info().uniform();
-    UniformQuantizationInfo       uqinfo    = _output->info()->quantization_info().uniform();
-    if(is_data_type_quantized_asymmetric(_input->info()->data_type()))
+    const UniformQuantizationInfo uqinfo_in = src->info()->quantization_info().uniform();
+    UniformQuantizationInfo       uqinfo    = dst->info()->quantization_info().uniform();
+    if(is_data_type_quantized_asymmetric(src->info()->data_type()))
     {
         uqinfo = compute_requantization_scale_offset(uqinfo_in, uqinfo);
     }
@@ -184,8 +184,8 @@
     Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
     win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
 
-    Iterator input(_input, win_collapsed);
-    Iterator output(_output, win_collapsed);
+    Iterator input(src, win_collapsed);
+    Iterator output(dst, win_collapsed);
     execute_window_loop(win_collapsed, [&](const Coordinates &)
     {
         auto input_ptr  = reinterpret_cast<const TIn *>(input.ptr());
@@ -206,14 +206,14 @@
 }
 
 template <typename T>
-void NEQuantizationLayerKernel::run_quantize_qasymm16(const Window &window)
+void CpuQuantizationKernel::run_quantize_qasymm16(const ITensor *src, ITensor *dst, const Window &window)
 {
     const auto window_start_x = static_cast<int>(window.x().start());
     const auto window_end_x   = static_cast<int>(window.x().end());
 
-    const UniformQuantizationInfo uqinfo_in = _input->info()->quantization_info().uniform();
-    UniformQuantizationInfo       uqinfo    = _output->info()->quantization_info().uniform();
-    if(is_data_type_quantized_asymmetric(_input->info()->data_type()))
+    const UniformQuantizationInfo uqinfo_in = src->info()->quantization_info().uniform();
+    UniformQuantizationInfo       uqinfo    = dst->info()->quantization_info().uniform();
+    if(is_data_type_quantized_asymmetric(src->info()->data_type()))
     {
         uqinfo = compute_requantization_scale_offset(uqinfo_in, uqinfo);
     }
@@ -227,8 +227,8 @@
     Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
     win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
 
-    Iterator input(_input, win_collapsed);
-    Iterator output(_output, win_collapsed);
+    Iterator input(src, win_collapsed);
+    Iterator output(dst, win_collapsed);
     execute_window_loop(win_collapsed, [&](const Coordinates &)
     {
         auto input_ptr  = reinterpret_cast<const T *>(input.ptr());
@@ -250,13 +250,22 @@
     input, output);
 }
 
-void NEQuantizationLayerKernel::run(const Window &window, const ThreadInfo &info)
+void CpuQuantizationKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
 {
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
     ARM_COMPUTE_ERROR_ON(_func == nullptr);
 
-    (this->*_func)(window);
+    const auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
+    auto       dst = tensors.get_tensor(TensorType::ACL_DST);
+    (this->*_func)(src, dst, window);
 }
-} // namespace arm_compute
+
+const char *CpuQuantizationKernel::name() const
+{
+    return "CpuQuantizationKernel";
+}
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/core/cpu/kernels/CpuQuantizationKernel.h b/src/core/cpu/kernels/CpuQuantizationKernel.h
new file mode 100644
index 0000000..74fd31f
--- /dev/null
+++ b/src/core/cpu/kernels/CpuQuantizationKernel.h
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_QUANTIZATIONKERNEL_H
+#define ARM_COMPUTE_CPU_QUANTIZATIONKERNEL_H
+
+#include "src/core/common/Macros.h"
+#include "src/core/cpu/ICpuKernel.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+/** Interface for the quantization layer kernel.
+ *
+ * @note The implementation supports only 3D input tensors
+ *
+ */
+class CpuQuantizationKernel : public ICpuKernel
+{
+public:
+    /** Default constructor */
+    CpuQuantizationKernel();
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuQuantizationKernel);
+    /** Set the input, output.
+     *
+     * @param[in]  src Source tensor info. The dimensions over the third will be interpreted as batches. Data types supported: QASYMM8/QASYMM8_SIGNED/F32/F16.
+     * @param[out] dst Destination tensor info with the same dimensions of input. Data types supported: QASYMM8/QASYMM8_SIGNED/QASYMM16.
+     *
+     * @note Output auto initialization is not supported by this kernel
+     */
+    void configure(ITensorInfo *src, ITensorInfo *dst);
+    /** Static function to check if given info will lead to a valid configuration of @ref CpuQuantizationKernel
+     *
+     * @param[in] src Input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F32/F16.
+     * @param[in] dst Output tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/QASYMM16.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
+
+    // Inherited methods overridden:
+    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    const char *name() const override;
+
+private:
+    /** Common signature for all the specialised @ref NEQuantizationLayerKernel functions
+     *
+     * @param[in] window Region on which to execute the kernel.
+     */
+    using QuantizationFunctionExecutorPtr = void (CpuQuantizationKernel::*)(const ITensor *src, ITensor *dst, const Window &window);
+    /** Function to apply QASYMM8 or QASYMM8_SIGNED quantization on a tensor.
+     *
+     * @param[in] window Region on which to execute the kernel.
+     */
+    template <typename TIn, typename TOut>
+    void run_quantize_qasymm8(const ITensor *src, ITensor *dst, const Window &window);
+    /** Function to apply QASYMM16 quantization on a tensor.
+     *
+     * @param[in] window Region on which to execute the kernel.
+     */
+    template <typename T>
+    void run_quantize_qasymm16(const ITensor *src, ITensor *dst, const Window &window);
+
+    QuantizationFunctionExecutorPtr _func;
+};
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CPU_QUANTIZATIONKERNEL_H */
diff --git a/src/runtime/NEON/functions/NEQuantizationLayer.cpp b/src/runtime/NEON/functions/NEQuantizationLayer.cpp
index 42eb12d..58ba687 100644
--- a/src/runtime/NEON/functions/NEQuantizationLayer.cpp
+++ b/src/runtime/NEON/functions/NEQuantizationLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,27 +24,43 @@
 
 #include "arm_compute/runtime/NEON/functions/NEQuantizationLayer.h"
 
-#include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
-#include "src/core/NEON/kernels/NEQuantizationLayerKernel.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "src/runtime/cpu/operators/CpuQuantization.h"
 
 namespace arm_compute
 {
+struct NEQuantizationLayer::Impl
+{
+    const ITensor                        *src{ nullptr };
+    ITensor                              *dst{ nullptr };
+    std::unique_ptr<cpu::CpuQuantization> op{ nullptr };
+};
+
+NEQuantizationLayer::NEQuantizationLayer()
+    : _impl(std::make_unique<Impl>())
+{
+}
+NEQuantizationLayer::~NEQuantizationLayer() = default;
+
 Status NEQuantizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_RETURN_ON_ERROR(NEQuantizationLayerKernel::validate(input, output));
-
-    return Status{};
+    return cpu::CpuQuantization::validate(input, output);
 }
 
 void NEQuantizationLayer::configure(const ITensor *input, ITensor *output)
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    _impl->src = input;
+    _impl->dst = output;
+    _impl->op  = std::make_unique<cpu::CpuQuantization>();
+    _impl->op->configure(input->info(), output->info());
+}
 
-    // Configure quantize kernel
-    auto k = std::make_unique<NEQuantizationLayerKernel>();
-    k->configure(input, output);
-    _kernel = std::move(k);
+void NEQuantizationLayer::run()
+{
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC, _impl->src);
+    pack.add_tensor(TensorType::ACL_DST, _impl->dst);
+    _impl->op->run(pack);
 }
 } // namespace arm_compute
diff --git a/src/runtime/cpu/operators/CpuQuantization.cpp b/src/runtime/cpu/operators/CpuQuantization.cpp
new file mode 100644
index 0000000..ede1385
--- /dev/null
+++ b/src/runtime/cpu/operators/CpuQuantization.cpp
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "src/runtime/cpu/operators/CpuQuantization.h"
+
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/core/cpu/kernels/CpuQuantizationKernel.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+Status CpuQuantization::validate(const ITensorInfo *src, const ITensorInfo *dst)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuQuantizationKernel::validate(src, dst));
+    return Status{};
+}
+
+void CpuQuantization::configure(ITensorInfo *src, ITensorInfo *dst)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+
+    // Configure quantize kernel
+    auto k = std::make_unique<kernels::CpuQuantizationKernel>();
+    k->configure(src, dst);
+    _kernel = std::move(k);
+}
+
+void CpuQuantization::run(ITensorPack &tensors)
+{
+    ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided");
+    NEScheduler::get().schedule_op(_kernel.get(), Window::DimY, _kernel->window(), tensors);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/runtime/cpu/operators/CpuQuantization.h b/src/runtime/cpu/operators/CpuQuantization.h
new file mode 100644
index 0000000..97f0c5f
--- /dev/null
+++ b/src/runtime/cpu/operators/CpuQuantization.h
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_QUANTIZATION_H
+#define ARM_COMPUTE_CPU_QUANTIZATION_H
+
+#include "arm_compute/core/ITensorInfo.h"
+#include "arm_compute/core/experimental/Types.h"
+#include "src/core/cpu/ICpuKernel.h"
+#include "src/runtime/cpu/ICpuOperator.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+namespace cpu
+{
+/** Basic function to simulate a quantization layer. This function calls the following Arm(R) Neon(TM) kernels:
+ *
+ *
+ * -# @ref kernels::CpuQuantizationKernel
+ *
+ */
+class CpuQuantization : public ICpuOperator
+{
+public:
+    /** Default Constructor */
+    CpuQuantization() = default;
+    /** Set the input and output tensors.
+     *
+     * @param[in]  src Source tensor info. The dimensions over the third will be interpreted as batches. Data types supported: QASYMM8/QASYMM8_SIGNED/F32/F16.
+     * @param[out] dst Destination tensor info with the same dimensions of input. Data types supported: QASYMM8/QASYMM8_SIGNED/QASYMM16
+     */
+    void configure(ITensorInfo *src, ITensorInfo *dst);
+    /** Static function to check if given info will lead to a valid configuration of @ref CpuQuantization
+     *
+     * @param[in] src Input tensor info. The dimensions over the third will be interpreted as batches. Data types supported: QASYMM8/QASYMM8_SIGNED/F32/F16.
+     * @param[in] dst Output tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/QASYMM16
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
+
+    // Inherited methods overridden:
+    void run(ITensorPack &tensors) override;
+};
+} // namespace cpu
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CPU_QUANTIZATION_H */