Rework CpuQuantizeKernel to enable FP16 in multi_isa builds

Resolves: COMPMID-7054
Signed-off-by: Ramy Elgammal <ramy.elgammal@arm.com>
Change-Id: I68d125b81ad7f74b2594ccda8d6ec08beef1ebd7
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/11555
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Pablo Marquez Tello <pablo.tello@arm.com>
Benchmark: Arm Jenkins <bsgcomp@arm.com>
diff --git a/src/BUILD.bazel b/src/BUILD.bazel
index 2648a96..499e564 100644
--- a/src/BUILD.bazel
+++ b/src/BUILD.bazel
@@ -818,6 +818,10 @@
 	"cpu/kernels/pool3d/neon/fp32.cpp",
 	"cpu/kernels/pool3d/neon/qasymm8.cpp",
 	"cpu/kernels/pool3d/neon/qasymm8_signed.cpp",
+	"cpu/kernels/quantize/generic/neon/fp16.cpp",
+	"cpu/kernels/quantize/generic/neon/fp32.cpp",
+	"cpu/kernels/quantize/generic/neon/integer.cpp",
+	"cpu/kernels/quantize/generic/neon/vquantize.cpp",
 	"cpu/kernels/range/generic/neon/fp16.cpp",
 	"cpu/kernels/range/generic/neon/fp32.cpp",
 	"cpu/kernels/range/generic/neon/integer.cpp",
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index e47b5cb..8d63ab5 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -809,6 +809,10 @@
 	cpu/kernels/pool3d/neon/fp32.cpp
 	cpu/kernels/pool3d/neon/qasymm8.cpp
 	cpu/kernels/pool3d/neon/qasymm8_signed.cpp
+	cpu/kernels/quantize/generic/neon/fp16.cpp
+	cpu/kernels/quantize/generic/neon/fp32.cpp
+	cpu/kernels/quantize/generic/neon/integer.cpp
+	cpu/kernels/quantize/generic/neon/vquantize.cpp
 	cpu/kernels/range/generic/neon/fp16.cpp
 	cpu/kernels/range/generic/neon/fp32.cpp
 	cpu/kernels/range/generic/neon/integer.cpp
diff --git a/src/cpu/kernels/CpuQuantizeKernel.cpp b/src/cpu/kernels/CpuQuantizeKernel.cpp
index d2ac6cf..ed4675a 100644
--- a/src/cpu/kernels/CpuQuantizeKernel.cpp
+++ b/src/cpu/kernels/CpuQuantizeKernel.cpp
@@ -29,12 +29,12 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 
+#include "src/core/common/Registrars.h"
 #include "src/core/CPP/Validate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-#include "src/core/NEON/NEAsymm.h"
-#include "src/core/NEON/NEMath.h"
 #include "src/core/NEON/wrapper/wrapper.h"
+#include "src/cpu/kernels/quantize/generic/neon/list.h"
 
 #include <arm_neon.h>
 #include <map>
@@ -47,7 +47,6 @@
 {
 namespace
 {
-constexpr auto window_step = 16;
 
 Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst)
 {
@@ -63,59 +62,6 @@
     return Status{};
 }
 
-template <typename T>
-inline float32x4x4_t load_value(const T *input_ptr)
-{
-    using Tx16_t = typename wrapper::traits::neon_vector<T, 16>::type;
-    return arm_compute::convert_to_float32x4x4<Tx16_t>(wrapper::vloadq(input_ptr));
-}
-
-template <>
-inline float32x4x4_t load_value(const float *input_ptr)
-{
-    return {wrapper::vloadq(input_ptr), wrapper::vloadq(input_ptr + 4), wrapper::vloadq(input_ptr + 8),
-            wrapper::vloadq(input_ptr + 12)};
-}
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-template <>
-inline float32x4x4_t load_value(const float16_t *input_ptr)
-{
-    return {vcvt_f32_f16(wrapper::vload(input_ptr)), vcvt_f32_f16(wrapper::vload(input_ptr + 4)),
-            vcvt_f32_f16(wrapper::vload(input_ptr + 8)), vcvt_f32_f16(wrapper::vload(input_ptr + 12))};
-}
-
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-
-template <typename element_type>
-using vector_type = wrapper::traits::neon_vector_t<element_type, window_step>;
-
-template <typename quantized_type>
-vector_type<quantized_type> vquantize_qasymm8(const float32x4x4_t &qv, const UniformQuantizationInfo &qi);
-
-template <>
-vector_type<uint8_t> vquantize_qasymm8<uint8_t>(const float32x4x4_t &qv, const UniformQuantizationInfo &qi)
-{
-    return vquantize(qv, qi);
-}
-
-template <>
-vector_type<int8_t> vquantize_qasymm8<int8_t>(const float32x4x4_t &qv, const UniformQuantizationInfo &qi)
-{
-    return vquantize_signed(qv, qi);
-}
-
-template <typename TOut, typename = typename std::enable_if<std::is_signed<TOut>::value, bool>::type>
-inline int8x16_t recombine_8_16(int16x8_t lower, int16x8_t upper)
-{
-    return wrapper::vcombine(wrapper::vqmovn(lower), wrapper::vqmovn(upper));
-}
-
-template <typename TOut, typename = typename std::enable_if<std::is_unsigned<TOut>::value, bool>::type>
-inline uint8x16_t recombine_8_16(int16x8_t lower, int16x8_t upper)
-{
-    return wrapper::vcombine(wrapper::vqmovun(lower), wrapper::vqmovun(upper));
-}
-
 } // namespace
 
 void CpuQuantizeKernel::configure(const ITensorInfo *src, ITensorInfo *dst)
@@ -124,38 +70,36 @@
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst));
 
     static const std::map<std::string, QuantizeFunctionExecutorPtr> quant_map = {
-        {"op_QASYMM8_QASYMM8", &CpuQuantizeKernel::run_quantize_qasymm8<uint8_t, uint8_t>},
-        {"op_QASYMM8_QASYMM8_SIGNED", &CpuQuantizeKernel::run_quantize_qasymm8<uint8_t, int8_t>},
-        {"op_QASYMM8_QASYMM16", &CpuQuantizeKernel::run_quantize_qasymm16<uint8_t>},
+        {"op_QASYMM8_QASYMM8", REGISTER_INTEGER_NEON(u8_u8_run_quantize_qasymm8)},
+        {"op_QASYMM8_QASYMM8_SIGNED", REGISTER_INTEGER_NEON(u8_i8_run_quantize_qasymm8)},
+        {"op_QASYMM8_QASYMM16", REGISTER_INTEGER_NEON(u8_run_quantize_qasymm16)},
 
-        {"op_QASYMM8_SIGNED_QASYMM8", &CpuQuantizeKernel::run_quantize_qasymm8<int8_t, uint8_t>},
-        {"op_QASYMM8_SIGNED_QASYMM8_SIGNED", &CpuQuantizeKernel::run_quantize_qasymm8<int8_t, int8_t>},
-        {"op_QASYMM8_SIGNED_QASYMM16", &CpuQuantizeKernel::run_quantize_qasymm16<int8_t>},
+        {"op_QASYMM8_SIGNED_QASYMM8", REGISTER_INTEGER_NEON(i8_u8_run_quantize_qasymm8)},
+        {"op_QASYMM8_SIGNED_QASYMM8_SIGNED", REGISTER_INTEGER_NEON(i8_i8_run_quantize_qasymm8)},
+        {"op_QASYMM8_SIGNED_QASYMM16", REGISTER_INTEGER_NEON(i8_run_quantize_qasymm16)},
 
         // Functions for offset only requantization
-        {"op_OFFSET_ONLY_QASYMM8_QASYMM8", &CpuQuantizeKernel::run_requantize_offset_only<uint8_t, uint8_t>},
-        {"op_OFFSET_ONLY_QASYMM8_QASYMM8_SIGNED", &CpuQuantizeKernel::run_requantize_offset_only<uint8_t, int8_t>},
-        {"op_OFFSET_ONLY_QASYMM8_SIGNED_QASYMM8", &CpuQuantizeKernel::run_requantize_offset_only<int8_t, uint8_t>},
-        {"op_OFFSET_ONLY_QASYMM8_SIGNED_QASYMM8_SIGNED",
-         &CpuQuantizeKernel::run_requantize_offset_only<int8_t, int8_t>},
+        {"op_OFFSET_ONLY_QASYMM8_QASYMM8", REGISTER_INTEGER_NEON(u8_u8_run_requantize_offset_only)},
+        {"op_OFFSET_ONLY_QASYMM8_QASYMM8_SIGNED", REGISTER_INTEGER_NEON(u8_i8_run_requantize_offset_only)},
+        {"op_OFFSET_ONLY_QASYMM8_SIGNED_QASYMM8", REGISTER_INTEGER_NEON(i8_u8_run_requantize_offset_only)},
+        {"op_OFFSET_ONLY_QASYMM8_SIGNED_QASYMM8_SIGNED", REGISTER_INTEGER_NEON(i8_i8_run_requantize_offset_only)},
 
         // Functions for offset uint8 to int8 and vice versa quantization (no scale changes)
         {"op_OFFSET_ONLY_CONVERT_QASYMM8_SIGNED_QASYMM8",
-         &CpuQuantizeKernel::run_requantize_offset_only_convert<int8_t, uint8_t>},
+         REGISTER_INTEGER_NEON(i8_u8_run_requantize_offset_only_convert)},
         {"op_OFFSET_ONLY_CONVERT_QASYMM8_QASYMM8_SIGNED",
-         &CpuQuantizeKernel::run_requantize_offset_only_convert<uint8_t, int8_t>},
+         REGISTER_INTEGER_NEON(u8_i8_run_requantize_offset_only_convert)},
 
-        {"op_F32_QSYMM8", &CpuQuantizeKernel::run_quantize_qsymm8<float, int8_t>},
+        {"op_F32_QSYMM8", REGISTER_FP32_NEON(fp32_i8_run_quantize_qsymm8)},
+        {"op_F32_QASYMM8", REGISTER_FP32_NEON(fp32_u8_run_quantize_qasymm8)},
+        {"op_F32_QASYMM8_SIGNED", REGISTER_FP32_NEON(fp32_i8_run_quantize_qasymm8)},
+        {"op_F32_QASYMM16", REGISTER_FP32_NEON(fp32_run_quantize_qasymm16)},
 
-        {"op_F32_QASYMM8", &CpuQuantizeKernel::run_quantize_qasymm8<float, uint8_t>},
-        {"op_F32_QASYMM8_SIGNED", &CpuQuantizeKernel::run_quantize_qasymm8<float, int8_t>},
-        {"op_F32_QASYMM16", &CpuQuantizeKernel::run_quantize_qasymm16<float>},
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        {"op_F16_QASYMM8", &CpuQuantizeKernel::run_quantize_qasymm8<float16_t, uint8_t>},
-        {"op_F16_QASYMM8_SIGNED", &CpuQuantizeKernel::run_quantize_qasymm8<float16_t, int8_t>},
-        {"op_F16_QASYMM16", &CpuQuantizeKernel::run_quantize_qasymm16<float16_t>},
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC*/
+#ifdef ARM_COMPUTE_ENABLE_FP16
+        {"op_F16_QASYMM8", REGISTER_FP16_NEON(fp16_u8_run_quantize_qasymm8)},
+        {"op_F16_QASYMM8_SIGNED", REGISTER_FP16_NEON(fp16_i8_run_quantize_qasymm8)},
+        {"op_F16_QASYMM16", REGISTER_FP16_NEON(fp16_run_quantize_qasymm16)},
+#endif /* ARM_COMPUTE_ENABLE_FP16 */
     };
 
     std::string function_to_call("op_");
@@ -203,242 +147,6 @@
     return Status{};
 }
 
-template <typename TIn, typename TOut>
-void CpuQuantizeKernel::run_quantize_qsymm8(const ITensor *src, ITensor *dst, const Window &window)
-{
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
-
-    const UniformQuantizationInfo uqinfo_in = src->info()->quantization_info().uniform();
-    UniformQuantizationInfo       uqinfo    = dst->info()->quantization_info().uniform();
-    uqinfo                                  = compute_requantization_scale_offset(uqinfo_in, uqinfo);
-
-    // Collapse window and reset first dimension to handle tail calculations manually
-    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
-    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator input(src, win_collapsed);
-    Iterator output(dst, win_collapsed);
-    execute_window_loop(
-        win_collapsed,
-        [&](const Coordinates &)
-        {
-            auto input_ptr  = reinterpret_cast<const TIn *>(input.ptr());
-            auto output_ptr = reinterpret_cast<TOut *>(output.ptr());
-            int  x          = window_start_x;
-            for (; x <= (window_end_x - window_step); x += window_step)
-            {
-                wrapper::vstore(&output_ptr[x], vquantize_qasymm8<TOut>(load_value(&input_ptr[x]), uqinfo));
-            }
-            // Compute left-over elements
-            for (; x < window_end_x; ++x)
-            {
-                output_ptr[x] = quantize_qsymm8(input_ptr[x], dst->info()->quantization_info());
-            }
-        },
-        input, output);
-}
-
-template <typename TIn, typename TOut>
-void CpuQuantizeKernel::run_requantize_offset_only_convert(const ITensor *src, ITensor *dst, const Window &window)
-{
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
-
-    // Calculate output offset difference.
-    const UniformQuantizationInfo uqinfo_in = src->info()->quantization_info().uniform();
-    UniformQuantizationInfo       uqinfo    = dst->info()->quantization_info().uniform();
-    uqinfo                                  = compute_requantization_scale_offset(uqinfo_in, uqinfo);
-
-    // Collapse window and reset first dimension to handle tail calculations manually
-    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
-
-    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    // Duplicate offset in signed vector format
-    const int8x16_t offset = wrapper::vdup_n(static_cast<int8_t>(uqinfo.offset), wrapper::traits::vector_128_tag{});
-
-    Iterator input(src, win_collapsed);
-    Iterator output(dst, win_collapsed);
-    execute_window_loop(
-        win_collapsed,
-        [&](const Coordinates &)
-        {
-            auto input_ptr  = reinterpret_cast<const TIn *>(input.ptr());
-            auto output_ptr = reinterpret_cast<TOut *>(output.ptr());
-            int  x          = window_start_x;
-            for (; x <= (window_end_x - window_step); x += window_step)
-            {
-                const wrapper::traits::neon_vector_t<TIn, window_step> qv =
-                    wrapper::vloadq(input_ptr + x); // load 128 bit vector of 8 bit datatype
-
-                // Signed addition.
-                auto res = vaddq_s8(reinterpret_cast<int8x16_t>(qv), offset);
-
-                // Output is dependent on datatype.
-                wrapper::vstore(&output_ptr[x],
-                                reinterpret_cast<wrapper::traits::neon_vector_t<TOut, window_step>>(res));
-            }
-            // Compute left-over elements
-            for (; x < window_end_x; ++x)
-            {
-                auto result   = uqinfo.offset + static_cast<int32_t>(input_ptr[x]);
-                output_ptr[x] = static_cast<TOut>(result);
-            }
-        },
-        input, output);
-}
-
-template <typename TIn, typename TOut>
-void CpuQuantizeKernel::run_requantize_offset_only(const ITensor *src, ITensor *dst, const Window &window)
-{
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
-
-    const UniformQuantizationInfo uqinfo_in = src->info()->quantization_info().uniform();
-    UniformQuantizationInfo       uqinfo    = dst->info()->quantization_info().uniform();
-    uqinfo                                  = compute_requantization_scale_offset(uqinfo_in, uqinfo);
-
-    // Collapse window and reset first dimension to handle tail calculations manually
-    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
-    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    // Duplicate offset in signed vector format
-    const int16x8_t offset = wrapper::vdup_n(static_cast<int16_t>(uqinfo.offset), wrapper::traits::vector_128_tag{});
-
-    const int32_t low_bound   = (dst->info()->data_type() == DataType::QASYMM8) ? 0 : -128;
-    const int32_t upper_bound = (dst->info()->data_type() == DataType::QASYMM8) ? 255 : 127;
-
-    Iterator input(src, win_collapsed);
-    Iterator output(dst, win_collapsed);
-    execute_window_loop(
-        win_collapsed,
-        [&](const Coordinates &)
-        {
-            auto  input_ptr  = reinterpret_cast<const TIn *>(input.ptr());
-            TOut *output_ptr = reinterpret_cast<TOut *>(output.ptr());
-
-            int x = window_start_x;
-            for (; x <= (window_end_x - window_step); x += window_step)
-            {
-                const auto qv    = wrapper::vloadq(input_ptr + x); // load 128 bit vector of 8 bit datatype
-                int16x8_t  lower = reinterpret_cast<int16x8_t>(wrapper::vmovl(wrapper::vgetlow(qv)));
-                int16x8_t  upper = reinterpret_cast<int16x8_t>(wrapper::vmovl(wrapper::vgethigh(qv)));
-
-                // Signed addition.
-                lower = wrapper::vqadd(lower, offset);
-                upper = wrapper::vqadd(upper, offset);
-
-                // Output is dependent on datatype.
-                auto res = recombine_8_16<TOut>(lower, upper);
-                wrapper::vstore(&output_ptr[x], res);
-            }
-            // Compute left-over elements
-            for (; x < window_end_x; ++x)
-            {
-                // Add offset and clamp result to within the range of the output datatype.
-                int32_t result = uqinfo.offset + static_cast<int32_t>(input_ptr[x]);
-                result         = utility::clamp<int32_t>(result, low_bound, upper_bound);
-
-                // Cast result to output datatype.
-                output_ptr[x] = static_cast<TOut>(result);
-            }
-        },
-        input, output);
-}
-
-template <typename TIn, typename TOut>
-void CpuQuantizeKernel::run_quantize_qasymm8(const ITensor *src, ITensor *dst, const Window &window)
-{
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
-
-    const UniformQuantizationInfo uqinfo_in = src->info()->quantization_info().uniform();
-    UniformQuantizationInfo       uqinfo    = dst->info()->quantization_info().uniform();
-    if (is_data_type_quantized_asymmetric(src->info()->data_type()))
-    {
-        uqinfo = compute_requantization_scale_offset(uqinfo_in, uqinfo);
-    }
-#ifdef __aarch64__
-    constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_EVEN;
-#else  //__aarch64__
-    constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_ZERO;
-#endif //__aarch64__
-
-    // Collapse window and reset first dimension to handle tail calculations manually
-    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
-    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator input(src, win_collapsed);
-    Iterator output(dst, win_collapsed);
-    execute_window_loop(
-        win_collapsed,
-        [&](const Coordinates &)
-        {
-            auto input_ptr  = reinterpret_cast<const TIn *>(input.ptr());
-            auto output_ptr = reinterpret_cast<TOut *>(output.ptr());
-
-            int x = window_start_x;
-            for (; x <= (window_end_x - window_step); x += window_step)
-            {
-                wrapper::vstore(&output_ptr[x], vquantize_qasymm8<TOut>(load_value(&input_ptr[x]), uqinfo));
-            }
-            // Compute left-over elements
-            for (; x < window_end_x; ++x)
-            {
-                output_ptr[x] = Qasymm8QuantizationHelper<TOut>::quantize(input_ptr[x], uqinfo, rounding_policy);
-            }
-        },
-        input, output);
-}
-
-template <typename T>
-void CpuQuantizeKernel::run_quantize_qasymm16(const ITensor *src, ITensor *dst, const Window &window)
-{
-    const auto window_start_x = static_cast<int>(window.x().start());
-    const auto window_end_x   = static_cast<int>(window.x().end());
-
-    const UniformQuantizationInfo uqinfo_in = src->info()->quantization_info().uniform();
-    UniformQuantizationInfo       uqinfo    = dst->info()->quantization_info().uniform();
-    if (is_data_type_quantized_asymmetric(src->info()->data_type()))
-    {
-        uqinfo = compute_requantization_scale_offset(uqinfo_in, uqinfo);
-    }
-#ifdef __aarch64__
-    constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_EVEN;
-#else  //__aarch64__
-    constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_ZERO;
-#endif //__aarch64__
-
-    // Collapse window and reset first dimension to handle tail calculations manually
-    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
-    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
-
-    Iterator input(src, win_collapsed);
-    Iterator output(dst, win_collapsed);
-    execute_window_loop(
-        win_collapsed,
-        [&](const Coordinates &)
-        {
-            auto input_ptr  = reinterpret_cast<const T *>(input.ptr());
-            auto output_ptr = reinterpret_cast<uint16_t *>(output.ptr());
-
-            int x = window_start_x;
-            for (; x <= (window_end_x - window_step); x += window_step)
-            {
-                uint16x8x2_t tmp = vquantize_qasymm16(load_value(&input_ptr[x]), uqinfo);
-                vst1q_u16(&output_ptr[x], tmp.val[0]);
-                vst1q_u16(&output_ptr[x + 8], tmp.val[1]);
-            }
-            // Compute left-over elements
-            for (; x < window_end_x; ++x)
-            {
-                output_ptr[x] = quantize_qasymm16(input_ptr[x], uqinfo, rounding_policy);
-            }
-        },
-        input, output);
-}
-
 void CpuQuantizeKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
 {
     ARM_COMPUTE_UNUSED(info);
@@ -448,7 +156,7 @@
 
     const auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
     auto       dst = tensors.get_tensor(TensorType::ACL_DST);
-    (this->*_func)(src, dst, window);
+    (*_func)(src, dst, window);
 }
 
 const char *CpuQuantizeKernel::name() const
diff --git a/src/cpu/kernels/CpuQuantizeKernel.h b/src/cpu/kernels/CpuQuantizeKernel.h
index c2f7ac6..750310c 100644
--- a/src/cpu/kernels/CpuQuantizeKernel.h
+++ b/src/cpu/kernels/CpuQuantizeKernel.h
@@ -76,31 +76,7 @@
      *
      * @param[in] window Region on which to execute the kernel.
      */
-    using QuantizeFunctionExecutorPtr = void (CpuQuantizeKernel::*)(const ITensor *src,
-                                                                    ITensor       *dst,
-                                                                    const Window  &window);
-    /** Function to apply QASYMM8 or QASYMM8_SIGNED quantization on a tensor.
-     *
-     * @param[in] window Region on which to execute the kernel.
-     */
-    template <typename TIn, typename TOut>
-    void run_quantize_qasymm8(const ITensor *src, ITensor *dst, const Window &window);
-    /** Function to apply QASYMM16 quantization on a tensor.
-     *
-     * @param[in] window Region on which to execute the kernel.
-     */
-    template <typename T>
-    void run_quantize_qasymm16(const ITensor *src, ITensor *dst, const Window &window);
-
-    template <typename TIn, typename TOut>
-    void run_quantize_qsymm8(const ITensor *src, ITensor *dst, const Window &window);
-
-    template <typename TIn, typename TOut>
-    void run_requantize_offset_only(const ITensor *src, ITensor *dst, const Window &window);
-
-    template <typename TIn, typename TOut>
-    void run_requantize_offset_only_convert(const ITensor *src, ITensor *dst, const Window &window);
-
+    using QuantizeFunctionExecutorPtr = void (*)(const ITensor *src, ITensor *dst, const Window &window);
     QuantizeFunctionExecutorPtr _func{nullptr};
     size_t                      _split_dimension{Window::DimY};
 };
diff --git a/src/cpu/kernels/quantize/generic/neon/fp16.cpp b/src/cpu/kernels/quantize/generic/neon/fp16.cpp
new file mode 100644
index 0000000..456a3bd
--- /dev/null
+++ b/src/cpu/kernels/quantize/generic/neon/fp16.cpp
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+#include "src/cpu/kernels/quantize/generic/neon/impl_fp16.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void fp16_u8_run_quantize_qasymm8(const ITensor *src, ITensor *dst, const Window &window)
+{
+    run_quantize_qasymm8<float16_t, uint8_t>(src, dst, window);
+}
+void fp16_i8_run_quantize_qasymm8(const ITensor *src, ITensor *dst, const Window &window)
+{
+    run_quantize_qasymm8<float16_t, int8_t>(src, dst, window);
+}
+void fp16_run_quantize_qasymm16(const ITensor *src, ITensor *dst, const Window &window)
+{
+    run_quantize_qasymm16<float16_t>(src, dst, window);
+}
+} // namespace cpu
+} // namespace arm_compute
+#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
diff --git a/src/cpu/kernels/quantize/generic/neon/fp32.cpp b/src/cpu/kernels/quantize/generic/neon/fp32.cpp
new file mode 100644
index 0000000..15f52b2
--- /dev/null
+++ b/src/cpu/kernels/quantize/generic/neon/fp32.cpp
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/kernels/quantize/generic/neon/impl_fp32.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void fp32_u8_run_quantize_qasymm8(const ITensor *src, ITensor *dst, const Window &window)
+{
+    run_quantize_qasymm8<float, uint8_t>(src, dst, window);
+}
+void fp32_i8_run_quantize_qasymm8(const ITensor *src, ITensor *dst, const Window &window)
+{
+    run_quantize_qasymm8<float, int8_t>(src, dst, window);
+}
+void fp32_run_quantize_qasymm16(const ITensor *src, ITensor *dst, const Window &window)
+{
+    run_quantize_qasymm16<float>(src, dst, window);
+}
+
+void fp32_i8_run_quantize_qsymm8(const ITensor *src, ITensor *dst, const Window &window)
+{
+    run_quantize_qsymm8<float, int8_t>(src, dst, window);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/quantize/generic/neon/impl.h b/src/cpu/kernels/quantize/generic/neon/impl.h
new file mode 100644
index 0000000..1861fca
--- /dev/null
+++ b/src/cpu/kernels/quantize/generic/neon/impl.h
@@ -0,0 +1,302 @@
+/*
+ * Copyright (c) 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_CPU_KERNELS_QUANTIZE_GENERIC_NEON_IMPL_H
+#define ACL_SRC_CPU_KERNELS_QUANTIZE_GENERIC_NEON_IMPL_H
+
+#include "arm_compute/core/Helpers.h"
+
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/NEAsymm.h"
+#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+constexpr auto window_step = 16;
+
+template <typename T>
+inline float32x4x4_t load_value(const T *input_ptr)
+{
+    using Tx16_t = typename wrapper::traits::neon_vector<T, 16>::type;
+    return arm_compute::convert_to_float32x4x4<Tx16_t>(wrapper::vloadq(input_ptr));
+}
+
+template <typename element_type>
+using vector_type = wrapper::traits::neon_vector_t<element_type, window_step>;
+
+template <typename quantized_type>
+vector_type<quantized_type> vquantize_qasymm8(const float32x4x4_t &qv, const UniformQuantizationInfo &qi);
+
+template <typename TOut, typename = typename std::enable_if<std::is_signed<TOut>::value, bool>::type>
+inline int8x16_t recombine_8_16(int16x8_t lower, int16x8_t upper)
+{
+    return wrapper::vcombine(wrapper::vqmovn(lower), wrapper::vqmovn(upper));
+}
+
+template <typename TOut, typename = typename std::enable_if<std::is_unsigned<TOut>::value, bool>::type>
+inline uint8x16_t recombine_8_16(int16x8_t lower, int16x8_t upper)
+{
+    return wrapper::vcombine(wrapper::vqmovun(lower), wrapper::vqmovun(upper));
+}
+
+template <typename TIn, typename TOut>
+void run_quantize_qsymm8(const ITensor *src, ITensor *dst, const Window &window)
+{
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+
+    const UniformQuantizationInfo uqinfo_in = src->info()->quantization_info().uniform();
+    UniformQuantizationInfo       uqinfo    = dst->info()->quantization_info().uniform();
+    uqinfo                                  = compute_requantization_scale_offset(uqinfo_in, uqinfo);
+
+    // Collapse window and reset first dimension to handle tail calculations manually
+    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator input(src, win_collapsed);
+    Iterator output(dst, win_collapsed);
+    execute_window_loop(
+        win_collapsed,
+        [&](const Coordinates &)
+        {
+            auto input_ptr  = reinterpret_cast<const TIn *>(input.ptr());
+            auto output_ptr = reinterpret_cast<TOut *>(output.ptr());
+            int  x          = window_start_x;
+            for (; x <= (window_end_x - window_step); x += window_step)
+            {
+                wrapper::vstore(&output_ptr[x], vquantize_qasymm8<TOut>(load_value(&input_ptr[x]), uqinfo));
+            }
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                output_ptr[x] = quantize_qsymm8(input_ptr[x], dst->info()->quantization_info());
+            }
+        },
+        input, output);
+}
+
+template <typename TIn, typename TOut>
+void run_requantize_offset_only_convert(const ITensor *src, ITensor *dst, const Window &window)
+{
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+
+    // Calculate output offset difference.
+    const UniformQuantizationInfo uqinfo_in = src->info()->quantization_info().uniform();
+    UniformQuantizationInfo       uqinfo    = dst->info()->quantization_info().uniform();
+    uqinfo                                  = compute_requantization_scale_offset(uqinfo_in, uqinfo);
+
+    // Collapse window and reset first dimension to handle tail calculations manually
+    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+
+    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    // Duplicate offset in signed vector format
+    const int8x16_t offset = wrapper::vdup_n(static_cast<int8_t>(uqinfo.offset), wrapper::traits::vector_128_tag{});
+
+    Iterator input(src, win_collapsed);
+    Iterator output(dst, win_collapsed);
+    execute_window_loop(
+        win_collapsed,
+        [&](const Coordinates &)
+        {
+            auto input_ptr  = reinterpret_cast<const TIn *>(input.ptr());
+            auto output_ptr = reinterpret_cast<TOut *>(output.ptr());
+            int  x          = window_start_x;
+            for (; x <= (window_end_x - window_step); x += window_step)
+            {
+                const wrapper::traits::neon_vector_t<TIn, window_step> qv =
+                    wrapper::vloadq(input_ptr + x); // load 128 bit vector of 8 bit datatype
+
+                // Signed addition.
+                auto res = vaddq_s8(reinterpret_cast<int8x16_t>(qv), offset);
+
+                // Output is dependent on datatype.
+                wrapper::vstore(&output_ptr[x],
+                                reinterpret_cast<wrapper::traits::neon_vector_t<TOut, window_step>>(res));
+            }
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                auto result   = uqinfo.offset + static_cast<int32_t>(input_ptr[x]);
+                output_ptr[x] = static_cast<TOut>(result);
+            }
+        },
+        input, output);
+}
+
+template <typename TIn, typename TOut>
+void run_requantize_offset_only(const ITensor *src, ITensor *dst, const Window &window)
+{
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+
+    const UniformQuantizationInfo uqinfo_in = src->info()->quantization_info().uniform();
+    UniformQuantizationInfo       uqinfo    = dst->info()->quantization_info().uniform();
+    uqinfo                                  = compute_requantization_scale_offset(uqinfo_in, uqinfo);
+
+    // Collapse window and reset first dimension to handle tail calculations manually
+    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    // Duplicate offset in signed vector format
+    const int16x8_t offset = wrapper::vdup_n(static_cast<int16_t>(uqinfo.offset), wrapper::traits::vector_128_tag{});
+
+    const int32_t low_bound   = (dst->info()->data_type() == DataType::QASYMM8) ? 0 : -128;
+    const int32_t upper_bound = (dst->info()->data_type() == DataType::QASYMM8) ? 255 : 127;
+
+    Iterator input(src, win_collapsed);
+    Iterator output(dst, win_collapsed);
+    execute_window_loop(
+        win_collapsed,
+        [&](const Coordinates &)
+        {
+            auto  input_ptr  = reinterpret_cast<const TIn *>(input.ptr());
+            TOut *output_ptr = reinterpret_cast<TOut *>(output.ptr());
+
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step); x += window_step)
+            {
+                const auto qv    = wrapper::vloadq(input_ptr + x); // load 128 bit vector of 8 bit datatype
+                int16x8_t  lower = reinterpret_cast<int16x8_t>(wrapper::vmovl(wrapper::vgetlow(qv)));
+                int16x8_t  upper = reinterpret_cast<int16x8_t>(wrapper::vmovl(wrapper::vgethigh(qv)));
+
+                // Signed addition.
+                lower = wrapper::vqadd(lower, offset);
+                upper = wrapper::vqadd(upper, offset);
+
+                // Output is dependent on datatype.
+                auto res = recombine_8_16<TOut>(lower, upper);
+                wrapper::vstore(&output_ptr[x], res);
+            }
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                // Add offset and clamp result to within the range of the output datatype.
+                int32_t result = uqinfo.offset + static_cast<int32_t>(input_ptr[x]);
+                result         = utility::clamp<int32_t>(result, low_bound, upper_bound);
+
+                // Cast result to output datatype.
+                output_ptr[x] = static_cast<TOut>(result);
+            }
+        },
+        input, output);
+}
+
+template <typename TIn, typename TOut>
+void run_quantize_qasymm8(const ITensor *src, ITensor *dst, const Window &window)
+{
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+
+    const UniformQuantizationInfo uqinfo_in = src->info()->quantization_info().uniform();
+    UniformQuantizationInfo       uqinfo    = dst->info()->quantization_info().uniform();
+    if (is_data_type_quantized_asymmetric(src->info()->data_type()))
+    {
+        uqinfo = compute_requantization_scale_offset(uqinfo_in, uqinfo);
+    }
+#ifdef __aarch64__
+    constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_EVEN;
+#else  //__aarch64__
+    constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_ZERO;
+#endif //__aarch64__
+
+    // Collapse window and reset first dimension to handle tail calculations manually
+    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator input(src, win_collapsed);
+    Iterator output(dst, win_collapsed);
+    execute_window_loop(
+        win_collapsed,
+        [&](const Coordinates &)
+        {
+            auto input_ptr  = reinterpret_cast<const TIn *>(input.ptr());
+            auto output_ptr = reinterpret_cast<TOut *>(output.ptr());
+
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step); x += window_step)
+            {
+                wrapper::vstore(&output_ptr[x], vquantize_qasymm8<TOut>(load_value(&input_ptr[x]), uqinfo));
+            }
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                output_ptr[x] = Qasymm8QuantizationHelper<TOut>::quantize(input_ptr[x], uqinfo, rounding_policy);
+            }
+        },
+        input, output);
+}
+
+template <typename T>
+void run_quantize_qasymm16(const ITensor *src, ITensor *dst, const Window &window)
+{
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+
+    const UniformQuantizationInfo uqinfo_in = src->info()->quantization_info().uniform();
+    UniformQuantizationInfo       uqinfo    = dst->info()->quantization_info().uniform();
+    if (is_data_type_quantized_asymmetric(src->info()->data_type()))
+    {
+        uqinfo = compute_requantization_scale_offset(uqinfo_in, uqinfo);
+    }
+#ifdef __aarch64__
+    constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_EVEN;
+#else  //__aarch64__
+    constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_ZERO;
+#endif //__aarch64__
+
+    // Collapse window and reset first dimension to handle tail calculations manually
+    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator input(src, win_collapsed);
+    Iterator output(dst, win_collapsed);
+    execute_window_loop(
+        win_collapsed,
+        [&](const Coordinates &)
+        {
+            auto input_ptr  = reinterpret_cast<const T *>(input.ptr());
+            auto output_ptr = reinterpret_cast<uint16_t *>(output.ptr());
+
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step); x += window_step)
+            {
+                uint16x8x2_t tmp = vquantize_qasymm16(load_value(&input_ptr[x]), uqinfo);
+                vst1q_u16(&output_ptr[x], tmp.val[0]);
+                vst1q_u16(&output_ptr[x + 8], tmp.val[1]);
+            }
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                output_ptr[x] = quantize_qasymm16(input_ptr[x], uqinfo, rounding_policy);
+            }
+        },
+        input, output);
+}
+} // namespace cpu
+} // namespace arm_compute
+
+#endif // ACL_SRC_CPU_KERNELS_QUANTIZE_GENERIC_NEON_IMPL_H
diff --git a/src/cpu/kernels/quantize/generic/neon/impl_fp16.h b/src/cpu/kernels/quantize/generic/neon/impl_fp16.h
new file mode 100644
index 0000000..47f1b90
--- /dev/null
+++ b/src/cpu/kernels/quantize/generic/neon/impl_fp16.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_CPU_KERNELS_QUANTIZE_GENERIC_NEON_IMPL_FP16_H
+#define ACL_SRC_CPU_KERNELS_QUANTIZE_GENERIC_NEON_IMPL_FP16_H
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/NEAsymm.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+
+inline float32x4x4_t load_value(const float16_t *input_ptr)
+{
+    return {vcvt_f32_f16(wrapper::vload(input_ptr)), vcvt_f32_f16(wrapper::vload(input_ptr + 4)),
+            vcvt_f32_f16(wrapper::vload(input_ptr + 8)), vcvt_f32_f16(wrapper::vload(input_ptr + 12))};
+}
+
+} // namespace cpu
+} // namespace arm_compute
+#include "impl.h"
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+#endif // ACL_SRC_CPU_KERNELS_QUANTIZE_GENERIC_NEON_IMPL_FP16_H
diff --git a/src/cpu/kernels/quantize/generic/neon/impl_fp32.h b/src/cpu/kernels/quantize/generic/neon/impl_fp32.h
new file mode 100644
index 0000000..00ae242
--- /dev/null
+++ b/src/cpu/kernels/quantize/generic/neon/impl_fp32.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_CPU_KERNELS_QUANTIZE_GENERIC_NEON_IMPL_FP32_H
+#define ACL_SRC_CPU_KERNELS_QUANTIZE_GENERIC_NEON_IMPL_FP32_H
+
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/NEAsymm.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+inline float32x4x4_t load_value(const float *input_ptr)
+{
+    return {wrapper::vloadq(input_ptr), wrapper::vloadq(input_ptr + 4), wrapper::vloadq(input_ptr + 8),
+            wrapper::vloadq(input_ptr + 12)};
+}
+
+} // namespace cpu
+} // namespace arm_compute
+
+#include "impl.h"
+#endif // ACL_SRC_CPU_KERNELS_QUANTIZE_GENERIC_NEON_IMPL_FP32_H
diff --git a/src/cpu/kernels/quantize/generic/neon/integer.cpp b/src/cpu/kernels/quantize/generic/neon/integer.cpp
new file mode 100644
index 0000000..4e39afa
--- /dev/null
+++ b/src/cpu/kernels/quantize/generic/neon/integer.cpp
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/kernels/quantize/generic/neon/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+void u8_u8_run_quantize_qasymm8(const ITensor *src, ITensor *dst, const Window &window)
+{
+    run_quantize_qasymm8<uint8_t, uint8_t>(src, dst, window);
+}
+void u8_i8_run_quantize_qasymm8(const ITensor *src, ITensor *dst, const Window &window)
+{
+    run_quantize_qasymm8<uint8_t, int8_t>(src, dst, window);
+}
+void i8_u8_run_quantize_qasymm8(const ITensor *src, ITensor *dst, const Window &window)
+{
+    run_quantize_qasymm8<int8_t, uint8_t>(src, dst, window);
+}
+void i8_i8_run_quantize_qasymm8(const ITensor *src, ITensor *dst, const Window &window)
+{
+    run_quantize_qasymm8<int8_t, int8_t>(src, dst, window);
+}
+
+void u8_run_quantize_qasymm16(const ITensor *src, ITensor *dst, const Window &window)
+{
+    run_quantize_qasymm16<uint8_t>(src, dst, window);
+}
+void i8_run_quantize_qasymm16(const ITensor *src, ITensor *dst, const Window &window)
+{
+    run_quantize_qasymm16<int8_t>(src, dst, window);
+}
+
+void u8_u8_run_requantize_offset_only(const ITensor *src, ITensor *dst, const Window &window)
+{
+    run_requantize_offset_only<uint8_t, uint8_t>(src, dst, window);
+}
+void u8_i8_run_requantize_offset_only(const ITensor *src, ITensor *dst, const Window &window)
+{
+    run_requantize_offset_only<uint8_t, int8_t>(src, dst, window);
+}
+void i8_u8_run_requantize_offset_only(const ITensor *src, ITensor *dst, const Window &window)
+{
+    run_requantize_offset_only<int8_t, uint8_t>(src, dst, window);
+}
+void i8_i8_run_requantize_offset_only(const ITensor *src, ITensor *dst, const Window &window)
+{
+    run_requantize_offset_only<int8_t, int8_t>(src, dst, window);
+}
+
+void i8_u8_run_requantize_offset_only_convert(const ITensor *src, ITensor *dst, const Window &window)
+{
+    run_requantize_offset_only_convert<int8_t, uint8_t>(src, dst, window);
+}
+void u8_i8_run_requantize_offset_only_convert(const ITensor *src, ITensor *dst, const Window &window)
+{
+    run_requantize_offset_only_convert<uint8_t, int8_t>(src, dst, window);
+}
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/quantize/generic/neon/list.h b/src/cpu/kernels/quantize/generic/neon/list.h
new file mode 100644
index 0000000..c4fb104
--- /dev/null
+++ b/src/cpu/kernels/quantize/generic/neon/list.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ACL_SRC_CPU_KERNELS_QUANTIZE_GENERIC_NEON_LIST_H
+#define ACL_SRC_CPU_KERNELS_QUANTIZE_GENERIC_NEON_LIST_H
+
+#include "arm_compute/core/Helpers.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+
+#define DECLARE_QUANTIZE_KERNEL(func_name) void func_name(const ITensor *src, ITensor *dst, const Window &window)
+
+DECLARE_QUANTIZE_KERNEL(u8_u8_run_quantize_qasymm8);
+DECLARE_QUANTIZE_KERNEL(u8_i8_run_quantize_qasymm8);
+DECLARE_QUANTIZE_KERNEL(i8_u8_run_quantize_qasymm8);
+DECLARE_QUANTIZE_KERNEL(i8_i8_run_quantize_qasymm8);
+
+DECLARE_QUANTIZE_KERNEL(u8_u8_run_requantize_offset_only);
+DECLARE_QUANTIZE_KERNEL(u8_i8_run_requantize_offset_only);
+DECLARE_QUANTIZE_KERNEL(i8_u8_run_requantize_offset_only);
+DECLARE_QUANTIZE_KERNEL(i8_i8_run_requantize_offset_only);
+
+DECLARE_QUANTIZE_KERNEL(i8_u8_run_requantize_offset_only_convert);
+DECLARE_QUANTIZE_KERNEL(u8_i8_run_requantize_offset_only_convert);
+
+DECLARE_QUANTIZE_KERNEL(u8_run_quantize_qasymm16);
+DECLARE_QUANTIZE_KERNEL(i8_run_quantize_qasymm16);
+
+DECLARE_QUANTIZE_KERNEL(fp32_u8_run_quantize_qasymm8);
+DECLARE_QUANTIZE_KERNEL(fp32_i8_run_quantize_qasymm8);
+DECLARE_QUANTIZE_KERNEL(fp32_run_quantize_qasymm16);
+
+DECLARE_QUANTIZE_KERNEL(fp32_i8_run_quantize_qsymm8);
+
+DECLARE_QUANTIZE_KERNEL(fp16_u8_run_quantize_qasymm8);
+DECLARE_QUANTIZE_KERNEL(fp16_i8_run_quantize_qasymm8);
+DECLARE_QUANTIZE_KERNEL(fp16_run_quantize_qasymm16);
+
+#undef DECLARE_QUANTIZE_KERNEL
+
+} // namespace cpu
+} // namespace arm_compute
+#endif // ACL_SRC_CPU_KERNELS_QUANTIZE_GENERIC_NEON_LIST_H
diff --git a/src/cpu/kernels/quantize/generic/neon/vquantize.cpp b/src/cpu/kernels/quantize/generic/neon/vquantize.cpp
new file mode 100644
index 0000000..d40702b
--- /dev/null
+++ b/src/cpu/kernels/quantize/generic/neon/vquantize.cpp
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2024 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "impl.h"
+namespace arm_compute
+{
+namespace cpu
+{
+template <>
+vector_type<uint8_t> vquantize_qasymm8<uint8_t>(const float32x4x4_t &qv, const UniformQuantizationInfo &qi)
+{
+    return vquantize(qv, qi);
+}
+
+template <>
+vector_type<int8_t> vquantize_qasymm8<int8_t>(const float32x4x4_t &qv, const UniformQuantizationInfo &qi)
+{
+    return vquantize_signed(qv, qi);
+}
+} // namespace cpu
+} // namespace arm_compute