COMPMID-2699: Add support for QASYMM16 in NEQuantizationLayer

Change-Id: Icb968e37551a9048040e9aaff5329e874c53a2ee
Signed-off-by: Michele Di Giorgio <michele.digiorgio@arm.com>
Reviewed-on: https://review.mlplatform.org/c/2016
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
diff --git a/arm_compute/core/NEON/NEAsymm.h b/arm_compute/core/NEON/NEAsymm.h
index f2d20d3..56d4c09 100644
--- a/arm_compute/core/NEON/NEAsymm.h
+++ b/arm_compute/core/NEON/NEAsymm.h
@@ -331,6 +331,40 @@
     const uint8x8_t pb = vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3])));
     return vcombine_u8(pa, pb);
 }
+
+/** Quantize to QASYMM16 a neon vector holding 16 floating point values.
+ *
+ * @param[in] qv Input values to be quantized.
+ * @param[in] qi Quantization information to be used in the computation.
+ *
+ * @return A neon vector holding the quantized values
+ */
+inline uint16x8x2_t vquantize_qasymm16(const float32x4x4_t &qv, const UniformQuantizationInfo &qi)
+{
+    const float       scale     = qi.scale;
+    const int         offset    = qi.offset;
+    const float32x4_t voffset   = vdupq_n_f32(offset);
+    const float32x4_t vinvscale = vdupq_n_f32(1.f / scale);
+    const int32x4x4_t rf =
+    {
+        {
+#ifdef __aarch64__
+            vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
+            vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
+            vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[2], vinvscale)),
+            vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[3], vinvscale)),
+#else  //__aarch64__
+            vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
+            vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
+            vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[2], vinvscale)),
+            vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[3], vinvscale)),
+#endif //__aarch64__
+        }
+    };
+    const uint16x8_t pa = vcombine_u16(vqmovun_s32(rf.val[0]), vqmovun_s32(rf.val[1]));
+    const uint16x8_t pb = vcombine_u16(vqmovun_s32(rf.val[2]), vqmovun_s32(rf.val[3]));
+    return { pa, pb };
+}
 } // namespace arm_compute
 #include "arm_compute/core/NEON/NEAsymm.inl"
 #endif // __ARM_COMPUTE_NEASYMM_H__
diff --git a/arm_compute/core/NEON/kernels/NEQuantizationLayerKernel.h b/arm_compute/core/NEON/kernels/NEQuantizationLayerKernel.h
index 391a72c..e1aaad5 100644
--- a/arm_compute/core/NEON/kernels/NEQuantizationLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NEQuantizationLayerKernel.h
@@ -57,13 +57,15 @@
     /** Set the input, output.
      *
      * @param[in]  input  Source tensor. The dimensions over the third will be interpreted as batches. Data types supported: F32/F16.
-     * @param[out] output Destination tensor with the same dimensions of input. Data types supported: QASYMM8.
+     * @param[out] output Destination tensor with the same dimensions of input. Data types supported: QASYMM8/QASYMM16.
+     *
+     * @note Output auto initialization is not supported by this kernel
      */
     void configure(const ITensor *input, ITensor *output);
     /** Static function to check if given info will lead to a valid configuration of @ref NEQuantizationLayerKernel
      *
      * @param[in] input  Input tensor info. Data types supported: F32/F16.
-     * @param[in] output Output tensor info. Data types supported: QASYMM8.
+     * @param[in] output Output tensor info. Data types supported: QASYMM8/QASYMM16.
      *
      * @return a status
      */
@@ -73,11 +75,28 @@
     void run(const Window &window, const ThreadInfo &info) override;
 
 private:
+    /** Common signature for all the specialised @ref NEQuantizationLayerKernel functions
+     *
+     * @param[in] window Region on which to execute the kernel.
+     */
+    using QuantizationFunctionExecutorPtr = void (NEQuantizationLayerKernel::*)(const Window &window);
+    /** Function to apply QASYMM8 quantization on a tensor.
+     *
+     * @param[in] window Region on which to execute the kernel.
+     */
     template <typename T>
-    void quantize(const Window &window, const QuantizationInfo &qinfo);
+    void run_quantize_qasymm8(const Window &window);
+    /** Function to apply QASYMM16 quantization on a tensor.
+     *
+     * @param[in] window Region on which to execute the kernel.
+     */
+    template <typename T>
+    void run_quantize_qasymm16(const Window &window);
 
     const ITensor *_input;
     ITensor       *_output;
+
+    QuantizationFunctionExecutorPtr _func;
 };
 } // namespace arm_compute
 #endif /*__ARM_COMPUTE_NEQUANTIZATIONLAYERKERNEL_H__ */
diff --git a/arm_compute/runtime/NEON/functions/NEQuantizationLayer.h b/arm_compute/runtime/NEON/functions/NEQuantizationLayer.h
index 46a62bd..2560932 100644
--- a/arm_compute/runtime/NEON/functions/NEQuantizationLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEQuantizationLayer.h
@@ -49,13 +49,13 @@
     /** Set the input and output tensors.
      *
      * @param[in]  input  Source tensor. The dimensions over the third will be interpreted as batches. Data types supported: F32/F16.
-     * @param[out] output Destination tensor with the same dimensions of input. Data types supported: QASYMM8/QSYMM16
+     * @param[out] output Destination tensor with the same dimensions of input. Data types supported: QASYMM8/QASYMM16
      */
     void configure(const ITensor *input, ITensor *output);
     /** Static function to check if given info will lead to a valid configuration of @ref NEQuantizationLayer
      *
      * @param[in] input  Input tensor info. The dimensions over the third will be interpreted as batches. Data types supported: F32/F16.
-     * @param[in] output Output tensor info. Data types supported: QASYMM8/QSYMM16
+     * @param[in] output Output tensor info. Data types supported: QASYMM8/QASYMM16
      *
      * @return a status
      */
diff --git a/src/core/NEON/kernels/NEQuantizationLayerKernel.cpp b/src/core/NEON/kernels/NEQuantizationLayerKernel.cpp
index 0aa34cd..6a9c4ae 100644
--- a/src/core/NEON/kernels/NEQuantizationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEQuantizationLayerKernel.cpp
@@ -34,9 +34,10 @@
 #include "arm_compute/core/CPP/Validate.h"
 
 #include <arm_neon.h>
+#include <map>
 
-using namespace arm_compute;
-
+namespace arm_compute
+{
 namespace
 {
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
@@ -45,7 +46,7 @@
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape().total_size() == 0);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8, DataType::QASYMM16);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
 
     return Status{};
@@ -71,7 +72,7 @@
 } // namespace
 
 NEQuantizationLayerKernel::NEQuantizationLayerKernel()
-    : _input(nullptr), _output(nullptr)
+    : _input(nullptr), _output(nullptr), _func(nullptr)
 {
 }
 
@@ -83,6 +84,33 @@
     _input  = input;
     _output = output;
 
+    static std::map<DataType, QuantizationFunctionExecutorPtr> quant_map_f32 =
+    {
+        { DataType::QASYMM8, &NEQuantizationLayerKernel::run_quantize_qasymm8<float> },
+        { DataType::QASYMM16, &NEQuantizationLayerKernel::run_quantize_qasymm16<float> },
+    };
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+    static std::map<DataType, QuantizationFunctionExecutorPtr> quant_map_f16 =
+    {
+        { DataType::QASYMM8, &NEQuantizationLayerKernel::run_quantize_qasymm8<float16_t> },
+        { DataType::QASYMM16, &NEQuantizationLayerKernel::run_quantize_qasymm16<float16_t> },
+    };
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC*/
+
+    switch(input->info()->data_type())
+    {
+        case DataType::F32:
+            _func = quant_map_f32[output->info()->data_type()];
+            break;
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+        case DataType::F16:
+            _func = quant_map_f16[output->info()->data_type()];
+            break;
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+        default:
+            ARM_COMPUTE_ERROR("Unsupported input data type.");
+    }
+
     // Configure kernel window
     Window win_config = calculate_max_window(*input->info(), Steps());
 
@@ -96,18 +124,17 @@
 Status NEQuantizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output));
-
     return Status{};
 }
 
 template <typename T>
-void NEQuantizationLayerKernel::quantize(const Window &window, const QuantizationInfo &qinfo)
+void NEQuantizationLayerKernel::run_quantize_qasymm8(const Window &window)
 {
     constexpr auto window_step    = 16;
     const auto     window_start_x = static_cast<int>(window.x().start());
     const auto     window_end_x   = static_cast<int>(window.x().end());
 
-    const UniformQuantizationInfo uqinfo = qinfo.uniform();
+    const UniformQuantizationInfo uqinfo = _output->info()->quantization_info().uniform();
 #ifdef __aarch64__
     constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_EVEN;
 #else  //__aarch64__
@@ -139,25 +166,54 @@
     input, output);
 }
 
+template <typename T>
+void NEQuantizationLayerKernel::run_quantize_qasymm16(const Window &window)
+{
+    constexpr auto window_step    = 16;
+    const auto     window_start_x = static_cast<int>(window.x().start());
+    const auto     window_end_x   = static_cast<int>(window.x().end());
+
+    const UniformQuantizationInfo uqinfo = _output->info()->quantization_info().uniform();
+#ifdef __aarch64__
+    constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_NEAREST_EVEN;
+#else  //__aarch64__
+    constexpr RoundingPolicy rounding_policy = RoundingPolicy::TO_ZERO;
+#endif //__aarch64__
+
+    // Collapse window and reset first dimension to handle tail calculations manually
+    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    Iterator input(_input, win_collapsed);
+    Iterator output(_output, win_collapsed);
+    execute_window_loop(win_collapsed, [&](const Coordinates &)
+    {
+        auto input_ptr  = reinterpret_cast<const T *>(input.ptr());
+        auto output_ptr = reinterpret_cast<uint16_t *>(output.ptr());
+
+        int x = window_start_x;
+        for(; x <= (window_end_x - window_step); x += window_step)
+        {
+            uint16x8x2_t tmp = vquantize_qasymm16(load_value(&input_ptr[x]), uqinfo);
+            vst1q_u16(&output_ptr[x], tmp.val[0]);
+            vst1q_u16(&output_ptr[x + 8], tmp.val[1]);
+        }
+        // Compute left-over elements
+        for(; x < window_end_x; ++x)
+        {
+            output_ptr[x] = quantize_qasymm16(input_ptr[x], uqinfo, rounding_policy);
+        }
+    },
+    input, output);
+}
+
 void NEQuantizationLayerKernel::run(const Window &window, const ThreadInfo &info)
 {
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON(_func == nullptr);
 
-    const QuantizationInfo &qinfo = _output->info()->quantization_info();
-
-    switch(_input->info()->data_type())
-    {
-        case DataType::F32:
-            NEQuantizationLayerKernel::quantize<float>(window, qinfo);
-            break;
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        case DataType::F16:
-            NEQuantizationLayerKernel::quantize<float16_t>(window, qinfo);
-            break;
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        default:
-            ARM_COMPUTE_ERROR("Unsupported data type.");
-    }
+    (this->*_func)(window);
 }
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEQuantizationLayer.cpp b/src/runtime/NEON/functions/NEQuantizationLayer.cpp
index 65873b1..4464978 100644
--- a/src/runtime/NEON/functions/NEQuantizationLayer.cpp
+++ b/src/runtime/NEON/functions/NEQuantizationLayer.cpp
@@ -27,8 +27,8 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 
-using namespace arm_compute;
-
+namespace arm_compute
+{
 Status NEQuantizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
@@ -46,3 +46,4 @@
     k->configure(input, output);
     _kernel = std::move(k);
 }
+} // namespace arm_compute
diff --git a/tests/validation/NEON/QuantizationLayer.cpp b/tests/validation/NEON/QuantizationLayer.cpp
index 8d19c93..49118f7 100644
--- a/tests/validation/NEON/QuantizationLayer.cpp
+++ b/tests/validation/NEON/QuantizationLayer.cpp
@@ -43,7 +43,8 @@
 namespace
 {
 /** Tolerance for quantization */
-constexpr AbsoluteTolerance<uint8_t> tolerance_u8(1);
+constexpr AbsoluteTolerance<uint8_t>  tolerance_u8(1);
+constexpr AbsoluteTolerance<uint16_t> tolerance_u16(1);
 
 const auto QuantizationSmallShapes = concat(datasets::Small3DShapes(), datasets::Small4DShapes());
 const auto QuantizationLargeShapes = concat(datasets::Large3DShapes(), datasets::Large4DShapes());
@@ -98,6 +99,8 @@
 
 template <typename T>
 using NEQuantizationLayerQASYMM8Fixture = QuantizationValidationFixture<Tensor, Accessor, NEQuantizationLayer, T, uint8_t>;
+template <typename T>
+using NEQuantizationLayerQASYMM16Fixture = QuantizationValidationFixture<Tensor, Accessor, NEQuantizationLayer, T, uint16_t>;
 
 TEST_SUITE(Float)
 TEST_SUITE(FP32)
@@ -109,6 +112,14 @@
     // Validate output
     validate(Accessor(_target), _reference, tolerance_u8);
 }
+FIXTURE_DATA_TEST_CASE(RunSmallQASYMM16, NEQuantizationLayerQASYMM16Fixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(QuantizationSmallShapes,
+                       framework::dataset::make("DataType", DataType::F32)),
+                       framework::dataset::make("DataTypeOut", { DataType::QASYMM16 })),
+                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, 10) })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_u16);
+}
 FIXTURE_DATA_TEST_CASE(RunLargeQASYMM8, NEQuantizationLayerQASYMM8Fixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(QuantizationLargeShapes,
                        framework::dataset::make("DataType", DataType::F32)),
                        framework::dataset::make("DataTypeOut", { DataType::QASYMM8 })),
@@ -117,10 +128,16 @@
     // Validate output
     validate(Accessor(_target), _reference, tolerance_u8);
 }
+FIXTURE_DATA_TEST_CASE(RunLargeQASYMM16, NEQuantizationLayerQASYMM16Fixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(QuantizationLargeShapes,
+                       framework::dataset::make("DataType", DataType::F32)),
+                       framework::dataset::make("DataTypeOut", { DataType::QASYMM16 })),
+                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, 10) })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_u16);
+}
 TEST_SUITE_END() // FP32
-TEST_SUITE_END() // Float
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-TEST_SUITE(Half)
 TEST_SUITE(FP16)
 FIXTURE_DATA_TEST_CASE(RunSmallQASYMM8, NEQuantizationLayerQASYMM8Fixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(QuantizationSmallShapes,
                        framework::dataset::make("DataType", DataType::F16)),
@@ -130,6 +147,14 @@
     // Validate output
     validate(Accessor(_target), _reference, tolerance_u8);
 }
+FIXTURE_DATA_TEST_CASE(RunSmallQASYMM16, NEQuantizationLayerQASYMM16Fixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(QuantizationSmallShapes,
+                       framework::dataset::make("DataType", DataType::F16)),
+                       framework::dataset::make("DataTypeOut", { DataType::QASYMM16 })),
+                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, 10) })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_u16);
+}
 FIXTURE_DATA_TEST_CASE(RunLargeQASYMM8, NEQuantizationLayerQASYMM8Fixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(combine(QuantizationLargeShapes,
                        framework::dataset::make("DataType", DataType::F16)),
                        framework::dataset::make("DataTypeOut", { DataType::QASYMM8 })),
@@ -138,9 +163,17 @@
     // Validate output
     validate(Accessor(_target), _reference, tolerance_u8);
 }
+FIXTURE_DATA_TEST_CASE(RunLargeQASYMM16, NEQuantizationLayerQASYMM16Fixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(combine(QuantizationLargeShapes,
+                       framework::dataset::make("DataType", DataType::F16)),
+                       framework::dataset::make("DataTypeOut", { DataType::QASYMM16 })),
+                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(0.5f, 10) })))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_u16);
+}
 TEST_SUITE_END() // FP16
-TEST_SUITE_END() // Half
 #endif           //  __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+TEST_SUITE_END() // Float
 
 TEST_SUITE_END() // QuantizationLayer
 TEST_SUITE_END() // NEON