COMPMID-2235: Extend type support for CL/NEON DequantizationLayer.

Adds support for:
- QSYMM8

Change-Id: Ia0b839fc844ce0f968dad1b69a001f9a660dbcd5
Signed-off-by: Georgios Pinitas <georgios.pinitas@arm.com>
Reviewed-on: https://review.mlplatform.org/c/1378
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Manuel Bottini <manuel.bottini@arm.com>
Reviewed-by: Michalis Spyrou <michalis.spyrou@arm.com>
diff --git a/src/core/CL/CLHelpers.cpp b/src/core/CL/CLHelpers.cpp
index f4ceca8..2e6ceb4 100644
--- a/src/core/CL/CLHelpers.cpp
+++ b/src/core/CL/CLHelpers.cpp
@@ -37,11 +37,11 @@
     switch(dt)
     {
         case DataType::U8:
-            return "uchar";
-        case DataType::S8:
-            return "char";
         case DataType::QASYMM8:
             return "uchar";
+        case DataType::S8:
+        case DataType::QSYMM8:
+            return "char";
         case DataType::U16:
             return "ushort";
         case DataType::S16:
@@ -69,11 +69,11 @@
     switch(dt)
     {
         case DataType::U8:
-            return "uchar";
-        case DataType::S8:
-            return "char";
         case DataType::QASYMM8:
             return "uchar";
+        case DataType::S8:
+        case DataType::QSYMM8:
+            return "char";
         case DataType::U16:
             return "ushort";
         case DataType::F16:
@@ -100,6 +100,7 @@
     {
         case DataType::U8:
         case DataType::S8:
+        case DataType::QSYMM8:
         case DataType::QASYMM8:
             return "8";
         case DataType::U16:
@@ -241,6 +242,7 @@
         case DataType::U8:
         case DataType::S8:
         case DataType::QASYMM8:
+        case DataType::QSYMM8:
             return device.getInfo<CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR>();
         case DataType::U16:
         case DataType::S16:
diff --git a/src/core/CL/cl_kernels/dequantization_layer.cl b/src/core/CL/cl_kernels/dequantization_layer.cl
index 7307700..ad3ed35 100644
--- a/src/core/CL/cl_kernels/dequantization_layer.cl
+++ b/src/core/CL/cl_kernels/dequantization_layer.cl
@@ -23,16 +23,17 @@
  */
 #include "helpers.h"
 
-#if defined(VEC_SIZE) && defined(DATA_TYPE) && defined(SCALE) && defined(OFFSET)
+#if defined(VEC_SIZE) && defined(DATA_TYPE_SRC) && defined(DATA_TYPE_DST) && defined(SCALE) && defined(OFFSET)
 
 /** This performs the dequantization of 8-bit unsigned integers to floating point.
  *
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note Source datatype should be given as a preprocessor argument using -DDATA_TYPE_SRC=type. e.g. -DDATA_TYPE_SRC=char
+ * @note Destination datatype should be given as a preprocessor argument using -DDATA_TYPE_DST=type. e.g. -DDATA_TYPE_DST=float
  * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
  * @note Quantization scale of input tensor is passed in with -DSCALE=scale.
  * @note Quantization offset of input tensor is passed in with -DOFFSET=offset.
  *
- * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: QASYMM8
+ * @param[in]  input_ptr                            Pointer to the source tensor. Supported data types: QASYMM8/QSYMM8
  * @param[in]  input_stride_x                       Stride of the source tensor in X dimension (in bytes)
  * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
  * @param[in]  input_stride_y                       Stride of the source tensor in Y dimension (in bytes)
@@ -66,7 +67,7 @@
 
     // Load data
     VEC_DATA_TYPE(int, VEC_SIZE)
-    val = CONVERT(VLOAD(VEC_SIZE)(0, (__global uchar *)input.ptr), VEC_DATA_TYPE(int, VEC_SIZE));
+    val = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_SRC *)input.ptr), VEC_DATA_TYPE(int, VEC_SIZE));
 
     // Create scale and offset vectors
     const VEC_DATA_TYPE(float, VEC_SIZE)
@@ -81,10 +82,10 @@
 
     // Store result
     VSTORE(VEC_SIZE)
-    (CONVERT(res, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), 0, (__global DATA_TYPE *)output.ptr);
+    (CONVERT(res, VEC_DATA_TYPE(DATA_TYPE_DST, VEC_SIZE)), 0, (__global DATA_TYPE_DST *)output.ptr);
 #else  // !defined(VEC_SIZE) || !defined(LAST_ACCESSED_X)
-    *((__global DATA_TYPE *)(output.ptr)) = (DATA_TYPE)((float)((int)(*((__global uchar *)(input.ptr))) - (int)(OFFSET)) * (float)(SCALE));
+    *((__global DATA_TYPE_DST *)(output.ptr)) = (DATA_TYPE)((float)((int)(*((__global DATA_TYPE_SRC *)(input.ptr))) - (int)(OFFSET)) * (float)(SCALE));
 #endif // defined(VEC_SIZE) && defined(LAST_ACCESSED_X)
 }
 
-#endif // defined(VEC_SIZE) && defined(DATA_TYPE) && defined(SCALE) && defined(OFFSET)
\ No newline at end of file
+#endif // defined(VEC_SIZE) && defined(DATA_TYPE_SRC) && defined(DATA_TYPE_DST) && defined(SCALE) && defined(OFFSET)
diff --git a/src/core/CL/kernels/CLDequantizationLayerKernel.cpp b/src/core/CL/kernels/CLDequantizationLayerKernel.cpp
index 0b06683..e383bc4 100644
--- a/src/core/CL/kernels/CLDequantizationLayerKernel.cpp
+++ b/src/core/CL/kernels/CLDequantizationLayerKernel.cpp
@@ -40,7 +40,7 @@
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QSYMM8);
 
     if(output->tensor_shape().total_size() > 0)
     {
@@ -95,15 +95,19 @@
     }
     ICLKernel::configure_internal(win);
 
-    const UniformQuantizationInfo qinfo = input->info()->quantization_info().uniform();
+    const UniformQuantizationInfo qinfo   = input->info()->quantization_info().uniform();
+    const int                     qoffset = is_data_type_quantized_asymmetric(input->info()->data_type()) ? qinfo.offset : 0;
 
     // Create kernel
     CLBuildOptions build_opts;
     build_opts.add_option("-DSCALE=" + float_to_string_with_full_precision(qinfo.scale));
-    build_opts.add_option("-DOFFSET=" + support::cpp11::to_string(qinfo.offset));
+    build_opts.add_option("-DOFFSET=" + support::cpp11::to_string(qoffset));
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(output->info()->data_type()));
+    build_opts.add_option("-DDATA_TYPE_SRC=" + get_cl_type_from_data_type(input->info()->data_type()));
+    build_opts.add_option("-DDATA_TYPE_DST=" + get_cl_type_from_data_type(output->info()->data_type()));
     build_opts.add_option_if(multi_access_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max<int>(output_width_x - vec_size_x, 0)));
+
+    // Create kernel name
     _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("dequantization_layer", build_opts.options()));
 }
 
diff --git a/src/core/NEON/kernels/NEDequantizationLayerKernel.cpp b/src/core/NEON/kernels/NEDequantizationLayerKernel.cpp
index a6dc097..bf0a2ca 100644
--- a/src/core/NEON/kernels/NEDequantizationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEDequantizationLayerKernel.cpp
@@ -42,7 +42,7 @@
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QSYMM8);
 
     if(output->tensor_shape().total_size() > 0)
     {
@@ -95,9 +95,11 @@
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 
 template <typename T>
-void run_dequantization(const ITensor *input, ITensor *output, const Window &window)
+void run_dequantization_qasymm8(const ITensor *input, ITensor *output, const Window &window)
 {
-    const UniformQuantizationInfo &qinfo = input->info()->quantization_info().uniform();
+    const UniformQuantizationInfo &qinfo  = input->info()->quantization_info().uniform();
+    const float                    scale  = qinfo.scale;
+    const int32_t                  offset = qinfo.offset;
 
     const int  window_step_x  = 16;
     const auto window_start_x = static_cast<int>(window.x().start());
@@ -120,7 +122,7 @@
         for(; x <= (window_end_x - window_step_x); x += window_step_x)
         {
             const auto vin  = wrapper::vloadq(in_ptr + x);
-            const auto vdeq = vdequantize(vin, qinfo);
+            const auto vdeq = vdequantize(vin, scale, offset);
 
             store_result<T>(reinterpret_cast<T *>(out_ptr + x), vdeq);
         }
@@ -129,11 +131,69 @@
         for(; x < window_end_x; ++x)
         {
             uint8_t val    = *(in_ptr + x);
-            *(out_ptr + x) = static_cast<T>(dequantize_qasymm8(val, qinfo));
+            *(out_ptr + x) = static_cast<T>(dequantize(val, scale, offset));
         }
     },
     in, out);
 }
+
+template <typename T>
+void run_dequantization_qsymm8(const ITensor *input, ITensor *output, const Window &window)
+{
+    const UniformQuantizationInfo &qinfo = input->info()->quantization_info().uniform();
+    const float                    scale = qinfo.scale;
+
+    const int  window_step_x  = 16;
+    const auto window_start_x = static_cast<int>(window.x().start());
+    const auto window_end_x   = static_cast<int>(window.x().end());
+
+    // Collapse window and reset first dimension to handle tail calculations manually
+    Window win_collapsed = window.collapse_if_possible(window, Window::DimZ);
+    win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+    // Create iterators
+    Iterator in(input, win_collapsed);
+    Iterator out(output, win_collapsed);
+
+    execute_window_loop(win_collapsed, [&](const Coordinates &)
+    {
+        const auto in_ptr  = reinterpret_cast<const int8_t *>(in.ptr());
+        const auto out_ptr = reinterpret_cast<T *>(out.ptr());
+
+        int x = window_start_x;
+        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        {
+            const auto vin  = wrapper::vloadq(in_ptr + x);
+            const auto vdeq = vdequantize(vin, scale);
+
+            store_result<T>(reinterpret_cast<T *>(out_ptr + x), vdeq);
+        }
+
+        // Compute left-over elements
+        for(; x < window_end_x; ++x)
+        {
+            uint8_t val    = *(in_ptr + x);
+            *(out_ptr + x) = static_cast<T>(dequantize(val, scale));
+        }
+    },
+    in, out);
+}
+
+template <typename T>
+void run_dequantization_core(const ITensor *input, ITensor *output, const Window &window)
+{
+    switch(input->info()->data_type())
+    {
+        case DataType::QASYMM8:
+            run_dequantization_qasymm8<T>(input, output, window);
+            break;
+        case DataType::QSYMM8:
+            run_dequantization_qsymm8<T>(input, output, window);
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Unsupported data type.");
+    }
+}
 } // namespace
 
 NEDequantizationLayerKernel::NEDequantizationLayerKernel()
@@ -173,11 +233,11 @@
     switch(_output->info()->data_type())
     {
         case DataType::F32:
-            run_dequantization<float>(_input, _output, window);
+            run_dequantization_core<float>(_input, _output, window);
             break;
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
         case DataType::F16:
-            run_dequantization<float16_t>(_input, _output, window);
+            run_dequantization_core<float16_t>(_input, _output, window);
             break;
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
         default:
diff --git a/src/core/Utils.cpp b/src/core/Utils.cpp
index 22be000..499a6c8 100644
--- a/src/core/Utils.cpp
+++ b/src/core/Utils.cpp
@@ -158,6 +158,8 @@
         { DataType::F32, "F32" },
         { DataType::F64, "F64" },
         { DataType::SIZET, "SIZET" },
+        { DataType::QSYMM8, "QSYMM8" },
+        { DataType::QSYMM8_PER_CHANNEL, "QSYMM8_PER_CHANNEL" },
         { DataType::QASYMM8, "QASYMM8" },
         { DataType::QSYMM16, "QSYMM16" },
     };
diff --git a/src/runtime/CL/CLTensorAllocator.cpp b/src/runtime/CL/CLTensorAllocator.cpp
index 63aa1ba..f3f16cd 100644
--- a/src/runtime/CL/CLTensorAllocator.cpp
+++ b/src/runtime/CL/CLTensorAllocator.cpp
@@ -87,11 +87,12 @@
     clear_quantization_arrays(scale, offset);
 
     // Create scale array
-    const size_t num_elements = qinfo.scale.size();
-    const size_t element_size = sizeof(decltype(qinfo.scale)::value_type);
-    scale                     = CLFloatArray(num_elements + pad_size);
+    const std::vector<float> &qscale       = qinfo.scale();
+    const size_t              num_elements = qscale.size();
+    const size_t              element_size = sizeof(std::remove_reference<decltype(qscale)>::type::value_type);
+    scale                                  = CLFloatArray(num_elements + pad_size);
     scale.resize(num_elements);
-    CLScheduler::get().queue().enqueueWriteBuffer(scale.cl_buffer(), CL_TRUE, 0, num_elements * element_size, qinfo.scale.data());
+    CLScheduler::get().queue().enqueueWriteBuffer(scale.cl_buffer(), CL_TRUE, 0, num_elements * element_size, qinfo.scale().data());
 }
 } // namespace