COMPMID-2764: Add support for QASYMM8_SIGNED in NEConvolutionLayer.

Change-Id: I8fbbd2e399f48968337a60147098d04f27c2d1c0
Signed-off-by: Georgios Pinitas <georgios.pinitas@arm.com>
Reviewed-on: https://review.mlplatform.org/c/2402
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
diff --git a/src/core/NEON/kernels/NECol2ImKernel.cpp b/src/core/NEON/kernels/NECol2ImKernel.cpp
index e3661ee..cea8782 100644
--- a/src/core/NEON/kernels/NECol2ImKernel.cpp
+++ b/src/core/NEON/kernels/NECol2ImKernel.cpp
@@ -43,10 +43,6 @@
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const Size2D &convolved_dims)
 {
     //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use NEON FP16 instructions.
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QASYMM8,
-                                                         DataType::U16, DataType::S16,
-                                                         DataType::U32, DataType::S32,
-                                                         DataType::F16, DataType::F32);
 
     // Validate configured output
     if(output->total_size() != 0)
diff --git a/src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.cpp
index a32f0bb..8418733 100644
--- a/src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.cpp
@@ -269,6 +269,13 @@
     return out_s8;
 }
 
+template <typename T>
+struct VectorTyper
+{
+    using stype = T;
+    using vtype = typename wrapper::traits::neon_bitvector_t<T, wrapper::traits::BitWidth::W128>;
+};
+
 inline Window get_win_vector_sum(const Window &window)
 {
     Window win_vector_sum(window);
@@ -300,9 +307,10 @@
     return bias_it;
 }
 
-template <bool has_a_offset, bool has_b_offset, bool has_bias, bool is_bounded_relu, bool is_fixed_point>
+template <typename VT, bool has_a_offset, bool has_b_offset, bool has_bias, bool is_bounded_relu, bool is_fixed_point>
 inline void run_offset_contribution_output_stage_window(const int32_t *vector_sum_col_ptr, const int32_t *vector_sum_row_ptr, const int32_t *bias_ptr, Iterator mm_result_it, Iterator out_it,
-                                                        const int32x4_t result_offset_s32, const int32x4_t result_shift_s32, uint8x16_t min_u8, uint8x16_t max_u8,
+                                                        const int32x4_t result_offset_s32, const int32x4_t result_shift_s32,
+                                                        typename VT::vtype min_vec, typename VT::vtype max_vec,
                                                         int32_t a_offset, int32_t b_offset, int32_t k_offset,
                                                         int32_t multiplier, int32_t shift, int32_t offset, int32_t min_bound, int32_t max_bound,
                                                         int window_step_x, int window_start_x, int window_end_x)
@@ -346,11 +354,13 @@
 
         if(is_fixed_point)
         {
-            vst1q_u8(out_it.ptr() + x, finalize_quantization<is_bounded_relu>(in_s32, multiplier, shift, result_offset_s32, min_u8, max_u8));
+            wrapper::vstore(reinterpret_cast<typename VT::stype *>(out_it.ptr() + x),
+                            finalize_quantization<is_bounded_relu>(in_s32, multiplier, shift, result_offset_s32, min_vec, max_vec));
         }
         else
         {
-            vst1q_u8(out_it.ptr() + x, finalize_quantization_floating_point<is_bounded_relu>(in_s32, result_shift_s32, min_u8, max_u8));
+            wrapper::vstore(reinterpret_cast<typename VT::stype *>(out_it.ptr() + x),
+                            finalize_quantization_floating_point<is_bounded_relu>(in_s32, result_shift_s32, min_vec, max_vec));
         }
     }
     // Compute left-over elements
@@ -370,7 +380,9 @@
         if(is_fixed_point)
         {
             // Finalize and store the result
-            *(out_it.ptr() + x) = finalize_quantization<is_bounded_relu>(in_value, multiplier, shift, offset, static_cast<uint8_t>(min_bound), static_cast<uint8_t>(max_bound));
+            *reinterpret_cast<typename VT::stype *>(out_it.ptr() + x) = finalize_quantization<is_bounded_relu>(in_value, multiplier, shift, offset,
+                                                                                                               static_cast<typename VT::stype>(min_bound),
+                                                                                                               static_cast<typename VT::stype>(max_bound));
         }
         else
         {
@@ -380,9 +392,10 @@
             // Bound and store the result
             if(is_bounded_relu)
             {
-                in_value = static_cast<uint8_t>(std::max<int32_t>(min_bound, std::min<int32_t>(max_bound, in_value)));
+                in_value = static_cast<typename VT::stype>(std::max<int32_t>(min_bound, std::min<int32_t>(max_bound, in_value)));
             }
-            *(out_it.ptr() + x) = static_cast<uint8_t>(std::max<int32_t>(0, std::min<int32_t>(255, in_value)));
+            *reinterpret_cast<typename VT::stype *>(out_it.ptr() + x) = static_cast<typename VT::stype>(std::max<int32_t>(static_cast<int32_t>(std::numeric_limits<typename VT::stype>::lowest()),
+                                                                                                                          std::min<int32_t>(static_cast<int32_t>(std::numeric_limits<typename VT::stype>::max()), in_value)));
         }
     }
 }
@@ -463,12 +476,15 @@
     }
 }
 
-template <bool is_gemm3d, bool is_bounded_relu, bool is_fixed_point>
+template <typename T, bool is_gemm3d, bool is_bounded_relu, bool is_fixed_point>
 void run_offset_contribution_output_stage(const Window &window,
                                           const ITensor *mm_result, const ITensor *vector_sum_col, const ITensor *vector_sum_row, const ITensor *bias, ITensor *output,
                                           int32_t a_offset, int32_t b_offset, int32_t k_offset, bool slide_vector_sum_col,
                                           GEMMLowpOutputStageInfo output_stage)
 {
+    using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
+    using Typer        = VectorTyper<T>;
+
     const int height_input = is_gemm3d ? mm_result->info()->dimension(1) : 0;
     const int depth_input  = is_gemm3d ? mm_result->info()->dimension(2) : 1;
 
@@ -478,10 +494,10 @@
     const int32_t min_bound  = output_stage.gemmlowp_min_bound;
     const int32_t max_bound  = output_stage.gemmlowp_max_bound;
 
-    const int32x4_t  result_offset_s32 = vdupq_n_s32(offset);
-    const int32x4_t  result_shift_s32  = vdupq_n_s32(is_fixed_point ? shift : -shift);
-    const uint8x16_t min_u8            = vdupq_n_u8(static_cast<uint8_t>(min_bound));
-    const uint8x16_t max_u8            = vdupq_n_u8(static_cast<uint8_t>(max_bound));
+    const int32x4_t result_offset_s32 = vdupq_n_s32(offset);
+    const int32x4_t result_shift_s32  = vdupq_n_s32(is_fixed_point ? shift : -shift);
+    const auto      min_vec           = wrapper::vdup_n(static_cast<T>(min_bound), ExactTagType{});
+    const auto      max_vec           = wrapper::vdup_n(static_cast<T>(max_bound), ExactTagType{});
 
     const int  window_step_x  = 16;
     const auto window_start_x = static_cast<int>(window.x().start());
@@ -517,11 +533,13 @@
                 const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_id * vector_sum_col_batch_offset);
                 const auto vector_sum_row_ptr = reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y)
                                                 + id.y() + (id.z() % depth_input) * height_input;
-                run_offset_contribution_output_stage_window<true, true, true, is_bounded_relu, is_fixed_point>(vector_sum_col_ptr, vector_sum_row_ptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it,
-                                                                                                               out_it,
-                                                                                                               result_offset_s32, result_shift_s32, min_u8, max_u8, a_offset, b_offset, k_offset,
-                                                                                                               multiplier, shift, offset, min_bound, max_bound,
-                                                                                                               window_step_x, window_start_x, window_end_x);
+                run_offset_contribution_output_stage_window<Typer, true, true, true, is_bounded_relu, is_fixed_point>(vector_sum_col_ptr, vector_sum_row_ptr, reinterpret_cast<const int32_t *>(bias_it.ptr()),
+                                                                                                                      mm_result_it,
+                                                                                                                      out_it,
+                                                                                                                      result_offset_s32, result_shift_s32,
+                                                                                                                      min_vec, max_vec, a_offset, b_offset, k_offset,
+                                                                                                                      multiplier, shift, offset, min_bound, max_bound,
+                                                                                                                      window_step_x, window_start_x, window_end_x);
             },
             vector_sum_col_it, vector_sum_row_it, bias_it, mm_result_it, out_it);
         }
@@ -533,10 +551,11 @@
                 const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_id * vector_sum_col_batch_offset);
                 const auto vector_sum_row_ptr = reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y)
                                                 + id.y() + (id.z() % depth_input) * height_input;
-                run_offset_contribution_output_stage_window<true, true, false, is_bounded_relu, is_fixed_point>(vector_sum_col_ptr, vector_sum_row_ptr, nullptr, mm_result_it, out_it,
-                                                                                                                result_offset_s32, result_shift_s32, min_u8, max_u8, a_offset, b_offset, k_offset,
-                                                                                                                multiplier, shift, offset, min_bound, max_bound,
-                                                                                                                window_step_x, window_start_x, window_end_x);
+                run_offset_contribution_output_stage_window<Typer, true, true, false, is_bounded_relu, is_fixed_point>(vector_sum_col_ptr, vector_sum_row_ptr, nullptr, mm_result_it, out_it,
+                                                                                                                       result_offset_s32, result_shift_s32,
+                                                                                                                       min_vec, max_vec, a_offset, b_offset, k_offset,
+                                                                                                                       multiplier, shift, offset, min_bound, max_bound,
+                                                                                                                       window_step_x, window_start_x, window_end_x);
             },
             vector_sum_col_it, vector_sum_row_it, mm_result_it, out_it);
         }
@@ -557,10 +576,12 @@
                 const int  batch_id           = id.z() / depth_input;
                 const auto vector_sum_row_ptr = reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y)
                                                 + id.y() + (id.z() % depth_input) * height_input;
-                run_offset_contribution_output_stage_window<false, true, true, is_bounded_relu, is_fixed_point>(nullptr, vector_sum_row_ptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it, out_it,
-                                                                                                                result_offset_s32, result_shift_s32, min_u8, max_u8, a_offset, b_offset, k_offset,
-                                                                                                                multiplier, shift, offset, min_bound, max_bound,
-                                                                                                                window_step_x, window_start_x, window_end_x);
+                run_offset_contribution_output_stage_window<Typer, false, true, true, is_bounded_relu, is_fixed_point>(nullptr, vector_sum_row_ptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it,
+                                                                                                                       out_it,
+                                                                                                                       result_offset_s32, result_shift_s32,
+                                                                                                                       min_vec, max_vec, a_offset, b_offset, k_offset,
+                                                                                                                       multiplier, shift, offset, min_bound, max_bound,
+                                                                                                                       window_step_x, window_start_x, window_end_x);
             },
             vector_sum_row_it, bias_it, mm_result_it, out_it);
         }
@@ -571,10 +592,11 @@
                 const int  batch_id           = id.z() / depth_input;
                 const auto vector_sum_row_ptr = reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y)
                                                 + id.y() + (id.z() % depth_input) * height_input;
-                run_offset_contribution_output_stage_window<false, true, false, is_bounded_relu, is_fixed_point>(nullptr, vector_sum_row_ptr, nullptr, mm_result_it, out_it,
-                                                                                                                 result_offset_s32, result_shift_s32, min_u8, max_u8, a_offset, b_offset, k_offset,
-                                                                                                                 multiplier, shift, offset, min_bound, max_bound,
-                                                                                                                 window_step_x, window_start_x, window_end_x);
+                run_offset_contribution_output_stage_window<Typer, false, true, false, is_bounded_relu, is_fixed_point>(nullptr, vector_sum_row_ptr, nullptr, mm_result_it, out_it,
+                                                                                                                        result_offset_s32, result_shift_s32,
+                                                                                                                        min_vec, max_vec, a_offset, b_offset, k_offset,
+                                                                                                                        multiplier, shift, offset, min_bound, max_bound,
+                                                                                                                        window_step_x, window_start_x, window_end_x);
             },
             vector_sum_row_it, mm_result_it, out_it);
         }
@@ -595,10 +617,12 @@
             {
                 const int  batch_id           = id.z() / depth_input;
                 const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_id * vector_sum_col_batch_offset);
-                run_offset_contribution_output_stage_window<true, false, true, is_bounded_relu, is_fixed_point>(vector_sum_col_ptr, nullptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it, out_it,
-                                                                                                                result_offset_s32, result_shift_s32, min_u8, max_u8, a_offset, b_offset, k_offset,
-                                                                                                                multiplier, shift, offset, min_bound, max_bound,
-                                                                                                                window_step_x, window_start_x, window_end_x);
+                run_offset_contribution_output_stage_window<Typer, true, false, true, is_bounded_relu, is_fixed_point>(vector_sum_col_ptr, nullptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it,
+                                                                                                                       out_it,
+                                                                                                                       result_offset_s32, result_shift_s32,
+                                                                                                                       min_vec, max_vec, a_offset, b_offset, k_offset,
+                                                                                                                       multiplier, shift, offset, min_bound, max_bound,
+                                                                                                                       window_step_x, window_start_x, window_end_x);
             },
             vector_sum_col_it, bias_it, mm_result_it, out_it);
         }
@@ -608,10 +632,11 @@
             {
                 const int  batch_id           = id.z() / depth_input;
                 const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_id * vector_sum_col_batch_offset);
-                run_offset_contribution_output_stage_window<true, false, false, is_bounded_relu, is_fixed_point>(vector_sum_col_ptr, nullptr, nullptr, mm_result_it, out_it,
-                                                                                                                 result_offset_s32, result_shift_s32, min_u8, max_u8, a_offset, b_offset, k_offset,
-                                                                                                                 multiplier, shift, offset, min_bound, max_bound,
-                                                                                                                 window_step_x, window_start_x, window_end_x);
+                run_offset_contribution_output_stage_window<Typer, true, false, false, is_bounded_relu, is_fixed_point>(vector_sum_col_ptr, nullptr, nullptr, mm_result_it, out_it,
+                                                                                                                        result_offset_s32, result_shift_s32,
+                                                                                                                        min_vec, max_vec, a_offset, b_offset, k_offset,
+                                                                                                                        multiplier, shift, offset, min_bound, max_bound,
+                                                                                                                        window_step_x, window_start_x, window_end_x);
             },
             vector_sum_col_it, mm_result_it, out_it);
         }
@@ -623,10 +648,11 @@
             Iterator bias_it = get_bias_it(collapsed_window, bias);
             execute_window_loop(collapsed_window, [&](const Coordinates &)
             {
-                run_offset_contribution_output_stage_window<false, false, true, is_bounded_relu, is_fixed_point>(nullptr, nullptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it, out_it,
-                                                                                                                 result_offset_s32, result_shift_s32, min_u8, max_u8, a_offset, b_offset, k_offset,
-                                                                                                                 multiplier, shift, offset, min_bound, max_bound,
-                                                                                                                 window_step_x, window_start_x, window_end_x);
+                run_offset_contribution_output_stage_window<Typer, false, false, true, is_bounded_relu, is_fixed_point>(nullptr, nullptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it, out_it,
+                                                                                                                        result_offset_s32, result_shift_s32,
+                                                                                                                        min_vec, max_vec, a_offset, b_offset, k_offset,
+                                                                                                                        multiplier, shift, offset, min_bound, max_bound,
+                                                                                                                        window_step_x, window_start_x, window_end_x);
             },
             bias_it, mm_result_it, out_it);
         }
@@ -634,10 +660,11 @@
         {
             execute_window_loop(collapsed_window, [&](const Coordinates &)
             {
-                run_offset_contribution_output_stage_window<false, false, false, is_bounded_relu, is_fixed_point>(nullptr, nullptr, nullptr, mm_result_it, out_it,
-                                                                                                                  result_offset_s32, result_shift_s32, min_u8, max_u8, a_offset, b_offset, k_offset,
-                                                                                                                  multiplier, shift, offset, min_bound, max_bound,
-                                                                                                                  window_step_x, window_start_x, window_end_x);
+                run_offset_contribution_output_stage_window<Typer, false, false, false, is_bounded_relu, is_fixed_point>(nullptr, nullptr, nullptr, mm_result_it, out_it,
+                                                                                                                         result_offset_s32, result_shift_s32,
+                                                                                                                         min_vec, max_vec, a_offset, b_offset, k_offset,
+                                                                                                                         multiplier, shift, offset, min_bound, max_bound,
+                                                                                                                         window_step_x, window_start_x, window_end_x);
             },
             mm_result_it, out_it);
         }
@@ -844,24 +871,36 @@
 NEGEMMLowpOffsetContributionOutputStageKernel::NEGEMMLowpOffsetContributionOutputStageFunction
 get_configured_function(const ITensor *mm_result, const ITensor *vector_sum_row, const ITensor *output, GEMMLowpOutputStageInfo output_stage)
 {
-    static std::map<uint8_t, NEGEMMLowpOffsetContributionOutputStageKernel::NEGEMMLowpOffsetContributionOutputStageFunction> map_function =
+    static std::map<uint8_t, NEGEMMLowpOffsetContributionOutputStageKernel::NEGEMMLowpOffsetContributionOutputStageFunction> map_function_qasymm =
     {
-        { 0, &run_offset_contribution_output_stage<false, false, false> },
-        { 1, &run_offset_contribution_output_stage<true, false, false> },
-        { 2, &run_offset_contribution_output_stage<false, true, false> },
-        { 3, &run_offset_contribution_output_stage<true, true, false> },
-        { 4, &run_offset_contribution_output_stage<false, false, true> },
-        { 5, &run_offset_contribution_output_stage<true, false, true> },
-        { 6, &run_offset_contribution_output_stage<false, true, true> },
-        { 7, &run_offset_contribution_output_stage<true, true, true> },
-        { 8, &run_offset_contribution_output_stage_symm<false, false, false> },
-        { 9, &run_offset_contribution_output_stage_symm<true, false, false> },
-        { 10, &run_offset_contribution_output_stage_symm<false, true, false> },
-        { 11, &run_offset_contribution_output_stage_symm<true, true, false> },
-        { 12, &run_offset_contribution_output_stage_symm<false, false, true> },
-        { 13, &run_offset_contribution_output_stage_symm<true, false, true> },
-        { 14, &run_offset_contribution_output_stage_symm<false, true, true> },
-        { 15, &run_offset_contribution_output_stage_symm<true, true, true> }
+        { 0, &run_offset_contribution_output_stage<uint8_t, false, false, false> },
+        { 1, &run_offset_contribution_output_stage<uint8_t, true, false, false> },
+        { 2, &run_offset_contribution_output_stage<uint8_t, false, true, false> },
+        { 3, &run_offset_contribution_output_stage<uint8_t, true, true, false> },
+        { 4, &run_offset_contribution_output_stage<uint8_t, false, false, true> },
+        { 5, &run_offset_contribution_output_stage<uint8_t, true, false, true> },
+        { 6, &run_offset_contribution_output_stage<uint8_t, false, true, true> },
+        { 7, &run_offset_contribution_output_stage<uint8_t, true, true, true> },
+        { 8, &run_offset_contribution_output_stage<int8_t, false, false, false> },
+        { 9, &run_offset_contribution_output_stage<int8_t, true, false, false> },
+        { 10, &run_offset_contribution_output_stage<int8_t, false, true, false> },
+        { 11, &run_offset_contribution_output_stage<int8_t, true, true, false> },
+        { 12, &run_offset_contribution_output_stage<int8_t, false, false, true> },
+        { 13, &run_offset_contribution_output_stage<int8_t, true, false, true> },
+        { 14, &run_offset_contribution_output_stage<int8_t, false, true, true> },
+        { 15, &run_offset_contribution_output_stage<int8_t, true, true, true> },
+    };
+
+    static std::map<uint8_t, NEGEMMLowpOffsetContributionOutputStageKernel::NEGEMMLowpOffsetContributionOutputStageFunction> map_function_qsymm =
+    {
+        { 0, &run_offset_contribution_output_stage_symm<false, false, false> },
+        { 1, &run_offset_contribution_output_stage_symm<true, false, false> },
+        { 2, &run_offset_contribution_output_stage_symm<false, true, false> },
+        { 3, &run_offset_contribution_output_stage_symm<true, true, false> },
+        { 4, &run_offset_contribution_output_stage_symm<false, false, true> },
+        { 5, &run_offset_contribution_output_stage_symm<true, false, true> },
+        { 6, &run_offset_contribution_output_stage_symm<false, true, true> },
+        { 7, &run_offset_contribution_output_stage_symm<true, true, true> }
     };
 
     // Check if input is a 3D reinterpretation
@@ -877,12 +916,23 @@
     const bool is_fixed_point = output_stage.type != GEMMLowpOutputStageType::QUANTIZE_DOWN;
 
     // Check if symmetric per-channel execution
-    const bool is_symm = output->info()->data_type() == DataType::QASYMM8_SIGNED;
+    const bool is_signed = output->info()->data_type() == DataType::QASYMM8_SIGNED;
+
+    // Check if symmetric per-channel execution
+    const bool is_symm = output_stage.is_quantized_per_channel;
 
     // key acts as a bitset, setting the first bit on reinterpret_as_3d,
     // the second on is_bounded_relu, and the third on is_fixed_point.
-    uint8_t key = (reinterpret_as_3d ? 1UL : 0UL) | ((is_bounded_relu ? 1UL : 0UL) << 1) | ((is_fixed_point ? 1UL : 0UL) << 2) | ((is_symm ? 1UL : 0UL) << 3);
-    return map_function.find(key)->second;
+    uint8_t key = (reinterpret_as_3d ? 1UL : 0UL) | ((is_bounded_relu ? 1UL : 0UL) << 1) | ((is_fixed_point ? 1UL : 0UL) << 2);
+    if(is_symm)
+    {
+        return map_function_qsymm.find(key)->second;
+    }
+    else
+    {
+        key |= ((is_signed ? 1UL : 0UL) << 3);
+        return map_function_qasymm.find(key)->second;
+    }
 }
 } // namespace
 
diff --git a/src/core/NEON/kernels/NEIm2ColKernel.cpp b/src/core/NEON/kernels/NEIm2ColKernel.cpp
index 0641d6c..f57b94d 100644
--- a/src/core/NEON/kernels/NEIm2ColKernel.cpp
+++ b/src/core/NEON/kernels/NEIm2ColKernel.cpp
@@ -49,8 +49,8 @@
                           bool has_bias, const Size2D &dilation, unsigned int num_groups)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::QASYMM8 && has_bias);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized(input->data_type()) && has_bias);
     ARM_COMPUTE_RETURN_ERROR_ON((dilation.x() < 1) || (dilation.y() < 1));
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_groups > 1, "Number of groups greater than one are not supported on NEON");
 
@@ -382,6 +382,7 @@
                 _func = (!conv_info.has_padding()) ? &NEIm2ColKernel::run_im2col<float16_t, false, true> : &NEIm2ColKernel::run_im2col<float16_t, true, true>;
                 break;
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+            case DataType::QASYMM8_SIGNED:
             case DataType::QASYMM8:
                 _func = (!conv_info.has_padding()) ? &NEIm2ColKernel::run_im2col<qasymm8_t, false, true> : &NEIm2ColKernel::run_im2col<qasymm8_t, true, true>;
                 break;
@@ -403,7 +404,10 @@
                 break;
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
             case DataType::QASYMM8:
-                _func = (!conv_info.has_padding()) ? &NEIm2ColKernel::run_im2col<qasymm8_t, false, false> : &NEIm2ColKernel::run_im2col<qasymm8_t, true, false>;
+                _func = (!conv_info.has_padding()) ? &NEIm2ColKernel::run_im2col<uint8_t, false, false> : &NEIm2ColKernel::run_im2col<qasymm8_t, true, false>;
+                break;
+            case DataType::QASYMM8_SIGNED:
+                _func = (!conv_info.has_padding()) ? &NEIm2ColKernel::run_im2col<int8_t, false, false> : &NEIm2ColKernel::run_im2col<qasymm8_t, true, false>;
                 break;
             default:
                 ARM_COMPUTE_ERROR("Data type not supported");
diff --git a/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp b/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp
index 6493164..aa43ad5 100644
--- a/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp
+++ b/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp
@@ -49,7 +49,9 @@
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *biases, const ITensorInfo *output)
 {
     //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use NEON FP16 instructions.
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QSYMM8_PER_CHANNEL, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1,
+                                                         DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL,
+                                                         DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
 
     if(biases != nullptr)
diff --git a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
index a730749..bb9620b 100644
--- a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
@@ -59,7 +59,9 @@
 Status NEConvolutionLayerReshapeWeights::validate(const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(weights);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8, DataType::QSYMM8_PER_CHANNEL, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1,
+                                                         DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL,
+                                                         DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
 
     if(biases != nullptr)
@@ -114,11 +116,12 @@
     {
         // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
         // Extract and negate input and weights offset
-        const QuantizationInfo        iqinfo  = input->info()->quantization_info();
-        const QuantizationInfo        wqinfo  = weights->info()->quantization_info();
-        const QuantizationInfo        oqinfo  = (output->info()->total_size() == 0) ? iqinfo : output->info()->quantization_info();
-        const UniformQuantizationInfo uiqinfo = iqinfo.uniform();
-        const UniformQuantizationInfo uoqinfo = oqinfo.uniform();
+        const QuantizationInfo        iqinfo    = input->info()->quantization_info();
+        const QuantizationInfo        wqinfo    = weights->info()->quantization_info();
+        const QuantizationInfo        oqinfo    = (output->info()->total_size() == 0) ? iqinfo : output->info()->quantization_info();
+        const UniformQuantizationInfo uiqinfo   = iqinfo.uniform();
+        const UniformQuantizationInfo uoqinfo   = oqinfo.uniform();
+        const DataType                data_type = input->info()->data_type();
 
         input->info()->set_quantization_info(QuantizationInfo(uiqinfo.scale, -uiqinfo.offset));
         if(!is_data_type_quantized_per_channel(weights->info()->data_type()))
@@ -128,23 +131,28 @@
         }
 
         // Merge activation with output stage
-        int min_activation = 0;
-        int max_activation = 255;
+        PixelValue type_min = 0;
+        PixelValue type_max = 0;
+        std::tie(type_min, type_max) = get_min_max(data_type);
+        int min_activation = type_min.get<int>();
+        int max_activation = type_max.get<int>();
 
         if(supported_acts.count(act_info.activation()) != 0)
         {
-            const int a_const_int = quantize_qasymm8(act_info.a(), uoqinfo);
-            const int b_const_int = quantize_qasymm8(act_info.b(), uoqinfo);
+            const bool is_quantized_signed = is_data_type_quantized_asymmetric_signed(data_type);
+            const int  a_const_int         = is_quantized_signed ? quantize_qasymm8_signed(act_info.a(), uoqinfo) : quantize_qasymm8(act_info.a(), uoqinfo);
+            const int  b_const_int         = is_quantized_signed ? quantize_qasymm8_signed(act_info.b(), uoqinfo) : quantize_qasymm8(act_info.b(), uoqinfo);
 
             min_activation = act_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU ? uoqinfo.offset : b_const_int;
-            max_activation = act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU ? 255 : a_const_int;
+            max_activation = act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU ? max_activation : a_const_int;
         }
 
         GEMMLowpOutputStageInfo output_info;
-        output_info.type               = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
-        output_info.gemmlowp_offset    = uoqinfo.offset;
-        output_info.gemmlowp_min_bound = min_activation;
-        output_info.gemmlowp_max_bound = max_activation;
+        output_info.type                     = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
+        output_info.gemmlowp_offset          = uoqinfo.offset;
+        output_info.gemmlowp_min_bound       = min_activation;
+        output_info.gemmlowp_max_bound       = max_activation;
+        output_info.is_quantized_per_channel = (weights->info()->data_type() == DataType::QSYMM8_PER_CHANNEL);
         quantization::calculate_quantized_multipliers_less_than_one(iqinfo, wqinfo, oqinfo, output_info);
 
         _mm_gemmlowp.configure(input, weights, biases, output, GEMMInfo(false, false, true, gemm_3d_depth, _skip_im2col, false, output_info));
@@ -163,8 +171,9 @@
 Status NEGEMMConvolutionLayer::validate_mm(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
                                            const ActivationLayerInfo &act_info, int gemm_3d_depth, bool skip_im2col)
 {
-    const bool is_quantized          = is_data_type_quantized_asymmetric(input->data_type());
-    const bool is_activation_enabled = act_info.enabled();
+    const DataType data_type             = input->data_type();
+    const bool     is_quantized          = is_data_type_quantized_asymmetric(data_type);
+    const bool     is_activation_enabled = act_info.enabled();
 
     // Create GEMMInfo structure
     const GEMMInfo gemm_info = GEMMInfo(false, false, true /* Reshape weights only for the first run */,
@@ -181,8 +190,11 @@
         const UniformQuantizationInfo uoqinfo = oqinfo.uniform();
 
         // Merge activation with output stage
-        int min_activation = 0;
-        int max_activation = 255;
+        PixelValue type_min = 0;
+        PixelValue type_max = 0;
+        std::tie(type_min, type_max) = get_min_max(data_type);
+        int min_activation = type_min.get<int>();
+        int max_activation = type_max.get<int>();
 
         const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { ActivationLayerInfo::ActivationFunction::RELU,
                                                                                    ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
@@ -190,18 +202,20 @@
                                                                                  };
         if(is_activation_enabled && supported_acts.count(act_info.activation()) != 0)
         {
-            const int a_const_int = quantize_qasymm8(act_info.a(), uoqinfo);
-            const int b_const_int = quantize_qasymm8(act_info.b(), uoqinfo);
+            const bool is_quantized_signed = is_data_type_quantized_asymmetric_signed(data_type);
+            const int  a_const_int         = is_quantized_signed ? quantize_qasymm8_signed(act_info.a(), uoqinfo) : quantize_qasymm8(act_info.a(), uoqinfo);
+            const int  b_const_int         = is_quantized_signed ? quantize_qasymm8_signed(act_info.b(), uoqinfo) : quantize_qasymm8(act_info.b(), uoqinfo);
 
             min_activation = act_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU ? uoqinfo.offset : b_const_int;
-            max_activation = act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU ? 255 : a_const_int;
+            max_activation = act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU ? max_activation : a_const_int;
         }
 
         GEMMLowpOutputStageInfo output_info;
-        output_info.type               = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
-        output_info.gemmlowp_offset    = uoqinfo.offset;
-        output_info.gemmlowp_min_bound = min_activation;
-        output_info.gemmlowp_max_bound = max_activation;
+        output_info.type                     = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
+        output_info.gemmlowp_offset          = uoqinfo.offset;
+        output_info.gemmlowp_min_bound       = min_activation;
+        output_info.gemmlowp_max_bound       = max_activation;
+        output_info.is_quantized_per_channel = (weights->data_type() == DataType::QSYMM8_PER_CHANNEL);
         ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multipliers_less_than_one(iqinfo, wqinfo, oqinfo, output_info));
 
         // Perform validation step on GEMMLowp
@@ -387,8 +401,8 @@
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights_info.are_reshaped(), "Weights already reshaped are not supported!");
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8, DataType::QSYMM8_PER_CHANNEL, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_groups > 1, "Grouping (num_groups != 1) is not supported on NEON");
 
diff --git a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
index 5b9d055..e36cb3d 100644
--- a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
@@ -280,9 +280,9 @@
 
 Status NEGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, const GEMMInfo &gemm_info)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::QASYMM8, DataType::QSYMM8_PER_CHANNEL);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32, DataType::QASYMM8);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(c != nullptr && gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::NONE, "Bias addition not supported in NEGEMMLowpMatrixMultiplyCore for output S32");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG((a)->dimension(0) != (b)->dimension(1),
                                     "The product AB is defined only if the number of columns in A is equal to the number of rows in B");