COMPMID-2309 : CLConvolutionLayer: support QUANT8_SYMM_PER_CHANNEL filters

Change-Id: I16f6758b768ede404a064db057302ded706e1e8a
Signed-off-by: Vidhya Sudhan Loganathan <vidhyasudhan.loganathan@arm.com>
Signed-off-by: Michele Di Giorgio <michele.digiorgio@arm.com>
Reviewed-on: https://review.mlplatform.org/c/2215
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
diff --git a/tests/validation/CL/ConvolutionLayer.cpp b/tests/validation/CL/ConvolutionLayer.cpp
index f1f9b59..9eb6c6d 100644
--- a/tests/validation/CL/ConvolutionLayer.cpp
+++ b/tests/validation/CL/ConvolutionLayer.cpp
@@ -271,6 +271,8 @@
 
 template <typename T>
 using CLGEMMConvolutionLayerQuantizedFixture = ConvolutionValidationQuantizedFixture<CLTensor, CLAccessor, CLGEMMConvolutionLayer, T>;
+template <typename T>
+using CLGEMMConvolutionLayerQuantizedPerChannelFixture = ConvolutionValidationQuantizedPerChannelFixture<CLTensor, CLAccessor, CLGEMMConvolutionLayer, T, int8_t>;
 
 const auto QuantizedActivationFunctionsDataset = framework::dataset::make("ActivationInfo",
 {
@@ -285,7 +287,6 @@
 });
 
 TEST_SUITE(Quantized)
-TEST_SUITE(QASYMM8)
 
 const auto QuantizationData = framework::dataset::make("QuantizationInfo",
 {
@@ -293,6 +294,7 @@
     QuantizationInfo(0.3f, 3),
     QuantizationInfo(1.f, 10),
 });
+TEST_SUITE(QASYMM8)
 
 FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
                        combine(combine(combine(combine(combine(datasets::SmallConvolutionLayerReducedDataset(),
@@ -317,6 +319,33 @@
     validate(CLAccessor(_target), _reference, tolerance_qasymm8);
 }
 TEST_SUITE_END() // QASYMM8
+TEST_SUITE(QSYMM8_PER_CHANNEL)
+
+FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMConvolutionLayerQuantizedPerChannelFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
+                       combine(combine(combine(combine(combine(combine(datasets::SmallConvolutionLayerReducedDataset(),
+                                                                       framework::dataset::make("ReshapeWeights", { true })),
+                                                               framework::dataset::make("DataType", { DataType::QASYMM8 })),
+                                                       framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                                               QuantizationData),
+                                       QuantizedActivationFunctionsSmallDataset),
+                               framework::dataset::make("WeightsDataType", { DataType::QSYMM8_PER_CHANNEL })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_qasymm8);
+}
+FIXTURE_DATA_TEST_CASE(RunLarge, CLGEMMConvolutionLayerQuantizedPerChannelFixture<uint8_t>, framework::DatasetMode::NIGHTLY,
+                       combine(combine(combine(combine(combine(combine(datasets::SmallConvolutionLayerDataset(),
+                                                                       framework::dataset::make("ReshapeWeights", { true })),
+                                                               framework::dataset::make("DataType", { DataType::QASYMM8 })),
+                                                       framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                                               QuantizationData),
+                                       QuantizedActivationFunctionsDataset),
+                               framework::dataset::make("WeightsDataType", { DataType::QSYMM8_PER_CHANNEL })))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, tolerance_qasymm8);
+}
+TEST_SUITE_END() // QSYMM8_PER_CHANNEL
 TEST_SUITE_END() // Quantized
 
 TEST_SUITE_END() // GEMMConvolutionLayer
diff --git a/tests/validation/CL/GEMMLowp.cpp b/tests/validation/CL/GEMMLowp.cpp
index f5bd871..39543b1 100644
--- a/tests/validation/CL/GEMMLowp.cpp
+++ b/tests/validation/CL/GEMMLowp.cpp
@@ -86,13 +86,15 @@
 
 using CLGEMMLowpMatrixMultiplyCoreFusedOffsetOutputFixture = GEMMLowpMatrixMultiplyCoreFusedOffsetOutputValidationFixture<CLTensor, CLAccessor, CLGEMMLowpMatrixMultiplyCore>;
 TEST_SUITE(FusedOffsetOutput)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMLowpMatrixMultiplyCoreFusedOffsetOutputFixture, framework::DatasetMode::ALL, datasets::SmallGEMMLowpFusedOffsetOutputDataset())
+FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMLowpMatrixMultiplyCoreFusedOffsetOutputFixture, framework::DatasetMode::ALL, combine(datasets::SmallGEMMLowpFusedOffsetOutputDataset(),
+                       framework::dataset::make("DataType", { DataType::QASYMM8 })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference);
 }
 
-FIXTURE_DATA_TEST_CASE(RunLarge, CLGEMMLowpMatrixMultiplyCoreFusedOffsetOutputFixture, framework::DatasetMode::NIGHTLY, datasets::LargeGEMMLowpFusedOffsetOutputDataset())
+FIXTURE_DATA_TEST_CASE(RunLarge, CLGEMMLowpMatrixMultiplyCoreFusedOffsetOutputFixture, framework::DatasetMode::NIGHTLY, combine(datasets::LargeGEMMLowpFusedOffsetOutputDataset(),
+                       framework::dataset::make("DataType", { DataType::QASYMM8 })))
 {
     // Validate output
     validate(CLAccessor(_target), _reference);
@@ -305,13 +307,17 @@
                                                                          2)
                                                                          * framework::dataset::make("min", -2, 0) * framework::dataset::make("max", 1, 3) * framework::dataset::make("addBias", { false, true });
 
-const auto quantize_down_int32_to_int16_scale_by_fixedpoint_multgreat1_cases = framework::dataset::make("result_fixedpoint_multiplier", 1073741823, 1073741825) * framework::dataset::make("result_shift", -3,
-                                                                    -2)
-                                                                    * framework::dataset::make("min", 0) * framework::dataset::make("max", 0) * framework::dataset::make("addBias", { false, true });
+const auto quantize_down_int32_to_int16_scale_by_fixedpoint_multgreat1_cases = framework::dataset::make("result_fixedpoint_multiplier", 1073741823,
+                                                                                                        1073741825)
+                                                                               * framework::dataset::make("result_shift", -3,
+                                                                                                          -2)
+                                                                               * framework::dataset::make("min", 0) * framework::dataset::make("max", 0) * framework::dataset::make("addBias", { false, true });
 
-const auto quantize_down_int32_to_int16_scale_by_fixedpoint_multgreat1_relu_cases = framework::dataset::make("result_fixedpoint_multiplier", 254601600, 254601602) * framework::dataset::make("result_shift", -3,
-                                                                         -1)
-                                                                         * framework::dataset::make("min", -2, 0) * framework::dataset::make("max", 1, 3) * framework::dataset::make("addBias", { false, true });
+const auto quantize_down_int32_to_int16_scale_by_fixedpoint_multgreat1_relu_cases = framework::dataset::make("result_fixedpoint_multiplier", 254601600,
+                                                                                                             254601602)
+                                                                                    * framework::dataset::make("result_shift", -3,
+                                                                                                               -1)
+                                                                                    * framework::dataset::make("min", -2, 0) * framework::dataset::make("max", 1, 3) * framework::dataset::make("addBias", { false, true });
 
 using CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointFixture =
     GEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointValidationFixture<CLTensor, CLAccessor, CLGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint>;
diff --git a/tests/validation/CL/WeightsReshape.cpp b/tests/validation/CL/WeightsReshape.cpp
index 30c231d..47cb975 100644
--- a/tests/validation/CL/WeightsReshape.cpp
+++ b/tests/validation/CL/WeightsReshape.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -47,22 +47,19 @@
 // *INDENT-OFF*
 // clang-format off
 DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
-               framework::dataset::make("InputInfo", { TensorInfo(TensorShape(3U, 3U, 2U, 4U), 1, DataType::U8),      // Unsupported data type
-                                                       TensorInfo(TensorShape(3U, 3U, 2U, 4U), 1, DataType::F32),     // Mismatching data type
+               framework::dataset::make("InputInfo", { TensorInfo(TensorShape(3U, 3U, 2U, 4U), 1, DataType::F32),     // Mismatching data type
                                                        TensorInfo(TensorShape(3U, 3U, 2U, 4U), 1, DataType::QASYMM8), // Bias not supported with QASYMM8
                                                        TensorInfo(TensorShape(3U, 3U, 2U, 4U), 1, DataType::F32),
                                                      }),
-               framework::dataset::make("BiasesInfo", { TensorInfo(TensorShape(4U), 1, DataType::U8),
-                                                        TensorInfo(TensorShape(4U), 1, DataType::F16),
+               framework::dataset::make("BiasesInfo", { TensorInfo(TensorShape(4U), 1, DataType::F16),
                                                         TensorInfo(TensorShape(4U), 1, DataType::QASYMM8),
                                                         TensorInfo(TensorShape(4U), 1, DataType::F32),
                                                       })),
-               framework::dataset::make("OutputInfo", { TensorInfo(TensorShape(4U, 19U), 1, DataType::U8),
-                                                        TensorInfo(TensorShape(4U, 19U), 1, DataType::F16),
+               framework::dataset::make("OutputInfo", { TensorInfo(TensorShape(4U, 19U), 1, DataType::F16),
                                                         TensorInfo(TensorShape(4U, 19U), 1, DataType::QASYMM8),
                                                         TensorInfo(TensorShape(4U, 19U), 1, DataType::F32),
                                                       })),
-               framework::dataset::make("Expected", { false, false, false, true })),
+               framework::dataset::make("Expected", { false, false, true })),
                input_info, biases_info, output_info, expected)
 {
     bool status = bool(CLWeightsReshape::validate(&input_info, &biases_info, &output_info));
diff --git a/tests/validation/NEON/GEMMLowp.cpp b/tests/validation/NEON/GEMMLowp.cpp
index d79374e..b79523d 100644
--- a/tests/validation/NEON/GEMMLowp.cpp
+++ b/tests/validation/NEON/GEMMLowp.cpp
@@ -147,13 +147,15 @@
 
 using NEGEMMLowpMatrixMultiplyCoreFusedOffsetOutputFixture = GEMMLowpMatrixMultiplyCoreFusedOffsetOutputValidationFixture<Tensor, Accessor, NEGEMMLowpMatrixMultiplyCore>;
 TEST_SUITE(FusedOffsetOutput)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpMatrixMultiplyCoreFusedOffsetOutputFixture, framework::DatasetMode::ALL, datasets::SmallGEMMLowpFusedOffsetOutputDataset())
+FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpMatrixMultiplyCoreFusedOffsetOutputFixture, framework::DatasetMode::ALL, combine(datasets::SmallGEMMLowpFusedOffsetOutputDataset(),
+                       framework::dataset::make("DataType", { DataType::QASYMM8 })))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
 
-FIXTURE_DATA_TEST_CASE(RunLarge, NEGEMMLowpMatrixMultiplyCoreFusedOffsetOutputFixture, framework::DatasetMode::NIGHTLY, datasets::LargeGEMMLowpFusedOffsetOutputDataset())
+FIXTURE_DATA_TEST_CASE(RunLarge, NEGEMMLowpMatrixMultiplyCoreFusedOffsetOutputFixture, framework::DatasetMode::NIGHTLY, combine(datasets::LargeGEMMLowpFusedOffsetOutputDataset(),
+                       framework::dataset::make("DataType", { DataType::QASYMM8 })))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -417,13 +419,17 @@
 const auto quantize_down_int32_to_int16_scale_by_fixedpoint_relu_cases = framework::dataset::make("result_fixedpoint_multiplier", 254601600, 254601602) * framework::dataset::make("result_shift", 1,
                                                                          2)
                                                                          * framework::dataset::make("min", -2, 0) * framework::dataset::make("max", 1, 3) * framework::dataset::make("addBias", { false, true });
-const auto quantize_down_int32_to_int16_scale_by_fixedpoint_multgreat1_cases = framework::dataset::make("result_fixedpoint_multiplier", 1073741823, 1073741825) * framework::dataset::make("result_shift", -3,
-                                                                    -2)
-                                                                    * framework::dataset::make("min", 0) * framework::dataset::make("max", 0) * framework::dataset::make("addBias", { false, true });
+const auto quantize_down_int32_to_int16_scale_by_fixedpoint_multgreat1_cases = framework::dataset::make("result_fixedpoint_multiplier", 1073741823,
+                                                                                                        1073741825)
+                                                                               * framework::dataset::make("result_shift", -3,
+                                                                                                          -2)
+                                                                               * framework::dataset::make("min", 0) * framework::dataset::make("max", 0) * framework::dataset::make("addBias", { false, true });
 
-const auto quantize_down_int32_to_int16_scale_by_fixedpoint_multgreat1_relu_cases = framework::dataset::make("result_fixedpoint_multiplier", 254601600, 254601602) * framework::dataset::make("result_shift", -3,
-                                                                         -1)
-                                                                         * framework::dataset::make("min", -2, 0) * framework::dataset::make("max", 1, 3) * framework::dataset::make("addBias", { false, true });
+const auto quantize_down_int32_to_int16_scale_by_fixedpoint_multgreat1_relu_cases = framework::dataset::make("result_fixedpoint_multiplier", 254601600,
+                                                                                                             254601602)
+                                                                                    * framework::dataset::make("result_shift", -3,
+                                                                                                               -1)
+                                                                                    * framework::dataset::make("min", -2, 0) * framework::dataset::make("max", 1, 3) * framework::dataset::make("addBias", { false, true });
 
 using NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointFixture =
     GEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPointValidationFixture<Tensor, Accessor, NEGEMMLowpQuantizeDownInt32ToInt16ScaleByFixedPoint>;
diff --git a/tests/validation/fixtures/GEMMLowpFixture.h b/tests/validation/fixtures/GEMMLowpFixture.h
index 8385221..5d092ec 100644
--- a/tests/validation/fixtures/GEMMLowpFixture.h
+++ b/tests/validation/fixtures/GEMMLowpFixture.h
@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/TensorShape.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "tests/AssetsLibrary.h"
 #include "tests/Globals.h"
 #include "tests/IAccessor.h"
@@ -47,23 +48,66 @@
 template <typename U>
 void fill(U &&tensor, int i)
 {
-    // Between 1 and 254 in order to avoid having -128 and 128 for the DOT product path
-    std::uniform_int_distribution<> distribution(1, 254);
-    library->fill(tensor, distribution, i);
+    switch(tensor.data_type())
+    {
+        case DataType::QSYMM8_PER_CHANNEL:
+        {
+            int min_bound = 128;
+            int max_bound = -127;
+            for(size_t j = 0; j < tensor.quantization_info().scale().size(); j++)
+            {
+                std::pair<int, int> bounds = get_symm_quantized_per_channel_bounds(tensor.quantization_info(), -1.0f, 1.0f, i);
+                if(bounds.first < min_bound)
+                {
+                    min_bound = bounds.first;
+                }
+                if(bounds.second > max_bound)
+                {
+                    max_bound = bounds.second;
+                }
+            }
+            std::uniform_int_distribution<int8_t> distribution(min_bound, max_bound);
+            library->fill(tensor, distribution, i);
+            break;
+        }
+        case DataType::QASYMM8:
+        {
+            std::uniform_int_distribution<uint8_t> distribution(1, 254);
+            library->fill(tensor, distribution, i);
+            break;
+        }
+        case DataType::F16:
+        case DataType::F32:
+        {
+            // Between 1 and 254 in order to avoid having -128 and 128 for the DOT product path
+            std::uniform_real_distribution<> distribution(-1.0f, 1.0f);
+            library->fill(tensor, distribution, i);
+            break;
+        }
+        default:
+            library->fill_tensor_uniform(tensor, i);
+    }
 }
 
 template <typename TensorType, typename AccessorType, typename FunctionType, bool reinterpret_input_as_3d, bool reinterpret_output_as_3d, typename OutputType, bool is_fused = false>
 TensorType compute_gemmlowp_target(const TensorShape &shape_a, const TensorShape &shape_b, const TensorShape &shape_output, int32_t a_offset, int32_t b_offset,
-                                   GEMMLowpOutputStageInfo output_stage = GEMMLowpOutputStageInfo())
+                                   GEMMLowpOutputStageInfo output_stage = GEMMLowpOutputStageInfo(), DataType data_type_b = DataType::QASYMM8, QuantizationInfo b_qinfo = QuantizationInfo())
 {
     // Create tensors
     TensorType a      = create_tensor<TensorType>(shape_a, DataType::QASYMM8, 1);
-    TensorType b      = create_tensor<TensorType>(shape_b, DataType::QASYMM8, 1);
+    TensorType b      = create_tensor<TensorType>(shape_b, data_type_b, 1); // gemm output before output stage mismatch if i pass data_layout_output here. to be investigated
     TensorType output = create_tensor<TensorType>(shape_output, output_stage.type == GEMMLowpOutputStageType::NONE ? DataType::S32 : DataType::QASYMM8, 1);
 
     a.info()->set_quantization_info(QuantizationInfo(1.0f / 255, a_offset));
-    b.info()->set_quantization_info(QuantizationInfo(1.0f / 255, b_offset));
 
+    if(data_type_b == DataType::QSYMM8_PER_CHANNEL)
+    {
+        b.info()->set_quantization_info(b_qinfo);
+    }
+    else
+    {
+        b.info()->set_quantization_info(QuantizationInfo(1.0f / 255, b_offset));
+    }
     TensorType bias;
     if(is_fused)
     {
@@ -101,14 +145,14 @@
         ARM_COMPUTE_EXPECT(!bias.info()->is_resizable(), framework::LogLevel::ERRORS);
         fill(AccessorType(bias), 2);
     }
-
     // Compute GEMM function
     gemmlowp.run();
     return output;
 }
 
-template <bool        reinterpret_input_as_3d>
-SimpleTensor<int32_t> compute_gemmlowp_reference(const TensorShape &shape_a, const TensorShape &shape_b, const TensorShape &shape_output, int32_t a_offset, int32_t b_offset)
+template <bool        reinterpret_input_as_3d, typename TW = uint8_t>
+SimpleTensor<int32_t> compute_gemmlowp_reference(const TensorShape &shape_a, const TensorShape &shape_b, const TensorShape &shape_output, int32_t a_offset, int32_t b_offset,
+                                                 DataType data_type_b = DataType::QASYMM8, QuantizationInfo b_qinfo = QuantizationInfo())
 {
     TensorShape shape_a_to_use = shape_a;
     if(reinterpret_input_as_3d)
@@ -119,13 +163,12 @@
 
     // Create reference
     SimpleTensor<uint8_t> a{ shape_a_to_use, DataType::QASYMM8, 1 };
-    SimpleTensor<uint8_t> b{ shape_b, DataType::QASYMM8, 1 };
+    SimpleTensor<TW>      b{ shape_b, data_type_b, 1, data_type_b == DataType::QSYMM8_PER_CHANNEL ? b_qinfo : QuantizationInfo(1.0f / 255, b_offset) };
 
     // Fill reference
     fill(a, 0);
     fill(b, 1);
-
-    return reference::gemmlowp_matrix_multiply_core<int32_t, uint8_t>(a, b, shape_output, a_offset, b_offset);
+    return reference::gemmlowp_matrix_multiply_core<int32_t, uint8_t, TW>(a, b, shape_output, a_offset, b_offset);
 }
 }
 
@@ -155,29 +198,50 @@
     SimpleTensor<int32_t> _reference{};
 };
 
-template <typename TensorType, typename AccessorType, typename FunctionType, bool reinterpret_input_as_3d = false, bool reinterpret_output_as_3d = false>
+template <typename TensorType, typename AccessorType, typename FunctionType, bool reinterpret_input_as_3d = false, bool reinterpret_output_as_3d = false, typename TW = uint8_t>
 class GEMMLowpMatrixMultiplyCoreFusedOffsetOutputValidationFixture : public framework::Fixture
 {
 public:
     template <typename...>
-    void setup(TensorShape shape_a, TensorShape shape_b, TensorShape shape_output, int32_t a_offset, int32_t b_offset, GEMMLowpOutputStageInfo output_stage)
+    void setup(TensorShape shape_a, TensorShape shape_b, TensorShape shape_output, int32_t a_offset, int32_t b_offset, GEMMLowpOutputStageInfo output_stage, DataType data_type_b)
     {
         ARM_COMPUTE_EXPECT(output_stage.type != GEMMLowpOutputStageType::NONE, framework::LogLevel::ERRORS);
-        _target    = compute_target(shape_a, shape_b, shape_output, a_offset, b_offset, output_stage);
-        _reference = compute_reference(shape_a, shape_b, shape_output, a_offset, b_offset, output_stage);
+        if(data_type_b == DataType::QSYMM8_PER_CHANNEL)
+        {
+            output_stage.is_quantized_per_channel         = true;
+            const size_t                     num_channels = shape_b[0];
+            std::vector<float>               scales(num_channels);
+            std::uniform_real_distribution<> distribution(0, 1);
+            library->fill(scales, distribution, 0);
+            output_stage.gemmlowp_multipliers.resize(num_channels);
+            output_stage.gemmlowp_shifts.resize(num_channels);
+            for(size_t i = 0; i < num_channels; ++i)
+            {
+                quantization::calculate_quantized_multiplier_less_than_one(scales[i], &output_stage.gemmlowp_multipliers[i], &output_stage.gemmlowp_shifts[i]);
+            }
+
+            _reference = compute_reference(shape_a, shape_b, shape_output, a_offset, 0, output_stage, data_type_b, QuantizationInfo(scales));
+            _target    = compute_target(shape_a, shape_b, shape_output, a_offset, 0, output_stage, data_type_b, QuantizationInfo(scales));
+        }
+        else
+        {
+            _reference = compute_reference(shape_a, shape_b, shape_output, a_offset, b_offset, output_stage, data_type_b, QuantizationInfo());
+            _target    = compute_target(shape_a, shape_b, shape_output, a_offset, b_offset, output_stage, data_type_b, QuantizationInfo());
+        }
     }
 
 protected:
-    TensorType compute_target(const TensorShape &shape_a, const TensorShape &shape_b, const TensorShape &shape_output, int32_t a_offset, int32_t b_offset, GEMMLowpOutputStageInfo output_stage)
+    TensorType compute_target(const TensorShape &shape_a, const TensorShape &shape_b, const TensorShape &shape_output, int32_t a_offset, int32_t b_offset, GEMMLowpOutputStageInfo output_stage,
+                              DataType data_type_b, QuantizationInfo b_qinfo)
     {
         return compute_gemmlowp_target<TensorType, AccessorType, FunctionType, reinterpret_input_as_3d, reinterpret_output_as_3d, qasymm8_t, true>(shape_a, shape_b, shape_output, a_offset, b_offset,
-                output_stage);
+                output_stage, data_type_b, b_qinfo);
     }
 
     SimpleTensor<qasymm8_t> compute_reference(const TensorShape &shape_a, const TensorShape &shape_b, const TensorShape &shape_output, int32_t a_offset, int32_t b_offset,
-                                              GEMMLowpOutputStageInfo output_stage)
+                                              GEMMLowpOutputStageInfo output_stage, DataType data_type_b, QuantizationInfo b_qinfo)
     {
-        SimpleTensor<int32_t> output = compute_gemmlowp_reference<reinterpret_input_as_3d>(shape_a, shape_b, shape_output, a_offset, b_offset);
+        SimpleTensor<int32_t> output = compute_gemmlowp_reference<reinterpret_input_as_3d, TW>(shape_a, shape_b, shape_output, a_offset, b_offset, data_type_b, b_qinfo);
 
         TensorShape           bias_shape(shape_b[0]);
         SimpleTensor<int32_t> bias{ bias_shape, DataType::S32, 1 };
@@ -187,11 +251,11 @@
         {
             case GEMMLowpOutputStageType::QUANTIZE_DOWN:
                 return reference::gemmlowp_quantize_down_int32_to_uint8_scale<int32_t>(output, bias,
-                                                                                       output_stage.gemmlowp_offset, output_stage.gemmlowp_multiplier, output_stage.gemmlowp_shift, output_stage.gemmlowp_min_bound, output_stage.gemmlowp_max_bound);
+                                                                                       output_stage.gemmlowp_offset, output_stage.gemmlowp_multipliers, output_stage.gemmlowp_shifts, output_stage.gemmlowp_min_bound, output_stage.gemmlowp_max_bound);
                 break;
             case GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT:
                 return reference::gemmlowp_quantize_down_int32_to_uint8_scale_by_fixedpoint<int32_t>(output, bias,
-                                                                                                     output_stage.gemmlowp_multiplier, output_stage.gemmlowp_shift, output_stage.gemmlowp_offset, output_stage.gemmlowp_min_bound, output_stage.gemmlowp_max_bound);
+                                                                                                     output_stage.gemmlowp_multipliers, output_stage.gemmlowp_shifts, output_stage.gemmlowp_offset, output_stage.gemmlowp_min_bound, output_stage.gemmlowp_max_bound);
                 break;
             default:
                 ARM_COMPUTE_ERROR("Not Supported!");
@@ -276,16 +340,19 @@
         // Fill reference
         fill(a, 0);
 
+        const std::vector<int32_t> result_mult_int_vec = { result_mult_int };
+        const std::vector<int32_t> result_shift_vec    = { result_shift };
+
         if(add_bias)
         {
             // Fill bias
             fill(b, 1);
 
-            return reference::gemmlowp_quantize_down_int32_to_uint8_scale<int32_t>(a, b, result_offset, result_mult_int, result_shift, min, max);
+            return reference::gemmlowp_quantize_down_int32_to_uint8_scale<int32_t>(a, b, result_offset, result_mult_int_vec, result_shift_vec, min, max);
         }
         else
         {
-            return reference::gemmlowp_quantize_down_int32_to_uint8_scale<int32_t>(a, result_offset, result_mult_int, result_shift, min, max);
+            return reference::gemmlowp_quantize_down_int32_to_uint8_scale<int32_t>(a, result_offset, result_mult_int_vec, result_shift_vec, min, max);
         }
     }
 
@@ -368,16 +435,19 @@
         // Fill reference
         fill(a, 0);
 
+        const std::vector<int32_t> result_fixed_point_multiplier_vec = { result_fixed_point_multiplier };
+        const std::vector<int32_t> result_shift_vec                  = { result_shift };
+
         if(add_bias)
         {
             // Fill bias
             fill(b, 1);
 
-            return reference::gemmlowp_quantize_down_int32_to_uint8_scale_by_fixedpoint<int32_t>(a, b, result_fixed_point_multiplier, result_shift, result_offset_after_shift, min, max);
+            return reference::gemmlowp_quantize_down_int32_to_uint8_scale_by_fixedpoint<int32_t>(a, b, result_fixed_point_multiplier_vec, result_shift_vec, result_offset_after_shift, min, max);
         }
         else
         {
-            return reference::gemmlowp_quantize_down_int32_to_uint8_scale_by_fixedpoint<int32_t>(a, result_fixed_point_multiplier, result_shift, result_offset_after_shift, min, max);
+            return reference::gemmlowp_quantize_down_int32_to_uint8_scale_by_fixedpoint<int32_t>(a, result_fixed_point_multiplier_vec, result_shift_vec, result_offset_after_shift, min, max);
         }
     }
 
diff --git a/tests/validation/reference/GEMMLowp.cpp b/tests/validation/reference/GEMMLowp.cpp
index 4283cb5..08be4a5 100644
--- a/tests/validation/reference/GEMMLowp.cpp
+++ b/tests/validation/reference/GEMMLowp.cpp
@@ -39,10 +39,11 @@
 namespace
 {
 template <typename T>
-void quantize_down_int32_to_uint8_scale(const SimpleTensor<T> *in, const SimpleTensor<T> *bias, SimpleTensor<uint8_t> *dst, int32_t result_offset, int32_t result_mult_int, int32_t result_shift,
-                                        int32_t min, int32_t max)
+void quantize_down_int32_to_uint8_scale(const SimpleTensor<T> *in, const SimpleTensor<T> *bias, SimpleTensor<uint8_t> *dst, int32_t result_offset, std::vector<int32_t> result_mult_int,
+                                        std::vector<int32_t> result_shift, int32_t min, int32_t max)
 {
-    const int cols_in = in->shape().x();
+    const int  cols_in        = in->shape().x();
+    const bool is_per_channel = result_mult_int.size() > 1;
 
     for(int i = 0; i < in->num_elements(); ++i)
     {
@@ -53,9 +54,9 @@
             result += (*bias)[i % cols_in];
         }
 
-        result *= result_mult_int;
+        result *= (is_per_channel) ? result_mult_int[i % cols_in] : result_mult_int[0];
 
-        result >>= result_shift;
+        result >>= (is_per_channel) ? result_shift[i % cols_in] : result_shift[0];
 
         // Bounded ReLu
         if(min != max)
@@ -68,10 +69,11 @@
 }
 
 template <typename T>
-void quantize_down_int32_to_uint8_scale_by_fixedpoint(const SimpleTensor<T> *in, const SimpleTensor<T> *bias, SimpleTensor<uint8_t> *dst, int32_t result_fixedpoint_multiplier, int32_t result_shift,
-                                                      int32_t result_offset_after_shift, int32_t min, int32_t max)
+void quantize_down_int32_to_uint8_scale_by_fixedpoint(const SimpleTensor<T> *in, const SimpleTensor<T> *bias, SimpleTensor<uint8_t> *dst, std::vector<int32_t> result_fixedpoint_multiplier,
+                                                      std::vector<int32_t> result_shift, int32_t result_offset_after_shift, int32_t min, int32_t max)
 {
-    const int cols_in = in->shape().x();
+    const int  cols_in        = in->shape().x();
+    const bool is_per_channel = result_fixedpoint_multiplier.size() > 1;
 
     for(int i = 0; i < in->num_elements(); ++i)
     {
@@ -83,7 +85,10 @@
         }
 
         // Fixed point multiplication
-        result = asymm_rounding_divide_by_pow2(asymm_int_mult(result, result_fixedpoint_multiplier), result_shift);
+        const int32_t multiplier = (is_per_channel) ? result_fixedpoint_multiplier[i % cols_in] : result_fixedpoint_multiplier[0];
+        const int32_t shift      = (is_per_channel) ? result_shift[i % cols_in] : result_shift[0];
+
+        result = asymm_rounding_divide_by_pow2(asymm_int_mult(result, multiplier), shift);
         result += result_offset_after_shift;
 
         // Bounded ReLu
@@ -132,8 +137,8 @@
 }
 } // namespace
 
-template <typename T_out, typename T_in>
-SimpleTensor<T_out> gemmlowp_matrix_multiply_core(const SimpleTensor<T_in> &a, const SimpleTensor<T_in> &b, TensorShape shape_c, int32_t a_offset, int32_t b_offset)
+template <typename T_out, typename T_in, typename T_in_1>
+SimpleTensor<T_out> gemmlowp_matrix_multiply_core(const SimpleTensor<T_in> &a, const SimpleTensor<T_in_1> &b, TensorShape shape_c, int32_t a_offset, int32_t b_offset)
 {
     static_assert(std::is_same<typename std::decay<T_out>::type, int32_t>::value, "Only int32_t is allowed for the output");
 
@@ -186,14 +191,15 @@
 }
 
 // used to validate assembly kernels which don't know anything about offsets
-template <typename T1, typename T2>
-SimpleTensor<T1> gemmlowp(const SimpleTensor<T2> &a, const SimpleTensor<T2> &b, TensorShape shape_c)
+template <typename T1, typename T2, typename T3>
+SimpleTensor<T1> gemmlowp(const SimpleTensor<T2> &a, const SimpleTensor<T3> &b, TensorShape shape_c)
 {
-    return gemmlowp_matrix_multiply_core<T1, T2>(a, b, shape_c, 0, 0);
+    return gemmlowp_matrix_multiply_core<T1, T2, T3>(a, b, shape_c, 0, 0);
 }
 
 template <typename T>
-SimpleTensor<uint8_t> gemmlowp_quantize_down_int32_to_uint8_scale(const SimpleTensor<T> &in, int32_t result_offset, int32_t result_mult_int, int32_t result_shift, int32_t min, int32_t max)
+SimpleTensor<uint8_t> gemmlowp_quantize_down_int32_to_uint8_scale(const SimpleTensor<T> &in, int32_t result_offset, std::vector<int32_t> result_mult_int, std::vector<int32_t> result_shift,
+                                                                  int32_t min, int32_t max)
 {
     SimpleTensor<uint8_t> dst(in.shape(), DataType::QASYMM8);
 
@@ -203,8 +209,8 @@
 }
 
 template <typename T>
-SimpleTensor<uint8_t> gemmlowp_quantize_down_int32_to_uint8_scale(const SimpleTensor<T> &in, const SimpleTensor<T> &bias, int32_t result_offset, int32_t result_mult_int, int32_t result_shift,
-                                                                  int32_t min, int32_t max)
+SimpleTensor<uint8_t> gemmlowp_quantize_down_int32_to_uint8_scale(const SimpleTensor<T> &in, const SimpleTensor<T> &bias, int32_t result_offset, std::vector<int32_t> result_mult_int,
+                                                                  std::vector<int32_t> result_shift, int32_t min, int32_t max)
 {
     SimpleTensor<uint8_t> dst(in.shape(), DataType::QASYMM8);
 
@@ -214,9 +220,8 @@
 }
 
 template <typename T>
-SimpleTensor<uint8_t> gemmlowp_quantize_down_int32_to_uint8_scale_by_fixedpoint(const SimpleTensor<T> &in, int32_t result_fixedpoint_multiplier, int32_t result_shift,
-                                                                                int32_t result_offset_after_shift, int32_t min,
-                                                                                int32_t max)
+SimpleTensor<uint8_t> gemmlowp_quantize_down_int32_to_uint8_scale_by_fixedpoint(const SimpleTensor<T> &in, std::vector<int32_t> result_fixedpoint_multiplier, std::vector<int32_t> result_shift,
+                                                                                int32_t result_offset_after_shift, int32_t min, int32_t max)
 {
     SimpleTensor<uint8_t> dst(in.shape(), DataType::QASYMM8);
 
@@ -226,8 +231,8 @@
 }
 
 template <typename T>
-SimpleTensor<uint8_t> gemmlowp_quantize_down_int32_to_uint8_scale_by_fixedpoint(const SimpleTensor<T> &in, const SimpleTensor<T> &bias, int32_t result_fixedpoint_multiplier, int32_t result_shift,
-                                                                                int32_t result_offset_after_shift, int32_t min, int32_t max)
+SimpleTensor<uint8_t> gemmlowp_quantize_down_int32_to_uint8_scale_by_fixedpoint(const SimpleTensor<T> &in, const SimpleTensor<T> &bias, std::vector<int32_t> result_fixedpoint_multiplier,
+                                                                                std::vector<int32_t> result_shift, int32_t result_offset_after_shift, int32_t min, int32_t max)
 {
     SimpleTensor<uint8_t> dst(in.shape(), DataType::QASYMM8);
 
@@ -258,22 +263,24 @@
     return dst;
 }
 
-template SimpleTensor<uint8_t> gemmlowp_quantize_down_int32_to_uint8_scale_by_fixedpoint(const SimpleTensor<int32_t> &a, int32_t result_fixedpoint_multiplier, int32_t result_shift,
-                                                                                         int32_t result_offset_after_shift, int32_t min, int32_t max);
-template SimpleTensor<uint8_t> gemmlowp_quantize_down_int32_to_uint8_scale_by_fixedpoint(const SimpleTensor<int32_t> &a, const SimpleTensor<int32_t> &b, int32_t result_fixedpoint_multiplier,
-                                                                                         int32_t result_shift, int32_t result_offset_after_shift, int32_t min, int32_t max);
+template SimpleTensor<uint8_t> gemmlowp_quantize_down_int32_to_uint8_scale_by_fixedpoint(const SimpleTensor<int32_t> &a, std::vector<int32_t> result_fixedpoint_multiplier,
+                                                                                         std::vector<int32_t> result_shift, int32_t result_offset_after_shift, int32_t min, int32_t max);
+template SimpleTensor<uint8_t> gemmlowp_quantize_down_int32_to_uint8_scale_by_fixedpoint(const SimpleTensor<int32_t> &a, const SimpleTensor<int32_t> &b,
+                                                                                         std::vector<int32_t> result_fixedpoint_multiplier,
+                                                                                         std::vector<int32_t> result_shift, int32_t result_offset_after_shift, int32_t min, int32_t max);
 template SimpleTensor<int16_t> gemmlowp_quantize_down_int32_to_int16_scale_by_fixedpoint(const SimpleTensor<int32_t> &a, int32_t result_fixedpoint_multiplier, int32_t result_shift,
                                                                                          int32_t min, int32_t max);
 template SimpleTensor<int16_t> gemmlowp_quantize_down_int32_to_int16_scale_by_fixedpoint(const SimpleTensor<int32_t> &a, const SimpleTensor<int32_t> &b, int32_t result_fixedpoint_multiplier,
                                                                                          int32_t result_shift, int32_t min, int32_t max);
-template SimpleTensor<uint8_t> gemmlowp_quantize_down_int32_to_uint8_scale(const SimpleTensor<int32_t> &a, int32_t result_offset, int32_t result_mult_int, int32_t result_shift, int32_t min,
-                                                                           int32_t max);
-template SimpleTensor<uint8_t> gemmlowp_quantize_down_int32_to_uint8_scale(const SimpleTensor<int32_t> &a, const SimpleTensor<int32_t> &b, int32_t result_offset, int32_t result_mult_int,
-                                                                           int32_t result_shift, int32_t min, int32_t max);
+template SimpleTensor<uint8_t> gemmlowp_quantize_down_int32_to_uint8_scale(const SimpleTensor<int32_t> &a, int32_t result_offset, std::vector<int32_t> result_mult_int,
+                                                                           std::vector<int32_t> result_shift, int32_t min, int32_t max);
+template SimpleTensor<uint8_t> gemmlowp_quantize_down_int32_to_uint8_scale(const SimpleTensor<int32_t> &a, const SimpleTensor<int32_t> &b, int32_t result_offset, std::vector<int32_t> result_mult_int,
+                                                                           std::vector<int32_t> result_shift, int32_t min, int32_t max);
 template SimpleTensor<int32_t> gemmlowp_matrix_multiply_core(const SimpleTensor<int8_t> &a, const SimpleTensor<int8_t> &b, TensorShape shape_c, int32_t a_offset, int32_t b_offset);
 template SimpleTensor<int32_t> gemmlowp_matrix_multiply_core(const SimpleTensor<uint8_t> &a, const SimpleTensor<uint8_t> &b, TensorShape shape_c, int32_t a_offset, int32_t b_offset);
-template SimpleTensor<int32_t> gemmlowp(const SimpleTensor<int8_t> &a, const SimpleTensor<int8_t> &b, TensorShape shape_c);
-template SimpleTensor<int32_t> gemmlowp(const SimpleTensor<uint8_t> &a, const SimpleTensor<uint8_t> &b, TensorShape shape_c);
+template SimpleTensor<int32_t> gemmlowp<int32_t, int8_t, int8_t>(const SimpleTensor<int8_t> &a, const SimpleTensor<int8_t> &b, TensorShape shape_c);
+template SimpleTensor<int32_t> gemmlowp<int32_t, uint8_t, uint8_t>(const SimpleTensor<uint8_t> &a, const SimpleTensor<uint8_t> &b, TensorShape shape_c);
+template SimpleTensor<int32_t> gemmlowp<int32_t, uint8_t, int8_t>(const SimpleTensor<uint8_t> &a, const SimpleTensor<int8_t> &b, TensorShape shape_c);
 } // namespace reference
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/reference/GEMMLowp.h b/tests/validation/reference/GEMMLowp.h
index 5581f67..815527e 100644
--- a/tests/validation/reference/GEMMLowp.h
+++ b/tests/validation/reference/GEMMLowp.h
@@ -35,31 +35,32 @@
 {
 namespace reference
 {
-template <typename T>
-SimpleTensor<uint8_t> gemmlowp_quantize_down_int32_to_uint8_scale(const SimpleTensor<T> &in, int32_t result_offset, int32_t result_mult_int, int32_t result_shift, int32_t min = 0, int32_t max = 0);
-template <typename T1, typename T2>
-SimpleTensor<T1> gemmlowp_matrix_multiply_core(const SimpleTensor<T2> &a, const SimpleTensor<T2> &b, TensorShape shape_c, int32_t a_offset, int32_t b_offset);
+template <typename T1, typename T2, typename T3>
+SimpleTensor<T1> gemmlowp_matrix_multiply_core(const SimpleTensor<T2> &a, const SimpleTensor<T3> &b, TensorShape shape_c, int32_t a_offset, int32_t b_offset);
+
+template <typename T1, typename T2, typename T3 = T2>
+SimpleTensor<T1> gemmlowp(const SimpleTensor<T2> &a, const SimpleTensor<T3> &b, TensorShape shape_c);
 
 template <typename T>
-SimpleTensor<uint8_t> gemmlowp_quantize_down_int32_to_uint8_scale(const SimpleTensor<T> &in, int32_t result_offset, int32_t result_mult_int, int32_t result_shift);
-
-template <typename T1, typename T2>
-SimpleTensor<T1> gemmlowp(const SimpleTensor<T2> &a, const SimpleTensor<T2> &b, TensorShape shape_c);
+SimpleTensor<uint8_t> gemmlowp_quantize_down_int32_to_uint8_scale(const SimpleTensor<T> &in, int32_t result_offset, std::vector<int32_t> result_mult_int, std::vector<int32_t> result_shift);
 
 template <typename T>
-SimpleTensor<uint8_t> gemmlowp_quantize_down_int32_to_uint8_scale(const SimpleTensor<T> &in, const SimpleTensor<T> &bias, int32_t result_offset, int32_t result_mult_int, int32_t result_shift,
+SimpleTensor<uint8_t> gemmlowp_quantize_down_int32_to_uint8_scale(const SimpleTensor<T> &in, int32_t result_offset, std::vector<int32_t> result_mult_int, std::vector<int32_t> result_shift,
                                                                   int32_t min = 0, int32_t max = 0);
 
 template <typename T>
-SimpleTensor<uint8_t> gemmlowp_quantize_down_int32_to_uint8_scale_by_fixedpoint(const SimpleTensor<T> &in, int32_t result_fixedpoint_multiplier, int32_t result_shift,
-                                                                                int32_t result_offset_after_shift,
-                                                                                int32_t min = 0, int32_t max = 0);
+SimpleTensor<uint8_t> gemmlowp_quantize_down_int32_to_uint8_scale(const SimpleTensor<T> &in, const SimpleTensor<T> &bias, int32_t result_offset, std::vector<int32_t> result_mult_int,
+                                                                  std::vector<int32_t> result_shift, int32_t min = 0, int32_t max = 0);
 
 template <typename T>
-SimpleTensor<uint8_t> gemmlowp_quantize_down_int32_to_uint8_scale_by_fixedpoint(const SimpleTensor<T> &in, const SimpleTensor<T> &bias, int32_t result_fixedpoint_multiplier, int32_t result_shift,
+SimpleTensor<uint8_t> gemmlowp_quantize_down_int32_to_uint8_scale_by_fixedpoint(const SimpleTensor<T> &in, std::vector<int32_t> result_fixedpoint_multiplier, std::vector<int32_t> result_shift,
                                                                                 int32_t result_offset_after_shift, int32_t min = 0, int32_t max = 0);
 
 template <typename T>
+SimpleTensor<uint8_t> gemmlowp_quantize_down_int32_to_uint8_scale_by_fixedpoint(const SimpleTensor<T> &in, const SimpleTensor<T> &bias, std::vector<int32_t> result_fixedpoint_multiplier,
+                                                                                std::vector<int32_t> result_shift, int32_t result_offset_after_shift, int32_t min = 0, int32_t max = 0);
+
+template <typename T>
 SimpleTensor<int16_t> gemmlowp_quantize_down_int32_to_int16_scale_by_fixedpoint(const SimpleTensor<T> &in, int32_t result_fixedpoint_multiplier, int32_t result_shift,
                                                                                 int32_t min, int32_t max);
 template <typename T>