Fix GemmLowp BatchMatMul Tests to use quantized Outputs

- Fix includes int8/uint8 quantized inputs
- Bias S32 value is limited to better allow detection of mismatches in gemmlowp kernel

Resolves: [COMPMID-5659]
Signed-off-by: Mohammed Suhail Munshi <MohammedSuhail.Munshi@arm.com>
Change-Id: Ie9cca430c6ab66253fe1d5252bd2c5396c7f38cf
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/8514
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Gunes Bayir <gunes.bayir@arm.com>
Benchmark: Arm Jenkins <bsgcomp@arm.com>
diff --git a/tests/datasets/GEMMLowpFusedOffsetOutputDataset.h b/tests/datasets/GEMMLowpFusedOffsetOutputDataset.h
index b87b2fa..8c90efc 100644
--- a/tests/datasets/GEMMLowpFusedOffsetOutputDataset.h
+++ b/tests/datasets/GEMMLowpFusedOffsetOutputDataset.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2019-2022 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -179,6 +179,38 @@
     }
 };
 
+class SmallGEMMLowpFusedBatchedMatMulDatasetUnsigned final : public GEMMLowpFusedOffsetOutputDataset
+{
+public:
+    SmallGEMMLowpFusedBatchedMatMulDatasetUnsigned()
+    {
+        add_config(TensorShape(4U, 3U), TensorShape(2U, 4U), TensorShape(2U, 3U), 0, 0, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, 5, 1 << 25, 5, 0, 254));
+        add_config(TensorShape(4U, 3U), TensorShape(2U, 4U), TensorShape(2U, 3U), 0, 0, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, 100, 1 << 25, 3, 0, 254));
+        add_config(TensorShape(12U, 15U), TensorShape(7U, 12U), TensorShape(7U, 15U), -3, 15, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, 0, 1 << 19, 0, 20, 210));
+        add_config(TensorShape(59U, 17U), TensorShape(36U, 59U), TensorShape(36U, 17U), -2, 13, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, -30, 2, 1 << 25, 14, 210));
+        add_config(TensorShape(2U, 4U, 3U), TensorShape(5U, 2U, 3U), TensorShape(5U, 4U, 3U), -5, 12, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, -20, 1 << 25, 4, 0, 127));
+        add_config(TensorShape(15U, 7U, 3U), TensorShape(29U, 15U, 3U), TensorShape(29U, 7U, 3U), 5, 2, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, -10, 1 << 25, 6, 10, 210));
+        add_config(TensorShape(56U, 17U, 32U), TensorShape(5U, 56U, 32U), TensorShape(5U, 17U, 32U), -3, 2, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, -15, 1 << 25, 3, 10, 210));
+        add_config(TensorShape(13U, 256U, 32U), TensorShape(19U, 13U, 32U), TensorShape(19U, 256U, 32U), 5, 2, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, -15, 1 << 25, 6, 50, 225));
+    }
+};
+
+class SmallGEMMLowpFusedBatchedMatMulDatasetSigned final : public GEMMLowpFusedOffsetOutputDataset
+{
+public:
+    SmallGEMMLowpFusedBatchedMatMulDatasetSigned()
+    {
+        add_config(TensorShape(4U, 3U), TensorShape(2U, 4U), TensorShape(2U, 3U), 0, 0, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, 5, 1 << 25, 5, -128, 127));
+        add_config(TensorShape(4U, 3U), TensorShape(2U, 4U), TensorShape(2U, 3U), 0, 0, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, 100, 1 << 25, 3, -128, 127));
+        add_config(TensorShape(12U, 15U), TensorShape(7U, 12U), TensorShape(7U, 15U), -3, 15, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, 0, 1 << 19, 0, -108, 127));
+        add_config(TensorShape(59U, 17U), TensorShape(36U, 59U), TensorShape(36U, 17U), -2, 13, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, -30, 2, 1 << 25, -98, 107));
+        add_config(TensorShape(2U, 4U, 3U), TensorShape(5U, 2U, 3U), TensorShape(5U, 4U, 3U), -5, 12, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, -20, 1 << 25, 4, -127, 64));
+        add_config(TensorShape(15U, 7U, 3U), TensorShape(29U, 15U, 3U), TensorShape(29U, 7U, 3U), 5, 2, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, -10, 1 << 25, 6, -64, 127));
+        add_config(TensorShape(56U, 17U, 32U), TensorShape(5U, 56U, 32U), TensorShape(5U, 17U, 32U), 3, 2, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, -15, 1 << 25, 6, -127, 110));
+        add_config(TensorShape(13U, 256U, 32U), TensorShape(19U, 13U, 32U), TensorShape(19U, 256U, 32U), 5, 2, OutputStageInfo(GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, -15, 1 << 25, 6, -77, 115));
+    }
+};
+
 class SmallGEMMLowpFusedOffsetOutputOutput3DUint8Dataset final : public GEMMLowpFusedOffsetOutputDataset
 {
 public:
diff --git a/tests/validation/CL/GEMMLowp.cpp b/tests/validation/CL/GEMMLowp.cpp
index 19e8eeb..0b057b9 100644
--- a/tests/validation/CL/GEMMLowp.cpp
+++ b/tests/validation/CL/GEMMLowp.cpp
@@ -66,13 +66,33 @@
     // Validate output
     validate(CLAccessor(_target), _reference);
 }
-TEST_SUITE(BATCHED_MATMUL)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMLowpBatchedMatMulFixture, framework::DatasetMode::ALL, datasets::SmallGEMMLowpBatchedMatMulDataset())
+
+using CLGEMMLowpMatrixMultiplyCoreFusedOffsetOutputFixtureBatchedUnsigned =
+    GEMMLowpMatrixMultiplyCoreFusedOffsetOutputGenericValidationFixture<CLTensor, CLAccessor, CLGEMMLowpMatrixMultiplyCore, false, false, uint8_t, uint8_t, true>;
+TEST_SUITE(BatchedMatMul)
+TEST_SUITE(QASYMM8)
+FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMLowpMatrixMultiplyCoreFusedOffsetOutputFixtureBatchedUnsigned, framework::DatasetMode::ALL,
+                       combine(combine(datasets::SmallGEMMLowpFusedBatchedMatMulDatasetUnsigned(),
+                                       framework::dataset::make("DataType", { DataType::QASYMM8 })),
+                               framework::dataset::make("bool", { false })))
 {
-    // Validate output
-    validate(CLAccessor(_target), _reference);
+    validate(CLAccessor(_target), _reference, tolerance_quant);
 }
-TEST_SUITE_END() // BATCHED_MATMUL
+TEST_SUITE_END() // QASYMM8
+
+using CLGEMMLowpMatrixMultiplyCoreFusedOffsetOutputFixtureBatchedSigned =
+    GEMMLowpMatrixMultiplyCoreFusedOffsetOutputGenericValidationFixture<CLTensor, CLAccessor, CLGEMMLowpMatrixMultiplyCore, false, false, int8_t, int8_t, true>;
+TEST_SUITE(QASYMM8_SIGNED)
+FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMLowpMatrixMultiplyCoreFusedOffsetOutputFixtureBatchedSigned, framework::DatasetMode::ALL,
+                       combine(combine(datasets::SmallGEMMLowpFusedBatchedMatMulDatasetSigned(),
+                                       framework::dataset::make("DataType", { DataType::QASYMM8_SIGNED })),
+                               framework::dataset::make("bool", { false })))
+{
+    validate(CLAccessor(_target), _reference, tolerance_quant);
+}
+TEST_SUITE_END() // QASYMM8_SIGNED
+TEST_SUITE_END() // BatchedMatMul
+
 TEST_SUITE(FusedOffsetOutput)
 TEST_SUITE(QASYMM8)
 using CLGEMMLowpMatrixMultiplyCoreFusedOffsetOutputUint8Fixture = GEMMLowpMatrixMultiplyCoreFusedOffsetOutputGenericValidationFixture<CLTensor, CLAccessor, CLGEMMLowpMatrixMultiplyCore>;
@@ -264,4 +284,4 @@
 TEST_SUITE_END() // CL
 } // namespace validation
 } // namespace test
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/tests/validation/NEON/GEMMLowp.cpp b/tests/validation/NEON/GEMMLowp.cpp
index 2dcc740..17c6d06 100644
--- a/tests/validation/NEON/GEMMLowp.cpp
+++ b/tests/validation/NEON/GEMMLowp.cpp
@@ -211,13 +211,6 @@
     }
 }
 
-TEST_SUITE(BatchedMatMul)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpBatchedMatMulFixture, framework::DatasetMode::ALL, datasets::SmallGEMMLowpBatchedMatMulDataset())
-{
-    validate(Accessor(_target), _reference);
-}
-TEST_SUITE_END() // BatchedMatMul
-
 FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpMatrixMultiplyCoreFixture, framework::DatasetMode::ALL, datasets::SmallGEMMLowpDataset())
 {
     // Validate output
@@ -230,20 +223,51 @@
     validate(Accessor(_target), _reference);
 }
 
+constexpr AbsoluteTolerance<float> tolerance_batched(2);
+
+using NEGEMMLowpMatrixMultiplyCoreFusedOffsetOutputFixtureBatchedUnsigned =
+    GEMMLowpMatrixMultiplyCoreFusedOffsetOutputGenericValidationFixture<Tensor, Accessor, NEGEMMLowpMatrixMultiplyCore, false, false, uint8_t, uint8_t, true>;
+
+TEST_SUITE(BatchedMatMul)
+TEST_SUITE(QASYMM8)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpMatrixMultiplyCoreFusedOffsetOutputFixtureBatchedUnsigned, framework::DatasetMode::ALL,
+                       combine(combine(datasets::SmallGEMMLowpFusedBatchedMatMulDatasetUnsigned(),
+                                       framework::dataset::make("DataType", { DataType::QASYMM8 })),
+                               framework::dataset::make("bool", { false })))
+{
+    validate(Accessor(_target), _reference, tolerance_batched);
+}
+TEST_SUITE_END() // QASYMM8
+
+using NEGEMMLowpMatrixMultiplyCoreFusedOffsetOutputFixtureBatchedSigned =
+    GEMMLowpMatrixMultiplyCoreFusedOffsetOutputGenericValidationFixture<Tensor, Accessor, NEGEMMLowpMatrixMultiplyCore, false, false, int8_t, int8_t, true>;
+TEST_SUITE(QASYMM8_SIGNED)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpMatrixMultiplyCoreFusedOffsetOutputFixtureBatchedSigned, framework::DatasetMode::ALL,
+                       combine(combine(datasets::SmallGEMMLowpFusedBatchedMatMulDatasetSigned(),
+                                       framework::dataset::make("DataType", { DataType::QASYMM8_SIGNED })),
+                               framework::dataset::make("bool", { false })))
+{
+    validate(Accessor(_target), _reference, tolerance_batched);
+}
+TEST_SUITE_END() // QASYMM8_SIGNED
+TEST_SUITE_END() // BatchedMatMul
+
 using NEGEMMLowpMatrixMultiplyCoreFusedOffsetOutputFixture = GEMMLowpMatrixMultiplyCoreFusedOffsetOutputValidationFixture<Tensor, Accessor, NEGEMMLowpMatrixMultiplyCore>;
+constexpr AbsoluteTolerance<float> tolerance_quant(1);
+
 TEST_SUITE(FusedOffsetOutput)
 FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpMatrixMultiplyCoreFusedOffsetOutputFixture, framework::DatasetMode::ALL, combine(datasets::SmallGEMMLowpFusedOffsetOutputUint8Dataset(),
                        framework::dataset::make("DataType", { DataType::QASYMM8 })))
 {
     // Validate output
-    validate(Accessor(_target), _reference);
+    validate(Accessor(_target), _reference, tolerance_quant);
 }
 
 FIXTURE_DATA_TEST_CASE(RunLarge, NEGEMMLowpMatrixMultiplyCoreFusedOffsetOutputFixture, framework::DatasetMode::NIGHTLY, combine(datasets::LargeGEMMLowpFusedOffsetOutputUint8Dataset(),
                        framework::dataset::make("DataType", { DataType::QASYMM8 })))
 {
     // Validate output
-    validate(Accessor(_target), _reference);
+    validate(Accessor(_target), _reference, tolerance_quant);
 }
 TEST_SUITE_END() // FusedOffsetOutput
 TEST_SUITE_END() // MatrixMultiplyCore
diff --git a/tests/validation/fixtures/GEMMLowpFixture.h b/tests/validation/fixtures/GEMMLowpFixture.h
index f1ec81a..043fc9c 100644
--- a/tests/validation/fixtures/GEMMLowpFixture.h
+++ b/tests/validation/fixtures/GEMMLowpFixture.h
@@ -68,6 +68,12 @@
             library->fill(tensor, distribution, i);
             break;
         }
+        case DataType::S32:
+        {
+            std::uniform_int_distribution<int32_t> distribution(-20000, 20000);
+            library->fill(tensor, distribution, i);
+            break;
+        }
         case DataType::F16:
         {
             arm_compute::utils::uniform_real_distribution_16bit<half> distribution{ -1.0f, 1.0f };
@@ -235,7 +241,8 @@
 protected:
     TensorType compute_target(const TensorShape &shape_a, const TensorShape &shape_b, const TensorShape &shape_output, int32_t a_offset, int32_t b_offset)
     {
-        return compute_gemmlowp_target<TensorType, AccessorType, FunctionType, reinterpret_input_as_3d, reinterpret_output_as_3d, int32_t, false, run_twice>(shape_a, shape_b, shape_output, a_offset, b_offset);
+        return compute_gemmlowp_target<TensorType, AccessorType, FunctionType, reinterpret_input_as_3d, reinterpret_output_as_3d, int32_t, false, run_twice>(shape_a, shape_b, shape_output, a_offset,
+                b_offset);
     }
 
     SimpleTensor<int32_t> compute_reference(const TensorShape &shape_a, const TensorShape &shape_b, const TensorShape &shape_output, int32_t a_offset, int32_t b_offset)
@@ -247,7 +254,7 @@
     SimpleTensor<int32_t> _reference{};
 };
 
-template <typename TensorType, typename AccessorType, typename FunctionType, bool reinterpret_input_as_3d = false, bool reinterpret_output_as_3d = false, typename TI = uint8_t, typename TW = uint8_t>
+template <typename TensorType, typename AccessorType, typename FunctionType, bool reinterpret_input_as_3d = false, bool reinterpret_output_as_3d = false, typename TI = uint8_t, typename TW = uint8_t, bool run_twice = false>
 class GEMMLowpMatrixMultiplyCoreFusedOffsetOutputGenericValidationFixture : public framework::Fixture
 {
 public:
@@ -286,18 +293,20 @@
     TensorType compute_target(const TensorShape &shape_a, const TensorShape &shape_b, const TensorShape &shape_output, int32_t a_offset, int32_t b_offset, GEMMLowpOutputStageInfo output_stage,
                               DataType data_type_a, DataType data_type_b, QuantizationInfo b_qinfo, bool reshape_b_only_on_first_run = false)
     {
-        return compute_gemmlowp_target<TensorType, AccessorType, FunctionType, reinterpret_input_as_3d, reinterpret_output_as_3d, qasymm8_t, true>(shape_a, shape_b, shape_output, a_offset, b_offset,
+        return compute_gemmlowp_target<TensorType, AccessorType, FunctionType, reinterpret_input_as_3d, reinterpret_output_as_3d, qasymm8_t, true, run_twice>(shape_a, shape_b, shape_output, a_offset,
+                b_offset,
                 output_stage, data_type_a, data_type_b, b_qinfo, reshape_b_only_on_first_run);
     }
 
     SimpleTensor<TI> compute_reference(const TensorShape &shape_a, const TensorShape &shape_b, const TensorShape &shape_output, int32_t a_offset, int32_t b_offset,
                                        GEMMLowpOutputStageInfo output_stage, DataType data_type_a, DataType data_type_b, QuantizationInfo b_qinfo)
     {
-        SimpleTensor<int32_t> output = compute_gemmlowp_reference<reinterpret_input_as_3d, TI, TW>(shape_a, shape_b, shape_output, a_offset, b_offset, data_type_a, data_type_b, b_qinfo);
+        SimpleTensor<int32_t> output = compute_gemmlowp_reference<reinterpret_input_as_3d, TI, TW, false, false, run_twice>(shape_a, shape_b, shape_output, a_offset, b_offset, data_type_a, data_type_b,
+                                                                                                                            b_qinfo);
 
         TensorShape           bias_shape(shape_b[0]);
         SimpleTensor<int32_t> bias{ bias_shape, DataType::S32, 1 };
-        fill(bias, 2);
+        (run_twice) ? fill(bias, 5) : fill(bias, 2); // Fill bias with same seed as last run of gemmlowp_target
 
         switch(output_stage.type)
         {