Disable SME2 Gemmlowp s8f32 kernel selection in case results needs to be accumulated

Similar to https://review.mlplatform.org/c/ml/ComputeLibrary/+/11500, s8f32 kernels do not support accumulate mode. This patch modifies the kernel selection and also adds more tests to stress these test cases better.

Partially Resolves: COMPMID-6995

Change-Id: I40e19446c012eb7334e4511e254cce0d635aa234
Signed-off-by: Gunes Bayir <gunes.bayir@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/11503
Benchmark: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Radu Salavat <radu.salavat@arm.com>
Reviewed-by: Jakub Sujak <jakub.sujak@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_s8fp32.cpp b/src/core/NEON/kernels/arm_gemm/gemm_s8fp32.cpp
index 782399d..38d9b76 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_s8fp32.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_s8fp32.cpp
@@ -55,7 +55,7 @@
 {
     GemmMethod::GEMM_INTERLEAVED,
     "sme2_interleaved_nomerge_s8qfp32_mopa_1VLx4VL.hpp",
-    [](const GemmArgs &args, const DequantizeFloat &) { return args._ci->has_sme2(); },
+    [](const GemmArgs &args, const DequantizeFloat &) { return args._ci->has_sme2() && !args._accumulate; },
     [](const GemmArgs &args, const DequantizeFloat &) { const auto VL = sme::get_vector_length<float>();
                                                         return args._Msize <= VL || (2*VL < args._Msize && args._Msize <= 3*VL); },
     [](const GemmArgs &args, const DequantizeFloat &dq) { return new GemmInterleavedNoMergeDequantized<cls_sme2_interleaved_nomerge_s8qfp32_mopa_1VLx4VL, int8_t, float>(args, dq); }
@@ -63,7 +63,7 @@
 {
     GemmMethod::GEMM_INTERLEAVED,
     "sme2_interleaved_nomerge_s8qfp32_mopa_4Vx1VL.hpp",
-    [](const GemmArgs &args, const DequantizeFloat &) { return args._ci->has_sme2(); },
+    [](const GemmArgs &args, const DequantizeFloat &) { return args._ci->has_sme2() && !args._accumulate; },
     [](const GemmArgs &args, const DequantizeFloat &) { const auto VL = sme::get_vector_length<float>();
                                                         return args._Nsize <= VL || (2*VL < args._Nsize && args._Nsize <= 3*VL); },
     [](const GemmArgs &args, const DequantizeFloat &dq) { return new GemmInterleavedNoMergeDequantized<cls_sme2_interleaved_nomerge_s8qfp32_mopa_4VLx1VL, int8_t, float>(args, dq); }
@@ -71,7 +71,7 @@
 {
     GemmMethod::GEMM_INTERLEAVED,
     "sme2_interleaved_nomerge_s8qfp32_mopa_2Vx2VL.hpp",
-    [](const GemmArgs &args, const DequantizeFloat &) { return args._ci->has_sme2(); },
+    [](const GemmArgs &args, const DequantizeFloat &) { return args._ci->has_sme2() && !args._accumulate; },
     nullptr,
     [](const GemmArgs &args, const DequantizeFloat &dq) { return new GemmInterleavedNoMergeDequantized<cls_sme2_interleaved_nomerge_s8qfp32_mopa_2VLx2VL, int8_t, float>(args, dq); }
 },
diff --git a/tests/validation/NEON/GEMMLowp.cpp b/tests/validation/NEON/GEMMLowp.cpp
index 9b1da61..d25f43a 100644
--- a/tests/validation/NEON/GEMMLowp.cpp
+++ b/tests/validation/NEON/GEMMLowp.cpp
@@ -360,13 +360,21 @@
 // Deqaunt tests involve returning F32 from the MatrixMultiplyCore kernels and is only implemented in aarch64
 TEST_SUITE(Dequant)
 constexpr AbsoluteTolerance<float> tolerance_dequantized(0.01f);
-FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpDequantizedMatrixMultiplyValidationFixture, framework::DatasetMode::ALL, datasets::SmallGEMMLowpDataset())
+FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpDequantizedMatrixMultiplyValidationFixture, framework::DatasetMode::ALL,
+    combine(
+        datasets::SmallGEMMLowpDataset(),
+        make("accumulate", {true, false})
+    ))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_dequantized);
 }
 
-FIXTURE_DATA_TEST_CASE(RunLarge, NEGEMMLowpDequantizedMatrixMultiplyValidationFixture, framework::DatasetMode::NIGHTLY, datasets::LargeGEMMLowpDataset())
+FIXTURE_DATA_TEST_CASE(RunLarge, NEGEMMLowpDequantizedMatrixMultiplyValidationFixture, framework::DatasetMode::NIGHTLY,
+    combine(
+        datasets::LargeGEMMLowpDataset(),
+        make("accumulate", {false})
+    ))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_dequantized);
diff --git a/tests/validation/fixtures/GEMMLowpFixture.h b/tests/validation/fixtures/GEMMLowpFixture.h
index 6b7cbba..aa4eedb 100644
--- a/tests/validation/fixtures/GEMMLowpFixture.h
+++ b/tests/validation/fixtures/GEMMLowpFixture.h
@@ -472,15 +472,9 @@
 class GEMMLowpDequantizedMatrixMultiplyValidationFixture : public framework::Fixture
 {
 public:
-    void setup(TensorShape shape_a, TensorShape shape_b, TensorShape shape_output, int32_t a_offset, int32_t b_offset)
+    void setup(TensorShape shape_a, TensorShape shape_b, TensorShape shape_output, int32_t a_offset, int32_t b_offset, bool accumulate)
     {
-        // Accumulation is supported for Int8/UInt8 only in aarch64
-        bool accumulate = true;
-        // Accumulation is not supported for Int8/UInt8 in aarch32
-#ifdef __arm__
-        accumulate = false;
-#endif //__arm__
-        bool dynamic_qinfo = false;
+        const bool dynamic_qinfo = false;
         const auto a_qinfo = QuantizationInfo(1.0f / 255, a_offset);
         const auto b_qinfo = QuantizationInfo(5.0f / 255, b_offset);
         TensorFillInfo finfo;