Disable SME2 Gemmlowp s8f32 kernel selection in case results needs to be accumulated Similar to https://review.mlplatform.org/c/ml/ComputeLibrary/+/11500, s8f32 kernels do not support accumulate mode. This patch modifies the kernel selection and also adds more tests to stress these test cases better. Partially Resolves: COMPMID-6995 Change-Id: I40e19446c012eb7334e4511e254cce0d635aa234 Signed-off-by: Gunes Bayir <gunes.bayir@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/11503 Benchmark: Arm Jenkins <bsgcomp@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Radu Salavat <radu.salavat@arm.com> Reviewed-by: Jakub Sujak <jakub.sujak@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>

commit: e5ef8c159a14872dda5e36e320f07b0963858d8c [log] [tgz]
author: Gunes Bayir <gunes.bayir@arm.com> Fri Apr 26 16:51:54 2024 +0100
committer: Gunes Bayir <gunes.bayir@arm.com> Mon Apr 29 09:31:44 2024 +0000
tree: 2d359eeff83b54b3e7e65484abc37e531aedbdf2
parent: 499b5bca1a897461d4105ba52e4c766ddb5f564a [diff]
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_s8fp32.cpp b/src/core/NEON/kernels/arm_gemm/gemm_s8fp32.cpp
index 782399d..38d9b76 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_s8fp32.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_s8fp32.cpp

@@ -55,7 +55,7 @@
 {
     GemmMethod::GEMM_INTERLEAVED,
     "sme2_interleaved_nomerge_s8qfp32_mopa_1VLx4VL.hpp",
-    [](const GemmArgs &args, const DequantizeFloat &) { return args._ci->has_sme2(); },
+    [](const GemmArgs &args, const DequantizeFloat &) { return args._ci->has_sme2() && !args._accumulate; },
     [](const GemmArgs &args, const DequantizeFloat &) { const auto VL = sme::get_vector_length<float>();
                                                         return args._Msize <= VL || (2*VL < args._Msize && args._Msize <= 3*VL); },
     [](const GemmArgs &args, const DequantizeFloat &dq) { return new GemmInterleavedNoMergeDequantized<cls_sme2_interleaved_nomerge_s8qfp32_mopa_1VLx4VL, int8_t, float>(args, dq); }
@@ -63,7 +63,7 @@
 {
     GemmMethod::GEMM_INTERLEAVED,
     "sme2_interleaved_nomerge_s8qfp32_mopa_4Vx1VL.hpp",
-    [](const GemmArgs &args, const DequantizeFloat &) { return args._ci->has_sme2(); },
+    [](const GemmArgs &args, const DequantizeFloat &) { return args._ci->has_sme2() && !args._accumulate; },
     [](const GemmArgs &args, const DequantizeFloat &) { const auto VL = sme::get_vector_length<float>();
                                                         return args._Nsize <= VL || (2*VL < args._Nsize && args._Nsize <= 3*VL); },
     [](const GemmArgs &args, const DequantizeFloat &dq) { return new GemmInterleavedNoMergeDequantized<cls_sme2_interleaved_nomerge_s8qfp32_mopa_4VLx1VL, int8_t, float>(args, dq); }
@@ -71,7 +71,7 @@
 {
     GemmMethod::GEMM_INTERLEAVED,
     "sme2_interleaved_nomerge_s8qfp32_mopa_2Vx2VL.hpp",
-    [](const GemmArgs &args, const DequantizeFloat &) { return args._ci->has_sme2(); },
+    [](const GemmArgs &args, const DequantizeFloat &) { return args._ci->has_sme2() && !args._accumulate; },
     nullptr,
     [](const GemmArgs &args, const DequantizeFloat &dq) { return new GemmInterleavedNoMergeDequantized<cls_sme2_interleaved_nomerge_s8qfp32_mopa_2VLx2VL, int8_t, float>(args, dq); }
 },

diff --git a/tests/validation/NEON/GEMMLowp.cpp b/tests/validation/NEON/GEMMLowp.cpp
index 9b1da61..d25f43a 100644
--- a/tests/validation/NEON/GEMMLowp.cpp
+++ b/tests/validation/NEON/GEMMLowp.cpp

@@ -360,13 +360,21 @@
 // Deqaunt tests involve returning F32 from the MatrixMultiplyCore kernels and is only implemented in aarch64
 TEST_SUITE(Dequant)
 constexpr AbsoluteTolerance<float> tolerance_dequantized(0.01f);
-FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpDequantizedMatrixMultiplyValidationFixture, framework::DatasetMode::ALL, datasets::SmallGEMMLowpDataset())
+FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpDequantizedMatrixMultiplyValidationFixture, framework::DatasetMode::ALL,
+    combine(
+        datasets::SmallGEMMLowpDataset(),
+        make("accumulate", {true, false})
+    ))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_dequantized);
 }
 
-FIXTURE_DATA_TEST_CASE(RunLarge, NEGEMMLowpDequantizedMatrixMultiplyValidationFixture, framework::DatasetMode::NIGHTLY, datasets::LargeGEMMLowpDataset())
+FIXTURE_DATA_TEST_CASE(RunLarge, NEGEMMLowpDequantizedMatrixMultiplyValidationFixture, framework::DatasetMode::NIGHTLY,
+    combine(
+        datasets::LargeGEMMLowpDataset(),
+        make("accumulate", {false})
+    ))
 {
     // Validate output
     validate(Accessor(_target), _reference, tolerance_dequantized);

diff --git a/tests/validation/fixtures/GEMMLowpFixture.h b/tests/validation/fixtures/GEMMLowpFixture.h
index 6b7cbba..aa4eedb 100644
--- a/tests/validation/fixtures/GEMMLowpFixture.h
+++ b/tests/validation/fixtures/GEMMLowpFixture.h

@@ -472,15 +472,9 @@
 class GEMMLowpDequantizedMatrixMultiplyValidationFixture : public framework::Fixture
 {
 public:
-    void setup(TensorShape shape_a, TensorShape shape_b, TensorShape shape_output, int32_t a_offset, int32_t b_offset)
+    void setup(TensorShape shape_a, TensorShape shape_b, TensorShape shape_output, int32_t a_offset, int32_t b_offset, bool accumulate)
     {
-        // Accumulation is supported for Int8/UInt8 only in aarch64
-        bool accumulate = true;
-        // Accumulation is not supported for Int8/UInt8 in aarch32
-#ifdef __arm__
-        accumulate = false;
-#endif //__arm__
-        bool dynamic_qinfo = false;
+        const bool dynamic_qinfo = false;
         const auto a_qinfo = QuantizationInfo(1.0f / 255, a_offset);
         const auto b_qinfo = QuantizationInfo(5.0f / 255, b_offset);
         TensorFillInfo finfo;
commit	e5ef8c159a14872dda5e36e320f07b0963858d8c	[log] [tgz]
author	Gunes Bayir <gunes.bayir@arm.com>	Fri Apr 26 16:51:54 2024 +0100
committer	Gunes Bayir <gunes.bayir@arm.com>	Mon Apr 29 09:31:44 2024 +0000
tree	2d359eeff83b54b3e7e65484abc37e531aedbdf2
parent	499b5bca1a897461d4105ba52e4c766ddb5f564a [diff]