COMPMID-3652 Fix CLFullyConnectedLayer failure on S10

* Fix out-of-bound mem reads in cases where M < M0 in
CLGEMMMatrixMultiplyNativeKernel and
CLGEMMMatrixMultiplyReshapedOnlyRHSKernel, as a result of the new
boundary-aware reading logics.
* Add fixture tests (alongside the padding configuration tests) in
these 2 kernels to catch all 5 possible scenarios with block dimension
configurations, which includes this particular bug as the
"...BoundaryHandlingFullInXSinglePartialInY" test case

Change-Id: I8a10ab67594171e3ea4fb6e35c84ddc4ab964fba
Signed-off-by: SiCong Li <sicong.li@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/3650
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyNativeKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixMultiplyNativeKernel.cpp
index c67d360..da57aa4 100644
--- a/src/core/CL/kernels/CLGEMMMatrixMultiplyNativeKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMMatrixMultiplyNativeKernel.cpp
@@ -261,6 +261,10 @@
     const unsigned int partial_store_m0 = internal_m % lhs_info.m0;
     const unsigned int partial_store_n0 = gemm_info.n % rhs_info.n0;
 
+    // Shrink M0 to be always <= M (internal_m) to prevent out-of-bounds reads.
+    // NOTE: This might have implications on heuristics and performance
+    const unsigned int internal_m0 = std::min(internal_m, lhs_info.m0);
+
     // Create build options
     CLBuildOptions build_opts;
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input0->info()->data_type()));
@@ -277,7 +281,7 @@
     build_opts.add_option("-DM=" + support::cpp11::to_string(internal_m));
     build_opts.add_option("-DN=" + support::cpp11::to_string(gemm_info.n));
     build_opts.add_option("-DK=" + support::cpp11::to_string(gemm_info.k));
-    build_opts.add_option("-DM0=" + support::cpp11::to_string(lhs_info.m0));
+    build_opts.add_option("-DM0=" + support::cpp11::to_string(internal_m0));
     build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0));
     build_opts.add_option("-DK0=" + support::cpp11::to_string(rhs_info.k0));
     build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0));
diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.cpp
index 27520c6..e65726b 100644
--- a/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.cpp
@@ -263,6 +263,10 @@
     const unsigned int partial_store_m0 = internal_m % lhs_info.m0;
     const unsigned int partial_store_n0 = gemm_info.n % rhs_info.n0;
 
+    // Shrink M0 to be always <= M (internal_m) to prevent out-of-bounds reads.
+    // NOTE: This might have implications on heuristics and performance
+    const unsigned int internal_m0 = std::min(internal_m, lhs_info.m0);
+
     // Create build options
     CLBuildOptions build_opts;
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input0->info()->data_type()));
@@ -282,7 +286,7 @@
     build_opts.add_option("-DM=" + support::cpp11::to_string(internal_m));
     build_opts.add_option("-DN=" + support::cpp11::to_string(gemm_info.n));
     build_opts.add_option("-DK=" + support::cpp11::to_string(gemm_info.k));
-    build_opts.add_option("-DM0=" + support::cpp11::to_string(lhs_info.m0));
+    build_opts.add_option("-DM0=" + support::cpp11::to_string(internal_m0));
     build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0));
     build_opts.add_option("-DK0=" + support::cpp11::to_string(rhs_info.k0));
     build_opts.add_option("-DH0=" + support::cpp11::to_string(rhs_info.h0));
diff --git a/tests/validation/CL/GEMMMatrixMultiplyNative.cpp b/tests/validation/CL/GEMMMatrixMultiplyNative.cpp
index cdcc1a4..6ba5012 100644
--- a/tests/validation/CL/GEMMMatrixMultiplyNative.cpp
+++ b/tests/validation/CL/GEMMMatrixMultiplyNative.cpp
@@ -118,6 +118,28 @@
 /** Broadcast bias from vector to matrix */
 const auto broadcast_bias_values = framework::dataset::make("broadcast_bias", { false, true } );
 
+/** Boundary handling cases for testing partial/non-partial (full) block dimensions, resulting from different combinations
+ * of M, M0, N and N0 values.
+ * M0 and N0 are kept constant, while the different test cases need to vary M and N.
+ *
+ * Eg. M = 64 and N = 33 result in a block dimension that has no partial blocks (all full blocks) in Y dimension and
+ * parital blocks in X dimension.
+ */
+const auto boundary_handling_cases = combine(combine(combine(combine(combine(combine(combine(combine(combine(
+                                    // Large k to force potential out-of-bound reads on input0
+                                    framework::dataset::make("K", 315),
+                                    // Batch size == 1 to force potential out-of-bound reads on input0
+                                    framework::dataset::make("batch_size", 1)),
+                                    framework::dataset::make("M0", 4)),
+                                    framework::dataset::make("N0", 4)),
+                                    framework::dataset::make("K0", 4)),
+                                    // Only need to test F32 as F16 shares identical boundary handling logics
+                                    framework::dataset::make("DataType", DataType::F32)),
+                                    framework::dataset::make("alpha", -0.75f )),
+                                    framework::dataset::make("beta", -0.35f )),
+                                    broadcast_bias_values),
+                                    framework::dataset::make("Activation", ActivationLayerInfo()));
+
 /** Configuration test */
 void validate_configuration(unsigned int m_value, unsigned int n_value, unsigned int k_value, unsigned int b_value, unsigned int m0_value, unsigned int n0_value, unsigned int k0_value, bool broadcast_bias, DataType data_type, const ActivationLayerInfo &act_info)
 {
@@ -257,6 +279,46 @@
     ARM_COMPUTE_EXPECT(status, framework::LogLevel::ERRORS);
 }
 
+FIXTURE_DATA_TEST_CASE(RunSmallBoundaryHandlingPartialInXPartialInY, CLGEMMMatrixMultiplyNativeFixture<float>, framework::DatasetMode::ALL,
+                combine(combine(
+                        framework::dataset::make("M", 3),
+                        framework::dataset::make("N", 1)),
+                        boundary_handling_cases))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32);
+}
+
+FIXTURE_DATA_TEST_CASE(RunSmallBoundaryHandlingPartialInXFullInY, CLGEMMMatrixMultiplyNativeFixture<float>, framework::DatasetMode::ALL,
+                combine(combine(
+                        framework::dataset::make("M", 64),
+                        framework::dataset::make("N", 51)),
+                        boundary_handling_cases))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32);
+}
+
+FIXTURE_DATA_TEST_CASE(RunSmallBoundaryHandlingFullInXFullInY, CLGEMMMatrixMultiplyNativeFixture<float>, framework::DatasetMode::ALL,
+                combine(combine(
+                        framework::dataset::make("M", 64),
+                        framework::dataset::make("N", 32)),
+                        boundary_handling_cases))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32);
+}
+
+FIXTURE_DATA_TEST_CASE(RunSmallBoundaryHandlingFullInXPartialInY, CLGEMMMatrixMultiplyNativeFixture<float>, framework::DatasetMode::ALL,
+                combine(combine(
+                        framework::dataset::make("M", 37),
+                        framework::dataset::make("N", 32)),
+                        boundary_handling_cases))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32);
+}
+
 FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMMatrixMultiplyNativeFixture<float>, framework::DatasetMode::ALL,
                 combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
                                                                    m_values,
diff --git a/tests/validation/CL/GEMMMatrixMultiplyReshapedOnlyRHS.cpp b/tests/validation/CL/GEMMMatrixMultiplyReshapedOnlyRHS.cpp
index 6a1d495..bd0cd03 100644
--- a/tests/validation/CL/GEMMMatrixMultiplyReshapedOnlyRHS.cpp
+++ b/tests/validation/CL/GEMMMatrixMultiplyReshapedOnlyRHS.cpp
@@ -131,6 +131,32 @@
 /** Broadcast bias from vector to matrix */
 const auto broadcast_bias_values = framework::dataset::make("broadcast_bias", { false, true } );
 
+/** Boundary handling cases for testing partial/non-partial (full) block dimensions, resulting from different combinations
+ * of M, M0, N and N0 values.
+ * M0 and N0 are kept constant, while the different test cases need to vary M and N.
+ *
+ * Eg. M = 64 and N = 33 result in a block dimension that has no partial blocks (all full blocks) in Y dimension and
+ * parital blocks in X dimension.
+ */
+const auto boundary_handling_cases = combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
+                                    // Large k to force potential out-of-bound reads on input0
+                                    framework::dataset::make("K", 315),
+                                    // Batch size == 1 to force potential out-of-bound reads on input0
+                                    framework::dataset::make("batch_size", 1)),
+                                    framework::dataset::make("M0", 4)),
+                                    framework::dataset::make("N0", 4)),
+                                    framework::dataset::make("K0", 4)),
+                                    framework::dataset::make("H0", 3)),
+                                    i_values_rhs),
+                                    t_values_rhs),
+                                    framework::dataset::make("export_to_cl_image_rhs", {true, false})),
+                                    // Only need to test F32 as F16 shares identical boundary handling logics
+                                    framework::dataset::make("DataType", DataType::F32)),
+                                    framework::dataset::make("alpha", -0.75f )),
+                                    framework::dataset::make("beta", -0.35f )),
+                                    broadcast_bias_values),
+                                    framework::dataset::make("Activation", ActivationLayerInfo()));
+
 /** Configuration test */
 bool validate_configuration(unsigned int m_value, unsigned int n_value, unsigned int k_value, unsigned int b_value,
                             unsigned int m0_value, unsigned int n0_value, unsigned int k0_value, unsigned int h0_value,
@@ -330,6 +356,46 @@
 TEST_SUITE(Float)
 TEST_SUITE(FP32)
 
+FIXTURE_DATA_TEST_CASE(RunPrecommitBoundaryHandlingPartialInXPartialInY, CLGEMMMatrixMultiplyReshapedOnlyRHSFixture<float>, framework::DatasetMode::PRECOMMIT,
+                combine(combine(
+                        framework::dataset::make("M", 3),
+                        framework::dataset::make("N", 1)),
+                        boundary_handling_cases))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32);
+}
+
+FIXTURE_DATA_TEST_CASE(RunPrecommitBoundaryHandlingPartialInXFullInY, CLGEMMMatrixMultiplyReshapedOnlyRHSFixture<float>, framework::DatasetMode::PRECOMMIT,
+                combine(combine(
+                        framework::dataset::make("M", 64),
+                        framework::dataset::make("N", 43)),
+                        boundary_handling_cases))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32);
+}
+
+FIXTURE_DATA_TEST_CASE(RunPrecommitBoundaryHandlingFullInXFullInY, CLGEMMMatrixMultiplyReshapedOnlyRHSFixture<float>, framework::DatasetMode::PRECOMMIT,
+                combine(combine(
+                        framework::dataset::make("M", 64),
+                        framework::dataset::make("N", 32)),
+                        boundary_handling_cases))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32);
+}
+
+FIXTURE_DATA_TEST_CASE(RunPrecommitBoundaryHandlingFullInXPartialInY, CLGEMMMatrixMultiplyReshapedOnlyRHSFixture<float>, framework::DatasetMode::PRECOMMIT,
+                combine(combine(
+                        framework::dataset::make("M", 37),
+                        framework::dataset::make("N", 32)),
+                        boundary_handling_cases))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference, rel_tolerance_f32, 0.f, abs_tolerance_f32);
+}
+
 FIXTURE_DATA_TEST_CASE(RunPrecommit, CLGEMMMatrixMultiplyReshapedOnlyRHSFixture<float>, framework::DatasetMode::PRECOMMIT,
                 combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(combine(
                                                                    m_values,