COMPMID-3928: Fix output conversion in gemmlowp_mm_native

This patch solves the following issues that arose from nightly tests:

 - The accumulated result of gemmlowp_mm_native can be either uint or
int and in order to be stored in memory we need to convert it to int.

- The RHS matrix still needs padding on the X dimension. Hence, revert
few changes to add the necessary padding elements.

- Replace zero padding validation tests with assertion in the configure
method of the kernel.

Change-Id: Ib6614a91bd0e98f2b850f52eef14d4fbf55517c8
Signed-off-by: Michele Di Giorgio <michele.digiorgio@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/4259
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
diff --git a/src/core/CL/cl_kernels/gemmlowp.cl b/src/core/CL/cl_kernels/gemmlowp.cl
index 059c2e1..bde7dd0 100644
--- a/src/core/CL/cl_kernels/gemmlowp.cl
+++ b/src/core/CL/cl_kernels/gemmlowp.cl
@@ -1100,8 +1100,9 @@
     const bool cond_y = y == 0;
     const bool cond_x = ((x + 1) * N0 >= N);
 
-    // Store output block
-    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, int, c, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
+    // Convert and store output block
+    REPEAT_VAR_INIT_CONVERT(M0, VEC_DATA_TYPE(int, N0), c, res); // resN = CONVERT(cN, VEC_DATA_TYPE(int, N0));
+    STORE_BLOCK_BOUNDARY_AWARE(M0, N0, int, res, dst_addr, dst_stride_y, zout, PARTIAL_STORE_M0, PARTIAL_STORE_N0, cond_y, cond_x);
 }
 #endif // defined(M0) && defined(N0) && defined(K0) && defined(K) && defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
 
diff --git a/src/core/CL/cl_kernels/repeat.h b/src/core/CL/cl_kernels/repeat.h
index 59bf5b9..bed94a7 100644
--- a/src/core/CL/cl_kernels/repeat.h
+++ b/src/core/CL/cl_kernels/repeat.h
@@ -134,6 +134,10 @@
 #define REPEAT_VAR_INIT_TO_CONST(N, TYPE, VAR, VAL) REPEAT_3_N(N, VAR_INIT_TO_CONST, TYPE, VAR, VAL)
 
 // Macro for initializing N variables by converting the data type. Generates N statements that defines VAR##N = RHS_ACCESSOR_DEF(...)
+#define VAR_INIT_CONVERT_DEF(ID, TYPE_OUT, VAR_IN, VAR_OUT) TYPE_OUT VAR_OUT##ID = CONVERT(VAR_IN##ID, TYPE_OUT)
+#define REPEAT_VAR_INIT_CONVERT(N, TYPE_OUT, VAR_IN, VAR_OUT) REPEAT_3_N(N, VAR_INIT_CONVERT, TYPE_OUT, VAR_IN, VAR_OUT)
+
+// Macro for initializing N variables by converting the data type with saturation. Generates N statements that defines VAR##N = RHS_ACCESSOR_DEF(...)
 #define VAR_INIT_CONVERT_SAT_DEF(ID, TYPE_OUT, VAR_IN, VAR_OUT) TYPE_OUT VAR_OUT##ID = CONVERT_SAT(VAR_IN##ID, TYPE_OUT)
 #define REPEAT_VAR_INIT_CONVERT_SAT(N, TYPE_OUT, VAR_IN, VAR_OUT) REPEAT_3_N(N, VAR_INIT_CONVERT_SAT, TYPE_OUT, VAR_IN, VAR_OUT)
 
diff --git a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.cpp b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.cpp
index cc98845..af7755b 100644
--- a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.cpp
@@ -37,10 +37,6 @@
 #include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
-#include <cstddef>
-#include <cstdint>
-#include <tuple>
-
 namespace arm_compute
 {
 using namespace misc::shape_calculator;
@@ -110,6 +106,7 @@
     bool          reinterpret_output_as_3d            = (gemm_info.depth_output_gemm3d() != 0);
 
     Window win{};
+    bool   window_changed = false;
 
     // In case both input and output have to be reinterpreted as 3D tensors,
     // force reinterpret_output_as_3d to be false.
@@ -137,7 +134,13 @@
     num_elems_processed_per_iteration_y = lhs_info.m0;
 
     win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-    output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
+
+    // RHS matrix still needs padding on the X
+    AccessWindowStatic input1_access(input1, 0, 0,
+                                     ceil_to_multiple(input1->dimension(0), num_elems_processed_per_iteration_x),
+                                     input1->dimension(1));
+
+    window_changed = update_window_and_padding(win, input1_access); // window used by the execute_window_loop
 
     // Collapse along the Z direction
     // This collapse needs to be here in order to tune the Z dimension of LWS
@@ -145,7 +148,8 @@
     const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(output->num_dimensions()), 2u);
     collapsed                                = win.collapse(win, dimension_to_collapse);
 
-    return std::make_pair(Status(), collapsed);
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, collapsed);
 }
 } // namespace
 
@@ -175,6 +179,9 @@
     _reinterpret_output_as_3d = (gemm_info.depth_output_gemm3d() != 0);
     _use_dummy_work_items     = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device());
 
+    // We still need padding on the X dimension for the RHS matrix
+    auto padding_info = get_padding_info({ input0, output });
+
     // In case both input and output have to be reinterpreted as 3D tensors,
     // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
     if(_reinterpret_input_as_3d == _reinterpret_output_as_3d)
@@ -197,11 +204,15 @@
     // If _reinterpret_input_as_3d = _reinterpret_output_as_3d = true,
     // we will dispatch a batched-GEMM to reduce the complexity of the address calculation within the OpenCL kernel.
     // This means that the actual m used by the kernel is given by output->info()->dimension(1) and not by gemm_info.m
-    const unsigned int internal_m = input0->info()->dimension(1);
+    const unsigned int internal_m = _reinterpret_output_as_3d ? gemm_info.m() : output->info()->dimension(1);
     // Calculate partial (store instead of load) M0 and partial N0 for the partial blocks at the end of a row/column if any. This is to avoid padding.
     const unsigned int partial_store_m0 = internal_m % lhs_info.m0;
     const unsigned int partial_store_n0 = gemm_info.n() % rhs_info.n0;
 
+    // Shrink M0 to be always <= M (internal_m) to prevent out-of-bounds reads.
+    // NOTE: This might have implications on heuristics and performance
+    const unsigned int internal_m0 = std::min(internal_m, lhs_info.m0);
+
     // Create build options
     CLBuildOptions build_opts;
     build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D");
@@ -213,7 +224,7 @@
     build_opts.add_option("-DM=" + support::cpp11::to_string(input0->info()->dimension(1)));
     build_opts.add_option("-DN=" + support::cpp11::to_string(gemm_info.n()));
     build_opts.add_option("-DK=" + support::cpp11::to_string(gemm_info.k()));
-    build_opts.add_option("-DM0=" + support::cpp11::to_string(lhs_info.m0));
+    build_opts.add_option("-DM0=" + support::cpp11::to_string(internal_m0));
     build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0));
     build_opts.add_option("-DK0=" + support::cpp11::to_string(rhs_info.k0));
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input0->info()->data_type()));
@@ -245,6 +256,8 @@
     _config_id += support::cpp11::to_string(rhs_info.n0);
     _config_id += "_";
     _config_id += support::cpp11::to_string(lhs_info.k0);
+
+    ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
 Status CLGEMMLowpMatrixMultiplyNativeKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, const GEMMLHSMatrixInfo &lhs_info,
diff --git a/tests/validation/CL/GEMMLowpMatrixMultiplyNative.cpp b/tests/validation/CL/GEMMLowpMatrixMultiplyNative.cpp
index 9e717df..ce000bd 100644
--- a/tests/validation/CL/GEMMLowpMatrixMultiplyNative.cpp
+++ b/tests/validation/CL/GEMMLowpMatrixMultiplyNative.cpp
@@ -88,81 +88,10 @@
 
 /** K0 values to test - Nightly */
 const auto k0_values_nightly = framework::dataset::make("K0", { 1, 2, 3, 4, 8, 16 });
-
-/** Zero padding test */
-bool validate_zero_padding(unsigned int m_value, unsigned int n_value, unsigned int k_value, unsigned int b_value, unsigned int m0_value, unsigned int n0_value, unsigned int k0_value, bool broadcast_bias, DataType data_type, const ActivationLayerInfo &act_info)
-{
-    const unsigned int M = m_value;
-    const unsigned int N = n_value;
-    const unsigned int K = k_value;
-
-    GEMMLHSMatrixInfo lhs_info;
-    lhs_info.m0         = m0_value;
-    lhs_info.k0         = k0_value;
-
-    GEMMRHSMatrixInfo rhs_info;
-    rhs_info.n0         = n0_value;
-    rhs_info.k0         = k0_value;
-
-    GEMMKernelInfo kernel_info;
-    kernel_info.m               = M;
-    kernel_info.n               = N;
-    kernel_info.k               = K;
-    kernel_info.broadcast_bias  = broadcast_bias;
-    kernel_info.activation_info = act_info;
-
-    const TensorShape lhs_shape(K, M, b_value);
-    const TensorShape rhs_shape(N, K, b_value);
-    const TensorShape dst_shape = compute_mm_shape(TensorInfo(lhs_shape, 1, data_type),
-                                                   TensorInfo(rhs_shape, 1, data_type),
-                                                   kernel_info);
-
-    // Create tensors
-    CLTensor lhs  = create_tensor<CLTensor>(lhs_shape, data_type);
-    CLTensor rhs  = create_tensor<CLTensor>(rhs_shape, data_type);
-    CLTensor dst  = create_tensor<CLTensor>(dst_shape, DataType::S32);
-
-    ARM_COMPUTE_EXPECT(lhs.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(rhs.info()->is_resizable(), framework::LogLevel::ERRORS);
-    ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
-
-    // Create and configure function
-    CLGEMMLowpMatrixMultiplyNative gemm;
-    gemm.configure(&lhs, &rhs,  &dst, lhs_info, rhs_info, GEMMReshapeInfo(m_value, n_value, k_value));
-
-    // Padding can be added along rhs and bias's X dimension
-    return dst.info()->padding().empty() && lhs.info()->padding().empty() && rhs.info()->padding().empty();
-}
 } // namespace
 
 TEST_SUITE(CL)
 TEST_SUITE(GEMMLowpMatrixMultiplyNative)
-
-/** Validate zero padding tests
- *
- * A series of validation tests to check that no padding is added as part of configuration for 4 different scenarios.
- *
- * Checks performed in order:
- *     - No partial blocks in both x and y dimensions
- *     - Partial blocks in x dimension
- *     - Partial blocks in y dimension
- *     - Partial blocks in both x and y dimensions
- *     - No blocks in both x and y dimensions, scalar store (N0==1)
- *     - Special case: partial_n0 == 5 (vstore1 should be invoked instead of vstore_partial_1)
- */
-DATA_TEST_CASE(ValidateZeroPadding, framework::DatasetMode::ALL, zip(zip(zip(
-framework::dataset::make("M",                   { 24, 63,   1, 51, 255, }),
-framework::dataset::make("N",                   { 47, 29, 122, 20,  21, })),
-framework::dataset::make("M0",                  { 4,   8,   2,  1,   8, })),
-framework::dataset::make("N0",                  { 4,   4,   3,  1,   8, })),
-m_value, n_value, m0_value, n0_value)
-{
-    bool status = validate_zero_padding(m_value, n_value, 23, 1, m0_value, n0_value, 4, false, DataType::QASYMM8, ActivationLayerInfo());
-    ARM_COMPUTE_EXPECT(status, framework::LogLevel::ERRORS);
-}
-
-
-
 FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMLowpMatrixMultiplyNativeFixture, framework::DatasetMode::ALL,
                 combine(combine(combine(combine(combine(combine(m_values,
                                                                 n_values),