Fix nightly issue caused by gemm_reshaped_only_rhs_mmul kernel The issue appears when this kernel is used by convolution operators because the stride calculations consider only simple matrix multiplication. In conv2d triggered runs, Rhs does not have the same dimension as Lhs and Dst. Also, cases where Lhs and Dst are interpreted as 3d, where their X and Y dimensions (in convolution sense) are collapsed into one. Resolves: COMPMID-6764 Change-Id: If443e6eb8f7a5cca1acc58b37c598122a013e69b Signed-off-by: Gunes Bayir <gunes.bayir@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/10913 Benchmark: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Jakub Sujak <jakub.sujak@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com>

commit: c310c11a4baa1eca4e4007b3250f4a771989f36f [log] [tgz]
author: Gunes Bayir <gunes.bayir@arm.com> Fri Dec 22 15:43:29 2023 +0000
committer: Gunes Bayir <gunes.bayir@arm.com> Fri Dec 22 17:35:16 2023 +0000
tree: 4992c6b0978fce5aeabeea23678be6e1a13c6c6b
parent: 85cafff0dd99b6f94a77a7d7933682fa7c6a4a70 [diff]
diff --git a/src/core/CL/cl_kernels/common/gemm_reshaped_only_rhs_mmul.cl b/src/core/CL/cl_kernels/common/gemm_reshaped_only_rhs_mmul.cl
index 8919023..09b8956 100644
--- a/src/core/CL/cl_kernels/common/gemm_reshaped_only_rhs_mmul.cl
+++ b/src/core/CL/cl_kernels/common/gemm_reshaped_only_rhs_mmul.cl

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022 Arm Limited.
+ * Copyright (c) 2022-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -117,9 +117,23 @@
     uint rhs_y = block_id;
 
     // Compute LHS/RHS/DST matrix address
+#ifdef REINTERPRET_INPUT_AS_3D
+    lhs_offset_first_element_in_bytes += lhs_x * sizeof(DATA_TYPE) + (lhs_y + z * M) * lhs_stride_y;
+#else // REINTERPRET_INPUT_AS_3D
     lhs_offset_first_element_in_bytes += lhs_x * sizeof(DATA_TYPE) + lhs_y * lhs_stride_y + z * lhs_stride_z;
+#endif // REINTERPRET_INPUT_AS_3D
+
+#ifdef BATCHED_RHS
     rhs_offset_first_element_in_bytes += rhs_x * sizeof(DATA_TYPE) + rhs_y * rhs_stride_y + z * rhs_stride_z;
+#else // BATCHED_RHS
+    rhs_offset_first_element_in_bytes += rhs_x * sizeof(DATA_TYPE) + rhs_y * rhs_stride_y;
+#endif // BATCHED_RHS
+
+#ifdef REINTERPRET_OUTPUT_AS_3D
+    dst_offset_first_element_in_bytes += dst_x * sizeof(DATA_TYPE) + (dst_y + z * M) * dst_stride_y;
+#else // REINTERPRET_OUTPUT_AS_3D
     dst_offset_first_element_in_bytes += dst_x * sizeof(DATA_TYPE) + dst_y * dst_stride_y + z * dst_stride_z;
+#endif // REINTERPRET_OUTPUT_AS_3D
 
     // Note: If RHS derives from the weights of convolution 2d layer, RHS will always be 2D and rhs_stride_z will always be equal to 0 for
     // not sliding the tensor
@@ -367,11 +381,25 @@
 
     // Starting RHS coordinates
     uint rhs_x = block_y * N0 * MMUL_N0 + block_x * N0;
+
+#ifdef BATCHED_RHS
     uint rhs_y = block_id + z * rhs_h;
+#else // BATCHED_RHS
+    uint rhs_y = block_id;
+#endif // BATCHED_RHS
 
     // Compute LHS/RHS/DST matrix address
+#ifdef REINTERPRET_INPUT_AS_3D
+    lhs_offset_first_element_in_bytes += lhs_x * sizeof(DATA_TYPE) + (lhs_y + z * M) * lhs_stride_y;
+#else // REINTERPRET_INPUT_AS_3D
     lhs_offset_first_element_in_bytes += lhs_x * sizeof(DATA_TYPE) + lhs_y * lhs_stride_y + z * lhs_stride_z;
+#endif // REINTERPRET_INPUT_AS_3D
+
+#ifdef REINTERPRET_OUTPUT_AS_3D
+    dst_offset_first_element_in_bytes += dst_x * sizeof(DATA_TYPE) + (dst_y + z * M) * dst_stride_y;
+#else // REINTERPRET_OUTPUT_AS_3D
     dst_offset_first_element_in_bytes += dst_x * sizeof(DATA_TYPE) + dst_y * dst_stride_y + z * dst_stride_z;
+#endif // REINTERPRET_OUTPUT_AS_3D
 
     // Initialize the accumulators
     // MMUL extension accumulate the result in F32 for both F32 and F16
@@ -525,4 +553,4 @@
 #undef RHS_OFFSET_X
 #undef RHS_STEP_X
 }
-#endif // defined(GEMM_MM_RESHAPED_ONLY_RHS_MMUL_TEXTURE)
\ No newline at end of file
+#endif // defined(GEMM_MM_RESHAPED_ONLY_RHS_MMUL_TEXTURE)

diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel.cpp b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel.cpp
index 4ca5580..4ca4b83 100644
--- a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel.cpp
+++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel.cpp

@@ -41,7 +41,6 @@
 #include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h"
 #include "support/Cast.h"
 #include "support/StringSupport.h"
-
 namespace arm_compute
 {
 namespace opencl
@@ -101,7 +100,7 @@
     ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(0) != k);
 
     // Validate the reinterpreted-as-3D-case
-    if (gemm_info.reinterpret_input_as_3d != 0)
+    if (gemm_info.reinterpret_input_as_3d)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(1) * src0->dimension(2) != m);
     }
@@ -284,8 +283,12 @@
     build_opts.add_option("-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a()));
     build_opts.add_option("-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b()));
 
+    build_opts.add_option_if(gemm_info.reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D");
+    build_opts.add_option_if(gemm_info.depth_output_gemm3d != 0, "-DREINTERPRET_OUTPUT_AS_3D");
+    build_opts.add_option_if(src1->num_dimensions() > 2, "-DBATCHED_RHS");
+
     std::string kernel_name("gemm_mm_reshaped_only_rhs_nt_mmul");
-    kernel_name += rhs_info.export_to_cl_image ? "_texture" : "";
+    kernel_name += _export_to_cl_image ? "_texture" : "";
 
     // A macro guard to compile ONLY the kernel of interest
     build_opts.add_option("-D" + upper_string(kernel_name));
commit	c310c11a4baa1eca4e4007b3250f4a771989f36f	[log] [tgz]
author	Gunes Bayir <gunes.bayir@arm.com>	Fri Dec 22 15:43:29 2023 +0000
committer	Gunes Bayir <gunes.bayir@arm.com>	Fri Dec 22 17:35:16 2023 +0000
tree	4992c6b0978fce5aeabeea23678be6e1a13c6c6b
parent	85cafff0dd99b6f94a77a7d7933682fa7c6a4a70 [diff]