Add test for validating batched-GEMM on CPU - Add new dataset for batched-GEMM - Add test for running batched-GEMM without bias. Currently bias is not supported in batched-GEMM - Fix reference implementation to slide correctly the RHS tensor Resolves COMPMID-4588 Change-Id: I20fcb5d9160f44292b7cc34570add911b1d732f6 Signed-off-by: Gian Marco Iodice <gianmarco.iodice@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/6040 Reviewed-by: SiCong Li <sicong.li@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com>

commit: 37a4611dcbefc9ad19af803216d9e2cd5ec64697 [log] [tgz]
author: Gian Marco Iodice <gianmarco.iodice@arm.com> Wed Aug 04 15:22:28 2021 +0100
committer: Gian Marco Iodice <gianmarco.iodice@arm.com> Fri Aug 13 12:34:17 2021 +0000
tree: b479ebcd25e1326afa6e756c70ded4e87c0fa651
parent: 7c2965760501ac96836b4f131d361266c99b91f2 [diff] [blame]
diff --git a/tests/validation/reference/GEMM.cpp b/tests/validation/reference/GEMM.cpp
index 6b3aa39..f7e97e4 100644
--- a/tests/validation/reference/GEMM.cpp
+++ b/tests/validation/reference/GEMM.cpp

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -51,12 +51,22 @@
     const int a_stride_w = K * M * D;
 
     const int b_stride_z = b.shape().num_dimensions() > 2 ? N * K : 0;     // Do not slide the matrix B along the 3th dimension in case matrix B has less than 3 dimensions
-    const int b_stride_w = b.shape().num_dimensions() > 3 ? K * N * D : 0; // Do not slide the matrix B along the 4th dimension in case matrix B has less than 4 dimensions
+    int       b_stride_w = b.shape().num_dimensions() > 3 ? K * N * D : 0; // Do not slide the matrix B along the 4th dimension in case matrix B has less than 4 dimensions
+
+    // Note: There are 3 gemm types: batched-gemm, multi-gemm, and batched of multi-gemms. The third dimension of tensor b is overloaded when tensor b has exactly 3 dimensions:
+    // it can be either number of batches or multis. Batched-GEMM computation is detected only when the third dimension of "a" and "c" tensors is 1 and the number of dimensions is 4
+    const bool is_batched_gemm = b.shape().num_dimensions() == 3 && a.shape().num_dimensions() == 4 && c.shape().num_dimensions() == 4 && a.shape()[2] == 1 && c.shape()[2] == 1;
+
+    // Batched-GEMM
+    if(is_batched_gemm)
+    {
+        b_stride_w = b_stride_z;
+    }
 
     const int c_stride_z = N * M;
     const int c_stride_w = N * M * D;
 
-#if defined(_OPENMP) && !( defined(__arm__) && defined(__ANDROID__))
+#if defined(_OPENMP) && !(defined(__arm__) && defined(__ANDROID__))
     #pragma omp parallel for collapse(2)
 #endif /* _OPENMP */
     for(int w = 0; w < W; ++w)
@@ -106,12 +116,22 @@
     const int a_stride_w = K * M * D;
 
     const int b_stride_z = b.shape().num_dimensions() > 2 ? N * K : 0;     // Do not slide the matrix B along the 3th dimension in case matrix B has less than 3 dimensions
-    const int b_stride_w = b.shape().num_dimensions() > 3 ? K * N * D : 0; // Do not slide the matrix B along the 4th dimension in case matrix B has less than 4 dimensions
+    int       b_stride_w = b.shape().num_dimensions() > 3 ? K * N * D : 0; // Do not slide the matrix B along the 4th dimension in case matrix B has less than 4 dimensions
+
+    // Note: There are 3 gemm types: batched-gemm, multi-gemm, and batched of multi-gemms. The third dimension of tensor b is overloaded when tensor b has exactly 3 dimensions:
+    // it can be either number of batches or multis. Batched-GEMM computation is detected only when the third dimension of "a" and "c" tensors is 1 and the number of dimensions is 4
+    const bool is_batched_gemm = b.shape().num_dimensions() == 3 && a.shape().num_dimensions() == 4 && c.shape().num_dimensions() == 4 && a.shape()[2] == 1 && c.shape()[2] == 1;
+
+    // Batched-GEMM
+    if(is_batched_gemm)
+    {
+        b_stride_w = b_stride_z;
+    }
 
     const int c_stride_z = N * M;
     const int c_stride_w = N * M * D;
 
-#if defined(_OPENMP) && !( defined(__arm__) && defined(__ANDROID__))
+#if defined(_OPENMP) && !(defined(__arm__) && defined(__ANDROID__))
     #pragma omp parallel for collapse(2)
 #endif /* _OPENMP */
     for(int w = 0; w < W; ++w)
commit	37a4611dcbefc9ad19af803216d9e2cd5ec64697	[log] [tgz]
author	Gian Marco Iodice <gianmarco.iodice@arm.com>	Wed Aug 04 15:22:28 2021 +0100
committer	Gian Marco Iodice <gianmarco.iodice@arm.com>	Fri Aug 13 12:34:17 2021 +0000
tree	b479ebcd25e1326afa6e756c70ded4e87c0fa651
parent	7c2965760501ac96836b4f131d361266c99b91f2 [diff] [blame]