COMPMID-3683: Fix performance regression on Mali-G76 (Fully connected) COMPMID-3682: Fix performance regression on Mali-G76 (Convolution) Updated the heuristic for GEMMReshapedOnlYRHS for Mali-G76 in order to take into account small workload cases Change-Id: I99fccbd0e94e4e21c0d1b88e23f02af06ef16ee9 Signed-off-by: Gian Marco Iodice <gianmarco.iodice@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/3689 Reviewed-by: SiCong Li <sicong.li@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com>

commit: 4aed4aafa2ddb0b6f4b76aef5008c8bb45599ea4 [log] [tgz]
author: Gian Marco Iodice <gianmarco.iodice@arm.com> Fri Aug 07 15:36:30 2020 +0100
committer: Gian Marco Iodice <gianmarco.iodice@arm.com> Fri Aug 07 18:19:52 2020 +0000
tree: 6b4cc56b305ec0966aac40998494075d8569e429
parent: 9c7fed85d339df64937e8edac3b591b8571ccce8 [diff] [blame]
diff --git a/src/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationBifrost.cpp b/src/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationBifrost.cpp
index 581c2d2..f9b65dc 100644
--- a/src/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationBifrost.cpp
+++ b/src/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfigurationBifrost.cpp

@@ -149,34 +149,51 @@
     GEMMLHSMatrixInfo lhs_info_img;
     GEMMRHSMatrixInfo rhs_info_img;
 
+    const bool is_workload_big = ((m * n * b) / 16) >= 2048;
     // Get lhs_info/rhs_info in case of OpenCL buffer
     if(m == 1)
     {
-        const unsigned int h0 = std::max(n / 2, 1U);
-        std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 1, 2, 8, 1, h0, false, true, false, true);
-    }
-    else
-    {
-        std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 2, false, true, false, true);
-    }
-
-    // Get lhs_info/rhs_info in case of OpenCL image
-    if(m == 1)
-    {
-        std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 1, 4, 4, 1, 4, false, true, false, false, true);
+        if((n / 4) >= 2048)
+        {
+            const unsigned int h0 = std::max(n / 4, 1U);
+            std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 1, 4, 8, 1, h0, false, true, false, true);
+        }
+        else
+        {
+            const unsigned int h0 = std::max(n / 2, 1U);
+            std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 1, 2, 8, 1, h0, false, true, false, true);
+        }
     }
     else
     {
         const int h0 = std::max(std::min(static_cast<int>(n / 4), static_cast<int>(16)), static_cast<int>(1));
+        if(is_workload_big)
+        {
+            std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 1, h0, false, true, false, true);
+        }
+        else
+        {
+            std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 2, 4, 8, 1, h0, false, true, false, true);
+        }
+    }
+
+    // Get lhs_info/rhs_info in case of OpenCL image
+    const int h0 = std::max(std::min(static_cast<int>(n / 4), static_cast<int>(16)), static_cast<int>(1));
+    if(is_workload_big)
+    {
         std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 1, h0, false, true, false, false, true);
     }
+    else
+    {
+        std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 2, 4, 8, 1, h0, false, true, false, true, true);
+    }
 
     const TensorInfo  tensor_rhs_info(TensorShape(n, k, b), 1, DataType::F32);
     const TensorShape shape = compute_rhs_reshaped_shape(tensor_rhs_info, rhs_info_img);
     const TensorInfo  tensor_reshaped_info(shape, 1, DataType::F32);
 
-    // In case of vector by matrix with few work-items, we use the OpenCL buffer rather than the OpenCL image2d
-    const bool use_cl_image2d = (m == 1 && n <= 4096) ? false : true;
+    // In case of vector by matrix or small workloads, we use the OpenCL buffer rather than the OpenCL image2d
+    const bool use_cl_image2d = ((m == 1) || ((((m * n * b) / 16) < 2048) && n < 128)) ? false : true;
 
     if(bool(validate_image2d_support_on_rhs(tensor_reshaped_info, rhs_info_img)) && use_cl_image2d)
     {
commit	4aed4aafa2ddb0b6f4b76aef5008c8bb45599ea4	[log] [tgz]
author	Gian Marco Iodice <gianmarco.iodice@arm.com>	Fri Aug 07 15:36:30 2020 +0100
committer	Gian Marco Iodice <gianmarco.iodice@arm.com>	Fri Aug 07 18:19:52 2020 +0000
tree	6b4cc56b305ec0966aac40998494075d8569e429
parent	9c7fed85d339df64937e8edac3b591b8571ccce8 [diff] [blame]