COMPMID-2096: Refactor the CLGEMMLowp function selection (heuristic)

Change-Id: I15a8b39e0354d3b6686ed4cc8c361782c0512037
Signed-off-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
Reviewed-on: https://review.mlplatform.org/c/1410
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: VidhyaSudhan Loganathan <vidhyasudhan.loganathan@arm.com>
diff --git a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
index 875e3a2..0286cb3 100644
--- a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
@@ -24,6 +24,7 @@
 #include "arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/gemm/native/CLGEMMNativeKernelConfiguration.h"
 #include "arm_compute/core/CL/gemm/reshaped_only_rhs/CLGEMMReshapedOnlyRHSKernelConfiguration.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
@@ -48,7 +49,8 @@
 
 CLGEMMLowpMatrixMultiplyCore::CLGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(std::move(memory_manager)),
-      _mm_kernel(),
+      _mm_midgard_kernel(),
+      _mm_native_kernel(),
       _mm_reshaped_only_rhs_kernel(),
       _mtx_b_reshape_kernel(),
       _mtx_a_reduction_kernel(),
@@ -63,6 +65,7 @@
       _a_offset(0),
       _b_offset(0),
       _is_gemm_reshaped(true),
+      _is_midgard(false),
       _reshape_b_only_on_first_run(false),
       _is_prepared(false),
       _fuse_output_stage(false)
@@ -84,7 +87,9 @@
     const GPUTarget gpu_target = CLScheduler::get().target();
 
     // Set the target for the kernels
-    _mm_kernel.set_target(gpu_target);
+    _mm_midgard_kernel.set_target(gpu_target);
+    _mm_native_kernel.set_target(gpu_target);
+    _mm_reshaped_only_rhs_kernel.set_target(gpu_target);
 
     const ICLTensor *matrix_a = a;
     const ICLTensor *matrix_b = b;
@@ -103,6 +108,7 @@
 
     // Check if we need to reshape the matrix A and matrix B
     _is_gemm_reshaped = is_gemm_reshaped(_reshape_b_only_on_first_run, gpu_target);
+    _is_midgard       = gpu_target == GPUTarget::MIDGARD;
 
     if(_is_gemm_reshaped)
     {
@@ -159,8 +165,19 @@
         }
         else
         {
-            // Configure matrix multiply kernel
-            _mm_kernel.configure(matrix_a, matrix_b, &_mm_result_s32, false, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
+            if(_is_midgard)
+            {
+                // Configure matrix multiply kernel
+                _mm_midgard_kernel.configure(matrix_a, matrix_b, &_mm_result_s32, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
+            }
+            else
+            {
+                // Pick up the GEMM configuration
+                std::tie(lhs_info, rhs_info) = CLGEMMNativeKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8);
+
+                // Configure matrix multiply kernel
+                _mm_native_kernel.configure(matrix_a, matrix_b, &_mm_result_s32, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
+            }
         }
 
         // Configure offset contribution kernel
@@ -178,8 +195,19 @@
         }
         else
         {
-            // Configure matrix multiply kernel
-            _mm_kernel.configure(matrix_a, matrix_b, output, false, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
+            if(_is_midgard)
+            {
+                // Configure matrix multiply kernel
+                _mm_midgard_kernel.configure(matrix_a, matrix_b, output, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
+            }
+            else
+            {
+                // Pick up the GEMM configuration
+                std::tie(lhs_info, rhs_info) = CLGEMMNativeKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8);
+
+                // Configure matrix multiply kernel
+                _mm_native_kernel.configure(matrix_a, matrix_b, output, lhs_info, rhs_info, GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d));
+            }
         }
 
         // Configure offset contribution kernel
@@ -232,6 +260,7 @@
     const unsigned int k                       = a->dimension(0);
     const unsigned int batch_size              = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
     const int          depth_output_gemm3d     = gemm_info.depth_output_gemm3d();
+    const bool         is_midgard              = gpu_target == GPUTarget::MIDGARD;
 
     bool reshape_matrix_b = is_gemm_reshaped(gemm_info.reshape_b_only_on_first_run(), CLScheduler::get().target());
 
@@ -287,9 +316,21 @@
             // Output tensor auto inizialitation if not yet initialized
             auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(compute_mm_shape(*matrix_a_info, *matrix_b_info, false, reshape_info)).set_data_type(DataType::S32));
 
-            // Validate matrix multiply
-            ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info, false, reshape_info));
+            if(is_midgard)
+            {
+                // Validate matrix multiply
+                ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info, reshape_info));
+            }
+            else
+            {
+                // Pick up the GEMM configuration
+                std::tie(lhs_info, rhs_info) = CLGEMMNativeKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8);
+
+                // Validate matrix multiply
+                ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyNativeKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info, lhs_info, rhs_info, reshape_info));
+            }
         }
+
         // Validate offset contribution kernel
         ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOffsetContributionOutputStageKernel::validate(&mm_result_s32_info,
                                                                                             a_offset == 0 ? nullptr : &info_vector_sum_col,
@@ -308,9 +349,21 @@
         }
         else
         {
-            // Validate matrix multiply
-            ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, output, false, reshape_info));
+            if(is_midgard)
+            {
+                // Validate matrix multiply
+                ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, output, reshape_info));
+            }
+            else
+            {
+                // Pick up the GEMM configuration
+                std::tie(lhs_info, rhs_info) = CLGEMMNativeKernelConfigurationFactory::create(gpu_target)->configure(m, n, k, batch_size, DataType::QASYMM8);
+
+                // Validate matrix multiply
+                ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyNativeKernel::validate(matrix_a_info, matrix_b_info, output, lhs_info, rhs_info, reshape_info));
+            }
         }
+
         if(output->total_size() != 0)
         {
             // Validate offset contribution kernel
@@ -353,7 +406,14 @@
     }
     else
     {
-        CLScheduler::get().enqueue(_mm_kernel, false);
+        if(_is_midgard)
+        {
+            CLScheduler::get().enqueue(_mm_midgard_kernel, false);
+        }
+        else
+        {
+            CLScheduler::get().enqueue(_mm_native_kernel, false);
+        }
     }
 
     // Run matrix A reduction kernel only if _b_offset is not equal to 0