Improve start-up timer for GeMM (floating-point):

 - Pass M,N,K at runtime as kernel parameters
 - Add a guard macro to compile only kernel of interest
 - Move reshpaing kernels to gemm_utils.cl
 - Remove the fallback reshaping kernel with Y-Padding support

Resolves: COMPMID-4888
Signed-off-by: Ramy Elgammal <ramy.elgammal@arm.com>
Change-Id: Ida3851326f0b77e410633271de9ecca106e37931
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/6662
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
diff --git a/src/gpu/cl/ClKernelLibrary.cpp b/src/gpu/cl/ClKernelLibrary.cpp
index c47cf8e..f87b226 100644
--- a/src/gpu/cl/ClKernelLibrary.cpp
+++ b/src/gpu/cl/ClKernelLibrary.cpp
@@ -290,10 +290,10 @@
     { "gemm_mm_reshaped_only_rhs_t_post_act_eltwise_op_act", "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl" },
     { "gemm_mm_reshaped_only_rhs_t_texture_post_act_eltwise_op_act", "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_reshaped_only_rhs.cl" },
     { "gemm_lc_vm_f32", "common/gemm.cl" },
-    { "gemm_reshape_lhs_matrix_nt", "common/gemm.cl" },
-    { "gemm_reshape_lhs_matrix_t", "common/gemm.cl" },
-    { "gemm_reshape_rhs_matrix_nt", "common/gemm.cl" },
-    { "gemm_reshape_rhs_matrix_t", "common/gemm.cl" },
+    { "gemm_reshape_lhs_matrix_nt", "common/gemm_utils.cl" },
+    { "gemm_reshape_lhs_matrix_t", "common/gemm_utils.cl" },
+    { "gemm_reshape_rhs_matrix_nt", "common/gemm_utils.cl" },
+    { "gemm_reshape_rhs_matrix_t", "common/gemm_utils.cl" },
     { "gemmlowp_matrix_a_reduction", "common/gemmlowp.cl" },
     { "gemmlowp_matrix_a_reduction_dot8", "common/gemmlowp.cl" },
     { "gemmlowp_matrix_b_reduction", "common/gemmlowp.cl" },
@@ -590,6 +590,10 @@
 #include "./cl_kernels/common/gemm.clembed"
     },
     {
+        "common/gemm_utils.cl",
+#include "./cl_kernels/common/gemm_utils.clembed"
+    },
+    {
         "common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_native.cl",
 #include "./cl_kernels/common/experimental/gemm_fused_post_ops/act_eltwise_op_act/gemm_mm_native.clembed"
     },
diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.cpp b/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.cpp
index af79435..0598899 100644
--- a/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.cpp
+++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.cpp
@@ -275,6 +275,9 @@
     // Shrink M0 to be always <= M (internal_m) to prevent out-of-bounds reads.
     // NOTE: This might have implications on heuristics and performance
     const unsigned int internal_m0 = std::min(internal_m, lhs_info.m0);
+    _m                             = internal_m;
+    _n                             = gemm_info.n;
+    _k                             = gemm_info.k;
 
     // Create build options
     CLBuildOptions build_opts;
@@ -289,9 +292,6 @@
     build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(d_gemm_3d));
     build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(src1->dimension(2)));
     build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS");
-    build_opts.add_option("-DM=" + support::cpp11::to_string(internal_m));
-    build_opts.add_option("-DN=" + support::cpp11::to_string(gemm_info.n));
-    build_opts.add_option("-DK=" + support::cpp11::to_string(gemm_info.k));
     build_opts.add_option("-DM0=" + support::cpp11::to_string(internal_m0));
     build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0));
     build_opts.add_option("-DK0=" + support::cpp11::to_string(rhs_info.k0));
@@ -312,6 +312,9 @@
     std::string kernel_name("gemm_mm_native");
     post_op_utils.set_post_ops_cl_kernel_name(kernel_name, gemm_info.post_ops);
 
+    // A macro guard to compile ONLY the kernel of interest
+    build_opts.add_option("-D" + upper_string(kernel_name));
+
     // Create kernel
     _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
 
@@ -392,11 +395,11 @@
         unsigned int idx0;
         if(_add_bias)
         {
-            idx0 = (4 + _num_post_op_args) * num_arguments_per_2D_tensor() + (4 + _num_post_op_args);
+            idx0 = (4 + _num_post_op_args) * num_arguments_per_2D_tensor() + (7 + _num_post_op_args);
         }
         else
         {
-            idx0 = (3 + _num_post_op_args) * num_arguments_per_2D_tensor() + (3 + _num_post_op_args);
+            idx0 = (3 + _num_post_op_args) * num_arguments_per_2D_tensor() + (6 + _num_post_op_args);
         }
         const unsigned int total_cross_plane_pad = src0->info()->padding().top + src0->info()->padding().bottom;
         _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
@@ -408,11 +411,11 @@
         unsigned int idx0;
         if(_add_bias)
         {
-            idx0 = (4 + _num_post_op_args) * num_arguments_per_2D_tensor() + 4 + (_reinterpret_input_as_3d ? 1 : 0) + _num_post_op_args;
+            idx0 = (4 + _num_post_op_args) * num_arguments_per_2D_tensor() + 7 + (_reinterpret_input_as_3d ? 1 : 0) + _num_post_op_args;
         }
         else
         {
-            idx0 = (3 + _num_post_op_args) * num_arguments_per_2D_tensor() + 3 + (_reinterpret_input_as_3d ? 1 : 0) + _num_post_op_args;
+            idx0 = (3 + _num_post_op_args) * num_arguments_per_2D_tensor() + 6 + (_reinterpret_input_as_3d ? 1 : 0) + _num_post_op_args;
         }
         const unsigned int total_cross_plane_pad = dst->info()->padding().top + dst->info()->padding().bottom;
         _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
@@ -455,6 +458,12 @@
             const auto post_op_arg = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(experimental::get_post_op_arg_type(i)));
             _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(post_op_arg->info()->strides_in_bytes()[2]));
         }
+
+        // Pass m, n and k at runtime
+        _kernel.setArg<cl_int>(idx++, _m);
+        _kernel.setArg<cl_int>(idx++, _n);
+        _kernel.setArg<cl_int>(idx++, _k);
+
         enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items);
     }
     while(window.slide_window_slice_3D(slice));
diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.h b/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.h
index 415eb7b..e478df7 100644
--- a/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.h
+++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.h
@@ -81,6 +81,9 @@
     bool         _reinterpret_output_as_3d{ false };
     bool         _use_dummy_work_items{ false };
     bool         _add_bias{ false };
+    signed int   _m{ 1 };
+    signed int   _n{ 1 };
+    signed int   _k{ 1 };
     unsigned int _num_post_op_args{ 0 }; // (EXPERIMENTAL_POST_OPS) total number of post op arguments
 };
 } // namespace kernels
diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.cpp b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.cpp
index 64e9933..6a450b6 100644
--- a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.cpp
+++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.cpp
@@ -201,7 +201,6 @@
     _use_dummy_work_items     = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device());
     _add_bias                 = src2 != nullptr;
     _export_to_cl_image       = rhs_info.export_to_cl_image;
-    _k                        = gemm_info.k;
     _num_post_op_args         = gemm_info.post_ops.total_num_arguments();
 
     // Check if we need to slide the matrix B
@@ -230,6 +229,9 @@
 
     const unsigned int partial_store_m0 = internal_m % lhs_info.m0;
     const unsigned int partial_store_n0 = gemm_info.n % rhs_info.n0;
+    _m                                  = gemm_info.m;
+    _n                                  = gemm_info.n;
+    _k                                  = gemm_info.k;
 
     // Create build options
     CLBuildOptions build_opts;
@@ -250,9 +252,6 @@
     build_opts.add_option("-DRHS_HEIGHT=" + support::cpp11::to_string(src1->dimension(1)));
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
     build_opts.add_option("-DDATA_TYPE_ACCUMULATOR=" + (enable_mixed_precision ? get_cl_type_from_data_type(DataType::F32) : get_cl_type_from_data_type(data_type)));
-    build_opts.add_option("-DM=" + support::cpp11::to_string(gemm_info.m));
-    build_opts.add_option("-DN=" + support::cpp11::to_string(gemm_info.n));
-    build_opts.add_option("-DK=" + support::cpp11::to_string(gemm_info.k));
     build_opts.add_option("-DM0=" + support::cpp11::to_string(lhs_info.m0));
     build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0));
     build_opts.add_option("-DK0=" + support::cpp11::to_string(lhs_info.k0));
@@ -278,6 +277,9 @@
     kernel_name += rhs_info.export_to_cl_image ? "_texture" : "";
     post_op_utils.set_post_ops_cl_kernel_name(kernel_name, gemm_info.post_ops);
 
+    // A macro guard to compile ONLY the kernel of interest
+    build_opts.add_option("-D" + upper_string(kernel_name));
+
     // Create kernel
     _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
 
@@ -399,9 +401,6 @@
             add_2D_tensor_argument(idx, post_op_arg, slice);
         }
 
-        // K dimension (not used if _export_to_cl_image == true)
-        _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_k));
-
         // LHS stride_z
         _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src0->info()->strides_in_bytes()[2]));
 
@@ -429,6 +428,13 @@
             _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(total_cross_plane_pad));
         }
 
+        // Pass m, n and k at runtime
+        _kernel.setArg<cl_int>(idx++, _m);
+        _kernel.setArg<cl_int>(idx++, _n);
+
+        // K dimension (not used if _export_to_cl_image == true)
+        _kernel.setArg<cl_int>(idx++, _k);
+
         // Dispatch kernel
         enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items);
     }
@@ -436,4 +442,4 @@
 }
 } // namespace kernels
 } // namespace opencl
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.h b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.h
index 09160ec..2d668b9 100644
--- a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.h
+++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.h
@@ -105,7 +105,9 @@
     bool         _use_dummy_work_items{ false };
     bool         _add_bias{ false };
     bool         _export_to_cl_image{ false };
-    unsigned int _k{ 1 };
+    signed int   _m{ 1 };
+    signed int   _n{ 1 };
+    signed int   _k{ 1 };
     unsigned int _num_post_op_args{ 0 }; // (EXPERIMENTAL_POST_OPS) total number of post op arguments
 };
 } // namespace kernels
diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.cpp b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.cpp
index aa80697..29f9180 100644
--- a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.cpp
+++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.cpp
@@ -240,7 +240,9 @@
     // Calculate partial (store instead of load) M0 and partial N0 for the partial blocks at the end of a row/column if any. This is to avoid padding.
     const unsigned int partial_store_m0 = internal_m % internal_m0;
     const unsigned int partial_store_n0 = gemm_info.n % rhs_info.n0;
-
+    _m                                  = internal_m;
+    _n                                  = gemm_info.n;
+    _k                                  = gemm_info.k;
     // Create build options
     CLBuildOptions build_opts;
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src0->data_type()));
@@ -253,9 +255,6 @@
     build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS");
     build_opts.add_option_if(rhs_info.export_to_cl_image, "-DOPENCL_IMAGE_SUPPORT");
     build_opts.add_option("-DRHS_HEIGHT=" + support::cpp11::to_string(src1->dimension(1)));
-    build_opts.add_option("-DM=" + support::cpp11::to_string(internal_m));
-    build_opts.add_option("-DN=" + support::cpp11::to_string(gemm_info.n));
-    build_opts.add_option("-DK=" + support::cpp11::to_string(gemm_info.k));
     build_opts.add_option("-DM0=" + support::cpp11::to_string(internal_m0));
     build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0));
     build_opts.add_option("-DK0=" + support::cpp11::to_string(rhs_info.k0));
@@ -286,6 +285,9 @@
     kernel_name += rhs_info.export_to_cl_image ? "_texture" : "";
     post_op_utils.set_post_ops_cl_kernel_name(kernel_name, gemm_info.post_ops);
 
+    // A macro guard to compile ONLY the kernel of interest
+    build_opts.add_option("-D" + upper_string(kernel_name));
+
     // Create kernel
     _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
 
@@ -447,6 +449,11 @@
             _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(total_cross_plane_pad_out));
         }
 
+        // Pass m, n and k at runtime as signed ints, to ensure results of any subractions they could be operand in, would still be signed.
+        _kernel.setArg<cl_int>(idx++, _m);
+        _kernel.setArg<cl_int>(idx++, _n);
+        _kernel.setArg<cl_int>(idx++, _k);
+
         enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items);
     }
     while(window.slide_window_slice_3D(slice));
diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h
index a8f0c4c..ec5878d 100644
--- a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h
+++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h
@@ -97,6 +97,9 @@
     bool         _add_bias{ false };
     bool         _export_to_cl_image{ false };
     bool         _has_pad_y{ false };
+    signed int   _m{ 1 };
+    signed int   _n{ 1 };
+    signed int   _k{ 1 };
     unsigned int _num_post_op_args{ 0 }; // (EXPERIMENTAL_POST_OPS) total number of post op arguments
 };
 } // namespace kernels
diff --git a/src/gpu/cl/operators/ClGemm.cpp b/src/gpu/cl/operators/ClGemm.cpp
index 50ecb21..5557385 100644
--- a/src/gpu/cl/operators/ClGemm.cpp
+++ b/src/gpu/cl/operators/ClGemm.cpp
@@ -191,7 +191,6 @@
       _mm_native_kernel(std::make_unique<ClGemmMatrixMultiplyNativeKernel>()),
       _mm_reshaped_kernel(std::make_unique<ClGemmMatrixMultiplyReshapedKernel>()),
       _mm_reshaped_only_rhs_kernel(std::make_unique<ClGemmMatrixMultiplyReshapedOnlyRhsKernel>()),
-      _mm_reshaped_only_rhs_fallback_kernel(std::make_unique<ClGemmMatrixMultiplyReshapedOnlyRhsKernel>()),
       _tmp_a(),
       _tmp_b(),
       _reshape_b_only_on_first_run(false),
@@ -303,7 +302,6 @@
 
     // Set the target for the kernels
     _mm_reshaped_only_rhs_kernel->set_target(gpu_target);
-    _mm_reshaped_only_rhs_fallback_kernel->set_target(gpu_target);
 
     GEMMLHSMatrixInfo lhs_info{};
     GEMMRHSMatrixInfo rhs_info{};
@@ -322,10 +320,6 @@
     kernel_info.has_pad_y = false;
     _mm_reshaped_only_rhs_kernel->configure(compile_context, a, &_tmp_b, c, output, alpha, beta, lhs_info, rhs_info, kernel_info);
 
-    // Configure matrix multiply kernel with y padding support
-    kernel_info.has_pad_y = true;
-    _mm_reshaped_only_rhs_fallback_kernel->configure(compile_context, a, &_tmp_b, c, output, alpha, beta, lhs_info, rhs_info, kernel_info);
-
     // Request memory for RHS reshape matrix
     _aux_mem[RhsReshape] = MemoryInfo(offset_int_vec(RhsReshape), _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _tmp_b.total_size());
 }
@@ -625,7 +619,7 @@
 
             if(has_pad_y)
             {
-                CLScheduler::get().enqueue_op(*_mm_reshaped_only_rhs_fallback_kernel, gemm_reshaped_onlyrhs_pack, true);
+                ARM_COMPUTE_ERROR_ON(has_pad_y);
             }
             else
             {
diff --git a/src/gpu/cl/operators/ClGemm.h b/src/gpu/cl/operators/ClGemm.h
index e084e53..3c0cad3 100644
--- a/src/gpu/cl/operators/ClGemm.h
+++ b/src/gpu/cl/operators/ClGemm.h
@@ -121,7 +121,6 @@
     std::unique_ptr<kernels::ClGemmMatrixMultiplyNativeKernel>          _mm_native_kernel;
     std::unique_ptr<kernels::ClGemmMatrixMultiplyReshapedKernel>        _mm_reshaped_kernel;
     std::unique_ptr<kernels::ClGemmMatrixMultiplyReshapedOnlyRhsKernel> _mm_reshaped_only_rhs_kernel;
-    std::unique_ptr<kernels::ClGemmMatrixMultiplyReshapedOnlyRhsKernel> _mm_reshaped_only_rhs_fallback_kernel;
     TensorInfo                                                          _tmp_a;
     TensorInfo                                                          _tmp_b;
     bool                                                                _reshape_b_only_on_first_run;