Fix LWS search space used by CLTuner

* Ensure CLTuner uses the real GWS used by run(), instead of the
  static GWS (which is usually changed at run time), by caching GWS in
  each kernel

  Note this is a somewhat inelegant workaround. The real issue stems
  from the fact that execution window and scheduler are very much
  coupled with our operator run() / run_op() method.
  (Please see COMPMID-5934)

* Restrict LWS values to explore within GWS bound for exhaustive mode

* Refactor gws_from_window() to include all the information required
  to calculate GWS

* Log lws search space used for tuning

* Fix ClDirectConv2dKernel config id

Resolves COMPMID-5892

Signed-off-by: SiCong Li <sicong.li@arm.com>
Change-Id: I420490d8b94d13ada2e44eb0a12078f883379334
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/9193
Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Benchmark: Arm Jenkins <bsgcomp@arm.com>
diff --git a/src/core/CL/ICLKernel.cpp b/src/core/CL/ICLKernel.cpp
index 109a076..dc3a86a 100644
--- a/src/core/CL/ICLKernel.cpp
+++ b/src/core/CL/ICLKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2022 Arm Limited.
+ * Copyright (c) 2016-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -43,7 +43,7 @@
         ARM_COMPUTE_ERROR_ON((i >= 3) && ((window[i].end() - window[i].start()) != 1));
     }
 
-    cl::NDRange gws = ICLKernel::gws_from_window(window);
+    cl::NDRange gws = ICLKernel::gws_from_window(window, use_dummy_work_items);
 
     // Check for empty NDRange
     if(gws.dimensions() == 0)
@@ -51,12 +51,7 @@
         return;
     }
 
-    // Use dummy work-items
-    if(use_dummy_work_items)
-    {
-        gws.get()[0] = get_next_power_two(gws[0]);
-        gws.get()[1] = get_next_power_two(gws[1]);
-    }
+    kernel.cache_gws(gws);
 
     cl::NDRange valid_lws;
     if(lws_hint[0] * lws_hint[1] * lws_hint[2] > kernel.get_max_workgroup_size())
@@ -190,7 +185,7 @@
     return _max_workgroup_size;
 }
 
-cl::NDRange ICLKernel::gws_from_window(const Window &window)
+cl::NDRange ICLKernel::gws_from_window(const Window &window, bool use_dummy_work_items)
 {
     if((window.x().end() - window.x().start()) == 0 || (window.y().end() - window.y().start()) == 0)
     {
@@ -201,6 +196,22 @@
                     (window.y().end() - window.y().start()) / window.y().step(),
                     (window.z().end() - window.z().start()) / window.z().step());
 
+    if(use_dummy_work_items)
+    {
+        gws.get()[0] = get_next_power_two(gws[0]);
+        gws.get()[1] = get_next_power_two(gws[1]);
+    }
+
     return gws;
 }
-} // namespace arm_compute
\ No newline at end of file
+
+cl::NDRange ICLKernel::get_cached_gws() const
+{
+    return _cached_gws;
+}
+
+void ICLKernel::cache_gws(const cl::NDRange &gws)
+{
+    _cached_gws = gws;
+}
+} // namespace arm_compute
diff --git a/src/core/CL/ICLKernel.h b/src/core/CL/ICLKernel.h
index 5d5b636..c82809c 100644
--- a/src/core/CL/ICLKernel.h
+++ b/src/core/CL/ICLKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2022 Arm Limited.
+ * Copyright (c) 2016-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -86,9 +86,16 @@
         return 2 + 2 * dimension_size;
     }
 
-    cl::NDRange default_lws_tune(const Window &window)
+    /** Get default lws for the kernel
+     *
+     * @param[in] window               Execution window used by the kernel
+     * @param[in] use_dummy_work_items If the kernel uses dummy workloads
+     *
+     * @return    cl::NDRange
+     */
+    cl::NDRange default_lws_tune(const Window &window, bool use_dummy_work_items)
     {
-        return get_default_lws_for_type(_type, gws_from_window(window));
+        return get_default_lws_for_type(_type, gws_from_window(window, use_dummy_work_items));
     }
 
     using IKernel::configure; //Prevent children from calling IKernel::configure() directly
@@ -115,7 +122,9 @@
 
         if(is_same_lws(_tuning_params_hint.get_lws(), CLKernelLibrary::get().default_ndrange()))
         {
-            _tuning_params_hint.set_lws(default_lws_tune(window));
+            // Disable use_dummy_work_items at configure time. Because dummy work items only affect gws size, which
+            // will be recalculated with use_dummy_work_items flag at run time again anyway.
+            _tuning_params_hint.set_lws(default_lws_tune(window, false /* use_dummy_work_items */));
         }
 
         IKernel::configure(window);
@@ -124,7 +133,7 @@
 public:
     /** Constructor */
     ICLKernel()
-        : _kernel(nullptr), _target(GPUTarget::MIDGARD), _config_id(arm_compute::default_config_id), _max_workgroup_size(0), _type(CLKernelType::UNKNOWN), _tuning_params_hint()
+        : _kernel(nullptr), _target(GPUTarget::MIDGARD), _config_id(arm_compute::default_config_id), _max_workgroup_size(0), _type(CLKernelType::UNKNOWN), _tuning_params_hint(), _cached_gws(cl::NullRange)
     {
     }
     /** Returns a reference to the OpenCL kernel of this object.
@@ -431,11 +440,24 @@
     size_t get_max_workgroup_size();
     /** Get the global work size given an execution window
      *
-     * @param[in] window Execution window
+     * @param[in] window               Execution window
+     * @param[in] use_dummy_work_items If the kernel uses dummy work items
      *
      * @return Global work size of the given execution window
      */
-    static cl::NDRange gws_from_window(const Window &window);
+    static cl::NDRange gws_from_window(const Window &window, bool use_dummy_work_items);
+
+    /** Get the cached gws used to enqueue this kernel
+     *
+     * @return Latest global work size of the kernel
+     */
+    cl::NDRange get_cached_gws() const;
+
+    /** Cache the latest gws used to enqueue this kernel
+     *
+     * @param[in] gws Latest global work size of the kernel
+     */
+    void cache_gws(const cl::NDRange &gws);
 
 private:
     /** Add the passed array's parameters to the object's kernel's arguments starting from the index idx.
@@ -465,6 +487,7 @@
     CLKernelType _type;               /**< The CL kernel type */
 private:
     CLTuningParams _tuning_params_hint; /**< Tuning parameters hint for the OpenCL kernel */
+    cl::NDRange    _cached_gws;         /**< Latest GWS used to enqueue this kernel */
 };
 
 /** Add the kernel to the command queue with the given window.