Add WBSM tuning to CLTuner

Add WBSM as possible parameter to be tuned
Add helper functions to check WBSM support and setting the value in the kernel
Update tuning parameter lists to use WBSM
Update CLTuner to use WBSM
The WBSM tuning is exposed as a parameter to be set at compile time by setting the CLTuningInfo
CLTuningInfo contains information about the tuning mode and if wbsm tuning enabled

Resolves: COMPMID-3936

Change-Id: Id53697c9c6d2cef41c049f368002f6197351b3ed
Signed-off-by: Manuel Bottini <manuel.bottini@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/4914
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
diff --git a/arm_compute/core/CL/CLCompileContext.h b/arm_compute/core/CL/CLCompileContext.h
index 6f6dc18..46a8c9b 100644
--- a/arm_compute/core/CL/CLCompileContext.h
+++ b/arm_compute/core/CL/CLCompileContext.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 Arm Limited.
+ * Copyright (c) 2020-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -296,6 +296,12 @@
      */
     bool int64_base_atomics_supported() const;
 
+    /* Returns true if the workgroup batch size modifier parameter is supported on the cl device
+    *
+    * @return true if the workgroup batch size modifier parameter is supported, false otherwise
+    */
+    bool is_wbsm_supported() const;
+
 private:
     /** Load program and its dependencies.
      *
@@ -327,6 +333,7 @@
     CLDevice    _device;                                              /**< Underlying CL device. */
     mutable std::map<std::string, const Program> _programs_map;       /**< Map with all already loaded program data. */
     mutable std::map<std::string, cl::Program>   _built_programs_map; /**< Map with all already built program data. */
+    bool _is_wbsm_supported;                                          /**< Support of worksize batch size modifier support boolean*/
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_CLCOMPILECONTEXT_H */
diff --git a/arm_compute/core/CL/CLHelpers.h b/arm_compute/core/CL/CLHelpers.h
index cf18e16..0e9aa5d 100644
--- a/arm_compute/core/CL/CLHelpers.h
+++ b/arm_compute/core/CL/CLHelpers.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 Arm Limited.
+ * Copyright (c) 2016-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,6 +30,11 @@
 #include <set>
 #include <string>
 
+/* CL Device capabilities */
+#define ARM_COMPUTE_LIBRARY_OPENCL_DEVICE_CAPABILITIES_ARM 0x41E4
+/* Workgroup Batch Size Modifier */
+#define ARM_COMPUTE_LIBRARY_OPENCL_EXEC_WBSM_ARM 0x41E6
+
 namespace arm_compute
 {
 class CLCoreRuntimeContext;
@@ -226,5 +231,20 @@
  */
 cl::NDRange create_lws_hint_parallel_implementations(unsigned int input_dimension, unsigned int vector_size);
 
+/* Helper function to check if the workgroup batch size modifier parameter is supported on the cl device
+ *
+ * @param[in] device cl device to check for support
+ *
+ * @return true if the workgroup batch size modifier parameter is supported, false otherwise
+ */
+bool get_wbsm_support_info(const cl::Device &device);
+
+/* Helper function to set the workgroup batch size modifier parameter in the kernel
+ *
+ * @param[in] kernel    cl kernel to set the workgroup batch size modifier parameter
+ * @param[in] wbsm_hint workgroup batch size modifier to use
+ */
+void set_wbsm(cl::Kernel &kernel, cl_int wbsm_hint);
+
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_CLHELPERS_H */
diff --git a/arm_compute/core/CL/CLKernelLibrary.h b/arm_compute/core/CL/CLKernelLibrary.h
index 1933893..0d8e4a6 100644
--- a/arm_compute/core/CL/CLKernelLibrary.h
+++ b/arm_compute/core/CL/CLKernelLibrary.h
@@ -148,6 +148,12 @@
      */
     std::string get_program_name(const std::string &kernel_name) const;
 
+    /* Returns true if the workgroup batch size modifier parameter is supported on the cl device
+    *
+    * @return true if the workgroup batch size modifier parameter is supported, false otherwise
+    */
+    bool is_wbsm_supported();
+
     /** Sets the CL context used to create programs.
      *
      * @note Setting the context also resets the device to the
diff --git a/arm_compute/core/CL/OpenCL.h b/arm_compute/core/CL/OpenCL.h
index f9796d7..155c3e4 100644
--- a/arm_compute/core/CL/OpenCL.h
+++ b/arm_compute/core/CL/OpenCL.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 Arm Limited.
+ * Copyright (c) 2016-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -135,6 +135,7 @@
     DECLARE_FUNCTION_PTR(clEnqueueMarker);
     DECLARE_FUNCTION_PTR(clWaitForEvents);
     DECLARE_FUNCTION_PTR(clCreateImage);
+    DECLARE_FUNCTION_PTR(clSetKernelExecInfo);
 
     // Third-party extensions
     DECLARE_FUNCTION_PTR(clImportMemoryARM);
diff --git a/arm_compute/runtime/CL/CLTuner.h b/arm_compute/runtime/CL/CLTuner.h
index 9814867..e1c98bf 100644
--- a/arm_compute/runtime/CL/CLTuner.h
+++ b/arm_compute/runtime/CL/CLTuner.h
@@ -182,7 +182,6 @@
     cl::Event    _kernel_event;
     bool         _tune_new_kernels;
     CLTuningInfo _tuning_info;
-    CLTunerMode  _tuner_mode;
 };
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_CLTUNER_H */
diff --git a/arm_compute/runtime/CL/CLTunerTypes.h b/arm_compute/runtime/CL/CLTunerTypes.h
index 49e2d61..e93ef5b 100644
--- a/arm_compute/runtime/CL/CLTunerTypes.h
+++ b/arm_compute/runtime/CL/CLTunerTypes.h
@@ -42,7 +42,10 @@
 /**< OpenCL tuner tuning information */
 struct CLTuningInfo
 {
-    bool tune_lws = true;
+    CLTunerMode tuner_mode = CLTunerMode::NORMAL; /**< Parameter to select the level (granularity) of the tuning */
+    bool        tune_wbsm  = false;               /**< Flag to tune the batches of work groups distributed to compute units.
+                                                       Internally, the library will check if this feature is available on
+                                                       the target platform */
 };
 
 /** Converts a string to a strong types enumeration @ref CLTunerMode
diff --git a/arm_compute/runtime/CL/CLTuningParams.h b/arm_compute/runtime/CL/CLTuningParams.h
index 99a3866..b504813 100644
--- a/arm_compute/runtime/CL/CLTuningParams.h
+++ b/arm_compute/runtime/CL/CLTuningParams.h
@@ -25,6 +25,10 @@
 #define ARM_COMPUTE_CLTUNING_PARAMS_H
 
 #include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/runtime/CL/CLTunerTypes.h"
+#include "support/StringSupport.h"
+
+#include <ostream>
 
 namespace arm_compute
 {
@@ -34,26 +38,95 @@
 public:
     CLTuningParams(const CLTuningParams &) = default;
 
-    CLTuningParams(unsigned int lws_x = 0, unsigned int lws_y = 0, unsigned int lws_z = 0)
-        : _lws(lws_x, lws_y, lws_z)
+    CLTuningParams(unsigned int lws_x = 0, unsigned int lws_y = 0, unsigned int lws_z = 0, int wbsm = 0)
+        : _lws(lws_x, lws_y, lws_z), _wbsm(wbsm)
     {
     }
-    CLTuningParams(cl::NDRange lws)
-        : _lws(lws)
+    CLTuningParams(cl::NDRange lws, cl_int wbsm = 0)
+        : _lws(lws), _wbsm(wbsm)
     {
     }
-    void set_lws(cl::NDRange &lws)
+
+    CLTuningParams(cl_int wbsm)
+        : CLTuningParams(cl::NullRange, wbsm)
+    {
+    }
+
+    void set_lws(cl::NDRange lws)
     {
         _lws = lws;
     }
 
-    cl::NDRange get_lws()
+    cl::NDRange get_lws() const
     {
         return _lws;
     }
 
+    void set_wbsm(cl_int wbsm)
+    {
+        _wbsm = wbsm;
+    }
+
+    cl_int get_wbsm() const
+    {
+        return _wbsm;
+    }
+
+    std::string to_string(CLTuningInfo tuning_info)
+    {
+        std::string tuning_params_string = "";
+        tuning_params_string += ";" + support::cpp11::to_string(_lws[0]) + ";" + support::cpp11::to_string(_lws[1]) + ";" + support::cpp11::to_string(_lws[2]);
+        if(tuning_info.tune_wbsm)
+        {
+            tuning_params_string += ";" + support::cpp11::to_string(_wbsm);
+        }
+        return tuning_params_string;
+    }
+
+    bool from_string(CLTuningInfo tuning_info, std::string tuning_params_string)
+    {
+        std::replace(tuning_params_string.begin(), tuning_params_string.end(), ';', ' ');
+        std::vector<std::string> array;
+        std::stringstream        ss(tuning_params_string);
+        std::string              temp;
+        while(ss >> temp)
+        {
+            array.push_back(temp);
+        }
+        // Read 3 values for lws
+        if(array.size() < 3)
+        {
+            return false;
+        }
+        const unsigned int lws_0 = support::cpp11::stoi(array[0]);
+        const unsigned int lws_1 = support::cpp11::stoi(array[1]);
+        const unsigned int lws_2 = support::cpp11::stoi(array[2]);
+        if(lws_0 == 0 && lws_1 == 0 && lws_2 == 0)
+        {
+            // If lws values are 0, cl::NullRange has to be used
+            // otherwise the lws object will be badly created
+            _lws = cl::NullRange;
+        }
+        else
+        {
+            _lws = cl::NDRange(lws_0, lws_1, lws_2);
+        }
+        array.erase(array.begin(), array.begin() + 3);
+        if(tuning_info.tune_wbsm)
+        {
+            if(array.size() < 1)
+            {
+                return false;
+            }
+            _wbsm = support::cpp11::stoi(array[0]);
+            array.erase(array.begin());
+        }
+        return true;
+    }
+
 private:
     cl::NDRange _lws;
+    cl_int      _wbsm;
 };
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_CLTUNING_PARAMS_H */
diff --git a/arm_compute/runtime/CL/tuners/CLTuningParametersList.h b/arm_compute/runtime/CL/tuners/CLTuningParametersList.h
index c51b990..69572c9 100644
--- a/arm_compute/runtime/CL/tuners/CLTuningParametersList.h
+++ b/arm_compute/runtime/CL/tuners/CLTuningParametersList.h
@@ -77,9 +77,12 @@
 
 /** Construct an ICLTuningParametersList object for the given tuner mode and gws configuration.
  *
+ * @param[in] tuning_info Tuning info containng which parameters to tune and the tuner mode
+ * @param[in] gws         Global worksize values
+ *
  * @return unique_ptr to the requested ICLTuningParametersList implementation.
  */
-std::unique_ptr<ICLTuningParametersList> get_tuning_parameters_list(CLTunerMode mode, const cl::NDRange &gws);
+std::unique_ptr<ICLTuningParametersList> get_tuning_parameters_list(CLTuningInfo tuning_info, const cl::NDRange &gws);
 
 } // namespace cl_tuner
 } // namespace arm_compute