Add WBSM tuning to CLTuner

Add WBSM as possible parameter to be tuned
Add helper functions to check WBSM support and setting the value in the kernel
Update tuning parameter lists to use WBSM
Update CLTuner to use WBSM
The WBSM tuning is exposed as a parameter to be set at compile time by setting the CLTuningInfo
CLTuningInfo contains information about the tuning mode and if wbsm tuning enabled

Resolves: COMPMID-3936

Change-Id: Id53697c9c6d2cef41c049f368002f6197351b3ed
Signed-off-by: Manuel Bottini <manuel.bottini@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/4914
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
diff --git a/src/core/CL/CLCompileContext.cpp b/src/core/CL/CLCompileContext.cpp
index 0afb7e5..3db0fe5 100644
--- a/src/core/CL/CLCompileContext.cpp
+++ b/src/core/CL/CLCompileContext.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 Arm Limited.
+ * Copyright (c) 2020-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -137,15 +137,16 @@
 {
 }
 CLCompileContext::CLCompileContext()
-    : _context(), _device(), _programs_map(), _built_programs_map()
+    : _context(), _device(), _programs_map(), _built_programs_map(), _is_wbsm_supported()
 {
 }
 
 CLCompileContext::CLCompileContext(cl::Context context, const cl::Device &device)
-    : _context(), _device(), _programs_map(), _built_programs_map()
+    : _context(), _device(), _programs_map(), _built_programs_map(), _is_wbsm_supported()
 {
-    _context = std::move(context);
-    _device  = CLDevice(device);
+    _context           = std::move(context);
+    _device            = CLDevice(device);
+    _is_wbsm_supported = get_wbsm_support_info(device);
 }
 
 Kernel CLCompileContext::create_kernel(const std::string &kernel_name, const std::string &program_name, const std::string &program_source,
@@ -318,7 +319,8 @@
 
 void CLCompileContext::set_device(cl::Device device)
 {
-    _device = std::move(device);
+    _device            = std::move(device);
+    _is_wbsm_supported = get_wbsm_support_info(device);
 }
 
 cl::NDRange CLCompileContext::default_ndrange() const
@@ -346,6 +348,11 @@
     return _device.supported("cl_khr_int64_base_atomics");
 }
 
+bool CLCompileContext::is_wbsm_supported() const
+{
+    return _is_wbsm_supported;
+}
+
 size_t CLCompileContext::max_local_workgroup_size(const cl::Kernel &kernel) const
 {
     size_t result;
diff --git a/src/core/CL/CLHelpers.cpp b/src/core/CL/CLHelpers.cpp
index 895bb72..aff8977 100644
--- a/src/core/CL/CLHelpers.cpp
+++ b/src/core/CL/CLHelpers.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 Arm Limited.
+ * Copyright (c) 2016-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -415,4 +415,26 @@
     const unsigned int num_of_threads = ((input_dimension + border_width) / 16);
     return cl::NDRange(std::min(8U, num_of_threads));
 }
+
+bool get_wbsm_support_info(const cl::Device &device)
+{
+    cl_bitfield capabilities = 0;
+    cl_int      err          = clGetDeviceInfo(device.get(), ARM_COMPUTE_LIBRARY_OPENCL_DEVICE_CAPABILITIES_ARM, sizeof(cl_bitfield), &capabilities, nullptr);
+    if((err == CL_SUCCESS) && (capabilities & ARM_COMPUTE_LIBRARY_OPENCL_EXEC_WBSM_ARM))
+    {
+        return true;
+    }
+    return false;
+}
+
+void set_wbsm(cl::Kernel &kernel, cl_int wbsm_hint)
+{
+    cl_int err = clSetKernelExecInfo(kernel.get(),
+                                     ARM_COMPUTE_LIBRARY_OPENCL_EXEC_WBSM_ARM,
+                                     sizeof(cl_int),
+                                     &wbsm_hint);
+    ARM_COMPUTE_UNUSED(err);
+    ARM_COMPUTE_ERROR_ON(err != CL_SUCCESS);
+}
+
 } // namespace arm_compute
diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp
index cf1c52e..75f76ea 100644
--- a/src/core/CL/CLKernelLibrary.cpp
+++ b/src/core/CL/CLKernelLibrary.cpp
@@ -1206,6 +1206,11 @@
     return _compile_context.int64_base_atomics_supported();
 }
 
+bool CLKernelLibrary::is_wbsm_supported()
+{
+    return _compile_context.is_wbsm_supported();
+}
+
 std::pair<std::string, bool> CLKernelLibrary::get_program(const std::string &program_name) const
 {
 #ifdef EMBEDDED_KERNELS
diff --git a/src/core/CL/ICLKernel.cpp b/src/core/CL/ICLKernel.cpp
index 2b259bf..1c6963f 100644
--- a/src/core/CL/ICLKernel.cpp
+++ b/src/core/CL/ICLKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 Arm Limited.
+ * Copyright (c) 2016-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,8 +29,6 @@
 
 #include <cstddef>
 
-using namespace arm_compute;
-
 void arm_compute::enqueue(cl::CommandQueue &queue, ICLKernel &kernel, const Window &window, const cl::NDRange &lws_hint, bool use_dummy_work_items)
 {
     if(kernel.kernel()() == nullptr)
@@ -77,9 +75,15 @@
         lws = valid_lws;
     }
 
+    if(CLKernelLibrary::get().is_wbsm_supported())
+    {
+        set_wbsm(kernel.kernel(), kernel.wbsm_hint());
+    }
     queue.enqueueNDRangeKernel(kernel.kernel(), cl::NullRange, gws, lws);
 }
 
+namespace arm_compute
+{
 template <unsigned int dimension_size>
 void ICLKernel::add_tensor_argument(unsigned &idx, const ICLTensor *tensor, const Window &window)
 {
@@ -146,3 +150,4 @@
 
     return gws;
 }
+} // namespace arm_compute
\ No newline at end of file
diff --git a/src/core/CL/ICLKernel.h b/src/core/CL/ICLKernel.h
index a24cd8c..6737109 100644
--- a/src/core/CL/ICLKernel.h
+++ b/src/core/CL/ICLKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2020 Arm Limited.
+ * Copyright (c) 2016-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -31,6 +31,7 @@
 #include "arm_compute/core/IKernel.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/experimental/Types.h"
+#include "arm_compute/runtime/CL/CLTuningParams.h"
 
 #include <string>
 
@@ -67,19 +68,30 @@
 protected:
     /** Configure the kernel's window and local workgroup size hint.
      *
-     * @param[in] window   The maximum window which will be returned by window()
-     * @param[in] lws_hint (Optional) Local-Workgroup-Size to use.
+     * @param[in] window    The maximum window which will be returned by window()
+     * @param[in] lws_hint  Local-Workgroup-Size to use.
+     * @param[in] wbsm_hint (Optional) Workgroup-Batch-Size-Modifier to use.
      */
-    void configure_internal(const Window &window, cl::NDRange lws_hint = CLKernelLibrary::get().default_ndrange())
+    void configure_internal(const Window &window, cl::NDRange lws_hint, cl_int wbsm_hint = 0)
     {
-        _lws_hint = lws_hint;
+        configure_internal(window, CLTuningParams(lws_hint, wbsm_hint));
+    }
+
+    /** Configure the kernel's window and tuning parameters hints.
+     *
+     * @param[in] window             The maximum window which will be returned by window()
+     * @param[in] tuning_params_hint (Optional) Tuning parameters to use.
+     */
+    void configure_internal(const Window &window, CLTuningParams tuning_params_hint = CLTuningParams(CLKernelLibrary::get().default_ndrange(), 0))
+    {
+        _tuning_params_hint = tuning_params_hint;
         IKernel::configure(window);
     }
 
 public:
     /** Constructor */
     ICLKernel()
-        : _kernel(nullptr), _target(GPUTarget::MIDGARD), _config_id(arm_compute::default_config_id), _max_workgroup_size(0), _lws_hint()
+        : _kernel(nullptr), _target(GPUTarget::MIDGARD), _config_id(arm_compute::default_config_id), _max_workgroup_size(0), _tuning_params_hint()
     {
     }
     /** Returns a reference to the OpenCL kernel of this object.
@@ -254,7 +266,7 @@
     void set_lws_hint(const cl::NDRange &lws_hint)
     {
         ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); // lws_hint will be overwritten by configure()
-        _lws_hint = lws_hint;
+        _tuning_params_hint.set_lws(lws_hint);
     }
 
     /** Return the Local-Workgroup-Size hint
@@ -263,7 +275,28 @@
      */
     cl::NDRange lws_hint() const
     {
-        return _lws_hint;
+        return _tuning_params_hint.get_lws();
+    }
+
+    /** Set the workgroup batch size modifier hint
+     *
+     * @note This method should be called after the configuration of the kernel
+     *
+     * @param[in] wbsm_hint workgroup batch size modifier value
+     */
+    void set_wbsm_hint(const cl_int &wbsm_hint)
+    {
+        ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); // wbsm_hint will be overwritten by configure()
+        _tuning_params_hint.set_wbsm(wbsm_hint);
+    }
+
+    /** Return the workgroup batch size modifier hint
+     *
+     * @return Current wbsm hint
+     */
+    cl_int wbsm_hint() const
+    {
+        return _tuning_params_hint.get_wbsm();
     }
 
     /** Get the configuration ID
@@ -344,7 +377,7 @@
     std::string _config_id;          /**< Configuration ID */
     size_t      _max_workgroup_size; /**< The maximum workgroup size for this kernel */
 private:
-    cl::NDRange _lws_hint; /**< Local workgroup size hint for the OpenCL kernel */
+    CLTuningParams _tuning_params_hint; /**< Tuning parameters hint for the OpenCL kernel */
 };
 
 /** Add the kernel to the command queue with the given window.
diff --git a/src/core/CL/OpenCL.cpp b/src/core/CL/OpenCL.cpp
index 6c70861..aff6285 100644
--- a/src/core/CL/OpenCL.cpp
+++ b/src/core/CL/OpenCL.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -134,6 +134,7 @@
     LOAD_FUNCTION_PTR(clEnqueueMarker, handle);
     LOAD_FUNCTION_PTR(clWaitForEvents, handle);
     LOAD_FUNCTION_PTR(clCreateImage, handle);
+    LOAD_FUNCTION_PTR(clSetKernelExecInfo, handle);
 
     // Third-party extensions
     LOAD_FUNCTION_PTR(clImportMemoryARM, handle);
@@ -962,6 +963,23 @@
     }
 }
 
+cl_int clSetKernelExecInfo(cl_kernel           kernel,
+                           cl_kernel_exec_info param_name,
+                           size_t              param_value_size,
+                           const void         *param_value)
+{
+    arm_compute::CLSymbols::get().load_default();
+    auto func = arm_compute::CLSymbols::get().clSetKernelExecInfo_ptr;
+    if(func != nullptr)
+    {
+        return func(kernel, param_name, param_value_size, param_value);
+    }
+    else
+    {
+        return CL_OUT_OF_RESOURCES;
+    }
+}
+
 cl_mem
 clImportMemoryARM(cl_context                      context,
                   cl_mem_flags                    flags,
diff --git a/src/runtime/CL/CLTuner.cpp b/src/runtime/CL/CLTuner.cpp
index 9060217..e16d680 100644
--- a/src/runtime/CL/CLTuner.cpp
+++ b/src/runtime/CL/CLTuner.cpp
@@ -32,13 +32,11 @@
 #include <cerrno>
 #include <fstream>
 #include <limits>
-#include <memory>
-#include <string>
 
 namespace arm_compute
 {
 CLTuner::CLTuner(bool tune_new_kernels, CLTuningInfo tuning_info)
-    : real_clEnqueueNDRangeKernel(nullptr), _tuning_params_table(), _lws_table(), _kernel_event(), _tune_new_kernels(tune_new_kernels), _tuning_info(tuning_info), _tuner_mode(CLTunerMode::NORMAL)
+    : real_clEnqueueNDRangeKernel(nullptr), _tuning_params_table(), _lws_table(), _kernel_event(), _tune_new_kernels(tune_new_kernels), _tuning_info(tuning_info)
 {
 }
 
@@ -62,12 +60,12 @@
 
 void CLTuner::set_tuner_mode(CLTunerMode mode)
 {
-    _tuner_mode = mode;
+    _tuning_info.tuner_mode = mode;
 }
 
 CLTunerMode CLTuner::get_tuner_mode() const
 {
-    return _tuner_mode;
+    return _tuning_info.tuner_mode;
 }
 
 void CLTuner::tune_kernel_static(ICLKernel &kernel)
@@ -103,12 +101,20 @@
 
                 // Set Local-Workgroup-Size
                 kernel.set_lws_hint(opt_tuning_params.get_lws());
+                if(_tuning_info.tune_wbsm)
+                {
+                    kernel.set_wbsm_hint(opt_tuning_params.get_wbsm());
+                }
             }
         }
         else
         {
             // Set Local-Workgroup-Size
             kernel.set_lws_hint(p->second.get_lws());
+            if(_tuning_info.tune_wbsm)
+            {
+                kernel.set_wbsm_hint(p->second.get_wbsm());
+            }
         }
     }
 }
@@ -188,13 +194,15 @@
     cl_ulong       min_exec_time = end - start;
     _kernel_event                = nullptr;
 
-    cl::NDRange opt_lws = cl::NullRange;
+    CLTuningParams opt_tuning_params(cl::NullRange, 0);
 
     // Construct the list of tuning parameters values to be tested based on the tuner mode.
-    auto lws_list = cl_tuner::get_tuning_parameters_list(_tuner_mode, gws);
-    for(size_t i = 0; i < lws_list->size(); ++i)
+    auto tuning_list = cl_tuner::get_tuning_parameters_list(_tuning_info, gws);
+    for(size_t i = 0; i < tuning_list->size(); ++i)
     {
-        cl::NDRange lws_test    = (*lws_list)[i].get_lws();
+        CLTuningParams tuning_test = (*tuning_list)[i];
+        // Setting the lws
+        cl::NDRange lws_test    = tuning_test.get_lws();
         auto        x           = lws_test[0];
         auto        y           = lws_test[1];
         auto        z           = lws_test[2];
@@ -205,8 +213,12 @@
             continue;
         }
 
-        //Set the Local-Workgroup-Size
         kernel.set_lws_hint(lws_test);
+        if(_tuning_info.tune_wbsm && CLKernelLibrary::get().is_wbsm_supported())
+        {
+            cl_int wbsm_test = tuning_test.get_wbsm();
+            kernel.set_wbsm_hint(wbsm_test);
+        }
 
         // Run the kernel
         inject_memory ? kernel.run_op(tensors, kernel.window(), queue_profiler) : kernel.run(kernel.window(), queue_profiler);
@@ -222,13 +234,17 @@
         if(diff < min_exec_time)
         {
             min_exec_time = diff;
-            opt_lws       = cl::NDRange(x, y, z);
+            opt_tuning_params.set_lws(tuning_test.get_lws());
+            if(_tuning_info.tune_wbsm)
+            {
+                opt_tuning_params.set_wbsm(tuning_test.get_wbsm());
+            }
         }
     }
 
     // Restore real function
     CLSymbols::get().clEnqueueNDRangeKernel_ptr = real_clEnqueueNDRangeKernel;
-    return CLTuningParams(opt_lws);
+    return opt_tuning_params;
 }
 
 void CLTuner::import_lws_table(const std::unordered_map<std::string, cl::NDRange> &lws_table)
@@ -271,34 +287,46 @@
         ARM_COMPUTE_ERROR_VAR("Failed to open '%s' (%s [%d])", filename.c_str(), strerror(errno), errno);
     }
     std::string line;
+    bool        header_line = true;
     while(!std::getline(fs, line).fail())
     {
-        std::istringstream ss(line);
-        std::string        token;
-        if(std::getline(ss, token, ';').fail())
+        if(header_line)
         {
-            ARM_COMPUTE_ERROR_VAR("Malformed row '%s' in %s (Should be of the form 'kernel_id;lws[0];lws[1];lws[2]')", ss.str().c_str(), filename.c_str());
-        }
-        std::string kernel_id = token;
-        cl::NDRange lws(1, 1, 1);
-        for(int i = 0; i < 3; i++)
-        {
-            if(std::getline(ss, token, ';').fail())
+            header_line            = false;
+            size_t pos_lws         = line.find("lws");
+            size_t pos_wbsm        = line.find("wbsm");
+            _tuning_info.tune_wbsm = false;
+            if(pos_lws != std::string::npos || pos_wbsm != std::string::npos)
             {
-                ARM_COMPUTE_ERROR_VAR("Malformed row '%s' in %s (Should be of the form 'kernel_id;lws[0];lws[1];lws[2]')", ss.str().c_str(), filename.c_str());
+                // The file has in the first line the parameters it has been tuned on
+                if(pos_wbsm != std::string::npos)
+                {
+                    _tuning_info.tune_wbsm = true;
+                }
+                // Once the line with the tuning parameter is read we can
+                // read the next one to start collecting the values
+                if(std::getline(fs, line).fail())
+                {
+                    break;
+                }
             }
-            lws.get()[i] = support::cpp11::stoi(token);
         }
 
-        // If all dimensions are 0: reset to NullRange (i.e nullptr)
-        if(lws[0] == 0 && lws[1] == 0 && lws[2] == 0)
+        CLTuningParams tuning_params;
+        size_t         pos = line.find(";");
+        if(pos == std::string::npos)
         {
-            lws = cl::NullRange;
+            ARM_COMPUTE_ERROR_VAR("Malformed row '%s' in %s", line.c_str(), filename.c_str());
         }
-        add_tuning_params(kernel_id, lws);
+        std::string kernel_id = line.substr(0, pos);
+        line.erase(0, pos + 1);
+        if(!tuning_params.from_string(_tuning_info, line))
+        {
+            ARM_COMPUTE_ERROR_VAR("Malformed row '%s' in %s", line.c_str(), filename.c_str());
+        }
+        add_tuning_params(kernel_id, tuning_params);
     }
     fs.close();
-    _tuning_info.tune_lws = true;
 }
 
 bool CLTuner::save_to_file(const std::string &filename) const
@@ -307,14 +335,24 @@
     {
         return false;
     }
-
     std::ofstream fs;
     fs.exceptions(std::ifstream::failbit | std::ifstream::badbit);
     fs.open(filename, std::ios::out);
+    std::string header_string = "";
+    header_string += "lws";
+    if(_tuning_info.tune_wbsm)
+    {
+        if(!header_string.empty())
+        {
+            header_string += " ";
+        }
+        header_string += "wbsm";
+    }
+    fs << header_string << std::endl;
     for(auto const &kernel_data : _tuning_params_table)
     {
-        const cl::NDRange lws = CLTuningParams(kernel_data.second).get_lws();
-        fs << kernel_data.first << ";" << lws[0] << ";" << lws[1] << ";" << lws[2] << std::endl;
+        CLTuningParams tun_pams(kernel_data.second);
+        fs << kernel_data.first << tun_pams.to_string(_tuning_info) << std::endl;
     }
     fs.close();
     return true;
diff --git a/src/runtime/CL/tuners/CLTuningParametersList.cpp b/src/runtime/CL/tuners/CLTuningParametersList.cpp
index 7f63078..6cb2212 100644
--- a/src/runtime/CL/tuners/CLTuningParametersList.cpp
+++ b/src/runtime/CL/tuners/CLTuningParametersList.cpp
@@ -35,8 +35,14 @@
 class CLTuningParametersList : public ICLTuningParametersList
 {
 protected:
-    /* Shape of 3-D search space */
-    TensorShape search_space_shape{ 0, 0, 0 };
+    /* Shape of 4-D search space */
+    TensorShape               search_space_shape{ 0, 0, 0, 0 };
+    std::vector<unsigned int> _lws_x{ 0 };
+    std::vector<unsigned int> _lws_y{ 0 };
+    std::vector<unsigned int> _lws_z{ 0 };
+    std::vector<int>          _wbsm{ 0 }; /* Modify the batches size of workgroups distributed to compute units.
+                                             The value is in the range [-31,+31].
+                                             When 0, the runtime-selected wbs used is unmodified. */
 
     /** Constructor */
     CLTuningParametersList() = default;
@@ -62,7 +68,7 @@
     /** Prevent default constructor calls */
     CLTuningParametersListExhaustive() = delete;
     /** Constructor */
-    CLTuningParametersListExhaustive(const cl::NDRange &gws);
+    CLTuningParametersListExhaustive(const cl::NDRange &gws, CLTuningInfo tuning_info);
     /** Copy Constructor */
     CLTuningParametersListExhaustive(const CLTuningParametersListExhaustive &) = default;
     /** Move Constructor */
@@ -83,7 +89,7 @@
 {
 public:
     /** Constructor */
-    CLTuningParametersListNormal(const cl::NDRange &gws);
+    CLTuningParametersListNormal(const cl::NDRange &gws, CLTuningInfo tuning_info);
     /** Copy Constructor */
     CLTuningParametersListNormal(const CLTuningParametersListNormal &) = default;
     /** Move Constructor */
@@ -98,11 +104,6 @@
     // Inherited methods overridden:
     CLTuningParams operator[](size_t) override;
 
-protected:
-    std::vector<unsigned int> _lws_x{};
-    std::vector<unsigned int> _lws_y{};
-    std::vector<unsigned int> _lws_z{};
-
     /** Prevent default constructor calls */
     CLTuningParametersListNormal() = default;
 
@@ -125,7 +126,7 @@
     /** Prevent default constructor calls */
     CLTuningParametersListRapid() = delete;
     /** Constructor */
-    CLTuningParametersListRapid(const cl::NDRange &gws);
+    CLTuningParametersListRapid(const cl::NDRange &gws, CLTuningInfo tuning_info);
     /** Copy Constructor */
     CLTuningParametersListRapid(const CLTuningParametersListRapid &) = default;
     /** Move Constructor */
@@ -156,36 +157,53 @@
 {
     ARM_COMPUTE_ERROR_ON(index >= size());
     auto coords = index2coords(search_space_shape, index);
-    return CLTuningParams(coords[0] + 1U, coords[1] + 1U, coords[2] + 1U);
+    return CLTuningParams(coords[0] + 1U, coords[1] + 1U, coords[2] + 1U, static_cast<int>(coords[3]));
 }
 
-CLTuningParametersListExhaustive::CLTuningParametersListExhaustive(const cl::NDRange &gws)
+CLTuningParametersListExhaustive::CLTuningParametersListExhaustive(const cl::NDRange &gws, CLTuningInfo tuning_info)
 {
     ARM_COMPUTE_UNUSED(gws);
-    search_space_shape = TensorShape(max_lws_supported_x,
-                                     max_lws_supported_y,
-                                     max_lws_supported_z);
+    search_space_shape[0] = max_lws_supported_x;
+    search_space_shape[1] = max_lws_supported_y;
+    search_space_shape[2] = max_lws_supported_z;
+    search_space_shape[3] = 1;
+    if(tuning_info.tune_wbsm)
+    {
+        _wbsm                 = { -3, -2, -1, 0, 1, 2, 3 };
+        search_space_shape[3] = _wbsm.size();
+    }
 }
 
 CLTuningParams CLTuningParametersListNormal::operator[](size_t index)
 {
     ARM_COMPUTE_ERROR_ON(index >= size());
     auto coords = index2coords(search_space_shape, index);
-    return CLTuningParams(_lws_x[coords[0]], _lws_y[coords[1]], _lws_z[coords[2]]);
+    return CLTuningParams(_lws_x[coords[0]], _lws_y[coords[1]], _lws_z[coords[2]], _wbsm[coords[3]]);
 }
 
-CLTuningParametersListNormal::CLTuningParametersListNormal(const cl::NDRange &gws)
+CLTuningParametersListNormal::CLTuningParametersListNormal(const cl::NDRange &gws, CLTuningInfo tuning_info)
 {
     auto lws_x_max = std::min(static_cast<unsigned int>(gws[0]), max_lws_supported_x);
     auto lws_y_max = std::min(static_cast<unsigned int>(gws[1]), max_lws_supported_y);
     auto lws_z_max = std::min(static_cast<unsigned int>(gws[2]), max_lws_supported_z);
 
-    // Initialize the LWS values to test
+    // Initialize the tuning parameters values to test
+    _lws_x = {};
+    _lws_y = {};
+    _lws_z = {};
     initialize_lws_values(_lws_x, gws[0], lws_x_max, gws[2] > 16); // Explore lws that are not factors of gws only when gws[2] > 16
     initialize_lws_values(_lws_y, gws[1], lws_y_max, gws[2] > 16); // Explore lws that are not factors of gws only when gws[2] > 16
     initialize_lws_values(_lws_z, gws[2], lws_z_max, false);
 
-    search_space_shape = TensorShape(_lws_x.size(), _lws_y.size(), _lws_z.size());
+    search_space_shape[0] = _lws_x.size();
+    search_space_shape[1] = _lws_y.size();
+    search_space_shape[2] = _lws_z.size();
+    search_space_shape[3] = 1;
+    if(tuning_info.tune_wbsm)
+    {
+        _wbsm                 = { -2, -1, 0, 1, 2 };
+        search_space_shape[3] = _wbsm.size();
+    }
 }
 
 void CLTuningParametersListNormal::initialize_lws_values(std::vector<unsigned int> &lws, unsigned int gws, unsigned int lws_max, bool mod_let_one)
@@ -207,18 +225,29 @@
     }
 }
 
-CLTuningParametersListRapid::CLTuningParametersListRapid(const cl::NDRange &gws)
+CLTuningParametersListRapid::CLTuningParametersListRapid(const cl::NDRange &gws, CLTuningInfo tuning_info)
 {
     auto lws_x_max = std::min(static_cast<unsigned int>(gws[0]), 8u); // Limit exploration to 1 - 8
     auto lws_y_max = std::min(static_cast<unsigned int>(gws[1]), 4u); // Limit exploration to 1 - 4
     auto lws_z_max = std::min(static_cast<unsigned int>(gws[2]), 4u); // Limit exploration to 1 - 4
 
     // Initialize the LWS values to test
+    _lws_x = {};
+    _lws_y = {};
+    _lws_z = {};
     initialize_lws_values(_lws_x, lws_x_max);
     initialize_lws_values(_lws_y, lws_y_max);
     initialize_lws_values(_lws_z, lws_z_max);
 
-    search_space_shape = TensorShape(_lws_x.size(), _lws_y.size(), _lws_z.size());
+    search_space_shape[0] = _lws_x.size();
+    search_space_shape[1] = _lws_y.size();
+    search_space_shape[2] = _lws_z.size();
+    search_space_shape[3] = 1;
+    if(tuning_info.tune_wbsm)
+    {
+        _wbsm                 = { -1, 0, 1 };
+        search_space_shape[3] = _wbsm.size();
+    }
 }
 
 void CLTuningParametersListRapid::initialize_lws_values(std::vector<unsigned int> &lws, unsigned int lws_max)
@@ -231,16 +260,16 @@
     }
 }
 
-std::unique_ptr<ICLTuningParametersList> get_tuning_parameters_list(CLTunerMode mode, const cl::NDRange &gws)
+std::unique_ptr<ICLTuningParametersList> get_tuning_parameters_list(CLTuningInfo tuning_info, const cl::NDRange &gws)
 {
-    switch(mode)
+    switch(tuning_info.tuner_mode)
     {
         case CLTunerMode::EXHAUSTIVE:
-            return std::make_unique<CLTuningParametersListExhaustive>(gws);
+            return std::make_unique<CLTuningParametersListExhaustive>(gws, tuning_info);
         case CLTunerMode::NORMAL:
-            return std::make_unique<CLTuningParametersListNormal>(gws);
+            return std::make_unique<CLTuningParametersListNormal>(gws, tuning_info);
         case CLTunerMode::RAPID:
-            return std::make_unique<CLTuningParametersListRapid>(gws);
+            return std::make_unique<CLTuningParametersListRapid>(gws, tuning_info);
         default:
             return nullptr;
     }