COMPMID-524 - Implemented CLTuner object

Change-Id: Idbdbecca1fc299ed042936119d90e2bed8db0938
Reviewed-on: http://mpd-gerrit.cambridge.arm.com/87101
Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
Tested-by: Kaizen <jeremy.johnson+kaizengerrit@arm.com>
diff --git a/src/core/CL/ICLKernel.cpp b/src/core/CL/ICLKernel.cpp
index 7a95374..b0ac40a 100644
--- a/src/core/CL/ICLKernel.cpp
+++ b/src/core/CL/ICLKernel.cpp
@@ -60,7 +60,7 @@
 }
 
 ICLKernel::ICLKernel()
-    : _kernel(nullptr), _lws_hint(CLKernelLibrary::get().default_ndrange()), _target(GPUTarget::MIDGARD)
+    : _kernel(nullptr), _lws_hint(CLKernelLibrary::get().default_ndrange()), _target(GPUTarget::MIDGARD), _config_id("")
 {
 }
 
diff --git a/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp b/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp
index 75e6d5e..4224d9b 100644
--- a/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp
+++ b/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp
@@ -230,6 +230,24 @@
 
         ICLKernel::configure(win);
     }
+
+    // Set config_id for enabling LWS tuning
+    _config_id = "direct_convolution_";
+    _config_id += lower_string(string_from_data_type(input->info()->data_type()));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(kernel_size);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(_conv_pad_x);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(_conv_pad_y);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(_conv_stride_x);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(_conv_stride_y);
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(output->info()->dimension(0));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(output->info()->dimension(1));
 }
 
 void CLDirectConvolutionLayerKernel::run(const Window &window, cl::CommandQueue &queue)
diff --git a/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp b/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp
index 5b6e0ec..268260b 100644
--- a/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp
+++ b/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp
@@ -81,6 +81,14 @@
     output_access.set_valid_region(win, input->info()->valid_region());
 
     ICLKernel::configure(win);
+
+    // Set config_id for enabling LWS tuning
+    _config_id = "interleave4x4_";
+    _config_id += lower_string(string_from_data_type(input->info()->data_type()));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(output->info()->dimension(0));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(output->info()->dimension(1));
 }
 
 void CLGEMMInterleave4x4Kernel::run(const Window &window, cl::CommandQueue &queue)
diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
index 684e323..b184c50 100644
--- a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
@@ -157,6 +157,17 @@
         output_access.set_valid_region(win, ValidRegion(coord, output->info()->tensor_shape()));
 
         ICLKernel::configure(win);
+
+        // Set config_id for enabling LWS tuning
+        _config_id = "gemm_";
+        _config_id += (is_interleaved_transposed ? "reshaped_" : "");
+        _config_id += lower_string(string_from_data_type(input0->info()->data_type()));
+        _config_id += "_";
+        _config_id += support::cpp11::to_string(output->info()->dimension(1));
+        _config_id += "_";
+        _config_id += support::cpp11::to_string(output->info()->dimension(0));
+        _config_id += "_";
+        _config_id += (is_interleaved_transposed ? support::cpp11::to_string(input1->info()->dimension(0)) : support::cpp11::to_string(input1->info()->dimension(1)));
     }
 }
 
diff --git a/src/core/CL/kernels/CLIm2ColKernel.cpp b/src/core/CL/kernels/CLIm2ColKernel.cpp
index 3d21a9e..98a799f 100644
--- a/src/core/CL/kernels/CLIm2ColKernel.cpp
+++ b/src/core/CL/kernels/CLIm2ColKernel.cpp
@@ -123,6 +123,15 @@
     }
 
     ICLKernel::configure(win);
+
+    // Set config_id for enabling LWS tuning
+    _config_id = "im2col_";
+    _config_id += (run_img2col_reduced ? "reduced_" : "");
+    _config_id += lower_string(string_from_data_type(input->info()->data_type()));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(output->info()->dimension(0));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(output->info()->dimension(1));
 }
 
 void CLIm2ColKernel::run(const Window &window, cl::CommandQueue &queue)
@@ -160,9 +169,6 @@
     slice_out.set(Window::DimY, Window::Dimension(0, _output->info()->dimension(1), 1));
     slice_out.set(Window::DimZ, Window::Dimension(0, 1, 1));
 
-    // Set the local-workgroup size
-    _lws_hint = cl::NDRange(4, 4, 4);
-
     do
     {
         unsigned int idx = 0;
diff --git a/src/runtime/CL/CLScheduler.cpp b/src/runtime/CL/CLScheduler.cpp
index f413f62..71a749f 100644
--- a/src/runtime/CL/CLScheduler.cpp
+++ b/src/runtime/CL/CLScheduler.cpp
@@ -24,11 +24,12 @@
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
 #include "arm_compute/core/CL/ICLKernel.h"
+#include "arm_compute/runtime/CL/CLTuner.h"
 
 using namespace arm_compute;
 
 CLScheduler::CLScheduler()
-    : _context(), _queue(), _target(GPUTarget::MIDGARD), _is_initialised(false)
+    : _context(), _queue(), _target(GPUTarget::MIDGARD), _is_initialised(false), _cl_tuner()
 {
 }
 
@@ -44,10 +45,18 @@
                              "The CLScheduler is not initialised yet! Please call the CLScheduler::get().default_init(), \
                              or CLScheduler::get()::init() and CLKernelLibrary::get()::init() function before running functions!");
 
+    // Tune the kernel if the CLTuner has been provided
+    if(_cl_tuner != nullptr)
+    {
+        // Tune the OpenCL kernel
+        _cl_tuner->tune_kernel(kernel);
+    }
+
+    // Run kernel
     kernel.run(kernel.window(), _queue);
 
     if(flush)
     {
         _queue.flush();
     }
-}
+}
\ No newline at end of file
diff --git a/src/runtime/CL/CLTuner.cpp b/src/runtime/CL/CLTuner.cpp
new file mode 100644
index 0000000..f3300d3
--- /dev/null
+++ b/src/runtime/CL/CLTuner.cpp
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CL/CLTuner.h"
+
+#include "arm_compute/core/CL/ICLKernel.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include <chrono>
+#include <limits>
+#include <string>
+
+using namespace arm_compute;
+
+CLTuner::CLTuner()
+    : _lws_table()
+{
+}
+
+void CLTuner::tune_kernel(ICLKernel &kernel)
+{
+    // Get the configuration ID from the kernel
+    const std::string &config_id = kernel.config_id();
+
+    // Check if we need to find the Optimal LWS. If config_id is empty, the kernel does not require to be tuned
+    if(config_id != "")
+    {
+        auto p = _lws_table.find(config_id);
+
+        if(p == _lws_table.end())
+        {
+            // Find the optimal LWS for the kernel
+            cl::NDRange opt_lws = find_optimal_lws(kernel);
+
+            // Insert the optimal LWS in the table
+            _lws_table.emplace(config_id, opt_lws);
+
+            // Set Local-Workgroup-Size
+            kernel.set_lws_hint(opt_lws);
+        }
+        else
+        {
+            // Set Local-Workgroup-Size
+            kernel.set_lws_hint(p->second);
+        }
+    }
+}
+
+cl::NDRange CLTuner::find_optimal_lws(ICLKernel &kernel)
+{
+    cl::CommandQueue q = CLScheduler::get().queue();
+
+    double min_exec_time = std::numeric_limits<double>::max();
+
+    cl::NDRange opt_lws = cl::NDRange(1, 1);
+
+    for(int y = 1; y <= 16; ++y)
+    {
+        for(int x = 1; x <= 16; ++x)
+        {
+            cl::NDRange lws_test = cl::NDRange(x, y);
+
+            //Set the Local-Workgroup-Size
+            kernel.set_lws_hint(lws_test);
+
+            auto t_start = std::chrono::high_resolution_clock::now();
+
+            // Run
+            kernel.run(kernel.window(), q);
+
+            CLScheduler::get().sync();
+
+            auto t_stop = std::chrono::high_resolution_clock::now();
+
+            std::chrono::duration<double, std::nano> fp_nano = t_stop - t_start;
+
+            // Check the execution time
+            if(fp_nano.count() < min_exec_time)
+            {
+                min_exec_time = fp_nano.count();
+                opt_lws       = cl::NDRange(x, y);
+            }
+        }
+    }
+
+    return opt_lws;
+}
+
+void CLTuner::import_lws_table(const std::unordered_map<std::string, cl::NDRange> &lws_table)
+{
+    _lws_table.clear();
+    _lws_table = lws_table;
+}
+
+const std::unordered_map<std::string, cl::NDRange> &CLTuner::export_lws_table()
+{
+    return _lws_table;
+}
\ No newline at end of file