COMPMID-3604: Graph failures during tuning

Update ICLTuner interface to account for the new memory injection
interface.
Redirect to appropriate kernel execution interface depending on if the
kernel supports memory injection or not.

Signed-off-by: Georgios Pinitas <georgios.pinitas@arm.com>
Change-Id: I8ce29f5c22f1865c9e688d12b65e68ee4486f99c
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/3588
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
diff --git a/arm_compute/runtime/CL/CLTuner.h b/arm_compute/runtime/CL/CLTuner.h
index 745d57a..aa31181 100644
--- a/arm_compute/runtime/CL/CLTuner.h
+++ b/arm_compute/runtime/CL/CLTuner.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -116,6 +116,7 @@
     // Inherited methods overridden:
     void tune_kernel_static(ICLKernel &kernel) override;
     void tune_kernel_dynamic(ICLKernel &kernel) override;
+    void tune_kernel_dynamic(ICLKernel &kernel, const InputTensorMap &inputs, const OutputTensorMap &outputs) override;
 
     /** Is the kernel_event set ?
      *
@@ -130,7 +131,7 @@
      *
      * @return The optimal LWS to use
      */
-    cl::NDRange find_optimal_lws(ICLKernel &kernel);
+    cl::NDRange find_optimal_lws(ICLKernel &kernel, const InputTensorMap &inputs, const OutputTensorMap &outputs);
 
     std::unordered_map<std::string, cl::NDRange> _lws_table;
     cl::Event   _kernel_event;
diff --git a/arm_compute/runtime/CL/ICLTuner.h b/arm_compute/runtime/CL/ICLTuner.h
index 0b23818..4bc8ddf 100644
--- a/arm_compute/runtime/CL/ICLTuner.h
+++ b/arm_compute/runtime/CL/ICLTuner.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,6 +24,8 @@
 #ifndef ARM_COMPUTE_ICLTUNER_H
 #define ARM_COMPUTE_ICLTUNER_H
 
+#include "arm_compute/core/experimental/Types.h"
+
 namespace arm_compute
 {
 class ICLKernel;
@@ -49,6 +51,13 @@
      * @param[in] kernel Kernel to tune
      */
     virtual void tune_kernel_dynamic(ICLKernel &kernel) = 0;
+    /** Tune OpenCL kernel dynamically
+     *
+     * @param[in]      kernel  Kernel to tune
+     * @param[in]      inputs  Inputs for the kernel to use
+     * @param[in, out] outputs Outputs for the kernel to use
+     */
+    virtual void tune_kernel_dynamic(ICLKernel &kernel, const InputTensorMap &inputs, const OutputTensorMap &outputs) = 0;
 };
-}
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_ICLTUNER_H */
diff --git a/arm_compute/runtime/CL/tuners/BifrostTuner.h b/arm_compute/runtime/CL/tuners/BifrostTuner.h
index b7ce6e9..830f7d9 100644
--- a/arm_compute/runtime/CL/tuners/BifrostTuner.h
+++ b/arm_compute/runtime/CL/tuners/BifrostTuner.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -37,6 +37,7 @@
     // Inherited overriden methods
     void tune_kernel_static(ICLKernel &kernel) override;
     void tune_kernel_dynamic(ICLKernel &kernel) override;
+    void tune_kernel_dynamic(ICLKernel &kernel, const InputTensorMap &inputs, const OutputTensorMap &outputs) override;
 };
 } // namespace tuners
 } // namespace arm_compute
diff --git a/arm_compute/runtime/CL/tuners/MidgardTuner.h b/arm_compute/runtime/CL/tuners/MidgardTuner.h
index 418b807..c702e7a 100644
--- a/arm_compute/runtime/CL/tuners/MidgardTuner.h
+++ b/arm_compute/runtime/CL/tuners/MidgardTuner.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -37,6 +37,7 @@
     // Inherited overriden methods
     void tune_kernel_static(ICLKernel &kernel) override;
     void tune_kernel_dynamic(ICLKernel &kernel) override;
+    void tune_kernel_dynamic(ICLKernel &kernel, const InputTensorMap &inputs, const OutputTensorMap &outputs) override;
 };
 } // namespace tuners
 } // namespace arm_compute
diff --git a/src/runtime/CL/CLScheduler.cpp b/src/runtime/CL/CLScheduler.cpp
index 56f5f21..5ef66f4 100644
--- a/src/runtime/CL/CLScheduler.cpp
+++ b/src/runtime/CL/CLScheduler.cpp
@@ -157,22 +157,16 @@
                              "The CLScheduler is not initialised yet! Please call the CLScheduler::get().default_init(), \
                              or CLScheduler::get()::init() and CLKernelLibrary::get()::init() function before running functions!");
 
+    const bool inject_memory = !inputs.empty();
+
     // Tune the kernel if the CLTuner has been provided
     if(_cl_tuner != nullptr)
     {
-        // Tune the OpenCL kernel
-        _cl_tuner->tune_kernel_dynamic(kernel);
+        inject_memory ? _cl_tuner->tune_kernel_dynamic(kernel, inputs, outputs) : _cl_tuner->tune_kernel_dynamic(kernel);
     }
 
     // Run kernel
-    if(inputs.empty())
-    {
-        kernel.run(kernel.window(), _queue);
-    }
-    else
-    {
-        kernel.run_op(inputs, outputs, kernel.window(), _queue);
-    }
+    inject_memory ? kernel.run_op(inputs, outputs, kernel.window(), _queue) : kernel.run(kernel.window(), _queue);
 
     if(flush)
     {
diff --git a/src/runtime/CL/CLTuner.cpp b/src/runtime/CL/CLTuner.cpp
index e3119c1..b2e3476 100644
--- a/src/runtime/CL/CLTuner.cpp
+++ b/src/runtime/CL/CLTuner.cpp
@@ -77,6 +77,11 @@
 
 void CLTuner::tune_kernel_dynamic(ICLKernel &kernel)
 {
+    tune_kernel_dynamic(kernel, {}, {});
+}
+
+void CLTuner::tune_kernel_dynamic(ICLKernel &kernel, const InputTensorMap &inputs, const OutputTensorMap &outputs)
+{
     // Get the configuration ID from the kernel and append GPU target name and number of available compute units
     const std::string config_id = kernel.config_id() + "_" + string_from_target(kernel.get_target()) + "_MP" + support::cpp11::to_string(CLKernelLibrary::get().get_num_compute_units());
 
@@ -90,7 +95,7 @@
             if(_tune_new_kernels)
             {
                 // Find the optimal LWS for the kernel
-                cl::NDRange opt_lws = find_optimal_lws(kernel);
+                cl::NDRange opt_lws = find_optimal_lws(kernel, inputs, outputs);
 
                 // Insert the optimal LWS in the table
                 add_lws_to_table(config_id, opt_lws);
@@ -112,7 +117,7 @@
     _lws_table.emplace(kernel_id, optimal_lws);
 }
 
-cl::NDRange CLTuner::find_optimal_lws(ICLKernel &kernel)
+cl::NDRange CLTuner::find_optimal_lws(ICLKernel &kernel, const InputTensorMap &inputs, const OutputTensorMap &outputs)
 {
     // Profiling queue
     cl::CommandQueue queue_profiler;
@@ -167,7 +172,8 @@
     cl::NDRange gws = ICLKernel::gws_from_window(kernel.window());
 
     // Run the kernel with default lws to be used as baseline
-    kernel.run(kernel.window(), queue_profiler);
+    const bool inject_memory = !inputs.empty();
+    inject_memory ? kernel.run_op(inputs, outputs, kernel.window(), queue_profiler) : kernel.run(kernel.window(), queue_profiler);
 
     queue_profiler.finish();
 
@@ -178,7 +184,7 @@
 
     cl::NDRange opt_lws = cl::NullRange;
 
-    //Construct the list of LWS values to be tested based on the tuner mode.
+    // Construct the list of LWS values to be tested based on the tuner mode.
     auto lws_list = cl_tuner::CLLWSListFactory::get_lws_list(_tuner_mode, gws);
     for(size_t i = 0; i < lws_list->size(); ++i)
     {
@@ -197,7 +203,7 @@
         kernel.set_lws_hint(lws_test);
 
         // Run the kernel
-        kernel.run(kernel.window(), queue_profiler);
+        inject_memory ? kernel.run_op(inputs, outputs, kernel.window(), queue_profiler) : kernel.run(kernel.window(), queue_profiler);
 
         queue_profiler.finish();
 
diff --git a/src/runtime/CL/tuners/BifrostTuner.cpp b/src/runtime/CL/tuners/BifrostTuner.cpp
index 3fecd04..1797c2c 100644
--- a/src/runtime/CL/tuners/BifrostTuner.cpp
+++ b/src/runtime/CL/tuners/BifrostTuner.cpp
@@ -315,5 +315,10 @@
 {
     ARM_COMPUTE_UNUSED(kernel);
 }
+
+void BifrostTuner::tune_kernel_dynamic(ICLKernel &kernel, const InputTensorMap &inputs, const OutputTensorMap &outputs)
+{
+    ARM_COMPUTE_UNUSED(kernel, inputs, outputs);
+}
 } // namespace tuners
 } // namespace arm_compute
\ No newline at end of file
diff --git a/src/runtime/CL/tuners/MidgardTuner.cpp b/src/runtime/CL/tuners/MidgardTuner.cpp
index a95ca19..68c98ce 100644
--- a/src/runtime/CL/tuners/MidgardTuner.cpp
+++ b/src/runtime/CL/tuners/MidgardTuner.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 Arm Limited.
+ * Copyright (c) 2018-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -73,5 +73,10 @@
 {
     ARM_COMPUTE_UNUSED(kernel);
 }
+
+void MidgardTuner::tune_kernel_dynamic(ICLKernel &kernel, const InputTensorMap &inputs, const OutputTensorMap &outputs)
+{
+    ARM_COMPUTE_UNUSED(kernel, inputs, outputs);
+}
 } // namespace tuners
 } // namespace arm_compute