COMPMID-481: Add thread info parameter

Change-Id: Iebb50a88d017445b6b37a86563ebd4abd86c5cf5
Reviewed-on: http://mpd-gerrit.cambridge.arm.com/86788
Tested-by: Kaizen <jeremy.johnson+kaizengerrit@arm.com>
Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
diff --git a/src/runtime/CL/functions/CLHOGMultiDetection.cpp b/src/runtime/CL/functions/CLHOGMultiDetection.cpp
index 6def2de..9eed355 100644
--- a/src/runtime/CL/functions/CLHOGMultiDetection.cpp
+++ b/src/runtime/CL/functions/CLHOGMultiDetection.cpp
@@ -29,6 +29,7 @@
 #include "arm_compute/runtime/CL/CLArray.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/Scheduler.h"
 #include "support/ToolchainSupport.h"
 
 using namespace arm_compute;
@@ -246,7 +247,7 @@
     {
         // Map detection windows array before computing non maxima suppression
         _detection_windows->map(CLScheduler::get().queue(), true);
-        _non_maxima_kernel->run(_non_maxima_kernel->window());
+        Scheduler::get().schedule(_non_maxima_kernel.get(), Window::DimY);
         _detection_windows->unmap(CLScheduler::get().queue());
     }
 }
diff --git a/src/runtime/CL/functions/CLHarrisCorners.cpp b/src/runtime/CL/functions/CLHarrisCorners.cpp
index 8f9fcdc..2140240 100644
--- a/src/runtime/CL/functions/CLHarrisCorners.cpp
+++ b/src/runtime/CL/functions/CLHarrisCorners.cpp
@@ -165,6 +165,6 @@
     _nonmax.unmap();
 
     _corners->map(CLScheduler::get().queue(), true);
-    _sort_euclidean.run(_sort_euclidean.window());
+    Scheduler::get().schedule(&_sort_euclidean, Window::DimY);
     _corners->unmap(CLScheduler::get().queue());
 }
diff --git a/src/runtime/CPP/CPPScheduler.cpp b/src/runtime/CPP/CPPScheduler.cpp
index 2a321a1..9cc3f03 100644
--- a/src/runtime/CPP/CPPScheduler.cpp
+++ b/src/runtime/CPP/CPPScheduler.cpp
@@ -52,7 +52,7 @@
      * This function will return as soon as the kernel has been sent to the worker thread.
      * wait() needs to be called to ensure the execution is complete.
      */
-    void start(ICPPKernel *kernel, const Window &window);
+    void start(ICPPKernel *kernel, const Window &window, const ThreadInfo &info);
     /** Wait for the current kernel execution to complete
      */
     void wait();
@@ -64,13 +64,14 @@
     std::thread        _thread;
     ICPPKernel        *_kernel{ nullptr };
     Window             _window;
+    ThreadInfo         _info;
     sem_t              _wait_for_work;
     sem_t              _job_complete;
     std::exception_ptr _current_exception;
 };
 
 Thread::Thread()
-    : _thread(), _window(), _wait_for_work(), _job_complete(), _current_exception(nullptr)
+    : _thread(), _window(), _info(), _wait_for_work(), _job_complete(), _current_exception(nullptr)
 {
     int ret = sem_init(&_wait_for_work, 0, 0);
     ARM_COMPUTE_ERROR_ON(ret < 0);
@@ -87,7 +88,7 @@
 {
     ARM_COMPUTE_ERROR_ON(!_thread.joinable());
 
-    start(nullptr, Window());
+    start(nullptr, Window(), ThreadInfo());
     _thread.join();
 
     int ret = sem_destroy(&_wait_for_work);
@@ -99,10 +100,11 @@
     ARM_COMPUTE_UNUSED(ret);
 }
 
-void Thread::start(ICPPKernel *kernel, const Window &window)
+void Thread::start(ICPPKernel *kernel, const Window &window, const ThreadInfo &info)
 {
     _kernel = kernel;
     _window = window;
+    _info   = info;
     int ret = sem_post(&_wait_for_work);
     ARM_COMPUTE_UNUSED(ret);
     ARM_COMPUTE_ERROR_ON(ret < 0);
@@ -133,7 +135,7 @@
         try
         {
             _window.validate();
-            _kernel->run(_window);
+            _kernel->run(_window, _info);
         }
         catch(...)
         {
@@ -163,8 +165,7 @@
 
 CPPScheduler::CPPScheduler()
     : _num_threads(std::thread::hardware_concurrency()),
-      _threads(std::unique_ptr<Thread[], void(*)(Thread *)>(new Thread[std::thread::hardware_concurrency() - 1], delete_threads)),
-      _target(CPUTarget::INTRINSICS)
+      _threads(std::unique_ptr<Thread[], void(*)(Thread *)>(new Thread[std::thread::hardware_concurrency() - 1], delete_threads))
 {
 }
 
@@ -179,50 +180,42 @@
     return _num_threads;
 }
 
-void CPPScheduler::set_target(CPUTarget target)
-{
-    _target = target;
-}
-
-CPUTarget CPPScheduler::target() const
-{
-    return _target;
-}
-
 void CPPScheduler::schedule(ICPPKernel *kernel, unsigned int split_dimension)
 {
     ARM_COMPUTE_ERROR_ON_MSG(!kernel, "The child class didn't set the kernel");
 
     /** [Scheduler example] */
+    ThreadInfo info;
+    info.cpu = _target;
+
     const Window      &max_window     = kernel->window();
     const unsigned int num_iterations = max_window.num_iterations(split_dimension);
-    const unsigned int num_threads    = std::min(num_iterations, _num_threads);
+    info.num_threads                  = std::min(num_iterations, _num_threads);
 
-    if(!kernel->is_parallelisable() || 1 == num_threads)
+    if(!kernel->is_parallelisable() || info.num_threads == 1)
     {
-        kernel->run(max_window);
+        kernel->run(max_window, info);
     }
     else
     {
-        for(unsigned int t = 0; t < num_threads; ++t)
+        for(int t = 0; t < info.num_threads; ++t)
         {
-            Window win = max_window.split_window(split_dimension, t, num_threads);
-            win.set_thread_id(t);
-            win.set_num_threads(num_threads);
+            Window win     = max_window.split_window(split_dimension, t, info.num_threads);
+            info.thread_id = t;
 
-            if(t != num_threads - 1)
+            if(t != info.num_threads - 1)
             {
-                _threads[t].start(kernel, win);
+                _threads[t].start(kernel, win, info);
             }
             else
             {
-                kernel->run(win);
+                kernel->run(win, info);
             }
         }
 
         try
         {
-            for(unsigned int t = 1; t < num_threads; ++t)
+            for(int t = 1; t < info.num_threads; ++t)
             {
                 _threads[t - 1].wait();
             }
diff --git a/src/runtime/CPP/SingleThreadScheduler.cpp b/src/runtime/CPP/SingleThreadScheduler.cpp
index f086813..4e46a59 100644
--- a/src/runtime/CPP/SingleThreadScheduler.cpp
+++ b/src/runtime/CPP/SingleThreadScheduler.cpp
@@ -38,12 +38,15 @@
 void SingleThreadScheduler::set_num_threads(unsigned int num_threads)
 {
     ARM_COMPUTE_UNUSED(num_threads);
+    ARM_COMPUTE_ERROR_ON(num_threads != 1);
 }
 
 void SingleThreadScheduler::schedule(ICPPKernel *kernel, unsigned int split_dimension)
 {
     ARM_COMPUTE_UNUSED(split_dimension);
-    kernel->run(kernel->window());
+    ThreadInfo info;
+    info.cpu = _target;
+    kernel->run(kernel->window(), info);
 }
 
 unsigned int SingleThreadScheduler::num_threads() const
diff --git a/src/runtime/NEON/INESimpleFunction.cpp b/src/runtime/NEON/INESimpleFunction.cpp
index a748a1e..23d9872 100644
--- a/src/runtime/NEON/INESimpleFunction.cpp
+++ b/src/runtime/NEON/INESimpleFunction.cpp
@@ -35,6 +35,6 @@
 
 void INESimpleFunction::run()
 {
-    _border_handler.run(_border_handler.window());
+    NEScheduler::get().schedule(&_border_handler, Window::DimZ);
     NEScheduler::get().schedule(_kernel.get(), Window::DimY);
 }
diff --git a/src/runtime/NEON/functions/NECannyEdge.cpp b/src/runtime/NEON/functions/NECannyEdge.cpp
index ca8877e..318cea2 100644
--- a/src/runtime/NEON/functions/NECannyEdge.cpp
+++ b/src/runtime/NEON/functions/NECannyEdge.cpp
@@ -161,7 +161,7 @@
     _sobel->run();
 
     // Fill border before non-maxima suppression. Nop for border mode undefined.
-    _border_mag_gradient.run(_border_mag_gradient.window());
+    NEScheduler::get().schedule(&_border_mag_gradient, Window::DimZ);
 
     // Run gradient
     NEScheduler::get().schedule(_gradient.get(), Window::DimY);
@@ -173,8 +173,8 @@
     memset(_output->buffer(), 0, _output->info()->total_size());
 
     // Fill border before edge trace
-    _border_edge_trace.run(_border_edge_trace.window());
+    NEScheduler::get().schedule(&_border_edge_trace, Window::DimZ);
 
     // Run edge tracing
-    _edge_trace.run(_edge_trace.window());
+    NEScheduler::get().schedule(&_edge_trace, Window::DimY);
 }
diff --git a/src/runtime/NEON/functions/NEConvolution.cpp b/src/runtime/NEON/functions/NEConvolution.cpp
index 4ad6450..249274b 100644
--- a/src/runtime/NEON/functions/NEConvolution.cpp
+++ b/src/runtime/NEON/functions/NEConvolution.cpp
@@ -94,7 +94,7 @@
 template <unsigned int matrix_size>
 void                   NEConvolutionSquare<matrix_size>::run()
 {
-    _border_handler.run(_border_handler.window());
+    NEScheduler::get().schedule(&_border_handler, Window::DimZ);
 
     if(_is_separable)
     {
diff --git a/src/runtime/NEON/functions/NEDerivative.cpp b/src/runtime/NEON/functions/NEDerivative.cpp
index c50db14..8118030 100644
--- a/src/runtime/NEON/functions/NEDerivative.cpp
+++ b/src/runtime/NEON/functions/NEDerivative.cpp
@@ -47,6 +47,6 @@
 
 void NEDerivative::run()
 {
-    _border_handler.run(_border_handler.window());
+    NEScheduler::get().schedule(&_border_handler, Window::DimZ);
     NEScheduler::get().schedule(&_kernel, Window::DimY);
 }
diff --git a/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
index 2e3a683..810efe5 100644
--- a/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
@@ -85,7 +85,7 @@
 
 void NEDirectConvolutionLayer::run()
 {
-    _input_border_handler.run(_input_border_handler.window());
+    NEScheduler::get().schedule(&_input_border_handler, Window::DimZ);
 
     NEScheduler::get().schedule(&_conv_kernel, Window::DimZ);
     NEScheduler::get().schedule(&_accumulate_bias_kernel, Window::DimY);
diff --git a/src/runtime/NEON/functions/NEEqualizeHistogram.cpp b/src/runtime/NEON/functions/NEEqualizeHistogram.cpp
index f6ec677..70b93ca 100644
--- a/src/runtime/NEON/functions/NEEqualizeHistogram.cpp
+++ b/src/runtime/NEON/functions/NEEqualizeHistogram.cpp
@@ -55,7 +55,7 @@
     NEScheduler::get().schedule(&_histogram_kernel, Window::DimY);
 
     // Calculate cumulative distribution of histogram and create LUT.
-    _cd_histogram_kernel.run(_cd_histogram_kernel.window());
+    NEScheduler::get().schedule(&_cd_histogram_kernel, Window::DimY);
 
     // Map input to output using created LUT.
     NEScheduler::get().schedule(&_map_histogram_kernel, Window::DimY);
diff --git a/src/runtime/NEON/functions/NEFastCorners.cpp b/src/runtime/NEON/functions/NEFastCorners.cpp
index 33a58f1..265041f 100644
--- a/src/runtime/NEON/functions/NEFastCorners.cpp
+++ b/src/runtime/NEON/functions/NEFastCorners.cpp
@@ -88,7 +88,7 @@
 
 void NEFastCorners::run()
 {
-    _border_handler.run(_border_handler.window());
+    NEScheduler::get().schedule(&_border_handler, Window::DimZ);
 
     NEScheduler::get().schedule(&_fast_corners_kernel, Window::DimY);
 
diff --git a/src/runtime/NEON/functions/NEGaussian5x5.cpp b/src/runtime/NEON/functions/NEGaussian5x5.cpp
index 69639d0..a1ce985 100644
--- a/src/runtime/NEON/functions/NEGaussian5x5.cpp
+++ b/src/runtime/NEON/functions/NEGaussian5x5.cpp
@@ -54,7 +54,7 @@
 
 void NEGaussian5x5::run()
 {
-    _border_handler.run(_border_handler.window());
+    NEScheduler::get().schedule(&_border_handler, Window::DimZ);
     NEScheduler::get().schedule(&_kernel_hor, Window::DimY);
     NEScheduler::get().schedule(&_kernel_vert, Window::DimY);
 }
diff --git a/src/runtime/NEON/functions/NEGaussianPyramid.cpp b/src/runtime/NEON/functions/NEGaussianPyramid.cpp
index e857aab..90bd584 100644
--- a/src/runtime/NEON/functions/NEGaussianPyramid.cpp
+++ b/src/runtime/NEON/functions/NEGaussianPyramid.cpp
@@ -109,7 +109,7 @@
 
     for(unsigned int i = 0; i < num_levels - 1; ++i)
     {
-        _border_handler[i].run(_border_handler[i].window());
+        NEScheduler::get().schedule(_border_handler.get() + i, Window::DimZ);
         NEScheduler::get().schedule(_horizontal_reduction.get() + i, Window::DimY);
         NEScheduler::get().schedule(_vertical_reduction.get() + i, Window::DimY);
     }
diff --git a/src/runtime/NEON/functions/NEHOGMultiDetection.cpp b/src/runtime/NEON/functions/NEHOGMultiDetection.cpp
index 8b3d014..1a038a2 100644
--- a/src/runtime/NEON/functions/NEHOGMultiDetection.cpp
+++ b/src/runtime/NEON/functions/NEHOGMultiDetection.cpp
@@ -239,6 +239,6 @@
     // Run non-maxima suppression kernel if enabled
     if(_non_maxima_suppression)
     {
-        _non_maxima_kernel->run(_non_maxima_kernel->window());
+        NEScheduler::get().schedule(_non_maxima_kernel.get(), Window::DimY);
     }
 }
diff --git a/src/runtime/NEON/functions/NEHarrisCorners.cpp b/src/runtime/NEON/functions/NEHarrisCorners.cpp
index 24b2bcb..7ec681d 100644
--- a/src/runtime/NEON/functions/NEHarrisCorners.cpp
+++ b/src/runtime/NEON/functions/NEHarrisCorners.cpp
@@ -207,8 +207,8 @@
     _sobel->run();
 
     // Fill border before harris score kernel
-    _border_gx.run(_border_gx.window());
-    _border_gy.run(_border_gy.window());
+    NEScheduler::get().schedule(&_border_gx, Window::DimZ);
+    NEScheduler::get().schedule(&_border_gy, Window::DimZ);
 
     // Run harris score kernel
     NEScheduler::get().schedule(_harris_score.get(), Window::DimY);
@@ -220,5 +220,5 @@
     NEScheduler::get().schedule(&_candidates, Window::DimY);
 
     // Run sort & euclidean distance
-    _sort_euclidean.run(_sort_euclidean.window());
+    NEScheduler::get().schedule(&_sort_euclidean, Window::DimY);
 }
diff --git a/src/runtime/NEON/functions/NEMeanStdDev.cpp b/src/runtime/NEON/functions/NEMeanStdDev.cpp
index ab8e72b..2304bc8 100644
--- a/src/runtime/NEON/functions/NEMeanStdDev.cpp
+++ b/src/runtime/NEON/functions/NEMeanStdDev.cpp
@@ -43,6 +43,6 @@
     _global_sum         = 0;
     _global_sum_squared = 0;
 
-    _fill_border_kernel.run(_fill_border_kernel.window());
+    NEScheduler::get().schedule(&_fill_border_kernel, Window::DimZ);
     NEScheduler::get().schedule(&_mean_stddev_kernel, Window::DimY);
 }
diff --git a/src/runtime/NEON/functions/NESobel5x5.cpp b/src/runtime/NEON/functions/NESobel5x5.cpp
index 8967a22..305d211 100644
--- a/src/runtime/NEON/functions/NESobel5x5.cpp
+++ b/src/runtime/NEON/functions/NESobel5x5.cpp
@@ -75,7 +75,7 @@
 
 void NESobel5x5::run()
 {
-    _border_handler.run(_border_handler.window());
+    NEScheduler::get().schedule(&_border_handler, Window::DimZ);
     NEScheduler::get().schedule(&_sobel_hor, Window::DimY);
     NEScheduler::get().schedule(&_sobel_vert, Window::DimY);
 }
diff --git a/src/runtime/NEON/functions/NESobel7x7.cpp b/src/runtime/NEON/functions/NESobel7x7.cpp
index f628da9..57fe028 100644
--- a/src/runtime/NEON/functions/NESobel7x7.cpp
+++ b/src/runtime/NEON/functions/NESobel7x7.cpp
@@ -75,7 +75,7 @@
 
 void NESobel7x7::run()
 {
-    _border_handler.run(_border_handler.window());
+    NEScheduler::get().schedule(&_border_handler, Window::DimZ);
     NEScheduler::get().schedule(&_sobel_hor, Window::DimY);
     NEScheduler::get().schedule(&_sobel_vert, Window::DimY);
 }