IVGCVSW-1824 Fix slow profiling of neon. (~50% reduced end-to-end time)

Change-Id: I58295c298934317a2b365887bd9f9f6705cd0a21
diff --git a/src/armnn/NeonInterceptorScheduler.cpp b/src/armnn/NeonInterceptorScheduler.cpp
index 8363def..a5ca315 100644
--- a/src/armnn/NeonInterceptorScheduler.cpp
+++ b/src/armnn/NeonInterceptorScheduler.cpp
@@ -9,9 +9,8 @@
 
 namespace armnn{
 
-NeonInterceptorScheduler::NeonInterceptorScheduler(NeonTimer::KernelMeasurements& kernels,
-                                                   arm_compute::IScheduler &realScheduler)
-        : m_Kernels(kernels), m_RealScheduler(realScheduler)
+NeonInterceptorScheduler::NeonInterceptorScheduler(arm_compute::IScheduler &realScheduler)
+        : m_RealScheduler(realScheduler)
 {
 }
 
@@ -27,32 +26,22 @@
 
 void NeonInterceptorScheduler::schedule(arm_compute::ICPPKernel* kernel, const Hints& hints)
 {
-    m_Timer.Start();
+    WallClockTimer::clock::time_point startTime = WallClockTimer::clock::now();
     m_RealScheduler.schedule(kernel, hints.split_dimension());
-    m_Timer.Stop();
+    WallClockTimer::clock::time_point stopTime = WallClockTimer::clock::now();
 
-    std::vector<Measurement> measurements = m_Timer.GetMeasurements();
-    BOOST_ASSERT(!measurements.empty());
-
-    Measurement measurement(measurements.front()); // NOTE: 1st measurement is delta
-    measurement.m_Name = kernel->name();
-    m_Kernels.push_back(std::move(measurement));
+    const auto delta       = std::chrono::duration<double, std::micro>(stopTime - startTime);
+    m_Kernels->emplace_back(kernel->name(), delta.count(), Measurement::Unit::TIME_US);
 }
 
 void NeonInterceptorScheduler::run_workloads(std::vector <Workload>& workloads)
 {
-    m_Timer.Start();
-    // NOTE: we should think about utilising the tag to make profiling more understandable
+    WallClockTimer::clock::time_point startTime = WallClockTimer::clock::now();
     m_RealScheduler.run_tagged_workloads(workloads, nullptr);
-    m_Timer.Stop();
+    WallClockTimer::clock::time_point stopTime = WallClockTimer::clock::now();
 
-    std::vector<Measurement> measurements = m_Timer.GetMeasurements();
-    BOOST_ASSERT_MSG(measurements.size() == 3, "WallClockTimer does not have correct amount of measurements.");
-
-    // WallClockTimer has 3 measurements, duration always being the first.
-    Measurement measurement(measurements.front());
-    measurement.m_Name = "Workload";
-    m_Kernels.push_back(std::move(measurement));
+    const auto delta       = std::chrono::duration<double, std::micro>(stopTime - startTime);
+    m_Kernels->emplace_back(std::string("Workload"), delta.count(), Measurement::Unit::TIME_US);
 }
 
 } // namespace armnn
\ No newline at end of file
diff --git a/src/armnn/NeonInterceptorScheduler.hpp b/src/armnn/NeonInterceptorScheduler.hpp
index 37966b8..f33b79a 100644
--- a/src/armnn/NeonInterceptorScheduler.hpp
+++ b/src/armnn/NeonInterceptorScheduler.hpp
@@ -17,7 +17,7 @@
 class NeonInterceptorScheduler : public arm_compute::IScheduler
 {
 public:
-    NeonInterceptorScheduler(NeonTimer::KernelMeasurements &kernels, arm_compute::IScheduler &realScheduler);
+    NeonInterceptorScheduler(arm_compute::IScheduler &realScheduler);
     ~NeonInterceptorScheduler() = default;
 
     void set_num_threads(unsigned int numThreads) override;
@@ -28,10 +28,11 @@
 
     void run_workloads(std::vector<Workload> &workloads) override;
 
+    void SetKernels(NeonTimer::KernelMeasurements* kernels) { m_Kernels = kernels; }
+    NeonTimer::KernelMeasurements* GetKernels() { return m_Kernels; }
 private:
-    NeonTimer::KernelMeasurements& m_Kernels;
+    NeonTimer::KernelMeasurements* m_Kernels;
     arm_compute::IScheduler& m_RealScheduler;
-    WallClockTimer m_Timer;
 };
 
 } // namespace armnn
diff --git a/src/armnn/NeonTimer.cpp b/src/armnn/NeonTimer.cpp
index 1ee0c64..219edc9 100644
--- a/src/armnn/NeonTimer.cpp
+++ b/src/armnn/NeonTimer.cpp
@@ -13,24 +13,31 @@
 
 namespace armnn
 {
+namespace
+{
+static thread_local auto g_Interceptor = std::make_shared<NeonInterceptorScheduler>(arm_compute::Scheduler::get());
+}
 
 void NeonTimer::Start()
 {
     m_Kernels.clear();
+    BOOST_ASSERT(g_Interceptor->GetKernels() == nullptr);
+    g_Interceptor->SetKernels(&m_Kernels);
+
     m_RealSchedulerType = arm_compute::Scheduler::get_type();
     //Note: We can't currently replace a custom scheduler
     if(m_RealSchedulerType != arm_compute::Scheduler::Type::CUSTOM)
     {
         // Keep the real schedule and add NeonInterceptorScheduler as an interceptor
         m_RealScheduler  = &arm_compute::Scheduler::get();
-        auto interceptor = std::make_shared<NeonInterceptorScheduler>(m_Kernels, *m_RealScheduler);
-        arm_compute::Scheduler::set(std::static_pointer_cast<arm_compute::IScheduler>(interceptor));
+        arm_compute::Scheduler::set(std::static_pointer_cast<arm_compute::IScheduler>(g_Interceptor));
     }
 }
 
 void NeonTimer::Stop()
 {
     // Restore real scheduler
+    g_Interceptor->SetKernels(nullptr);
     arm_compute::Scheduler::set(m_RealSchedulerType);
     m_RealScheduler = nullptr;
 }