Minor improvement of inference profiling

* Start inference profiling at the actual beginning
* Add profiling events for EnqueueInputs and EnqueueOutputs
* Add profiling event for working memory allocation
* Refactor Execute body to remove code duplication
* forward arguments to constructors rather than copy

Change-Id: Iacab85f0a02e88e2423885f86f97e4dba4037319
Signed-off-by: Derek Lamberti <derek.lamberti@arm.com>
diff --git a/src/armnn/LoadedNetwork.cpp b/src/armnn/LoadedNetwork.cpp
index fbf8cfb..b35dfd1 100644
--- a/src/armnn/LoadedNetwork.cpp
+++ b/src/armnn/LoadedNetwork.cpp
@@ -451,8 +451,6 @@
 Status LoadedNetwork::EnqueueWorkload(const InputTensors& inputTensors,
                                       const OutputTensors& outputTensors)
 {
-    ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "EnqueueWorkload");
-
     const Graph& graph = m_OptimizedNetwork->GetGraph();
 
     // Walk graph to determine the order of execution.
@@ -471,21 +469,27 @@
     }
 
     // For each input to the network, call EnqueueInput with the data passed by the user.
-    m_InputQueue.clear();
-    m_InputQueue.reserve(graph.GetNumInputs());
-    for (const BindableLayer* inputLayer : graph.GetInputLayers())
     {
-        const TensorPin& pin = workloadData.GetInputTensorPin(inputLayer->GetBindingId());
-        EnqueueInput(*inputLayer, pin.GetTensorHandle(), pin.GetTensorInfo());
+        ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "PrepareInputs");
+        m_InputQueue.clear();
+        m_InputQueue.reserve(graph.GetNumInputs());
+        for (const BindableLayer* inputLayer : graph.GetInputLayers())
+        {
+            const TensorPin& pin = workloadData.GetInputTensorPin(inputLayer->GetBindingId());
+            EnqueueInput(*inputLayer, pin.GetTensorHandle(), pin.GetTensorInfo());
+        }
     }
 
     // For each output to the network, call EnqueueOutput with the data passed by the user.
-    m_OutputQueue.clear();
-    m_OutputQueue.reserve(graph.GetNumOutputs());
-    for (const BindableLayer* outputLayer : graph.GetOutputLayers())
     {
-        const TensorPin& pin = workloadData.GetOutputTensorPin(outputLayer->GetBindingId());
-        EnqueueOutput(*outputLayer, pin.GetTensorHandle(), pin.GetTensorInfo());
+        ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "PrepareOutputs");
+        m_OutputQueue.clear();
+        m_OutputQueue.reserve(graph.GetNumOutputs());
+        for (const BindableLayer* outputLayer : graph.GetOutputLayers())
+        {
+            const TensorPin& pin = workloadData.GetOutputTensorPin(outputLayer->GetBindingId());
+            EnqueueOutput(*outputLayer, pin.GetTensorHandle(), pin.GetTensorInfo());
+        }
     }
 
     std::unique_ptr<TimelineUtilityMethods> timelineUtils =
@@ -684,8 +688,13 @@
     }
 }
 
-void LoadedNetwork::AllocateWorkingMemory()
+void LoadedNetwork::AllocateWorkingMemory(std::lock_guard<std::mutex>& lock)
 {
+    ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "Working Memory Allocation");
+
+    // this unused parameter makes sure we can only call this function with a valid lock
+    IgnoreUnused(lock);
+
     if (m_IsWorkingMemAllocated)
     {
         return;
@@ -736,49 +745,29 @@
     try
     {
         std::lock_guard<std::mutex> lockGuard(m_WorkingMemMutex);
-        AllocateWorkingMemory();
+        AllocateWorkingMemory(lockGuard);
 
         ProfilingDynamicGuid workloadInferenceID(0);
-        for (auto& input : m_InputQueue)
+        auto ExecuteQueue = [&timelineUtils, &workloadInferenceID, &inferenceGuid](WorkloadQueue& queue)
         {
-            if(timelineUtils)
+            for (auto& workload : queue)
             {
-                workloadInferenceID = timelineUtils->RecordWorkloadInferenceAndStartOfLifeEvent(input->GetGuid(),
-                                                                                                inferenceGuid);
+                if(timelineUtils)
+                {
+                    workloadInferenceID = timelineUtils->RecordWorkloadInferenceAndStartOfLifeEvent(workload->GetGuid(),
+                                                                                                    inferenceGuid);
+                }
+                workload->Execute();
+                if(timelineUtils)
+                {
+                    timelineUtils->RecordEndOfLifeEvent(workloadInferenceID);
+                }
             }
-            input->Execute();
-            if(timelineUtils)
-            {
-                timelineUtils->RecordEndOfLifeEvent(workloadInferenceID);
-            }
-        }
+        };
 
-        for (auto& workload : m_WorkloadQueue)
-        {
-            if(timelineUtils)
-            {
-                workloadInferenceID = timelineUtils->RecordWorkloadInferenceAndStartOfLifeEvent(workload->GetGuid(),
-                                                                                                inferenceGuid);
-            }
-            workload->Execute();
-            if(timelineUtils)
-            {
-                timelineUtils->RecordEndOfLifeEvent(workloadInferenceID);
-            }
-        }
-        for (auto& output: m_OutputQueue)
-        {
-            if(timelineUtils)
-            {
-                workloadInferenceID = timelineUtils->RecordWorkloadInferenceAndStartOfLifeEvent(output->GetGuid(),
-                                                                                                inferenceGuid);
-            }
-            output->Execute();
-            if(timelineUtils)
-            {
-                timelineUtils->RecordEndOfLifeEvent(workloadInferenceID);
-            }
-        }
+        ExecuteQueue(m_InputQueue);
+        ExecuteQueue(m_WorkloadQueue);
+        ExecuteQueue(m_OutputQueue);
     }
     catch (const RuntimeException& error)
     {
diff --git a/src/armnn/LoadedNetwork.hpp b/src/armnn/LoadedNetwork.hpp
index 918375a..8c21030 100644
--- a/src/armnn/LoadedNetwork.hpp
+++ b/src/armnn/LoadedNetwork.hpp
@@ -59,7 +59,7 @@
     void SendNetworkStructure();
 
 private:
-    void AllocateWorkingMemory();
+    void AllocateWorkingMemory(std::lock_guard<std::mutex>& lock);
 
     LoadedNetwork(std::unique_ptr<OptimizedNetwork> net,
                   const INetworkProperties& networkProperties,
diff --git a/src/armnn/Profiling.hpp b/src/armnn/Profiling.hpp
index 08d7f7b..08e55a1 100644
--- a/src/armnn/Profiling.hpp
+++ b/src/armnn/Profiling.hpp
@@ -115,7 +115,7 @@
     using InstrumentPtr = std::unique_ptr<Instrument>;
 
     template<typename... Args>
-    ScopedProfilingEvent(const BackendId& backendId, const std::string& name, Args... args)
+    ScopedProfilingEvent(const BackendId& backendId, const std::string& name, Args&&... args)
         : m_Event(nullptr)
         , m_Profiler(ProfilerManager::GetInstance().GetProfiler())
     {
@@ -123,7 +123,7 @@
         {
             std::vector<InstrumentPtr> instruments(0);
             instruments.reserve(sizeof...(args)); //One allocation
-            ConstructNextInVector(instruments, args...);
+            ConstructNextInVector(instruments, std::forward<Args>(args)...);
             m_Event = m_Profiler->BeginEvent(backendId, name, std::move(instruments));
         }
     }
@@ -144,10 +144,10 @@
     }
 
     template<typename Arg, typename... Args>
-    void ConstructNextInVector(std::vector<InstrumentPtr>& instruments, Arg arg, Args... args)
+    void ConstructNextInVector(std::vector<InstrumentPtr>& instruments, Arg&& arg, Args&&... args)
     {
-        instruments.emplace_back(std::make_unique<Arg>(arg));
-        ConstructNextInVector(instruments, args...);
+        instruments.emplace_back(std::make_unique<Arg>(std::forward<Arg>(arg)));
+        ConstructNextInVector(instruments, std::forward<Args>(args)...);
     }
 
     Event* m_Event;       ///< Event to track
diff --git a/src/armnn/Runtime.cpp b/src/armnn/Runtime.cpp
index 5692494..28e2df2 100644
--- a/src/armnn/Runtime.cpp
+++ b/src/armnn/Runtime.cpp
@@ -308,6 +308,7 @@
                                 const InputTensors& inputTensors,
                                 const OutputTensors& outputTensors)
 {
+    ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "EnqueueWorkload");
     LoadedNetwork* loadedNetwork = GetLoadedNetworkPtr(networkId);
 
     static thread_local NetworkId lastId = networkId;