IVGCVSW-5775 'Add Async Support to ExecuteNetwork'

* Enabled async mode with '-n, concurrent' and 'simultaneous-iterations'
  in ExecuteNetwork
* Number of input files provided should be equal to number of input files
  provided multiply by number of simultaneous iterations divided by comma

!armnn:5443

Signed-off-by: Sadik Armagan <sadik.armagan@arm.com>
Change-Id: Ibeb318010430bf4ae61a02b18b1bf88f3657774c
diff --git a/include/armnn/IWorkingMemHandle.hpp b/include/armnn/IWorkingMemHandle.hpp
index 171fa3d..6fb2f9f 100644
--- a/include/armnn/IWorkingMemHandle.hpp
+++ b/include/armnn/IWorkingMemHandle.hpp
@@ -25,6 +25,9 @@
     /// Returns the NetworkId of the Network that this IWorkingMemHandle works with.
     virtual NetworkId GetNetworkId() = 0;
 
+    /// Returns the InferenceId of the Inference that this IWorkingMemHandle works with.
+    virtual profiling::ProfilingGuid GetInferenceId() = 0;
+
     /// Allocate the backing memory required for execution. If this is not called, then allocation will be
     /// deferred to execution time. The mutex must be locked.
     virtual void Allocate() = 0;
diff --git a/src/armnn/WorkingMemHandle.cpp b/src/armnn/WorkingMemHandle.cpp
index 0cbef82..b54c5ba 100644
--- a/src/armnn/WorkingMemHandle.cpp
+++ b/src/armnn/WorkingMemHandle.cpp
@@ -26,7 +26,8 @@
     m_MemoryManagers(memoryManagers),
     m_OwnedTensorHandles(std::move(ownedTensorHandles)),
     m_IsAllocated(false),
-    m_Mutex()
+    m_Mutex(),
+    m_InferenceId(profiling::ProfilingService::GetNextGuid())
 {
 }
 
diff --git a/src/armnn/WorkingMemHandle.hpp b/src/armnn/WorkingMemHandle.hpp
index 92b0aca..5ccb2b2 100644
--- a/src/armnn/WorkingMemHandle.hpp
+++ b/src/armnn/WorkingMemHandle.hpp
@@ -38,6 +38,11 @@
         return m_NetworkId;
     }
 
+    profiling::ProfilingGuid GetInferenceId() override
+    {
+        return m_InferenceId;
+    }
+
     /// Allocate the backing memory required for execution. If this is not called, then allocation will be
     /// deferred to execution time. The mutex must be locked.
     void Allocate() override;
@@ -87,6 +92,7 @@
 
     bool m_IsAllocated;
     std::mutex m_Mutex;
+    profiling::ProfilingGuid m_InferenceId;
 };
 
 } // end experimental namespace
diff --git a/src/backends/reference/workloads/RefDetectionPostProcessWorkload.cpp b/src/backends/reference/workloads/RefDetectionPostProcessWorkload.cpp
index 25c326a..6784e21 100644
--- a/src/backends/reference/workloads/RefDetectionPostProcessWorkload.cpp
+++ b/src/backends/reference/workloads/RefDetectionPostProcessWorkload.cpp
@@ -46,10 +46,10 @@
     auto scores       = MakeDecoder<float>(scoresInfo, inputs[1]->Map());
     auto anchors      = MakeDecoder<float>(anchorsInfo, m_Anchors->Map(false));
 
-    float* detectionBoxes   = GetOutputTensorData<float>(0, m_Data);
-    float* detectionClasses = GetOutputTensorData<float>(1, m_Data);
-    float* detectionScores  = GetOutputTensorData<float>(2, m_Data);
-    float* numDetections    = GetOutputTensorData<float>(3, m_Data);
+    float* detectionBoxes   = reinterpret_cast<float*>(outputs[0]->Map());
+    float* detectionClasses = reinterpret_cast<float*>(outputs[1]->Map());
+    float* detectionScores  = reinterpret_cast<float*>(outputs[2]->Map());
+    float* numDetections    = reinterpret_cast<float*>(outputs[3]->Map());
 
     DetectionPostProcess(boxEncodingsInfo, scoresInfo, anchorsInfo,
                          detectionBoxesInfo, detectionClassesInfo,
diff --git a/tests/ExecuteNetwork/ExecuteNetwork.cpp b/tests/ExecuteNetwork/ExecuteNetwork.cpp
index 60e4ec3..2bbb517 100644
--- a/tests/ExecuteNetwork/ExecuteNetwork.cpp
+++ b/tests/ExecuteNetwork/ExecuteNetwork.cpp
@@ -279,7 +279,8 @@
     using TContainer =
            mapbox::util::variant<std::vector<float>, std::vector<int>, std::vector<unsigned char>, std::vector<int8_t>>;
 
-    std::vector<TContainer> inputDataContainers;
+    std::vector<std::vector<TContainer>> inputs;
+    std::vector<std::vector<TContainer>> outputs;
 
     try
     {
@@ -298,6 +299,7 @@
         inferenceModelParams.m_CachedNetworkFilePath          = params.m_CachedNetworkFilePath;
         inferenceModelParams.m_NumberOfThreads                = params.m_NumberOfThreads;
         inferenceModelParams.m_MLGOTuningFilePath             = params.m_MLGOTuningFilePath;
+        inferenceModelParams.m_AsyncEnabled                   = params.m_Concurrent;
 
         for(const std::string& inputName: params.m_InputNames)
         {
@@ -324,106 +326,201 @@
                                                  runtime);
 
         const size_t numInputs = inferenceModelParams.m_InputBindings.size();
-        for(unsigned int i = 0; i < numInputs; ++i)
+
+        armnn::Optional<QuantizationParams> qParams = params.m_QuantizeInput ?
+                                                      armnn::MakeOptional<QuantizationParams>(
+                                                          model.GetInputQuantizationParams()) :
+                                                      armnn::EmptyOptional();
+
+        for(unsigned int j = 0; j < params.m_SimultaneousIterations ; ++j)
         {
-            armnn::Optional<QuantizationParams> qParams = params.m_QuantizeInput ?
-                                                          armnn::MakeOptional<QuantizationParams>(
-                                                                  model.GetInputQuantizationParams()) :
-                                                          armnn::EmptyOptional();
-
-            armnn::Optional<std::string> dataFile = params.m_GenerateTensorData ?
-                                                    armnn::EmptyOptional() :
-                                                    armnn::MakeOptional<std::string>(
-                                                            params.m_InputTensorDataFilePaths[i]);
-
-            unsigned int numElements = model.GetInputSize(i);
-            if (params.m_InputTensorShapes.size() > i && params.m_InputTensorShapes[i])
+            std::vector<TContainer> inputDataContainers;
+            for(unsigned int i = 0; i < numInputs; ++i)
             {
-                // If the user has provided a tensor shape for the current input,
-                // override numElements
-                numElements = params.m_InputTensorShapes[i]->GetNumElements();
+                armnn::Optional<std::string> dataFile = params.m_GenerateTensorData ?
+                                                        armnn::EmptyOptional() :
+                                                        armnn::MakeOptional<std::string>(
+                                                            params.m_InputTensorDataFilePaths[(j * numInputs) + i]);
+
+                unsigned int numElements = model.GetInputSize(i);
+                if (params.m_InputTensorShapes.size() > i && params.m_InputTensorShapes[i])
+                {
+                    // If the user has provided a tensor shape for the current input,
+                    // override numElements
+                    numElements = params.m_InputTensorShapes[i]->GetNumElements();
+                }
+
+                TContainer tensorData;
+                PopulateTensorWithData(tensorData,
+                                       numElements,
+                                       params.m_InputTypes[i],
+                                       qParams,
+                                       dataFile);
+
+                inputDataContainers.push_back(tensorData);
             }
-
-            TContainer tensorData;
-            PopulateTensorWithData(tensorData,
-                                   numElements,
-                                   params.m_InputTypes[i],
-                                   qParams,
-                                   dataFile);
-
-            inputDataContainers.push_back(tensorData);
+            inputs.push_back(inputDataContainers);
         }
 
         const size_t numOutputs = inferenceModelParams.m_OutputBindings.size();
-        std::vector<TContainer> outputDataContainers;
 
-        for (unsigned int i = 0; i < numOutputs; ++i)
+        for (unsigned int j = 0; j < params.m_SimultaneousIterations; ++j)
         {
-            if (params.m_OutputTypes[i].compare("float") == 0)
+            std::vector <TContainer> outputDataContainers;
+            for (unsigned int i = 0; i < numOutputs; ++i)
             {
-                outputDataContainers.push_back(std::vector<float>(model.GetOutputSize(i)));
-            }
-            else if (params.m_OutputTypes[i].compare("int") == 0)
-            {
-                outputDataContainers.push_back(std::vector<int>(model.GetOutputSize(i)));
-            }
-            else if (params.m_OutputTypes[i].compare("qasymm8") == 0)
-            {
-                outputDataContainers.push_back(std::vector<uint8_t>(model.GetOutputSize(i)));
-            }
-            else if (params.m_OutputTypes[i].compare("qsymms8") == 0)
-            {
-                outputDataContainers.push_back(std::vector<int8_t>(model.GetOutputSize(i)));
-            }
-            else
-            {
-                ARMNN_LOG(fatal) << "Unsupported tensor data type \"" << params.m_OutputTypes[i] << "\". ";
-                return EXIT_FAILURE;
-            }
-        }
-
-        for (size_t x = 0; x < params.m_Iterations; x++)
-        {
-            // model.Run returns the inference time elapsed in EnqueueWorkload (in milliseconds)
-            auto inference_duration = model.Run(inputDataContainers, outputDataContainers);
-
-            if (params.m_GenerateTensorData)
-            {
-                ARMNN_LOG(warning) << "The input data was generated, note that the output will not be useful";
-            }
-
-            // Print output tensors
-            const auto& infosOut = model.GetOutputBindingInfos();
-            for (size_t i = 0; i < numOutputs; i++)
-            {
-                const armnn::TensorInfo& infoOut = infosOut[i].second;
-                auto outputTensorFile = params.m_OutputTensorFiles.empty() ? "" : params.m_OutputTensorFiles[i];
-
-                TensorPrinter printer(inferenceModelParams.m_OutputBindings[i],
-                                      infoOut,
-                                      outputTensorFile,
-                                      params.m_DequantizeOutput);
-                mapbox::util::apply_visitor(printer, outputDataContainers[i]);
-            }
-
-            ARMNN_LOG(info) << "\nInference time: " << std::setprecision(2)
-                            << std::fixed << inference_duration.count() << " ms\n";
-
-            // If thresholdTime == 0.0 (default), then it hasn't been supplied at command line
-            if (params.m_ThresholdTime != 0.0)
-            {
-                ARMNN_LOG(info) << "Threshold time: " << std::setprecision(2)
-                                << std::fixed << params.m_ThresholdTime << " ms";
-                auto thresholdMinusInference = params.m_ThresholdTime - inference_duration.count();
-                ARMNN_LOG(info) << "Threshold time - Inference time: " << std::setprecision(2)
-                                << std::fixed << thresholdMinusInference << " ms" << "\n";
-
-                if (thresholdMinusInference < 0)
+                if (params.m_OutputTypes[i].compare("float") == 0)
                 {
-                    std::string errorMessage = "Elapsed inference time is greater than provided threshold time.";
-                    ARMNN_LOG(fatal) << errorMessage;
+                    outputDataContainers.push_back(std::vector<float>(model.GetOutputSize(i)));
+                } else if (params.m_OutputTypes[i].compare("int") == 0)
+                {
+                    outputDataContainers.push_back(std::vector<int>(model.GetOutputSize(i)));
+                } else if (params.m_OutputTypes[i].compare("qasymm8") == 0)
+                {
+                    outputDataContainers.push_back(std::vector<uint8_t>(model.GetOutputSize(i)));
+                } else if (params.m_OutputTypes[i].compare("qsymms8") == 0)
+                {
+                    outputDataContainers.push_back(std::vector<int8_t>(model.GetOutputSize(i)));
+                } else
+                {
+                    ARMNN_LOG(fatal) << "Unsupported tensor data type \"" << params.m_OutputTypes[i] << "\". ";
+                    return EXIT_FAILURE;
                 }
             }
+            outputs.push_back(outputDataContainers);
+        }
+
+        if (!params.m_Concurrent)
+        {
+            // Synchronous Execution
+            for (size_t x = 0; x < params.m_Iterations; x++)
+            {
+                // model.Run returns the inference time elapsed in EnqueueWorkload (in milliseconds)
+                auto inference_duration = model.Run(inputs[0], outputs[0]);
+
+                if (params.m_GenerateTensorData)
+                {
+                    ARMNN_LOG(warning) << "The input data was generated, note that the output will not be useful";
+                }
+
+                // Print output tensors
+                const auto& infosOut = model.GetOutputBindingInfos();
+                for (size_t i = 0; i < numOutputs; i++)
+                {
+                    const armnn::TensorInfo& infoOut = infosOut[i].second;
+                    auto outputTensorFile = params.m_OutputTensorFiles.empty() ? "" : params.m_OutputTensorFiles[i];
+
+                    TensorPrinter printer(inferenceModelParams.m_OutputBindings[i],
+                                          infoOut,
+                                          outputTensorFile,
+                                          params.m_DequantizeOutput);
+                    mapbox::util::apply_visitor(printer, outputs[0][i]);
+                }
+
+                ARMNN_LOG(info) << "\nInference time: " << std::setprecision(2)
+                                << std::fixed << inference_duration.count() << " ms\n";
+
+                // If thresholdTime == 0.0 (default), then it hasn't been supplied at command line
+                if (params.m_ThresholdTime != 0.0)
+                {
+                    ARMNN_LOG(info) << "Threshold time: " << std::setprecision(2)
+                                    << std::fixed << params.m_ThresholdTime << " ms";
+                    auto thresholdMinusInference = params.m_ThresholdTime - inference_duration.count();
+                    ARMNN_LOG(info) << "Threshold time - Inference time: " << std::setprecision(2)
+                                    << std::fixed << thresholdMinusInference << " ms" << "\n";
+
+                    if (thresholdMinusInference < 0)
+                    {
+                        std::string errorMessage = "Elapsed inference time is greater than provided threshold time.";
+                        ARMNN_LOG(fatal) << errorMessage;
+                    }
+                }
+            }
+        }
+        else
+        {
+            try
+            {
+                ARMNN_LOG(info) << "Asynchronous Execution...  \n";
+                std::vector<std::future<std::tuple<armnn::profiling::ProfilingGuid,
+                std::chrono::duration<double, std::milli>>>> inferenceResults;
+                inferenceResults.reserve(params.m_SimultaneousIterations);
+
+                // Create WorkingMemHandles for each inference
+                std::vector<std::unique_ptr<armnn::experimental::IWorkingMemHandle>> workingMemHandles;
+                workingMemHandles.reserve(params.m_SimultaneousIterations);
+                for (unsigned int i = 0; i < params.m_SimultaneousIterations; ++i)
+                {
+                    workingMemHandles.push_back(model.CreateWorkingMemHandle());
+                }
+
+                // Run each inference in its own thread
+                for (unsigned int i = 0; i < params.m_SimultaneousIterations; ++i)
+                {
+                    armnn::experimental::IWorkingMemHandle& workingMemHandleRef = *workingMemHandles[i].get();
+                    inferenceResults.push_back(std::async(
+                        std::launch::async, [&model, &workingMemHandleRef, &inputs, &outputs, i]() {
+                            return model.RunAsync(workingMemHandleRef, inputs[i], outputs[i]);
+                        }
+                        ));
+                }
+
+                // Check the results
+                for (unsigned int j = 0; j < inferenceResults.size(); ++j)
+                {
+                    // Get the results
+                    auto inferenceResult = inferenceResults[j].get();
+                    auto inference_duration = std::get<1>(inferenceResult);
+                    auto inferenceID = std::get<0>(inferenceResult);
+
+                    if (params.m_GenerateTensorData)
+                    {
+                        ARMNN_LOG(warning) << "The input data was generated, note that the output will not be useful";
+                    }
+
+                    // Print output tensors
+                    const auto& infosOut = model.GetOutputBindingInfos();
+                    for (size_t i = 0; i < numOutputs; i++)
+                    {
+                        const armnn::TensorInfo& infoOut = infosOut[i].second;
+                        auto outputTensorFile = params.m_OutputTensorFiles.empty()
+                                                ? ""
+                                                : params.m_OutputTensorFiles[(j * numOutputs) + i];
+
+                        TensorPrinter printer(inferenceModelParams.m_OutputBindings[i],
+                                              infoOut,
+                                              outputTensorFile,
+                                              params.m_DequantizeOutput);
+                        mapbox::util::apply_visitor(printer, outputs[j][i]);
+                    }
+
+                    ARMNN_LOG(info) << "\nInference time: " << std::setprecision(2)
+                                    << std::fixed << inference_duration.count() << " ms\n";
+
+                    // If thresholdTime == 0.0 (default), then it hasn't been supplied at command line
+                    if (params.m_ThresholdTime != 0.0)
+                    {
+                        ARMNN_LOG(info) << "Threshold time: " << std::setprecision(2)
+                                        << std::fixed << params.m_ThresholdTime << " ms";
+                        auto thresholdMinusInference = params.m_ThresholdTime - inference_duration.count();
+                        ARMNN_LOG(info) << "Threshold time - Inference time: " << std::setprecision(2)
+                                        << std::fixed << thresholdMinusInference << " ms" << "\n";
+
+                        if (thresholdMinusInference < 0)
+                        {
+                            ARMNN_LOG(fatal) << "Elapsed inference time is greater than provided threshold time. \n";
+                        }
+                    }
+                    ARMNN_LOG(info) << "Asynchronous Execution is finished for Inference ID: " << inferenceID << " \n";
+
+                }
+            }
+            catch (const armnn::Exception& e)
+            {
+                ARMNN_LOG(fatal) << "Armnn Error: " << e.what();
+                return EXIT_FAILURE;
+            }
+
         }
     }
     catch (const armnn::Exception& e)
diff --git a/tests/ExecuteNetwork/ExecuteNetworkParams.cpp b/tests/ExecuteNetwork/ExecuteNetworkParams.cpp
index 4e3b5e3..8f1cb0b 100644
--- a/tests/ExecuteNetwork/ExecuteNetworkParams.cpp
+++ b/tests/ExecuteNetwork/ExecuteNetworkParams.cpp
@@ -145,6 +145,12 @@
 
         CheckModelFormat(m_ModelFormat);
 
+        // Check number of simultaneous iterations
+        if ((m_SimultaneousIterations < 1))
+        {
+            ARMNN_LOG(fatal) << "simultaneous-iterations cannot be less than 1. ";
+        }
+
         // Check input tensor shapes
         if ((m_InputTensorShapes.size() != 0) &&
             (m_InputTensorShapes.size() != m_InputNames.size()))
@@ -159,10 +165,19 @@
                 ARMNN_LOG(fatal) << "One or more input data file paths are not valid. ";
             }
 
-            if (m_InputTensorDataFilePaths.size() != m_InputNames.size())
+            if (!m_Concurrent && m_InputTensorDataFilePaths.size() != m_InputNames.size())
             {
                 ARMNN_LOG(fatal) << "input-name and input-tensor-data must have the same amount of elements. ";
             }
+
+            if (m_InputTensorDataFilePaths.size() < m_SimultaneousIterations * m_InputNames.size())
+            {
+                ARMNN_LOG(fatal) << "There is not enough input data for " << m_SimultaneousIterations << " execution.";
+            }
+            if (m_InputTensorDataFilePaths.size() > m_SimultaneousIterations * m_InputNames.size())
+            {
+                ARMNN_LOG(fatal) << "There is more input data for " << m_SimultaneousIterations << " execution.";
+            }
         }
 
         if ((m_OutputTensorFiles.size() != 0) &&
@@ -171,6 +186,12 @@
             ARMNN_LOG(fatal) << "output-name and write-outputs-to-file must have the same amount of elements. ";
         }
 
+        if ((m_OutputTensorFiles.size() != 0)
+            && m_OutputTensorFiles.size() != m_SimultaneousIterations * m_OutputNames.size())
+        {
+            ARMNN_LOG(fatal) << "There is not enough output data for " << m_SimultaneousIterations << " execution.";
+        }
+
         if (m_InputTypes.size() == 0)
         {
             //Defaults the value of all inputs to "float"
diff --git a/tests/ExecuteNetwork/ExecuteNetworkParams.hpp b/tests/ExecuteNetwork/ExecuteNetworkParams.hpp
index a19eaa9..c325df1 100644
--- a/tests/ExecuteNetwork/ExecuteNetworkParams.hpp
+++ b/tests/ExecuteNetwork/ExecuteNetworkParams.hpp
@@ -23,6 +23,7 @@
 
     std::string                   m_CachedNetworkFilePath;
     std::vector<armnn::BackendId> m_ComputeDevices;
+    bool                          m_Concurrent;
     bool                          m_DequantizeOutput;
     std::string                   m_DynamicBackendsPath;
     bool                          m_EnableBf16TurboMode;
@@ -49,6 +50,7 @@
     bool                          m_PrintIntermediate;
     bool                          m_QuantizeInput;
     bool                          m_SaveCachedNetwork;
+    size_t                        m_SimultaneousIterations;
     size_t                        m_SubgraphId;
     double                        m_ThresholdTime;
     int                           m_TuningLevel;
diff --git a/tests/ExecuteNetwork/ExecuteNetworkProgramOptions.cpp b/tests/ExecuteNetwork/ExecuteNetworkProgramOptions.cpp
index 286c970..042087e 100644
--- a/tests/ExecuteNetwork/ExecuteNetworkProgramOptions.cpp
+++ b/tests/ExecuteNetwork/ExecuteNetworkProgramOptions.cpp
@@ -194,6 +194,10 @@
                  "If left empty (the default), dynamic backends will not be used.",
                  cxxopts::value<std::string>(m_RuntimeOptions.m_DynamicBackendsPath))
 
+                ("n,concurrent",
+                 "If this option is enabled inferences will be executed in parallel asynchronously.",
+                 cxxopts::value<bool>(m_ExNetParams.m_Concurrent)->default_value("false")->implicit_value("true"))
+
                 ("d,input-tensor-data",
                  "Path to files containing the input data as a flat array separated by whitespace. "
                  "Several paths can be passed by separating them with a comma. If not specified, the network will be "
@@ -278,7 +282,11 @@
                 ("D,armnn-tflite-delegate",
                  "Enable Arm NN TfLite delegate. "
                  "This option is depreciated please use tflite-executor instead",
-                 cxxopts::value<bool>(m_ExNetParams.m_EnableDelegate)->default_value("false")->implicit_value("true"));
+                 cxxopts::value<bool>(m_ExNetParams.m_EnableDelegate)->default_value("false")->implicit_value("true"))
+
+               ("simultaneous-iterations",
+                "Number of simultaneous iterations to async-run the network for, default is set to 1",
+                cxxopts::value<size_t>(m_ExNetParams.m_SimultaneousIterations)->default_value("1"));
 
         m_CxxOptions.add_options("c) Optimization")
                 ("bf16-turbo-mode",
diff --git a/tests/InferenceModel.hpp b/tests/InferenceModel.hpp
index cab594e..88c704c 100644
--- a/tests/InferenceModel.hpp
+++ b/tests/InferenceModel.hpp
@@ -101,6 +101,7 @@
     std::string                     m_CachedNetworkFilePath;
     unsigned int                    m_NumberOfThreads;
     std::string                     m_MLGOTuningFilePath;
+    bool                            m_AsyncEnabled;
 
 
     Params()
@@ -118,6 +119,7 @@
         , m_CachedNetworkFilePath("")
         , m_NumberOfThreads(0)
         , m_MLGOTuningFilePath("")
+        , m_AsyncEnabled(false)
     {}
 };
 
@@ -472,14 +474,14 @@
             optNet->SerializeToDot(file);
         }
 
-
-
         armnn::Status ret;
         {
             ARMNN_SCOPED_HEAP_PROFILING("LoadNetwork");
 
             const auto loading_start_time = armnn::GetTimeNow();
-            ret = m_Runtime->LoadNetwork(m_NetworkIdentifier, std::move(optNet));
+            armnn::INetworkProperties networkProperties(false, false, params.m_AsyncEnabled);
+            std::string errorMessage;
+            ret = m_Runtime->LoadNetwork(m_NetworkIdentifier, std::move(optNet), errorMessage, networkProperties);
 
             ARMNN_LOG(info) << "Network loading time: " << std::setprecision(2)
                             << std::fixed << armnn::GetTimeDuration(loading_start_time).count() << " ms\n";
@@ -553,7 +555,6 @@
         armnn::Status ret = m_Runtime->EnqueueWorkload(m_NetworkIdentifier,
                                                        MakeInputTensors(inputContainers),
                                                        MakeOutputTensors(outputContainers));
-
         const auto duration = armnn::GetTimeDuration(start_time);
 
         // if profiling is enabled print out the results
@@ -572,6 +573,63 @@
         }
     }
 
+    std::tuple<armnn::profiling::ProfilingGuid, std::chrono::duration<double, std::milli>> RunAsync(
+        armnn::experimental::IWorkingMemHandle& workingMemHandleRef,
+        const std::vector<TContainer>& inputContainers,
+        std::vector<TContainer>& outputContainers)
+    {
+        for (unsigned int i = 0; i < outputContainers.size(); ++i)
+        {
+            const unsigned int expectedOutputDataSize = GetOutputSize(i);
+
+            mapbox::util::apply_visitor([expectedOutputDataSize, i](auto&& value)
+            {
+                const unsigned int actualOutputDataSize   = armnn::numeric_cast<unsigned int>(value.size());
+                if (actualOutputDataSize < expectedOutputDataSize)
+                {
+                    unsigned int outputIndex = i;
+                    throw armnn::Exception(
+                            fmt::format("Not enough data for output #{0}: expected "
+                            "{1} elements, got {2}", outputIndex, expectedOutputDataSize, actualOutputDataSize));
+                }
+            },
+            outputContainers[i]);
+        }
+
+        std::shared_ptr<armnn::IProfiler> profiler = m_Runtime->GetProfiler(m_NetworkIdentifier);
+        if (profiler)
+        {
+            profiler->EnableProfiling(m_EnableProfiling);
+        }
+
+        // Start timer to record inference time in EnqueueWorkload (in milliseconds)
+        const auto start_time = armnn::GetTimeNow();
+
+        armnn::Status ret = m_Runtime->Execute(workingMemHandleRef,
+                                               MakeInputTensors(inputContainers),
+                                               MakeOutputTensors(outputContainers));
+        auto inferenceID = workingMemHandleRef.GetInferenceId();
+
+        const auto duration = armnn::GetTimeDuration(start_time);
+
+        // if profiling is enabled print out the results
+        if (profiler && profiler->IsProfilingEnabled())
+        {
+            profiler->Print(std::cout);
+        }
+
+        if (ret == armnn::Status::Failure)
+        {
+            throw armnn::Exception(
+                fmt::format("IRuntime::Execute asynchronously failed for network #{0} on inference #{1}",
+                            m_NetworkIdentifier, inferenceID));
+        }
+        else
+        {
+            return std::make_tuple(inferenceID, duration);
+        }
+    }
+
     const armnn::BindingPointInfo& GetInputBindingInfo(unsigned int inputIndex = 0u) const
     {
         CheckInputIndexIsValid(inputIndex);
@@ -618,6 +676,11 @@
         return quantizationParams;
     }
 
+    std::unique_ptr<armnn::experimental::IWorkingMemHandle> CreateWorkingMemHandle()
+    {
+        return m_Runtime->CreateWorkingMemHandle(m_NetworkIdentifier);
+    }
+
 private:
     armnn::NetworkId m_NetworkIdentifier;
     std::shared_ptr<armnn::IRuntime> m_Runtime;