IVGCVSW-5790 Merge async prototype

 * Added thread safe execution mechanism for armnn
 * Removed duplicate function bool Compare(T a, T b, float tolerance)
 * Added StridedSliceAsyncEndToEndTest
 * Fixed memory leak

Signed-off-by: Mike Kelly <mike.kelly@arm.com>
Change-Id: I2d367fc77ee7c01b8953138543e76af5e691211f
diff --git a/Android.mk b/Android.mk
index aa89ff9..806d81b 100644
--- a/Android.mk
+++ b/Android.mk
@@ -108,6 +108,7 @@
         profiling/server/src/timelineDecoder/TimelineCaptureCommandHandler.cpp \
         profiling/server/src/timelineDecoder/TimelineDecoder.cpp \
         profiling/server/src/timelineDecoder/TimelineDirectoryCaptureCommandHandler.cpp \
+        src/armnn/AsyncNetwork.cpp \
         src/armnn/BackendHelper.cpp \
         src/armnn/BackendRegistry.cpp \
         src/armnn/Descriptors.cpp \
@@ -134,6 +135,7 @@
         src/armnn/TypesUtils.cpp \
         src/armnn/Utils.cpp \
         src/armnn/WallClockTimer.cpp \
+        src/armnn/WorkingMemHandle.cpp \
         src/armnnUtils/DataLayoutIndexed.cpp \
         src/armnnUtils/DotSerializer.cpp \
         src/armnnUtils/FloatingPointConverter.cpp \
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4e75c28..62417be 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -242,12 +242,14 @@
     include/armnn/Descriptors.hpp
     include/armnn/DescriptorsFwd.hpp
     include/armnn/Exceptions.hpp
+    include/armnn/IAsyncNetwork.hpp
     include/armnn/ILayerSupport.hpp
     include/armnn/ILayerVisitor.hpp
     include/armnn/INetwork.hpp
     include/armnn/IProfiler.hpp
     include/armnn/IRuntime.hpp
     include/armnn/IStrategy.hpp
+    include/armnn/IWorkingMemHandle.hpp
     include/armnn/LayerSupport.hpp
     include/armnn/LayerVisitorBase.hpp
     include/armnn/Logging.hpp
@@ -406,6 +408,8 @@
     src/armnn/layers/TransposeLayer.cpp
     src/armnn/layers/UnmapLayer.cpp
     src/armnn/layers/UnmapLayer.hpp
+    src/armnn/AsyncNetwork.cpp
+    src/armnn/AsyncNetwork.hpp
     src/armnn/BackendRegistry.cpp
     src/armnn/BackendSettings.hpp
     src/armnn/BackendHelper.cpp
@@ -477,6 +481,9 @@
     src/armnn/Utils.cpp
     src/armnn/WallClockTimer.cpp
     src/armnn/WallClockTimer.hpp
+    src/armnn/WorkingMemDescriptor.hpp
+    src/armnn/WorkingMemHandle.cpp
+    src/armnn/WorkingMemHandle.hpp
     src/armnn/optimizations/AddBroadcastReshapeLayer.hpp
     src/armnn/optimizations/AddDebug.hpp
     src/armnn/optimizations/All.hpp
diff --git a/include/armnn/ArmNN.hpp b/include/armnn/ArmNN.hpp
index 4b945b9..ac4d33f 100644
--- a/include/armnn/ArmNN.hpp
+++ b/include/armnn/ArmNN.hpp
@@ -7,6 +7,7 @@
 #include "BackendId.hpp"
 #include "Descriptors.hpp"
 #include "Exceptions.hpp"
+#include "IAsyncNetwork.hpp"
 #include "INetwork.hpp"
 #include "IRuntime.hpp"
 #include "LstmParams.hpp"
diff --git a/include/armnn/IAsyncNetwork.hpp b/include/armnn/IAsyncNetwork.hpp
new file mode 100644
index 0000000..7ef83bb
--- /dev/null
+++ b/include/armnn/IAsyncNetwork.hpp
@@ -0,0 +1,51 @@
+//
+// Copyright © 2021 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include <armnn/NetworkFwd.hpp>
+
+#include "INetwork.hpp"
+#include "IProfiler.hpp"
+#include "IWorkingMemHandle.hpp"
+#include "Tensor.hpp"
+#include "Types.hpp"
+
+#include <mutex>
+
+namespace armnn
+{
+
+namespace experimental
+{
+
+class IAsyncNetwork
+{
+public:
+    virtual ~IAsyncNetwork() {};
+
+    virtual TensorInfo GetInputTensorInfo(LayerBindingId layerId) const = 0;
+    virtual TensorInfo GetOutputTensorInfo(LayerBindingId layerId) const = 0;
+
+    /// Thread safe execution of the network. Returns once execution is complete.
+    /// Will block until this and any other thread using the same workingMem object completes.
+    virtual Status Execute(const InputTensors& inputTensors,
+                           const OutputTensors& outputTensors,
+                           IWorkingMemHandle& workingMemHandle) = 0;
+
+    /// Create a new unique WorkingMemHandle object. Create multiple handles if you wish to have
+    /// overlapped Execution by calling this function from different threads.
+    virtual std::unique_ptr<IWorkingMemHandle> CreateWorkingMemHandle() = 0;
+
+    /// Get the profiler used for this network
+    virtual std::shared_ptr<IProfiler> GetProfiler() const = 0;
+
+    /// Register a debug callback function to be used with this network
+    virtual void RegisterDebugCallback(const DebugCallbackFunction& func) = 0;
+};
+
+} // end experimental namespace
+
+} // end armnn namespace
diff --git a/include/armnn/INetwork.hpp b/include/armnn/INetwork.hpp
index bceb074..2db6d5d 100644
--- a/include/armnn/INetwork.hpp
+++ b/include/armnn/INetwork.hpp
@@ -704,6 +704,12 @@
     std::unique_ptr<NetworkImpl> pNetworkImpl;
 };
 
+namespace experimental
+{
+class AsyncNetwork;
+class WorkingMemHandle;
+}
+
 struct BackendSettings;
 struct OptimizationResult;
 class OptimizedNetworkImpl;
@@ -723,6 +729,10 @@
 
 protected:
     friend class LoadedNetwork;
+
+    friend class experimental::AsyncNetwork;
+    friend class experimental::WorkingMemHandle;
+
     friend Graph& GetGraphForTesting(IOptimizedNetwork* optNetPtr);
     friend ModelOptions& GetModelOptionsForTesting(IOptimizedNetwork* optNetPtr);
     friend IOptimizedNetworkPtr Optimize(const INetwork& inNetwork,
diff --git a/include/armnn/IRuntime.hpp b/include/armnn/IRuntime.hpp
index 9122089..9f70329 100644
--- a/include/armnn/IRuntime.hpp
+++ b/include/armnn/IRuntime.hpp
@@ -5,6 +5,7 @@
 #pragma once
 
 #include "BackendOptions.hpp"
+#include "IAsyncNetwork.hpp"
 #include "INetwork.hpp"
 #include "IProfiler.hpp"
 #include "Tensor.hpp"
@@ -37,6 +38,8 @@
     virtual ~INetworkProperties() {}
 };
 
+using namespace armnn::experimental;
+
 class IRuntime
 {
 public:
@@ -142,6 +145,20 @@
                        std::string& errorMessage,
                        const INetworkProperties& networkProperties);
 
+    /// This is an experimental function.
+    /// Creates an executable network. This network is thread safe allowing for multiple networks to be
+    /// loaded simultaneously via different threads.
+    /// Note that the network is never registered with the runtime so does not need to be 'Unloaded'.
+    /// @param [out] networkIdOut Unique identifier for the network is returned in this reference.
+    /// @param [in] network Complete network to load into the IRuntime.
+    /// @param [out] errorMessage Error message if there were any errors.
+    /// @param [out] networkProperties the INetworkProperties that govern how the network should operate.
+    /// @return The IAsyncNetwork
+    std::unique_ptr<IAsyncNetwork> CreateAsyncNetwork(NetworkId& networkIdOut,
+                                                      IOptimizedNetworkPtr network,
+                                                      std::string& errorMessage,
+                                                      const INetworkProperties& networkProperties);
+
     TensorInfo GetInputTensorInfo(NetworkId networkId, LayerBindingId layerId) const;
     TensorInfo GetOutputTensorInfo(NetworkId networkId, LayerBindingId layerId) const;
 
diff --git a/include/armnn/IWorkingMemHandle.hpp b/include/armnn/IWorkingMemHandle.hpp
new file mode 100644
index 0000000..921b7e1
--- /dev/null
+++ b/include/armnn/IWorkingMemHandle.hpp
@@ -0,0 +1,46 @@
+//
+// Copyright © 2021 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include <mutex>
+
+namespace armnn
+{
+
+namespace experimental
+{
+
+struct WorkingMemDescriptor;
+
+class IWorkingMemHandle
+{
+public:
+    virtual ~IWorkingMemHandle() {};
+
+    /// Allocate the backing memory required for execution. If this is not called, then allocation will be
+    /// deferred to execution time. The mutex must be locked.
+    virtual void Allocate() = 0;
+
+    /// Free the backing memory required for execution. The mutex must be locked.
+    virtual void Free() = 0;
+
+    /// IsAllocated returns true if the backing memory is currently allocated. The mutex must be locked.
+    virtual bool IsAllocated() = 0;
+
+    /// Get a mutex which can be used for synchronizing access to the WorkingMemHandle object.
+    virtual std::mutex& GetMutex() = 0;
+
+    /// Get the WorkingMemDescriptor for a Layer. The mutex must be locked.
+    virtual WorkingMemDescriptor& GetWorkingMemDescriptor(LayerGuid id) = 0;
+
+    /// Get the WorkingMemDescriptor at an index. The WorkingMemDescriptors are stored in the same order as
+    /// the Workloads in a topologically sorted graph. The mutex must be locked.
+    virtual WorkingMemDescriptor& GetWorkingMemDescriptorAt(unsigned int id) = 0;
+};
+
+} // end experimental namespace
+
+} // end armnn namespace
diff --git a/include/armnn/NetworkFwd.hpp b/include/armnn/NetworkFwd.hpp
index 619839e..6c2970f 100644
--- a/include/armnn/NetworkFwd.hpp
+++ b/include/armnn/NetworkFwd.hpp
@@ -6,8 +6,17 @@
 
 namespace armnn
 {
+
 struct LstmInputParams;
 struct QuantizedLstmInputParams;
+
+namespace experimental
+{
+
+class IAsyncNetwork;
+
+} // end experimental namespace
+
 class INetwork;
 class IOptimizedNetwork;
 class Graph;
@@ -15,4 +24,5 @@
 class IOutputSlot;
 class IConnectableLayer;
 class IDataLayer;
-}
+
+} // end armnn namespace
diff --git a/include/armnn/backends/IWorkload.hpp b/include/armnn/backends/IWorkload.hpp
index 0bd8d2d..a4827eb 100644
--- a/include/armnn/backends/IWorkload.hpp
+++ b/include/armnn/backends/IWorkload.hpp
@@ -1,5 +1,5 @@
 //
-// Copyright © 2020 Arm Ltd. All rights reserved.
+// Copyright © 2020 Arm Ltd and Contributors. All rights reserved.
 // SPDX-License-Identifier: MIT
 //
 #pragma once
@@ -9,6 +9,15 @@
 namespace armnn
 {
 
+namespace experimental
+{
+
+struct WorkingMemDescriptor;
+
+} // end experimental namespace
+
+using namespace armnn::experimental;
+
 /// Workload interface to enqueue a layer computation.
 class IWorkload {
 public:
@@ -18,9 +27,11 @@
 
     virtual void Execute() const = 0;
 
+    virtual void ExecuteAsync(WorkingMemDescriptor& desc) = 0;
+
     virtual profiling::ProfilingGuid GetGuid() const = 0;
 
-    virtual void RegisterDebugCallback(const DebugCallbackFunction & /*func*/) {}
+    virtual void RegisterDebugCallback(const DebugCallbackFunction& /*func*/) {}
 };
 
 } //namespace armnn
diff --git a/src/armnn/AsyncNetwork.cpp b/src/armnn/AsyncNetwork.cpp
new file mode 100644
index 0000000..4698bcf
--- /dev/null
+++ b/src/armnn/AsyncNetwork.cpp
@@ -0,0 +1,611 @@
+//
+// Copyright © 2021 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "AsyncNetwork.hpp"
+#include "Graph.hpp"
+#include "Layer.hpp"
+#include "Profiling.hpp"
+
+#include <armnn/BackendRegistry.hpp>
+#include <armnn/Logging.hpp>
+#include <armnn/utility/Assert.hpp>
+
+#include <armnn/backends/IMemoryManager.hpp>
+#include <backendsCommon/CpuTensorHandle.hpp>
+#include <backendsCommon/WorkloadData.hpp>
+#include <backendsCommon/MemCopyWorkload.hpp>
+#include <LabelsAndEventClasses.hpp>
+
+#include <fmt/format.h>
+
+namespace armnn
+{
+
+namespace experimental
+{
+
+void AddLayerStructure(std::unique_ptr<profiling::TimelineUtilityMethods>& timelineUtils,
+                       const Layer& layer,
+                       profiling::ProfilingGuid networkGuid)
+{
+    // Add layer to the post-optimisation network structure
+    std::string layerName = layer.GetNameStr().empty() ? "<Unnamed>" : layer.GetNameStr();
+    timelineUtils->CreateNamedTypedChildEntity(layer.GetGuid(),
+                                               networkGuid,
+                                               layerName,
+                                               profiling::LabelsAndEventClasses::LAYER_GUID);
+    for (auto&& input : layer.GetInputSlots())
+    {
+        const IOutputSlot* source = input.GetConnectedOutputSlot();
+        ARMNN_ASSERT(source != NULL);
+        timelineUtils->CreateConnectionRelationship(profiling::ProfilingRelationshipType::RetentionLink,
+                                                    source->GetOwningLayerGuid(),
+                                                    layer.GetGuid());
+    }
+}
+
+void AddWorkloadStructure(std::unique_ptr<profiling::TimelineUtilityMethods>& timelineUtils,
+                          std::unique_ptr<IWorkload>& workload,
+                          const Layer& layer)
+{
+    // Add workload to the post-optimisation network structure
+    timelineUtils->CreateTypedEntity(workload->GetGuid(), profiling::LabelsAndEventClasses::WORKLOAD_GUID);
+    timelineUtils->MarkEntityWithLabel(workload->GetGuid(),
+                                       layer.GetBackendId().Get(),
+                                       profiling::LabelsAndEventClasses::BACKENDID_GUID);
+
+    // Link the workload to the layer
+    timelineUtils->CreateRelationship(profiling::ProfilingRelationshipType::RetentionLink,
+                                      layer.GetGuid(),
+                                      workload->GetGuid(),
+                                      profiling::LabelsAndEventClasses::CHILD_GUID);
+}
+
+TensorInfo AsyncNetwork::GetInputTensorInfo(LayerBindingId layerId) const
+{
+    for (auto&& inputLayer : m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph().GetInputLayers())
+    {
+        ARMNN_ASSERT_MSG(inputLayer->GetNumOutputSlots() == 1, "Input layer should have exactly 1 output slot");
+        if (inputLayer->GetBindingId() == layerId)
+        {
+            return inputLayer->GetOutputSlot(0).GetTensorInfo();
+        }
+    }
+
+    throw InvalidArgumentException(fmt::format("No input layer is associated with id {0}}", layerId));
+}
+
+TensorInfo AsyncNetwork::GetOutputTensorInfo(LayerBindingId layerId) const
+{
+    for (auto&& outputLayer : m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph().GetOutputLayers())
+    {
+        ARMNN_ASSERT_MSG(outputLayer->GetNumInputSlots() == 1, "Output layer should have exactly 1 input slot");
+        ARMNN_ASSERT_MSG(outputLayer->GetInputSlot(0).GetConnection(), "Input slot on Output layer must be connected");
+        if (outputLayer->GetBindingId() == layerId)
+        {
+            return outputLayer->GetInputSlot(0).GetConnection()->GetTensorInfo();
+        }
+    }
+
+    throw InvalidArgumentException(fmt::format("No output layer is associated with id {0}}", layerId));
+}
+
+// Need something like the collectors to get the correct tensors for the inputs
+void AsyncNetwork::CollectInputTensorHandles(
+        std::unordered_map<LayerGuid, std::vector<ITensorHandle*> >& tensorHandles,
+        std::vector<ITensorHandle*>& inputs,
+        const armnn::Layer* layer,
+        const TensorHandleFactoryRegistry& registry,
+        const bool isMemoryManaged)
+{
+    for (auto&& inputSlot : layer->GetInputSlots())
+    {
+        // The graph must be well-formed at this point.
+        ARMNN_ASSERT(inputSlot.GetConnection());
+        auto outputSlot = inputSlot.GetConnectedOutputSlot();
+        auto key = outputSlot->GetOwningLayer().GetGuid();
+        auto search = tensorHandles.find(key);
+
+        if (search == tensorHandles.end())
+        {
+            ITensorHandleFactory::FactoryId factoryId = outputSlot->GetTensorHandleFactoryId();
+            const TensorInfo& tensorInfo = outputSlot->GetTensorInfo();
+
+            ARMNN_ASSERT(factoryId != ITensorHandleFactory::LegacyFactoryId);
+            ITensorHandleFactory* handleFactory = registry.GetFactory(factoryId);
+            ARMNN_ASSERT(handleFactory);
+            std::unique_ptr<ITensorHandle> tensor = handleFactory->CreateTensorHandle(tensorInfo, isMemoryManaged);
+            ITensorHandle* tensorPtr = tensor.release();
+            inputs.push_back(tensorPtr);
+        }
+        else
+        {
+            unsigned int index = outputSlot->CalculateIndexOnOwner();
+            inputs.push_back(search->second[index]);
+        }
+    }
+}
+
+void AsyncNetwork::CreateOutputTensorHandles(
+        std::unordered_map<LayerGuid, std::vector<ITensorHandle*> >& tensorHandles,
+        std::vector<ITensorHandle*>& outputs,
+        const armnn::Layer* layer,
+        const TensorHandleFactoryRegistry& registry,
+        const bool isMemoryManaged)
+{
+    auto guid = layer->GetGuid();
+    std::vector<ITensorHandle*> tensorHandleVectors;
+    tensorHandleVectors.reserve(layer->GetNumOutputSlots());
+
+    for (unsigned int idx=0; idx < layer->GetNumOutputSlots(); idx++)
+    {
+        const OutputSlot& slot = layer->GetOutputSlot(idx);
+        ITensorHandleFactory::FactoryId factoryId = slot.GetTensorHandleFactoryId();
+        const TensorInfo& tensorInfo = slot.GetTensorInfo();
+
+        ARMNN_ASSERT(factoryId != ITensorHandleFactory::LegacyFactoryId);
+        ITensorHandleFactory* handleFactory = registry.GetFactory(factoryId);
+        ARMNN_ASSERT(handleFactory);
+        std::unique_ptr<ITensorHandle> tensor = handleFactory->CreateTensorHandle(tensorInfo, isMemoryManaged);
+        ITensorHandle* tensorPtr = tensor.release();
+        outputs.push_back(tensorPtr);
+        tensorHandleVectors.push_back(tensorPtr);
+    }
+    tensorHandles.insert({guid, tensorHandleVectors});
+}
+
+const IWorkloadFactory& AsyncNetwork::GetWorkloadFactory(const Layer& layer) const
+{
+    const IWorkloadFactory* workloadFactory = nullptr;
+
+    auto it = m_WorkloadFactories.find(layer.GetBackendId());
+    if (it == m_WorkloadFactories.end())
+    {
+        throw RuntimeException(
+                        fmt::format("No workload factory for {0} to be used for layer: {1}}",
+                                    layer.GetBackendId().Get(),
+                                    layer.GetNameStr()),
+                                    CHECK_LOCATION());
+    }
+
+    workloadFactory = it->second.first.get();
+
+    ARMNN_ASSERT_MSG(workloadFactory, "No workload factory");
+
+    std::string reasonIfUnsupported;
+    ARMNN_ASSERT_MSG(IWorkloadFactory::IsLayerSupported(layer, {}, reasonIfUnsupported),
+                     "Factory does not support layer");
+    IgnoreUnused(reasonIfUnsupported);
+    return *workloadFactory;
+}
+
+void AsyncNetwork::EnqueueInput(const BindableLayer& layer, const ConstTensor& inputTensor, WorkingMemHandle& context)
+{
+    if (layer.GetType() != LayerType::Input)
+    {
+        throw InvalidArgumentException("EnqueueInput: given layer not an InputLayer");
+    }
+    LayerGuid id = layer.GetOutputSlot(0).GetConnection(0)->GetOwningLayer().GetGuid();
+    WorkingMemDescriptor descriptor = context.GetWorkingMemDescriptor(id);
+    ARMNN_ASSERT_MSG(descriptor.m_Outputs.size() == 1, "Can only handle Input Layer with one output");
+
+    MemorySourceFlags importFlags = descriptor.m_Outputs[0]->GetImportFlags();
+    if (m_NetworkProperties.m_ImportEnabled)  // Try import the input tensor
+    {
+        if (CheckFlag(importFlags, MemorySource::Malloc) )
+        {
+            // This assumes a CPU Tensor handle
+            std::unique_ptr<ITensorHandle> tensorHandle =
+                    std::make_unique<ConstPassthroughCpuTensorHandle>(inputTensor.GetInfo(),
+                                                                      inputTensor.GetMemoryArea());
+
+            void* mem = tensorHandle->Map(false);
+            if (descriptor.m_Outputs[0]->Import(mem, MemorySource::Malloc))
+            {
+                tensorHandle->Unmap();
+                return;
+            }
+            tensorHandle->Unmap();
+            throw MemoryImportException("EnqueueInput: Memory Import failed");
+        }
+        else
+        {
+            throw MemoryImportException("EnqueueInput: Memory Import failed, backend does not support Import");
+        }
+    }
+    else
+    {
+        std::unique_ptr<ITensorHandle> tensorHandle =
+                std::make_unique<ConstPassthroughCpuTensorHandle>(inputTensor.GetInfo(), inputTensor.GetMemoryArea());
+
+        auto copyFunc = [](void* dst, const void* src, size_t size)
+        {
+            memcpy(dst, src, size);
+        };
+
+        for (const auto& input : descriptor.m_Inputs)
+        {
+            CopyTensorContentsGeneric(tensorHandle.get(), input, copyFunc);
+        }
+    }
+}
+
+void AsyncNetwork::EnqueueOutput(const BindableLayer& layer, const Tensor& outputTensor, WorkingMemHandle& handle)
+{
+    if (layer.GetType() != LayerType::Output)
+    {
+        throw InvalidArgumentException("EnqueueOutput: given layer not an OutputLayer");
+    }
+    ARMNN_ASSERT_MSG(layer.GetNumInputSlots() == 1, "Output Layer should have exactly one input.");
+
+    LayerGuid id = layer.GetInputSlot(0).GetConnectedOutputSlot()->GetOwningLayerGuid();
+    WorkingMemDescriptor descriptor = handle.GetWorkingMemDescriptor(id);
+
+    ITensorHandle* inputTensorHandle = descriptor.m_Inputs[0];
+    ARMNN_ASSERT_MSG(inputTensorHandle != nullptr, "Data should have been allocated.");
+
+    // Try import the output tensor.
+    // Note: We can only import the output pointer if all of the following  hold true:
+    // a) The imported pointer is aligned sufficiently
+    // b) The tensor has zero padding
+    // c) There is only one connection to the OutputSlot and it is to an OutputLayer.
+    // d) The output pointer is allocated via malloc. (Other types will be supported in a later release)
+    // e) m_IsExportEnabled must be set to true
+    if (m_NetworkProperties.m_ExportEnabled &&
+        (layer.GetInputSlots()[0].GetConnectedOutputSlot()->GetNumConnections() == 1))
+    {
+        if (layer.GetInputSlots()[0].GetConnectedOutputSlot()->GetOwningLayer().GetType() != LayerType::Input)
+        {
+            MemorySourceFlags importFlags = inputTensorHandle->GetImportFlags();
+            if (CheckFlag(importFlags, MemorySource::Malloc))
+            {
+                std::unique_ptr<ITensorHandle> tensorHandle =
+                        std::make_unique<PassthroughCpuTensorHandle>(outputTensor.GetInfo(),
+                                                                     outputTensor.GetMemoryArea());
+
+                void* mem = tensorHandle->Map(false);
+                bool importOk = inputTensorHandle->Import(mem, MemorySource::Malloc);
+                tensorHandle->Unmap();
+
+                if (importOk)
+                {
+                    ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "SyncMemGeneric_Execute");
+                    descriptor.m_Inputs[0]->Map(true);
+                    descriptor.m_Inputs[0]->Unmap();
+                }
+                else
+                {
+                    throw MemoryExportException("EnqueueOutput: Memory Export failed");
+                }
+            }
+            else
+            {
+                throw MemoryExportException("EnqueueOutput: Memory Export failed, backend does not support Export");
+            }
+        }
+        else
+        {
+            throw MemoryExportException("EnqueueOutput: Memory Export failed, attempting to export Input Layer");
+        }
+    }
+    else
+    {
+        auto copyFunc = [](void* dst, const void* src, size_t size)
+        {
+            memcpy(dst, src, size);
+        };
+
+        std::unique_ptr<ITensorHandle> tensorHandle =
+                std::make_unique<PassthroughCpuTensorHandle>(outputTensor.GetInfo(), outputTensor.GetMemoryArea());
+
+        CopyTensorContentsGeneric(descriptor.m_Outputs[0], tensorHandle.get(), copyFunc);
+    }
+}
+
+AsyncNetwork::AsyncNetwork(std::unique_ptr<IOptimizedNetwork> net,
+                           const INetworkProperties& networkProperties,
+                           profiling::ProfilingService& profilingService) :
+    m_OptimizedNetwork(std::move(net)),
+    m_NetworkProperties(networkProperties),
+    m_ProfilingService(profilingService)
+{
+    // Create a profiler and register it for the current thread.
+    m_Profiler = std::make_shared<IProfiler>();
+    ProfilerManager::GetInstance().RegisterProfiler(m_Profiler.get());
+
+    Graph &order = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph().TopologicalSort();
+
+    //First create tensor handlers, backends and workload factories.
+    //Handlers are created before workloads are.
+    //Because workload creation can modify some of the handlers,
+    //(for example the splitter and concat layers).
+    for (auto &&layer : order)
+    {
+        auto const &backendId = layer->GetBackendId();
+        if (m_Backends.count(backendId) == 0)
+        {
+            auto createBackend = BackendRegistryInstance().GetFactory(backendId);
+            auto it = m_Backends.emplace(std::make_pair(backendId, createBackend()));
+
+            IBackendInternal* backend = it.first->second.get();
+
+            if (backend->SupportsTensorAllocatorAPI())
+            {
+                backend->RegisterTensorHandleFactories(m_TensorHandleFactoryRegistry);
+
+                auto workloadFactory = backend->CreateWorkloadFactory(m_TensorHandleFactoryRegistry);
+                m_WorkloadFactories.emplace(
+                        std::make_pair(backendId, std::make_pair(std::move(workloadFactory), nullptr)));
+            }
+            else
+            {
+                IBackendInternal::IMemoryManagerSharedPtr memoryManager = backend->CreateMemoryManager();
+                auto workloadFactory = backend->CreateWorkloadFactory(memoryManager);
+
+                m_WorkloadFactories.emplace(
+                        std::make_pair(backendId, std::make_pair(std::move(workloadFactory), memoryManager)));
+            }
+        }
+    }
+
+    profiling::ProfilingGuid networkGuid = m_OptimizedNetwork->GetGuid();
+    std::unique_ptr<profiling::TimelineUtilityMethods> timelineUtils =
+            profiling::TimelineUtilityMethods::GetTimelineUtils(m_ProfilingService);
+    if (timelineUtils)
+    {
+        timelineUtils->CreateTypedEntity(networkGuid, profiling::LabelsAndEventClasses::NETWORK_GUID);
+    }
+
+    //Then create workloads.
+    for (auto &&layer : order)
+    {
+        if (timelineUtils)
+        {
+            // Add layer to the post-optimisation network structure
+            AddLayerStructure(timelineUtils, *layer, networkGuid);
+        }
+
+        const IWorkloadFactory &workloadFactory = GetWorkloadFactory(*layer);
+
+        switch (layer->GetType())
+        {
+            case LayerType::Input:
+            case LayerType::Output:
+            {
+                // Inputs and outputs are treated in a special way - see EnqueueInput() and EnqueueOutput().
+                break;
+            }
+            default:
+            {
+                auto workload = layer->CreateWorkload(workloadFactory);
+
+                if (!workload)
+                {
+                    const char* const layerName =
+                            layer->GetNameStr().length() != 0 ? layer->GetName() : "<Unnamed>";
+                    throw InvalidArgumentException(
+                            fmt::format("No workload created for layer (name: '{0}' type: '{1}') (compute '{2}')",
+                                        layerName,
+                                        static_cast<int>(layer->GetType()),
+                                        layer->GetBackendId().Get()
+                    ));
+                }
+
+                if (timelineUtils)
+                {
+                    // Add workload to the post-optimisation network structure
+                    AddWorkloadStructure(timelineUtils, workload, *layer);
+                }
+
+                m_WorkloadQueue.push_back(move(workload));
+                // release the constant data in the layer..
+                layer->ReleaseConstantData();
+                break;
+            }
+        }
+    }
+
+    if (timelineUtils)
+    {
+        // Commit to send the post-optimisation network structure
+        timelineUtils->Commit();
+    }
+
+    // Now that the intermediate tensor memory has been set-up, do any post allocation configuration for each workload.
+    // PostAllocationConfiguure will now need to be handled in the ExecuteOn(WorkingMemDescriptor)
+    for (auto &workload : m_WorkloadQueue)
+    {
+        workload->PostAllocationConfigure();
+    }
+}
+
+Status AsyncNetwork::Execute(const InputTensors& inputTensors,
+                             const OutputTensors& outputTensors,
+                             IWorkingMemHandle& iWorkingMemHandle)
+{
+    const Graph& graph = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph();
+
+    // Walk graph to determine the order of execution.
+    if (graph.GetNumLayers() < 2)
+    {
+        ARMNN_LOG(warning) << "IRuntime::EnqueueWorkload()::Less than two nodes in graph";
+        return Status::Failure;
+    }
+
+    if (graph.GetNumInputs() != inputTensors.size())
+    {
+        throw InvalidArgumentException("Number of inputs provided does not match network.");
+    }
+
+    std::unique_ptr<profiling::TimelineUtilityMethods> timelineUtils =
+            profiling::TimelineUtilityMethods::GetTimelineUtils(m_ProfilingService);
+    profiling::ProfilingGuid inferenceGuid = m_ProfilingService.GetNextGuid();
+    if (timelineUtils)
+    {
+        // Add inference timeline trace if profiling is enabled.
+        profiling::ProfilingGuid networkGuid = m_OptimizedNetwork->GetGuid();
+        timelineUtils->CreateTypedEntity(inferenceGuid, profiling::LabelsAndEventClasses::INFERENCE_GUID);
+        timelineUtils->CreateRelationship(profiling::ProfilingRelationshipType::RetentionLink,
+                                          networkGuid,
+                                          inferenceGuid,
+                                          profiling::LabelsAndEventClasses::EXECUTION_OF_GUID);
+        timelineUtils->RecordEvent(inferenceGuid, profiling::LabelsAndEventClasses::ARMNN_PROFILING_SOL_EVENT_CLASS);
+    }
+
+    bool executionSucceeded = true;
+
+    if (timelineUtils)
+    {
+        // Add end of life of the inference timeline if profiling is enabled.
+        timelineUtils->RecordEvent(inferenceGuid, profiling::LabelsAndEventClasses::ARMNN_PROFILING_EOL_EVENT_CLASS);
+        timelineUtils->Commit();
+    }
+    WorkingMemHandle& workingMemHandle = dynamic_cast<WorkingMemHandle&>(iWorkingMemHandle);
+    std::lock_guard<std::mutex> lockGuard(workingMemHandle.GetMutex());
+
+    if (!workingMemHandle.IsAllocated())
+    {
+        workingMemHandle.Allocate();
+    }
+
+    {
+        ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "PrepareInputs");
+        unsigned int i = 0;
+
+        for (const BindableLayer* inputLayer : graph.GetInputLayers())
+        {
+            EnqueueInput(*inputLayer, inputTensors[i].second, workingMemHandle);
+            ++i;
+        }
+    }
+
+    auto Fail = [&](const std::exception& error)
+    {
+        ARMNN_LOG(error) << "An error occurred attempting to execute a workload: " << error.what();
+        executionSucceeded = false;
+    };
+    profiling::ProfilingDynamicGuid workloadInferenceID(0);
+
+    try
+    {
+        for (unsigned int i = 0; i < m_WorkloadQueue.size(); ++i)
+        {
+            auto& workload = m_WorkloadQueue[i];
+            if (timelineUtils)
+            {
+                workloadInferenceID = timelineUtils->RecordWorkloadInferenceAndStartOfLifeEvent(workload->GetGuid(),
+                                                                                                inferenceGuid);
+            }
+            workload->ExecuteAsync(workingMemHandle.GetWorkingMemDescriptorAt(i));
+
+            if (timelineUtils)
+            {
+                timelineUtils->RecordEndOfLifeEvent(workloadInferenceID);
+            }
+        }
+    }
+    catch (const RuntimeException& error)
+    {
+        Fail(error);
+    }
+    catch (const std::runtime_error& error)
+    {
+        Fail(error);
+    }
+    // For each output to the network, call EnqueueOutput with the data passed by the user.
+    {
+        ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "PrepareOutputs");
+        unsigned int i = static_cast<unsigned int>(m_WorkloadQueue.size() - graph.GetNumOutputs());
+
+        for (const BindableLayer* outputLayer : graph.GetOutputLayers())
+        {
+            EnqueueOutput(*outputLayer, outputTensors[i].second, workingMemHandle);
+            ++i;
+        }
+    }
+    return executionSucceeded ? Status::Success : Status::Failure;
+}
+
+/// Get the profiler used for this network
+std::shared_ptr<IProfiler> AsyncNetwork::GetProfiler() const
+{
+    return m_Profiler;
+}
+
+void AsyncNetwork::RegisterDebugCallback(const DebugCallbackFunction& func)
+{
+    for (auto&& workloadPtr: m_WorkloadQueue)
+    {
+        workloadPtr.get()->RegisterDebugCallback(func);
+    }
+}
+
+/// Create a new unique WorkingMemHandle object. Create multiple handles if you wish to have
+/// overlapped Execution by calling this function from different threads.
+std::unique_ptr<IWorkingMemHandle> AsyncNetwork::CreateWorkingMemHandle()
+{
+    Graph& order = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph();
+    std::unordered_map<LayerGuid, std::vector<ITensorHandle*> > tensorHandles;
+    std::vector<WorkingMemDescriptor> workingMemDescriptors;
+    std::unordered_map<LayerGuid, WorkingMemDescriptor> workingMemDescriptorMap;
+
+    for (auto&& layer : order)
+    {
+        if (layer->GetType() == LayerType::Input || layer->GetType() == LayerType::Output)
+        {
+            continue;
+        }
+        WorkingMemDescriptor workingMemDescriptor;
+        // Look for the layer with 1 OutputSlot which has 1 connection and that connection is an Output Layer
+        // If Export is enabled disable memory management so we can export, otherwise we do a copy
+        if((layer->GetNumOutputSlots() == 1) &&
+           (layer->GetOutputSlots()[0].GetNumConnections() == 1) &&
+           (layer->GetOutputSlots()[0].GetConnection(0)->GetOwningLayer().GetType() == LayerType::Output))
+        {
+            CollectInputTensorHandles(tensorHandles,
+                                      workingMemDescriptor.m_Inputs,
+                                      layer,
+                                      m_TensorHandleFactoryRegistry,
+                                      !m_NetworkProperties.m_ExportEnabled);
+            CreateOutputTensorHandles(tensorHandles,
+                                      workingMemDescriptor.m_Outputs,
+                                      layer,
+                                      m_TensorHandleFactoryRegistry,
+                                      !m_NetworkProperties.m_ExportEnabled);
+        }
+        else
+        {
+            CollectInputTensorHandles(tensorHandles,
+                                      workingMemDescriptor.m_Inputs,
+                                      layer,
+                                      m_TensorHandleFactoryRegistry);
+            CreateOutputTensorHandles(tensorHandles,
+                                      workingMemDescriptor.m_Outputs,
+                                      layer,
+                                      m_TensorHandleFactoryRegistry);
+        }
+        workingMemDescriptorMap.insert({layer->GetGuid(), workingMemDescriptor});
+        workingMemDescriptors.push_back(workingMemDescriptor);
+    }
+    return std::make_unique<WorkingMemHandle>(workingMemDescriptors, workingMemDescriptorMap);
+}
+
+void AsyncNetwork::FreeWorkingMemory()
+{
+    // Informs the memory managers to release memory in it's respective memory group
+    for (auto&& workloadFactory : m_WorkloadFactories)
+    {
+        IBackendInternal::IMemoryManagerSharedPtr memoryManager = workloadFactory.second.second;
+        if (memoryManager)
+        {
+            memoryManager->Release();
+        }
+    }
+    m_TensorHandleFactoryRegistry.ReleaseMemory();
+}
+
+} // end experimental namespace
+
+} // end armnn namespace
diff --git a/src/armnn/AsyncNetwork.hpp b/src/armnn/AsyncNetwork.hpp
new file mode 100644
index 0000000..9c525c5
--- /dev/null
+++ b/src/armnn/AsyncNetwork.hpp
@@ -0,0 +1,106 @@
+//
+// Copyright © 2021 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include <armnn/IAsyncNetwork.hpp>
+#include <armnn/Tensor.hpp>
+#include <armnn/Types.hpp>
+
+#include "LayerFwd.hpp"
+#include "Network.hpp"
+#include "Profiling.hpp"
+#include "WorkingMemHandle.hpp"
+
+#include <armnn/backends/IBackendInternal.hpp>
+#include <backendsCommon/TensorHandleFactoryRegistry.hpp>
+#include <backendsCommon/Workload.hpp>
+#include <backendsCommon/WorkloadFactory.hpp>
+#include <ProfilingService.hpp>
+#include <TimelineUtilityMethods.hpp>
+
+#include <unordered_map>
+
+namespace armnn
+{
+
+namespace experimental
+{
+
+class AsyncNetwork final : public IAsyncNetwork
+{
+public:
+    using WorkloadQueue = std::vector<std::unique_ptr<IWorkload>>;
+
+    AsyncNetwork(std::unique_ptr<IOptimizedNetwork> net,
+                 const INetworkProperties &networkProperties,
+                 profiling::ProfilingService &profilingService);
+
+    ~AsyncNetwork() { FreeWorkingMemory(); }
+
+    TensorInfo GetInputTensorInfo(LayerBindingId layerId) const override;
+    TensorInfo GetOutputTensorInfo(LayerBindingId layerId) const override;
+
+    /// Thread safe execution of the network. Returns once execution is complete.
+    /// Will block until this and any other thread using the same workingMem object completes.
+    virtual Status Execute(const InputTensors& inputTensors,
+                           const OutputTensors& outputTensors,
+                           IWorkingMemHandle& workingMemHandle) override;
+
+    /// Create a new unique WorkingMemHandle object. Create multiple handles if you wish to have
+    /// overlapped Execution by calling this function from different threads.
+    std::unique_ptr<IWorkingMemHandle> CreateWorkingMemHandle() override;
+
+    /// Get the profiler used for this network
+    std::shared_ptr<IProfiler> GetProfiler() const override;
+
+    /// Register a debug callback function to be used with this network
+    void RegisterDebugCallback(const DebugCallbackFunction& func) override;
+
+private:
+    void FreeWorkingMemory();
+
+    void CollectInputTensorHandles(std::unordered_map<LayerGuid, std::vector<ITensorHandle*> >& tensorHandles,
+                                   std::vector<ITensorHandle*>& inputs,
+                                   const armnn::Layer* layer,
+                                   const TensorHandleFactoryRegistry& registry,
+                                   const bool isMemoryManaged = false);
+
+    void CreateOutputTensorHandles(std::unordered_map<LayerGuid, std::vector<ITensorHandle*> >& tensorHandles,
+                                   std::vector<ITensorHandle*>& outputs,
+                                   const armnn::Layer* layer,
+                                   const TensorHandleFactoryRegistry& registry,
+                                   const bool isMemoryManaged = false);
+
+    void EnqueueInput(const BindableLayer& layer, const ConstTensor& inputTensor, WorkingMemHandle& handle);
+
+    void EnqueueOutput(const BindableLayer& layer, const Tensor& outputTensor, WorkingMemHandle& handle);
+
+    using BackendPtrMap = std::unordered_map<BackendId, IBackendInternalUniquePtr>;
+
+    using WorkloadFactoryWithMemoryManager =
+            std::pair<IBackendInternal::IWorkloadFactoryPtr, IBackendInternal::IMemoryManagerSharedPtr>;
+
+    using WorkloadFactoryMap = std::unordered_map<BackendId, WorkloadFactoryWithMemoryManager>;
+
+    const IWorkloadFactory& GetWorkloadFactory(const Layer& layer) const;
+
+    BackendPtrMap m_Backends;
+    WorkloadFactoryMap m_WorkloadFactories;
+
+    std::unique_ptr<IOptimizedNetwork> m_OptimizedNetwork;
+    INetworkProperties m_NetworkProperties;
+    WorkloadQueue m_WorkloadQueue;
+    std::shared_ptr<IProfiler> m_Profiler;
+
+    TensorHandleFactoryRegistry m_TensorHandleFactoryRegistry;
+
+    /// Profiling Service Instance
+    profiling::ProfilingService& m_ProfilingService;
+};
+
+} // end experimental namespace
+
+} // end armnn namespace
diff --git a/src/armnn/Runtime.cpp b/src/armnn/Runtime.cpp
index 9cc7b2c..5dc1ef9 100644
--- a/src/armnn/Runtime.cpp
+++ b/src/armnn/Runtime.cpp
@@ -64,6 +64,14 @@
     return pRuntimeImpl->LoadNetwork(networkIdOut, std::move(network), errorMessage, networkProperties);
 }
 
+std::unique_ptr<IAsyncNetwork> IRuntime::CreateAsyncNetwork(NetworkId& networkIdOut,
+                                                            IOptimizedNetworkPtr network,
+                                                            std::string& errorMessage,
+                                                            const INetworkProperties& networkProperties)
+{
+    return pRuntimeImpl->CreateAsyncNetwork(networkIdOut, std::move(network), errorMessage, networkProperties);
+}
+
 TensorInfo IRuntime::GetInputTensorInfo(NetworkId networkId, LayerBindingId layerId) const
 {
     return pRuntimeImpl->GetInputTensorInfo(networkId, layerId);
@@ -165,6 +173,43 @@
     return Status::Success;
 }
 
+std::unique_ptr<IAsyncNetwork> RuntimeImpl::CreateAsyncNetwork(NetworkId& networkIdOut,
+                                                               IOptimizedNetworkPtr network,
+                                                               std::string&,
+                                                               const INetworkProperties& networkProperties)
+{
+    IOptimizedNetwork* rawNetwork = network.release();
+
+    networkIdOut = GenerateNetworkId();
+
+    for (auto&& context : m_BackendContexts)
+    {
+        context.second->BeforeLoadNetwork(networkIdOut);
+    }
+
+    unique_ptr<AsyncNetwork> asyncNetwork = std::make_unique<AsyncNetwork>(
+            std::unique_ptr<IOptimizedNetwork>(rawNetwork),
+            networkProperties,
+            m_ProfilingService);
+
+    if (!asyncNetwork)
+    {
+        return nullptr;
+    }
+
+    for (auto&& context : m_BackendContexts)
+    {
+        context.second->AfterLoadNetwork(networkIdOut);
+    }
+
+    if (m_ProfilingService.IsProfilingEnabled())
+    {
+        m_ProfilingService.IncrementCounterValue(armnn::profiling::NETWORK_LOADS);
+    }
+
+    return asyncNetwork;
+}
+
 Status RuntimeImpl::UnloadNetwork(NetworkId networkId)
 {
     bool unloadOk = true;
diff --git a/src/armnn/Runtime.hpp b/src/armnn/Runtime.hpp
index 2c7e07f..150012e 100644
--- a/src/armnn/Runtime.hpp
+++ b/src/armnn/Runtime.hpp
@@ -4,6 +4,7 @@
 //
 #pragma once
 
+#include "AsyncNetwork.hpp"
 #include "LoadedNetwork.hpp"
 #include "DeviceSpec.hpp"
 
@@ -55,6 +56,13 @@
     TensorInfo GetInputTensorInfo(NetworkId networkId, LayerBindingId layerId) const;
     TensorInfo GetOutputTensorInfo(NetworkId networkId, LayerBindingId layerId) const;
 
+    // Create Aysnchronous Network from the IOptimizedNetowrkPtr
+    std::unique_ptr<IAsyncNetwork> CreateAsyncNetwork(NetworkId& networkIdOut,
+                                                      IOptimizedNetworkPtr network,
+                                                      std::string& errorMessage,
+                                                      const INetworkProperties& networkProperties);
+
+
     // Evaluates network using input in inputTensors, outputs filled into outputTensors.
     Status EnqueueWorkload(NetworkId networkId,
         const InputTensors& inputTensors,
diff --git a/src/armnn/WorkingMemDescriptor.hpp b/src/armnn/WorkingMemDescriptor.hpp
new file mode 100644
index 0000000..688082e
--- /dev/null
+++ b/src/armnn/WorkingMemDescriptor.hpp
@@ -0,0 +1,29 @@
+//
+// Copyright © 2021 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include <armnn/backends/ITensorHandle.hpp>
+
+#include <vector>
+
+namespace armnn
+{
+
+namespace experimental
+{
+
+struct WorkingMemDescriptor
+{
+    std::vector<ITensorHandle*> m_Inputs;
+    std::vector<ITensorHandle*> m_Outputs;
+
+    ~WorkingMemDescriptor() = default;
+    WorkingMemDescriptor() = default;
+};
+
+} // end experimental namespace
+
+} // end armnn namespace
diff --git a/src/armnn/WorkingMemHandle.cpp b/src/armnn/WorkingMemHandle.cpp
new file mode 100644
index 0000000..7a901b2
--- /dev/null
+++ b/src/armnn/WorkingMemHandle.cpp
@@ -0,0 +1,49 @@
+//
+// Copyright © 2021 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "backendsCommon/CpuTensorHandle.hpp"
+#include "WorkingMemHandle.hpp"
+#include "Network.hpp"
+
+namespace armnn
+{
+
+namespace experimental
+{
+
+WorkingMemHandle::WorkingMemHandle(std::vector<WorkingMemDescriptor> workingMemDescriptors,
+                                   std::unordered_map<LayerGuid, WorkingMemDescriptor> workingMemDescriptorMap) :
+    m_WorkingMemDescriptors(workingMemDescriptors),
+    m_WorkingMemDescriptorMap(workingMemDescriptorMap),
+    m_IsAllocated(false),
+    m_Mutex()
+{}
+
+void WorkingMemHandle::FreeWorkingMemory()
+{
+    for (auto workingMemDescriptor : m_WorkingMemDescriptors)
+    {
+        for (auto input : workingMemDescriptor.m_Inputs)
+        {
+            if (input)
+            {
+                delete input;
+                input = nullptr;
+            }
+        }
+        for (auto output : workingMemDescriptor.m_Outputs)
+        {
+            if (output)
+            {
+                delete output;
+                output = nullptr;
+            }
+        }
+    }
+}
+
+} // end experimental namespace
+
+} // end armnn namespace
diff --git a/src/armnn/WorkingMemHandle.hpp b/src/armnn/WorkingMemHandle.hpp
new file mode 100644
index 0000000..090f180
--- /dev/null
+++ b/src/armnn/WorkingMemHandle.hpp
@@ -0,0 +1,119 @@
+//
+// Copyright © 2021 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include "Layer.hpp"
+#include "Network.hpp"
+#include "WorkingMemDescriptor.hpp"
+
+#include <armnn/IWorkingMemHandle.hpp>
+#include <armnn/Tensor.hpp>
+
+#include <unordered_map>
+
+namespace armnn
+{
+
+namespace experimental
+{
+
+class WorkingMemHandle final : public IWorkingMemHandle
+{
+
+public:
+    WorkingMemHandle(std::vector<WorkingMemDescriptor> workingMemDescriptors,
+                     std::unordered_map<LayerGuid, WorkingMemDescriptor> workingMemDescriptorMap);
+
+    ~WorkingMemHandle() { FreeWorkingMemory(); }
+
+    /// Allocate the backing memory required for execution. If this is not called, then allocation will be
+    /// deferred to execution time. The mutex must be locked.
+    void Allocate() override
+    {
+        if (m_IsAllocated)
+        {
+            return;
+        }
+        m_IsAllocated = true;
+
+        // Iterate through all WorkingMemDescriptors calling allocate() on each input and output in turn
+        for (auto workingMemDescriptor :  m_WorkingMemDescriptors)
+        {
+            for (auto& input : workingMemDescriptor.m_Inputs)
+            {
+                input->Allocate();
+            }
+            for (auto& output : workingMemDescriptor.m_Outputs)
+            {
+                output->Allocate();
+            }
+        }
+    }
+
+    /// Free the backing memory required for execution. The mutex must be locked.
+    void Free() override
+    {
+        if (!m_IsAllocated)
+        {
+            return;
+        }
+        m_IsAllocated = false;
+
+        // Iterate through all WorkingMemDescriptors calling free() on each input and output in turn
+        for (auto workingMemDescriptor :  m_WorkingMemDescriptors)
+        {
+            for (auto& input : workingMemDescriptor.m_Inputs)
+            {
+                input->Unmap();
+            }
+            for (auto& output : workingMemDescriptor.m_Outputs)
+            {
+                output->Unmap();
+            }
+        }
+    }
+
+    /// IsAllocated returns true if the backing memory is currently allocated. The mutex must be locked.
+    bool IsAllocated() override
+    {
+        return m_IsAllocated;
+    }
+
+    /// Get a mutex which can be used for synchronizing access to the WorkingMemHandle object.
+    std::mutex& GetMutex() override
+    {
+        return m_Mutex;
+    }
+
+    /// Get the WorkingMemDescriptor for a Layer. The mutex must be locked.
+    WorkingMemDescriptor& GetWorkingMemDescriptor(LayerGuid id) override
+    {
+        auto result = m_WorkingMemDescriptorMap.find(id);
+        ARMNN_ASSERT(result != m_WorkingMemDescriptorMap.end());
+        return result->second;
+    }
+
+    /// Get the WorkingMemDescriptor at an index. The WorkingMemDescriptors are stored in the same order as
+    /// the Workloads in a topologically sorted graph. The mutex must be locked.
+    WorkingMemDescriptor& GetWorkingMemDescriptorAt(unsigned int id) override
+    {
+        return m_WorkingMemDescriptors[id];
+    }
+
+private:
+    void FreeWorkingMemory();
+
+    std::shared_ptr<ProfilerImpl> m_Profiler;
+
+    std::vector<WorkingMemDescriptor> m_WorkingMemDescriptors;
+    std::unordered_map<LayerGuid, WorkingMemDescriptor> m_WorkingMemDescriptorMap;
+    bool m_IsAllocated;
+    std::mutex m_Mutex;
+};
+
+} // end experimental namespace
+
+} // end armnn namespace
diff --git a/src/backends/backendsCommon/MemCopyWorkload.cpp b/src/backends/backendsCommon/MemCopyWorkload.cpp
index 7bdc05e..813adef 100644
--- a/src/backends/backendsCommon/MemCopyWorkload.cpp
+++ b/src/backends/backendsCommon/MemCopyWorkload.cpp
@@ -1,5 +1,5 @@
 //
-// Copyright © 2017 Arm Ltd. All rights reserved.
+// Copyright © 2017 Arm Ltd and Contributors. All rights reserved.
 // SPDX-License-Identifier: MIT
 //
 
@@ -40,7 +40,7 @@
 
 
 CopyMemGenericWorkload::CopyMemGenericWorkload(const MemCopyQueueDescriptor& descriptor,
-                                                         const WorkloadInfo& info)
+                                               const WorkloadInfo& info)
     : BaseWorkload<MemCopyQueueDescriptor>(descriptor, info)
 {
     GatherTensorHandlePairs(descriptor, m_TensorHandlePairs);
@@ -61,4 +61,21 @@
     }
 }
 
+void CopyMemGenericWorkload::ExecuteAsync(WorkingMemDescriptor& descriptor)
+{
+    ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "CopyMemGeneric_Execute_WorkingMemDescriptor");
+    std::vector<TensorHandlePair> tensorHandlePairs;
+    GatherTensorHandlePairs(descriptor, tensorHandlePairs);
+
+    auto copyFunc = [](void* dst, const void* src, size_t size)
+    {
+        memcpy(dst, src, size);
+    };
+
+    for (const auto& pair : tensorHandlePairs)
+    {
+        CopyTensorContentsGeneric(pair.first, pair.second, copyFunc);
+    }
+}
+
 } //namespace armnn
diff --git a/src/backends/backendsCommon/MemCopyWorkload.hpp b/src/backends/backendsCommon/MemCopyWorkload.hpp
index 6529286..12664fd 100644
--- a/src/backends/backendsCommon/MemCopyWorkload.hpp
+++ b/src/backends/backendsCommon/MemCopyWorkload.hpp
@@ -1,5 +1,5 @@
 //
-// Copyright © 2017 Arm Ltd. All rights reserved.
+// Copyright © 2017 Arm Ltd and Contributors. All rights reserved.
 // SPDX-License-Identifier: MIT
 //
 #pragma once
@@ -19,6 +19,7 @@
 public:
     CopyMemGenericWorkload(const MemCopyQueueDescriptor& descriptor, const WorkloadInfo& info);
     void Execute() const override;
+    void ExecuteAsync(WorkingMemDescriptor& descriptor) override;
 
 private:
     using TensorHandlePair = std::pair<const ITensorHandle*, ITensorHandle*>;
diff --git a/src/backends/backendsCommon/MemSyncWorkload.cpp b/src/backends/backendsCommon/MemSyncWorkload.cpp
index b29c46e..fe04a30 100644
--- a/src/backends/backendsCommon/MemSyncWorkload.cpp
+++ b/src/backends/backendsCommon/MemSyncWorkload.cpp
@@ -1,5 +1,5 @@
 //
-// Copyright © 2017 Arm Ltd. All rights reserved.
+// Copyright © 2017 Arm Ltd and Contributors. All rights reserved.
 // SPDX-License-Identifier: MIT
 //
 
@@ -14,7 +14,7 @@
 {
 
 SyncMemGenericWorkload::SyncMemGenericWorkload(const MemSyncQueueDescriptor& descriptor,
-                                                         const WorkloadInfo& info)
+                                               const WorkloadInfo& info)
     : BaseWorkload<MemSyncQueueDescriptor>(descriptor, info)
 {
     m_TensorHandle = descriptor.m_Inputs[0];
@@ -27,4 +27,11 @@
     m_TensorHandle->Unmap();
 }
 
+void SyncMemGenericWorkload::ExecuteAsync(WorkingMemDescriptor& descriptor)
+{
+    ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "SyncMemGeneric_Execute_WorkingMemDescriptor");
+    descriptor.m_Inputs[0]->Map(true);
+    descriptor.m_Inputs[0]->Unmap();
+}
+
 } //namespace armnn
diff --git a/src/backends/backendsCommon/MemSyncWorkload.hpp b/src/backends/backendsCommon/MemSyncWorkload.hpp
index 0d44788..8142f18 100644
--- a/src/backends/backendsCommon/MemSyncWorkload.hpp
+++ b/src/backends/backendsCommon/MemSyncWorkload.hpp
@@ -1,5 +1,5 @@
 //
-// Copyright © 2017 Arm Ltd. All rights reserved.
+// Copyright © 2017 Arm Ltd and Contributors. All rights reserved.
 // SPDX-License-Identifier: MIT
 //
 #pragma once
@@ -19,6 +19,7 @@
 public:
     SyncMemGenericWorkload(const MemSyncQueueDescriptor& descriptor, const WorkloadInfo& info);
     void Execute() const override;
+    void ExecuteAsync(WorkingMemDescriptor& descriptor) override;
 
 private:
     ITensorHandle* m_TensorHandle;
diff --git a/src/backends/backendsCommon/Workload.hpp b/src/backends/backendsCommon/Workload.hpp
index 482f9bd..940b878 100644
--- a/src/backends/backendsCommon/Workload.hpp
+++ b/src/backends/backendsCommon/Workload.hpp
@@ -1,11 +1,12 @@
 //
-// Copyright © 2017 Arm Ltd. All rights reserved.
+// Copyright © 2017 Arm Ltd and Contributors. All rights reserved.
 // SPDX-License-Identifier: MIT
 //
 #pragma once
 
 #include "WorkloadData.hpp"
 #include "WorkloadInfo.hpp"
+#include "WorkingMemDescriptor.hpp"
 
 #include <armnn/backends/IWorkload.hpp>
 #include <Profiling.hpp>
@@ -36,6 +37,8 @@
         m_Data.Validate(info);
     }
 
+    void ExecuteAsync(WorkingMemDescriptor&) override {};
+
     void PostAllocationConfigure() override {}
 
     const QueueDescriptor& GetData() const { return m_Data; }
diff --git a/src/backends/backendsCommon/test/CMakeLists.txt b/src/backends/backendsCommon/test/CMakeLists.txt
index d3857b8..9d36f52 100644
--- a/src/backends/backendsCommon/test/CMakeLists.txt
+++ b/src/backends/backendsCommon/test/CMakeLists.txt
@@ -51,6 +51,7 @@
     SpaceToDepthEndToEndTestImpl.cpp
     SpaceToDepthEndToEndTestImpl.hpp
     SplitterEndToEndTestImpl.hpp
+    StridedSliceAsyncEndToEndTest.hpp
     TensorCopyUtils.cpp
     TensorCopyUtils.hpp
     WorkloadFactoryHelper.hpp
diff --git a/src/backends/backendsCommon/test/EndToEndTestImpl.hpp b/src/backends/backendsCommon/test/EndToEndTestImpl.hpp
index 9ce4201..3a757d0 100644
--- a/src/backends/backendsCommon/test/EndToEndTestImpl.hpp
+++ b/src/backends/backendsCommon/test/EndToEndTestImpl.hpp
@@ -4,6 +4,8 @@
 //
 #pragma once
 
+#include "CommonTestUtils.hpp"
+
 #include <armnn/Descriptors.hpp>
 #include <armnn/INetwork.hpp>
 #include <armnn/IRuntime.hpp>
@@ -105,23 +107,6 @@
     );
 }
 
-// Utility template for comparing tensor elements
-template<DataType ArmnnType, typename T = ResolveType<ArmnnType>>
-bool Compare(T a, T b, float tolerance = 0.000001f)
-{
-    if (ArmnnType == DataType::Boolean)
-    {
-        // NOTE: Boolean is represented as uint8_t (with zero equals
-        // false and everything else equals true), therefore values
-        // need to be casted to bool before comparing them
-        return static_cast<bool>(a) == static_cast<bool>(b);
-    }
-
-    // NOTE: All other types can be cast to float and compared with
-    // a certain level of tolerance
-    return std::fabs(static_cast<float>(a) - static_cast<float>(b)) <= tolerance;
-}
-
 // Utility function to find the number of instances of a substring within a string.
 int SubStringCounter(std::string& string, std::string&& substring)
 {
diff --git a/src/backends/backendsCommon/test/StridedSliceAsyncEndToEndTest.hpp b/src/backends/backendsCommon/test/StridedSliceAsyncEndToEndTest.hpp
new file mode 100644
index 0000000..2ccd2b1
--- /dev/null
+++ b/src/backends/backendsCommon/test/StridedSliceAsyncEndToEndTest.hpp
@@ -0,0 +1,178 @@
+//
+// Copyright © 2021 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include <ResolveType.hpp>
+
+#include <armnn/IWorkingMemHandle.hpp>
+#include <armnn/INetwork.hpp>
+
+#include <backendsCommon/test/CommonTestUtils.hpp>
+
+#include <boost/test/unit_test.hpp>
+
+#include <vector>
+
+namespace armnn
+{
+
+namespace experimental
+{
+
+template<DataType ArmnnIType, DataType ArmnnOType,
+        typename TInput = ResolveType <ArmnnIType>, typename TOutput = ResolveType <ArmnnOType>>
+void AsyncEndToEndTestImpl(INetworkPtr network,
+                           const std::map<int, std::vector<TInput>>& inputTensorData,
+                           const std::map<int, std::vector<TOutput>>& expectedOutputData,
+                           std::vector<BackendId> backends,
+                           float tolerance = 0.000001f)
+{
+    // Create Runtime in which test will run
+    IRuntime::CreationOptions options;
+    IRuntimePtr runtime(IRuntime::Create(options));
+
+    // Optimize the Network
+    IOptimizedNetworkPtr optNet = Optimize(*network, backends, runtime->GetDeviceSpec());
+
+    // Creates AsyncNetwork
+    NetworkId networkId = 0;
+    std::string errorMessage;
+    const INetworkProperties networkProperties;
+    auto asyncNetwork = runtime->CreateAsyncNetwork(networkId, std::move(optNet), errorMessage, networkProperties);
+
+    InputTensors inputTensors;
+    inputTensors.reserve(inputTensorData.size());
+    for (auto&& it : inputTensorData)
+    {
+        inputTensors.push_back({it.first,
+                                ConstTensor(asyncNetwork->GetInputTensorInfo(it.first), it.second.data())});
+    }
+
+    OutputTensors outputTensors;
+    outputTensors.reserve(expectedOutputData.size());
+    std::map<int, std::vector<TOutput>> outputStorage;
+    for (auto&& it : expectedOutputData)
+    {
+        std::vector<TOutput> out(it.second.size());
+        outputStorage.emplace(it.first, out);
+        outputTensors.push_back({it.first,
+                                 Tensor(asyncNetwork->GetOutputTensorInfo(it.first),
+                                        outputStorage.at(it.first).data())});
+    }
+
+    // Create WorkingMemHandle for this async network
+    std::unique_ptr<IWorkingMemHandle> workingMemHandle = asyncNetwork->CreateWorkingMemHandle();
+    IWorkingMemHandle& workingMemHandleRef = *workingMemHandle.get();
+
+    // Run the async network
+    asyncNetwork->Execute(inputTensors, outputTensors, workingMemHandleRef);
+
+    // Checks the results.
+    for (auto&& it : expectedOutputData)
+    {
+        std::vector<TOutput> out = outputStorage.at(it.first);
+        for (unsigned int i = 0; i < out.size(); ++i)
+        {
+            BOOST_CHECK(Compare<ArmnnOType>(it.second[i], out[i], tolerance) == true);
+        }
+    }
+}
+
+template<typename armnn::DataType DataType>
+INetworkPtr CreateStridedSliceNetwork(const TensorShape& inputShape,
+                                      const TensorShape& outputShape,
+                                      const std::vector<int>& beginData,
+                                      const std::vector<int>& endData,
+                                      const std::vector<int>& stridesData,
+                                      int beginMask = 0,
+                                      int endMask = 0,
+                                      int shrinkAxisMask = 0,
+                                      int ellipsisMask = 0,
+                                      int newAxisMask = 0,
+                                      const float qScale = 1.0f,
+                                      const int32_t qOffset = 0)
+{
+    using namespace armnn;
+    // Builds up the structure of the network.
+    INetworkPtr net(INetwork::Create());
+
+    TensorInfo inputTensorInfo(inputShape, DataType, qScale, qOffset);
+    TensorInfo outputTensorInfo(outputShape, DataType, qScale, qOffset);
+
+    armnn::StridedSliceDescriptor stridedSliceDescriptor;
+    stridedSliceDescriptor.m_Begin = beginData;
+    stridedSliceDescriptor.m_End = endData;
+    stridedSliceDescriptor.m_Stride = stridesData;
+    stridedSliceDescriptor.m_BeginMask = beginMask;
+    stridedSliceDescriptor.m_EndMask = endMask;
+    stridedSliceDescriptor.m_ShrinkAxisMask = shrinkAxisMask;
+    stridedSliceDescriptor.m_EllipsisMask = ellipsisMask;
+    stridedSliceDescriptor.m_NewAxisMask = newAxisMask;
+
+    IConnectableLayer* input = net->AddInputLayer(0, "Input_Layer");
+    IConnectableLayer* stridedSlice = net->AddStridedSliceLayer(stridedSliceDescriptor, "splitter");
+    IConnectableLayer* output = net->AddOutputLayer(0);
+
+    Connect(input, stridedSlice, inputTensorInfo, 0, 0);
+    Connect(stridedSlice, output, outputTensorInfo, 0, 0);
+
+    return net;
+}
+
+template<armnn::DataType ArmnnType>
+void StridedSlicedEndToEndTest(const std::vector<BackendId>& backends)
+{
+    using namespace armnn;
+    using T = ResolveType<ArmnnType>;
+
+    const TensorShape& inputShape = {3, 2, 3, 1};
+    const TensorShape& outputShape = {1, 2, 3, 1};
+    const std::vector<int>& beginData = {1, 0, 0, 0};
+    const std::vector<int>& endData = {2, 2, 3, 1};
+    const std::vector<int>& stridesData = {1, 1, 1, 1};
+    int beginMask = 0;
+    int endMask = 0;
+    int shrinkAxisMask = 0;
+    int ellipsisMask = 0;
+    int newAxisMask = 0;
+
+    // Builds up the structure of the network
+    INetworkPtr net = CreateStridedSliceNetwork<ArmnnType>(inputShape,
+                                                           outputShape,
+                                                           beginData,
+                                                           endData,
+                                                           stridesData,
+                                                           beginMask,
+                                                           endMask,
+                                                           shrinkAxisMask,
+                                                           ellipsisMask,
+                                                           newAxisMask);
+
+    BOOST_TEST_CHECKPOINT("create a network");
+
+    // Creates structures for input & output.
+    std::vector<T> inputData{
+            1.0f, 1.0f, 1.0f, 2.0f, 2.0f, 2.0f,
+
+            3.0f, 3.0f, 3.0f, 4.0f, 4.0f, 4.0f,
+
+            5.0f, 5.0f, 5.0f, 6.0f, 6.0f, 6.0f
+    };
+
+    std::vector<T> outputExpected{
+            3.0f, 3.0f, 3.0f, 4.0f, 4.0f, 4.0f
+    };
+
+    std::map<int, std::vector<T>> inputTensorData = {{0, inputData}};
+    std::map<int, std::vector<T>> expectedOutputData = {{0, outputExpected}};
+
+    AsyncEndToEndTestImpl<ArmnnType, ArmnnType>(move(net), inputTensorData, expectedOutputData, backends);
+}
+
+} // experimental namespace
+
+} // armnn namespace
+
diff --git a/src/backends/reference/test/RefEndToEndTests.cpp b/src/backends/reference/test/RefEndToEndTests.cpp
index b697481..521854b 100644
--- a/src/backends/reference/test/RefEndToEndTests.cpp
+++ b/src/backends/reference/test/RefEndToEndTests.cpp
@@ -1,5 +1,5 @@
 //
-// Copyright © 2017 Arm Ltd. All rights reserved.
+// Copyright © 2017 Arm Ltd and Contributors. All rights reserved.
 // SPDX-License-Identifier: MIT
 //
 
@@ -25,6 +25,7 @@
 #include <backendsCommon/test/ResizeEndToEndTestImpl.hpp>
 #include <backendsCommon/test/SpaceToDepthEndToEndTestImpl.hpp>
 #include <backendsCommon/test/SplitterEndToEndTestImpl.hpp>
+#include <backendsCommon/test/StridedSliceAsyncEndToEndTest.hpp>
 #include <backendsCommon/test/TransposeConvolution2dEndToEndTestImpl.hpp>
 
 #include <boost/test/unit_test.hpp>
@@ -1336,6 +1337,10 @@
     StridedSliceInvalidSliceEndToEndTest(defaultBackends);
 }
 
+BOOST_AUTO_TEST_CASE(RefAsyncFP32StridedSlicedEndToEndTest)
+{
+    armnn::experimental::StridedSlicedEndToEndTest<armnn::DataType::Float32>(defaultBackends);
+}
 #endif
 
 BOOST_AUTO_TEST_SUITE_END()
diff --git a/src/backends/reference/workloads/RefStridedSliceWorkload.cpp b/src/backends/reference/workloads/RefStridedSliceWorkload.cpp
index 6a29439..ce807ee 100644
--- a/src/backends/reference/workloads/RefStridedSliceWorkload.cpp
+++ b/src/backends/reference/workloads/RefStridedSliceWorkload.cpp
@@ -1,5 +1,5 @@
 //
-// Copyright © 2017 Arm Ltd. All rights reserved.
+// Copyright © 2017 Arm Ltd and Contributors. All rights reserved.
 // SPDX-License-Identifier: MIT
 //
 
@@ -35,4 +35,24 @@
                  GetDataTypeSize(inputDataType));
 }
 
+void RefStridedSliceWorkload::ExecuteAsync(WorkingMemDescriptor& descriptor)
+{
+    ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuRef, "RefStridedSliceWorkload_Execute_WorkingMemDescriptor");
+
+    const TensorInfo& inputInfo  = GetTensorInfo(descriptor.m_Inputs[0]);
+    const TensorInfo& outputInfo = GetTensorInfo(descriptor.m_Outputs[0]);
+
+    DataType inputDataType  = inputInfo.GetDataType();
+    DataType outputDataType = outputInfo.GetDataType();
+
+    ARMNN_ASSERT(inputDataType == outputDataType);
+    IgnoreUnused(outputDataType);
+
+    StridedSlice(inputInfo,
+                 m_Data.m_Parameters,
+                 descriptor.m_Inputs[0]->Map(),
+                 descriptor.m_Outputs[0]->Map(),
+                 GetDataTypeSize(inputDataType));
+}
+
 } // namespace armnn
diff --git a/src/backends/reference/workloads/RefStridedSliceWorkload.hpp b/src/backends/reference/workloads/RefStridedSliceWorkload.hpp
index 44aabc0..3e253ed 100644
--- a/src/backends/reference/workloads/RefStridedSliceWorkload.hpp
+++ b/src/backends/reference/workloads/RefStridedSliceWorkload.hpp
@@ -1,5 +1,5 @@
 //
-// Copyright © 2017 Arm Ltd. All rights reserved.
+// Copyright © 2017 Arm Ltd and Contributors. All rights reserved.
 // SPDX-License-Identifier: MIT
 //
 
@@ -15,6 +15,7 @@
 public:
     RefStridedSliceWorkload(const StridedSliceQueueDescriptor& descriptor, const WorkloadInfo& info);
     void Execute() const override;
+    void ExecuteAsync(WorkingMemDescriptor& descriptor) override;
 };
 
 } // namespace armnn