IVGCVSW-6527 Support the new memory API in loaded network

 * enable external memory management for neon and ref backends
 * change m_TensorMemoryVector to hold shared pointers
 * change input layer backend Id to match backend id of connected layer

Signed-off-by: Finn Williams <finn.williams@arm.com>
Change-Id: I2216a724028312eb101b290df3f224177826b1a0
diff --git a/src/armnn/LoadedNetwork.cpp b/src/armnn/LoadedNetwork.cpp
index 7fb14d0..03e5ad5 100644
--- a/src/armnn/LoadedNetwork.cpp
+++ b/src/armnn/LoadedNetwork.cpp
@@ -131,11 +131,14 @@
 
     profiler->EnableNetworkDetailsToStdOut(networkProperties.m_OutputNetworkDetailsMethod);
 
-    Graph& order = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph().TopologicalSort();
     //First create tensor handlers, backends and workload factories.
     //Handlers are created before workloads are.
     //Because workload creation can modify some of the handlers,
     //(for example the splitter and concat layers).
+
+    bool useExternalMemoryManager = false;
+    bool useInternalMemoryManager = false;
+    Graph& order = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph().TopologicalSort();
     for (auto&& layer : order)
     {
         auto const& backendId = layer->GetBackendId();
@@ -154,25 +157,44 @@
                 throw BackendCapabilityException(er);
             }
 
+            if (networkProperties.m_AsyncEnabled &&
+                !HasCapability(BackendOptions::BackendOption{"ExternallyManagedMemory", true},
+                backend->GetCapabilities()))
+            {
+                std::string er = backend->GetId();
+                er += " does not support ExternallyManagedMemory\n";
+                er += "AsyncEnabled networks require all backends to support ExternallyManagedMemory";
+                throw BackendCapabilityException(er);
+            }
+
+            if (HasCapability(BackendOptions::BackendOption{"ExternallyManagedMemory", true},backend->GetCapabilities())
+                && (m_NetworkProperties.m_ExternalMemoryManagementEnabled ||  m_NetworkProperties.m_AsyncEnabled))
+            {
+                m_SupportsExternallyManagedMemory[backend->GetId()] = true;
+                useExternalMemoryManager = true;
+            }
+            else
+            {
+                m_SupportsExternallyManagedMemory[backend->GetId()] = false;
+                useInternalMemoryManager = true;
+            }
+
+            IBackendInternal::IWorkloadFactoryPtr workloadFactory;
             if (backend->SupportsTensorAllocatorAPI())
             {
-                auto workloadFactory = backend->CreateWorkloadFactory(
+                workloadFactory = backend->CreateWorkloadFactory(
                     m_TensorHandleFactoryRegistry,
                     m_OptimizedNetwork->pOptimizedNetworkImpl->GetModelOptions(),
                     static_cast<MemorySourceFlags>(m_NetworkProperties.m_InputSource),
                     static_cast<MemorySourceFlags>(m_NetworkProperties.m_OutputSource));
-                m_WorkloadFactories.emplace(
-                    std::make_pair(backendId, std::make_pair(std::move(workloadFactory), nullptr)));
             }
             else
             {
-                IBackendInternal::IMemoryManagerSharedPtr memoryManager = backend->CreateMemoryManager();
-                auto workloadFactory = backend->CreateWorkloadFactory(
-                    memoryManager, m_OptimizedNetwork->pOptimizedNetworkImpl->GetModelOptions());
-
-                m_WorkloadFactories.emplace(
-                    std::make_pair(backendId, std::make_pair(std::move(workloadFactory), memoryManager)));
+                m_BackendMemoryMangers.emplace_back(backend->CreateMemoryManager());
+                workloadFactory = backend->CreateWorkloadFactory(
+                        m_BackendMemoryMangers.back(), m_OptimizedNetwork->pOptimizedNetworkImpl->GetModelOptions());
             }
+            m_WorkloadFactories[backendId ] = std::move(workloadFactory);
         }
     }
 
@@ -181,6 +203,7 @@
         for (auto&& layer : order)
         {
             auto& workloadFactory = GetWorkloadFactory(*layer);
+            bool supportsExternalManager = m_SupportsExternallyManagedMemory[layer->GetBackendId()];
 
             switch (layer->GetType())
             {
@@ -191,7 +214,12 @@
                     // to false when creating TensorHandles
                     layer->CreateTensorHandles(m_TensorHandleFactoryRegistry,
                                                workloadFactory,
-                                               !m_NetworkProperties.m_ImportEnabled);
+                                               !supportsExternalManager && !m_NetworkProperties.m_ImportEnabled);
+                    break;
+                }
+                case LayerType::Constant:
+                {
+                    layer->CreateTensorHandles(m_TensorHandleFactoryRegistry, workloadFactory, true);
                     break;
                 }
                 default:
@@ -199,16 +227,18 @@
                     // Look for a layer with 1 OutputSlot which has 1 connection and that connection is an Output Layer
                     // If Export is enabled disable memory management so we can export, otherwise we do a copy
                     if ((layer->GetNumOutputSlots() == 1) &&
-                        (layer->GetOutputSlots()[0].GetNumConnections() == 1) &&
-                        (layer->GetOutputSlots()[0].GetConnection(0)->GetOwningLayer().GetType() == LayerType::Output))
+                       (layer->GetOutputSlots()[0].GetNumConnections() == 1) &&
+                       (layer->GetOutputSlots()[0].GetConnection(0)->GetOwningLayer().GetType() == LayerType::Output))
                     {
                         layer->CreateTensorHandles(m_TensorHandleFactoryRegistry,
                                                    workloadFactory,
-                                                   !m_NetworkProperties.m_ExportEnabled);
+                                                   !supportsExternalManager && !m_NetworkProperties.m_ExportEnabled);
                     }
                     else
                     {
-                        layer->CreateTensorHandles(m_TensorHandleFactoryRegistry, workloadFactory);
+                        layer->CreateTensorHandles(m_TensorHandleFactoryRegistry,
+                                                   workloadFactory,
+                                                   !supportsExternalManager);
                     }
                 }
             }
@@ -251,7 +281,8 @@
                     // Inputs and outputs are treated in a special way - see EnqueueInput() and EnqueueOutput().
                     break;
                 }
-                default: {
+                default:
+                {
                     auto workload = layer->CreateWorkload(workloadFactory);
 
                     if (!workload)
@@ -272,11 +303,16 @@
 
                     // For async networks ConstantWorkloads are managed exclusively by LoadedNetwork
                     // and are separated out from the other workloads
-                    if (networkProperties.m_AsyncEnabled && layer->GetType() == LayerType::Constant)
+                    if((networkProperties.m_AsyncEnabled  || useExternalMemoryManager) &&
+                        layer->GetType() == LayerType::Constant)
                     {
+                        m_ConstantTensorHandles[layer->GetGuid()] =
+                                layer->GetOutputSlot(0).GetOutputHandler().GetData();
                         m_ConstantWorkloads[layer->GetGuid()] = std::move(workload);
-                    } else {
-                        m_WorkloadQueue.push_back(move(workload));
+                    }
+                    else
+                    {
+                        m_WorkloadQueue.push_back(std::move(workload));
                     }
 
                     // release the constant data in the layer..
@@ -289,7 +325,7 @@
 
     for (auto&& workloadFactory : m_WorkloadFactories)
     {
-        workloadFactory.second.first->AfterWorkloadsCreated();
+        workloadFactory.second->AfterWorkloadsCreated();
     }
 
     if (timelineUtils)
@@ -298,28 +334,90 @@
         timelineUtils->Commit();
     }
 
+    if (useExternalMemoryManager)
+    {
+        if (networkProperties.m_AsyncEnabled)
+        {
+            CreateMemoryProfileAsync();
+        }
+        else
+        {
+            CreateMemoryProfile();
+        }
+
+        auto backendStrategyMap = BackendRegistryInstance().GetMemoryOptimizerStrategies();
+        for (auto& backendMemoryProfile : m_MemBlockMap)
+        {
+            const BackendId& backendId = backendMemoryProfile.first;
+            if (backendStrategyMap.find(backendId) != backendStrategyMap.end())
+            {
+                m_MemBinMap[backendId] = backendStrategyMap[backendId]->Optimize(backendMemoryProfile.second);
+            }
+            else
+            {
+                m_MemBinMap[backendId] = m_ConstantStrategy->Optimize(backendMemoryProfile.second);
+            }
+        }
+
+        if (!networkProperties.m_AsyncEnabled)
+        {
+            m_ExternalMemoryManager = CreateExternalMemoryManger(m_TensorMemory);
+
+            // Sort m_TensorMemory, so it's order matches m_Tensorhandles
+            std::sort(m_TensorMemory.begin(), m_TensorMemory.end(),
+                      [](const std::pair<std::shared_ptr<TensorMemory>, MemorySource>& lhs,
+                         const std::pair<std::shared_ptr<TensorMemory>, MemorySource>& rhs)
+                      {
+                          return lhs.first->m_OutputSlotId < rhs.first->m_OutputSlotId;
+                      });
+        }
+    }
+
+    // Now that the intermediate tensor memory has been set-up,
+    // do any post allocation configuration for each workload.
     if (!networkProperties.m_AsyncEnabled)
     {
-        // Set up memory.
-        m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph().AllocateDynamicBuffers();
+        if (useInternalMemoryManager)
+        {
+            // Set up memory.
+            m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph().AllocateDynamicBuffers();
+        }
 
-        // Now that the intermediate tensor memory has been set-up,
-        // do any post allocation configuration for each workload.
-        ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "LoadNetwork_PostAllocationConfigure");
         for (auto &workload : m_WorkloadQueue)
         {
             workload->PostAllocationConfigure();
         }
     }
-    else
+
+    if (useExternalMemoryManager)
     {
-        AllocateAndExecuteConstantWorkloads();
+        if (!networkProperties.m_AsyncEnabled)
+        {
+            AllocateAndExecuteConstantWorkloads();
+        }
+        else
+        {
+            AllocateAndExecuteConstantWorkloadsAsync();
+        }
     }
 }
 
 void LoadedNetwork::AllocateAndExecuteConstantWorkloads()
 {
     ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "LoadNetwork_AllocateAndExecuteConstants");
+    for (auto& pair : m_ConstantWorkloads)
+    {
+        auto tensorHandle = m_ConstantTensorHandles[pair.first];
+        tensorHandle->Allocate();
+        pair.second->Execute();
+    }
+}
+
+
+
+void LoadedNetwork::AllocateAndExecuteConstantWorkloadsAsync()
+{
+    ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "LoadNetwork_AllocateAndExecuteConstants");
     Graph& order = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph();
     for (auto&& layer : order)
     {
@@ -343,7 +441,6 @@
     }
 }
 
-
 void LoadedNetwork::SendNetworkStructure()
 {
     ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "LoadNetwork_SendNetworkStructure");
@@ -429,7 +526,7 @@
                                            CHECK_LOCATION());
     }
 
-    workloadFactory = it->second.first.get();
+    workloadFactory = it->second.get();
 
     ARMNN_ASSERT_MSG(workloadFactory, "No workload factory");
 
@@ -780,9 +877,19 @@
     {
         return;
     }
-    for (auto&& workloadFactory : m_WorkloadFactories)
+
+    if (m_ExternalMemoryManager)
     {
-        IBackendInternal::IMemoryManagerSharedPtr memoryManager = workloadFactory.second.second;
+        m_ExternalMemoryManager->Allocate();
+
+        for (unsigned int i = 0; i < m_TensorMemory.size(); ++i)
+        {
+            m_Tensorhandles[i]->Import(m_TensorMemory[i].first->m_Data, m_TensorMemory[i].second);
+        }
+    }
+
+    for (auto&& memoryManager : m_BackendMemoryMangers)
+    {
         if (memoryManager)
         {
             memoryManager->Acquire();
@@ -795,14 +902,20 @@
 void LoadedNetwork::FreeWorkingMemory()
 {
     std::lock_guard<std::mutex> lockGuard(m_WorkingMemMutex);
+
     if (!m_IsWorkingMemAllocated)
     {
         return;
     }
-    // Informs the memory managers to release memory in it's respective memory group
-    for (auto&& workloadFactory : m_WorkloadFactories)
+
+    if (m_ExternalMemoryManager)
     {
-        IBackendInternal::IMemoryManagerSharedPtr memoryManager = workloadFactory.second.second;
+        m_ExternalMemoryManager->Deallocate();
+    }
+
+    // Informs the memory managers to release memory in its respective memory group
+    for (auto&& memoryManager : m_BackendMemoryMangers)
+    {
         if (memoryManager)
         {
             memoryManager->Release();
@@ -1392,37 +1505,16 @@
 std::unique_ptr<IWorkingMemHandle> LoadedNetwork::CreateWorkingMemHandle(NetworkId networkId)
 {
     Graph& order = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph();
-    std::unordered_map<LayerGuid, std::vector<std::unique_ptr<ITensorHandle> > > tensorHandleMap;
+
+    // Tensors that will need to be allocated internally within armnn
+    std::vector<std::unique_ptr<ITensorHandle>> managedTensorHandles;
+    // Tensors that will be allocated externally by the user
+    std::vector<std::unique_ptr<ITensorHandle>> unmanagedTensorHandles;
+
     std::vector<WorkingMemDescriptor> workingMemDescriptors;
     std::unordered_map<LayerGuid, WorkingMemDescriptor> workingMemDescriptorMap;
-    TensorHandleFactoryRegistry tensorHandleFactoryRegistry;
-    WorkloadFactoryMap workloadFactoryMap;
 
-    std::vector<std::shared_ptr<IMemoryManager>> memoryManagers;
-
-    for (auto const& backend : m_Backends)
-    {
-        if (backend.second->SupportsTensorAllocatorAPI())
-        {
-            backend.second->RegisterTensorHandleFactories(
-                tensorHandleFactoryRegistry,
-                static_cast<MemorySourceFlags>(m_NetworkProperties.m_InputSource),
-                static_cast<MemorySourceFlags>(m_NetworkProperties.m_OutputSource));
-            memoryManagers.emplace_back(tensorHandleFactoryRegistry.GetMemoryManagers().back());
-        }
-        else
-        {
-            std::shared_ptr<IMemoryManager> memoryManager = backend.second->CreateMemoryManager();
-            auto workloadFactory = backend.second->CreateWorkloadFactory(
-                    memoryManager, m_OptimizedNetwork->pOptimizedNetworkImpl->GetModelOptions());
-
-            workloadFactoryMap.emplace(
-                    std::make_pair(backend.first, std::make_pair(std::move(workloadFactory), memoryManager)));
-            memoryManagers.emplace_back(memoryManager);
-        }
-    }
-
-    auto GetTensorHandle = [&](Layer* layer, const OutputSlot& outputSlot, bool isMemoryManaged)
+    auto GetTensorHandle = [&](Layer* layer, const OutputSlot& outputSlot)
     {
         ITensorHandleFactory::FactoryId factoryId = outputSlot.GetTensorHandleFactoryId();
         const TensorInfo& tensorInfo = outputSlot.GetTensorInfo();
@@ -1431,28 +1523,30 @@
         {
             BackendId id = layer->GetBackendId();
             ARMNN_NO_DEPRECATE_WARN_BEGIN
-            return workloadFactoryMap.at(id).first->CreateTensorHandle(tensorInfo, isMemoryManaged);
+            return m_WorkloadFactories.at(id)->CreateTensorHandle(tensorInfo, false);
             ARMNN_NO_DEPRECATE_WARN_END
         }
         else
         {
-            ITensorHandleFactory* handleFactory = tensorHandleFactoryRegistry.GetFactory(factoryId);
+            ITensorHandleFactory* handleFactory = m_TensorHandleFactoryRegistry.GetFactory(factoryId);
             ARMNN_ASSERT(handleFactory);
-            return handleFactory->CreateTensorHandle(tensorInfo, isMemoryManaged);
+            return handleFactory->CreateTensorHandle(tensorInfo, false);
         }
     };
 
     struct HandleInfo
     {
-        unsigned int m_ReferenceCount = 0;
-        bool isInputLayerHandle = false;
-        bool isOutputLayerHandle = false;
+        ITensorHandle* m_TensorHandle;
+
+        bool m_IsInputLayerHandle = false;
+        bool m_IsOutputLayerHandle = false;
 
         WorkingMemHandle::InputMemDescriptorCoords m_InputMemDescriptorCoords;
         WorkingMemHandle::OutputMemDescriptorCoords m_OutputMemDescriptorCoords;
     };
 
-    std::unordered_map<const ITensorHandle*, HandleInfo> handleReferenceCounts;
+    std::unordered_map<const OutputSlot*, HandleInfo> outputToHandleInfoMap;
+
     unsigned int layerIndex = 0;
     for (auto&& layer : order)
     {
@@ -1508,27 +1602,33 @@
                 }
             }
 
-            tensorHandleMap[layer->GetGuid()].emplace_back(GetTensorHandle(layer, slot, isMemoryManaged));
-            ITensorHandle* tensorHandle = tensorHandleMap[layer->GetGuid()].back().get();
+            ITensorHandle* tensorHandle;
+            if (isMemoryManaged)
+            {
+                managedTensorHandles.emplace_back(GetTensorHandle(layer, slot));
+                tensorHandle = managedTensorHandles.back().get();
+            }
+            else
+            {
+                unmanagedTensorHandles.emplace_back(GetTensorHandle(layer, slot));
+                tensorHandle = unmanagedTensorHandles.back().get();
+            }
 
             workingMemDescriptor.m_Outputs.push_back(tensorHandle);
-            tensorHandle->Manage();
-            unsigned int numConnections = slot.GetNumConnections();
-            ARMNN_ASSERT(numConnections != 0);
 
-            HandleInfo& handleInfo = handleReferenceCounts[tensorHandle];
-            handleInfo.m_ReferenceCount = numConnections;
+            HandleInfo& handleInfo = outputToHandleInfoMap[&slot];
+            handleInfo.m_TensorHandle = tensorHandle;
 
             // Store the coordinates of the current layer's OutputSlot that is connected to the OutputLayer
             if (isConnectedToOutputLayer)
             {
-                handleInfo.isOutputLayerHandle = true;
+                handleInfo.m_IsOutputLayerHandle = true;
                 handleInfo.m_OutputMemDescriptorCoords.m_OutputSlotCoords = {layerIndex, slotIndex};
             }
             // Store the LayerBindingId of the InputLayer
             if (isInputLayer)
             {
-                handleInfo.isInputLayerHandle = true;
+                handleInfo.m_IsInputLayerHandle = true;
                 LayerBindingId bindingId = static_cast<BindableLayer*>(layer)->GetBindingId();
                 handleInfo.m_InputMemDescriptorCoords.m_LayerBindingId = bindingId;
             }
@@ -1557,20 +1657,19 @@
                 {
                     LayerBindingId bindingId = static_cast<BindableLayer*>(layer)->GetBindingId();
 
-                    HandleInfo& handleInfo = handleReferenceCounts[tensorHandle];
-                    handleInfo.isOutputLayerHandle = true;
+                    HandleInfo& handleInfo = outputToHandleInfoMap[outputSlot];
+                    handleInfo.m_TensorHandle = tensorHandle;
+                    handleInfo.m_IsOutputLayerHandle = true;
                     handleInfo.m_OutputMemDescriptorCoords.m_LayerBindingIds.push_back(bindingId);
                     handleInfo.m_OutputMemDescriptorCoords.m_InputSlotCoords.push_back({layerIndex, 0});
                 }
                 continue;
             }
 
-            auto search = tensorHandleMap.find(key);
-            unsigned int index = outputSlot->CalculateIndexOnOwner();
-            ITensorHandle* inputTensorHandle = search->second[index].get();
-            workingMemDescriptor.m_Inputs.push_back(inputTensorHandle);
+            HandleInfo& handleInfo = outputToHandleInfoMap.at(outputSlot);
 
-            HandleInfo& handleInfo = handleReferenceCounts.at(inputTensorHandle);
+            ITensorHandle* inputTensorHandle = handleInfo.m_TensorHandle;
+            workingMemDescriptor.m_Inputs.push_back(inputTensorHandle);
 
             // Store the LayerBindingId of the OutputLayer
             if (isOutputLayer)
@@ -1581,25 +1680,18 @@
             }
             // In this case the layer is not an Output Layer but shares its input tensorhandle with an OutputLayer
             // It will need to be updated as well, if we swap out the tensorhandle
-            else if (handleInfo.isOutputLayerHandle)
+            else if (handleInfo.m_IsOutputLayerHandle)
             {
                 handleInfo.m_OutputMemDescriptorCoords.m_InputSlotCoords.push_back({layerIndex, slot.GetSlotIndex()});
             }
 
             // Store the coordinates of the InputSlots connected to the InputLayer
             // There can be more than one InputSlot connected to an InputLayer, so we use a vector
-            if (handleInfo.isInputLayerHandle)
+            if (handleInfo.m_IsInputLayerHandle)
             {
                 std::pair<LayerGuid, unsigned int> connectionLocation{layerIndex, slot.GetSlotIndex()};
                 handleInfo.m_InputMemDescriptorCoords.m_InputSlotCoords.emplace_back(connectionLocation);
             }
-
-            --handleInfo.m_ReferenceCount;
-            if (handleInfo.m_ReferenceCount == 0u)
-            {
-                // Stop managing lifetime of tensor handle
-                inputTensorHandle->Allocate();
-            }
         }
         workingMemDescriptorMap.insert({layer->GetGuid(), workingMemDescriptor});
 
@@ -1612,17 +1704,29 @@
         }
     }
 
+    std::vector<std::pair<std::shared_ptr<TensorMemory>, MemorySource>> tensorMemory;
+
+    auto externalMemoryManager = CreateExternalMemoryManger(tensorMemory);
+
+    // Sort m_TensorMemory, so it's order matches the outputSlot order
+    std::sort(tensorMemory.begin(), tensorMemory.end(),
+              [](const std::pair<std::shared_ptr<TensorMemory>, MemorySource>& lhs,
+                 const std::pair<std::shared_ptr<TensorMemory>, MemorySource>& rhs)
+              {
+                  return lhs.first->m_OutputSlotId < rhs.first->m_OutputSlotId;
+              });
+
     std::vector<WorkingMemHandle::InputMemDescriptorCoords> inputConnectionsInfo;
     std::vector<WorkingMemHandle::OutputMemDescriptorCoords> outputConnectionsInfo;
 
-    for (const auto& handleInfo: handleReferenceCounts)
+    for (const auto& handleInfo: outputToHandleInfoMap)
     {
-        if (handleInfo.second.isOutputLayerHandle)
+        if (handleInfo.second.m_IsOutputLayerHandle)
         {
             outputConnectionsInfo.emplace_back(handleInfo.second.m_OutputMemDescriptorCoords);
         }
 
-        if (handleInfo.second.isInputLayerHandle)
+        if (handleInfo.second.m_IsInputLayerHandle)
         {
             inputConnectionsInfo.emplace_back(handleInfo.second.m_InputMemDescriptorCoords);
         }
@@ -1633,8 +1737,10 @@
                                               outputConnectionsInfo,
                                               workingMemDescriptors,
                                               workingMemDescriptorMap,
-                                              memoryManagers,
-                                              std::move(tensorHandleMap));
+                                              std::move(externalMemoryManager),
+                                              std::move(tensorMemory),
+                                              std::move(managedTensorHandles),
+                                              std::move(unmanagedTensorHandles));
 }
 
 void LoadedNetwork::RegisterDebugCallback(const DebugCallbackFunction& func)
@@ -1645,6 +1751,312 @@
     }
 }
 
+
+void LoadedNetwork::CreateMemoryProfileAsync()
+{
+    struct PartialBlock
+    {
+        unsigned int m_StartOfLife;
+        unsigned int m_Lifetime;
+
+        size_t m_MemSize;
+        unsigned int m_Index;
+
+        BackendId m_BackendId;
+    };
+
+    auto align = [](size_t numToAlign)
+    {
+        const size_t alignment = sizeof(float);
+        return ((numToAlign + alignment - 1) / alignment) * alignment;
+    };
+
+    std::unordered_map<const OutputSlot*, PartialBlock> memBlockTrackerMap;
+
+    const bool inputImportingEnabled = m_NetworkProperties.m_InputSource != MemorySource::Undefined;
+    const bool outputImportingEnabled = m_NetworkProperties.m_OutputSource != MemorySource::Undefined;
+
+    unsigned int timestep = 0;
+    unsigned int outputIndex = 0;
+    Graph& order = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph().TopologicalSort();
+
+    for (auto&& layer : order)
+    {
+        const LayerType& layerType = layer->GetType();
+        // Don't manage memory if importing.
+        if (layerType == LayerType::Input && inputImportingEnabled)
+        {
+            continue;
+        }
+        // Don't manage memory if importing.
+        if (layerType == LayerType::Output && outputImportingEnabled
+            && layer->GetInputSlot(0).GetConnectedOutputSlot()->GetNumConnections() == 1)
+        {
+            continue;
+        }
+        // Because Constant Layer memory can not be shared, the memory must persist for the lifetime of execution,
+        // management is done separately.
+        if (layerType == LayerType::Constant)
+        {
+            continue;
+        }
+
+        BackendId backendId = layer->GetBackendId();
+        for (auto& outputSlot : layer->GetOutputSlots())
+        {
+            if (!m_SupportsExternallyManagedMemory[backendId])
+            {
+                continue;
+            }
+
+            PartialBlock partialBlock;
+
+            partialBlock.m_StartOfLife = timestep;
+
+            size_t alignedSize = align(outputSlot.GetOutputHandler().GetTensorInfo().GetNumBytes());
+            partialBlock.m_MemSize = alignedSize;
+            partialBlock.m_Index = outputIndex++;
+            partialBlock.m_Lifetime = outputSlot.GetNumConnections();
+            partialBlock.m_BackendId = backendId;
+
+            if (partialBlock.m_Lifetime == 0)
+            {
+                m_MemBlockMap[partialBlock.m_BackendId].emplace_back(partialBlock.m_StartOfLife,
+                                                                     partialBlock.m_StartOfLife,
+                                                                     partialBlock.m_MemSize,
+                                                                     0,
+                                                                     partialBlock.m_Index);
+            }
+            else
+            {
+                memBlockTrackerMap[&outputSlot] = partialBlock;
+            }
+        }
+
+        for (auto& inputSlot : layer->GetInputSlots())
+        {
+            const Layer& connectedInputLayer = inputSlot.GetConnectedOutputSlot()->GetOwningLayer();
+            const LayerType& owningLayerType = connectedInputLayer.GetType();
+
+            if (owningLayerType == LayerType::Constant)
+            {
+                continue;
+            }
+            if (inputImportingEnabled && owningLayerType == LayerType::Input)
+            {
+                continue;
+            }
+
+            auto outputSlot = inputSlot.GetConnectedOutputSlot();
+
+            PartialBlock& partialBlock = memBlockTrackerMap.at(outputSlot);
+
+            auto& lifetime = partialBlock.m_Lifetime;
+            --lifetime;
+
+            if (lifetime == 0)
+            {
+                m_MemBlockMap[partialBlock.m_BackendId].emplace_back(partialBlock.m_StartOfLife,
+                                                                     timestep,
+                                                                     partialBlock.m_MemSize,
+                                                                     0,
+                                                                     partialBlock.m_Index);
+            }
+        }
+        ++timestep;
+    }
+}
+
+void LoadedNetwork::CreateMemoryProfile()
+{
+    // Finds the first TensorHandle ancestor of a SubTensorHandle. If the ITensorHandle provided
+    // is a TensorHandle, the function just returns it
+    auto TraceSubTensorHandleAncestry = [](ITensorHandle* const subTensorHandle)
+    {
+        ITensorHandle* ancestor = subTensorHandle;
+        while (ancestor && ancestor->GetParent())
+        {
+            ancestor = ancestor->GetParent();
+        }
+        return ancestor;
+    };
+
+    struct PartialBlock
+    {
+        unsigned int m_StartOfLife;
+        unsigned int m_Lifetime;
+
+        size_t m_MemSize;
+        unsigned int m_Index;
+
+        BackendId m_BackendId;
+    };
+
+    auto align = [](size_t numToAlign)
+    {
+        const size_t alignment = sizeof(float);
+        return ((numToAlign + alignment - 1) / alignment) * alignment;
+    };
+
+    std::unordered_map<ITensorHandle*, PartialBlock> memBlockTrackerMap;
+
+    const bool inputImportingEnabled = m_NetworkProperties.m_InputSource != MemorySource::Undefined;
+    const bool outputImportingEnabled = m_NetworkProperties.m_OutputSource != MemorySource::Undefined;
+
+    unsigned int timestep = 0;
+    unsigned int outputIndex = 0;
+    Graph& order = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph().TopologicalSort();
+
+    for (auto&& layer : order)
+    {
+        const LayerType& layerType = layer->GetType();
+        // Don't manage memory if importing.
+        if (layerType == LayerType::Input && inputImportingEnabled)
+        {
+            continue;
+        }
+        // Don't manage memory if importing.
+        if (layerType == LayerType::Output && outputImportingEnabled
+            && layer->GetInputSlot(0).GetConnectedOutputSlot()->GetNumConnections() == 1)
+        {
+            continue;
+        }
+        // Because Constant Layer memory can not be shared, the memory must persist for the lifetime of execution,
+        // management is done separately.
+        if (layerType == LayerType::Constant)
+        {
+            continue;
+        }
+
+        BackendId backendId = layer->GetBackendId();
+        for (auto& outputSlot : layer->GetOutputSlots())
+        {
+            if (!m_SupportsExternallyManagedMemory[backendId])
+            {
+                continue;
+            }
+
+            ITensorHandle* tensorHandle = outputSlot.GetOutputHandler().GetData();
+            tensorHandle = TraceSubTensorHandleAncestry(tensorHandle);
+
+            if (memBlockTrackerMap.find(tensorHandle) == memBlockTrackerMap.end())
+            {
+                PartialBlock partialBlock;
+
+                partialBlock.m_StartOfLife = timestep;
+
+                size_t alignedSize = align(outputSlot.GetOutputHandler().GetTensorInfo().GetNumBytes());
+                partialBlock.m_MemSize = alignedSize;
+                partialBlock.m_Index = outputIndex++;
+                partialBlock.m_Lifetime = outputSlot.GetNumConnections();
+                partialBlock.m_BackendId = backendId;
+
+                if (partialBlock.m_Lifetime == 0)
+                {
+                    m_MemBlockMap[partialBlock.m_BackendId].emplace_back(partialBlock.m_StartOfLife,
+                                                                         partialBlock.m_StartOfLife,
+                                                                         partialBlock.m_MemSize,
+                                                                         0,
+                                                                         partialBlock.m_Index);
+                }
+                else
+                {
+                    memBlockTrackerMap[tensorHandle] = partialBlock;
+                }
+                m_Tensorhandles.push_back(tensorHandle);
+
+            }
+            else
+            {
+                memBlockTrackerMap.at(tensorHandle).m_Lifetime += outputSlot.GetNumConnections();
+            }
+        }
+
+        for (auto& inputSlot : layer->GetInputSlots())
+        {
+            const Layer& connectedInputLayer = inputSlot.GetConnectedOutputSlot()->GetOwningLayer();
+            const LayerType& owningLayerType = connectedInputLayer.GetType();
+
+            if (owningLayerType == LayerType::Constant)
+            {
+                continue;
+            }
+            if (inputImportingEnabled && owningLayerType == LayerType::Input)
+            {
+                continue;
+            }
+            if (!m_SupportsExternallyManagedMemory[connectedInputLayer.GetBackendId()])
+            {
+                continue;
+            }
+
+            auto outputSlot = inputSlot.GetConnectedOutputSlot();
+
+            ITensorHandle* tensorHandle = outputSlot->GetOutputHandler().GetData();
+            tensorHandle = TraceSubTensorHandleAncestry(tensorHandle);
+
+            PartialBlock& partialBlock = memBlockTrackerMap.at(tensorHandle);
+
+            auto& lifetime = partialBlock.m_Lifetime;
+            --lifetime;
+
+            if (lifetime == 0)
+            {
+                m_MemBlockMap[partialBlock.m_BackendId].emplace_back(partialBlock.m_StartOfLife,
+                                                                     timestep,
+                                                                     partialBlock.m_MemSize,
+                                                                     0,
+                                                                     partialBlock.m_Index);
+            }
+        }
+        ++timestep;
+    }
+
+}
+
+std::unique_ptr<MemoryManager> LoadedNetwork::CreateExternalMemoryManger(
+        std::vector<std::pair<std::shared_ptr<TensorMemory>, MemorySource>>& tensorMemoryVec)
+{
+    std::unique_ptr<MemoryManager> memoryManager = std::make_unique<MemoryManager>();
+    auto allocatorMap = BackendRegistryInstance().GetAllocators();
+
+    for (auto& backend : m_MemBinMap)
+    {
+        std::vector<BufferStorage> bufferStorageVec;
+
+        std::shared_ptr<ICustomAllocator> backendAllocator;
+        if (allocatorMap.find(backend.first) != allocatorMap.end())
+        {
+            backendAllocator = allocatorMap[backend.first];
+        }
+        else
+        {
+            backendAllocator = m_Backends[backend.first]->GetDefaultAllocator();
+        }
+
+        for (auto& memBin : backend.second)
+        {
+            BufferStorage bufferStorage;
+            bufferStorage.m_BufferSize = memBin.m_MemSize;
+            bufferStorage.m_TensorMemoryVector.reserve(memBin.m_MemBlocks.size());
+
+            for (auto& memBlock : memBin.m_MemBlocks)
+            {
+                auto tensorMemory = std::make_shared<TensorMemory>(TensorMemory{memBlock.m_Offset, memBlock.m_Index});
+
+                tensorMemoryVec.emplace_back(tensorMemory, backendAllocator->GetMemorySourceType());
+                bufferStorage.m_TensorMemoryVector.emplace_back(tensorMemory);
+            }
+
+            bufferStorageVec.emplace_back(std::move(bufferStorage));
+        }
+
+        memoryManager->StoreMemToAllocate(bufferStorageVec, backendAllocator, 4);
+    }
+
+    return memoryManager;
+}
+
 LayerBindingId LoadedNetwork::ValidateImportedInputID(ImportedInputId id)
 {
     try
diff --git a/src/armnn/LoadedNetwork.hpp b/src/armnn/LoadedNetwork.hpp
index 71ceaa3..35c482c 100644
--- a/src/armnn/LoadedNetwork.hpp
+++ b/src/armnn/LoadedNetwork.hpp
@@ -10,9 +10,15 @@
 
 #include <armnn/Tensor.hpp>
 #include <armnn/backends/IBackendInternal.hpp>
+#include <armnn/backends/IMemoryOptimizerStrategy.hpp>
 #include <backendsCommon/TensorHandleFactoryRegistry.hpp>
 #include <backendsCommon/Workload.hpp>
 #include <backendsCommon/WorkloadFactory.hpp>
+#include <backendsCommon/DefaultAllocator.hpp>
+#include <backendsCommon/MemoryManager.hpp>
+#include <backendsCommon/memoryOptimizerStrategyLibrary/strategies/SingleAxisPriorityList.hpp>
+
+
 #include <ProfilingService.hpp>
 #include <TimelineUtilityMethods.hpp>
 
@@ -89,16 +95,16 @@
     profiling::ProfilingGuid GetNetworkGuid();
 
 private:
-    using WorkloadFactoryWithMemoryManager =
-    std::pair<IBackendInternal::IWorkloadFactoryPtr, IBackendInternal::IMemoryManagerSharedPtr>;
 
-    using WorkloadFactoryMap = std::unordered_map<BackendId, WorkloadFactoryWithMemoryManager>;
 
     void AllocateWorkingMemory(std::lock_guard<std::mutex>& lock);
     void AllocateAndExecuteConstantWorkloads();
+    void AllocateAndExecuteConstantWorkloadsAsync();
 
-    std::unordered_map<LayerGuid, ITensorHandle* > m_ConstantTensorHandles;
-    std::unordered_map<LayerGuid, std::unique_ptr<IWorkload> > m_ConstantWorkloads;
+    std::unordered_map<LayerGuid, std::unique_ptr<IWorkload>> m_ConstantWorkloads;
+    std::unordered_map<LayerGuid, ITensorHandle*> m_ConstantTensorHandles;
+
+    std::unique_ptr<IMemoryOptimizerStrategy> m_ConstantStrategy = std::make_unique<SingleAxisPriorityList>();
 
     LoadedNetwork(std::unique_ptr<IOptimizedNetwork> net,
                   const INetworkProperties& networkProperties,
@@ -120,9 +126,18 @@
     inline LayerBindingId ValidateImportedInputID(ImportedInputId id);
     inline LayerBindingId ValidateImportedOutputID(ImportedOutputId id);
 
+    void CreateMemoryProfile();
+    void CreateMemoryProfileAsync();
+
+    std::unique_ptr<MemoryManager> CreateExternalMemoryManger(
+            std::vector<std::pair<std::shared_ptr<TensorMemory>, MemorySource>>& tensorMemory);
+
     using BackendPtrMap = std::unordered_map<BackendId, IBackendInternalUniquePtr>;
 
-    BackendPtrMap       m_Backends;
+    BackendPtrMap  m_Backends;
+    std::vector<IBackendInternal::IMemoryManagerSharedPtr> m_BackendMemoryMangers;
+
+    using WorkloadFactoryMap = std::unordered_map<BackendId, IBackendInternal::IWorkloadFactoryPtr>;
     WorkloadFactoryMap  m_WorkloadFactories;
 
     std::unique_ptr<IOptimizedNetwork> m_OptimizedNetwork;
@@ -171,6 +186,17 @@
 
     ImportedInputId m_CurImportedInputId = 0;
     ImportedInputId m_CurImportedOutputId = 0;
+
+    std::unordered_map<BackendId, std::vector<MemBlock>> m_MemBlockMap;
+    std::unordered_map<BackendId, std::vector<MemBin>> m_MemBinMap;
+
+    std::vector<ITensorHandle*> m_Tensorhandles;
+
+    std::vector<std::pair<std::shared_ptr<TensorMemory>, MemorySource>> m_TensorMemory;
+
+    std::unique_ptr<MemoryManager> m_ExternalMemoryManager;
+
+    std::unordered_map<BackendId, bool> m_SupportsExternallyManagedMemory;
 };
 
 }
diff --git a/src/armnn/Network.cpp b/src/armnn/Network.cpp
index e00dbfc..17a1da1 100644
--- a/src/armnn/Network.cpp
+++ b/src/armnn/Network.cpp
@@ -934,6 +934,11 @@
     {
         auto layer = *it;
 
+        if (layer->GetType() == LayerType::Input)
+        {
+            continue;
+        }
+
         DataType dataTypeIn  = layer->GetNumInputSlots() == 0 ? DataType::Float32 :
             layer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo().GetDataType();
         DataType dataTypeOut = layer->GetNumOutputSlots() == 0 ? DataType::Float32 :
@@ -1027,6 +1032,17 @@
         }
     }
 
+    for (auto it = firstLayer; it != lastLayer; ++it)
+    {
+        auto layer = *it;
+
+        if(layer->GetType() == LayerType::Input)
+        {
+            BackendId connectedBackendId = layer->GetOutputSlot(0).GetConnection(0)->GetOwningLayer().GetBackendId();
+            layer->SetBackendId(connectedBackendId);
+        }
+    }
+
     return result;
 }
 
diff --git a/src/armnn/WorkingMemHandle.cpp b/src/armnn/WorkingMemHandle.cpp
index e2ad52a..2cb47fb 100644
--- a/src/armnn/WorkingMemHandle.cpp
+++ b/src/armnn/WorkingMemHandle.cpp
@@ -17,16 +17,20 @@
 
 WorkingMemHandle::WorkingMemHandle(NetworkId networkId,
         std::vector<InputMemDescriptorCoords> inputLayerInfo,
-        std::vector<OutputMemDescriptorCoords> ouputLayerInfo,
+        std::vector<OutputMemDescriptorCoords> outputLayerInfo,
         std::vector<WorkingMemDescriptor> workingMemDescriptors,
         std::unordered_map<LayerGuid, WorkingMemDescriptor> workingMemDescriptorMap,
-        std::vector<std::shared_ptr<IMemoryManager>> memoryManagers,
-        std::unordered_map<LayerGuid, std::vector<std::unique_ptr<ITensorHandle> > > ownedTensorHandles)
+        std::unique_ptr<MemoryManager> memoryManager,
+        std::vector<std::pair<std::shared_ptr<TensorMemory>, MemorySource>> tensorMemory,
+        std::vector<std::unique_ptr<ITensorHandle>> managedTensorHandles,
+        std::vector<std::unique_ptr<ITensorHandle>> unmanagedTensorHandles)
     : m_NetworkId(networkId)
     , m_WorkingMemDescriptors(workingMemDescriptors)
     , m_WorkingMemDescriptorMap(workingMemDescriptorMap)
-    , m_MemoryManagers(memoryManagers)
-    , m_OwnedTensorHandles(std::move(ownedTensorHandles))
+    , m_MemoryManager(std::move(memoryManager))
+    , m_TensorMemory(std::move(tensorMemory))
+    , m_ManagedTensorHandles(std::move(managedTensorHandles))
+    , m_UnmanagedTensorHandles(std::move(unmanagedTensorHandles))
     , m_InputSize(numeric_cast<DifferenceType>(inputLayerInfo.size()))
     , m_IsAllocated(false)
 {
@@ -54,7 +58,7 @@
         }
     }
     size_t bindingIdCount = inputLayerInfo.size();
-    for (const auto& outputInfo : ouputLayerInfo)
+    for (const auto& outputInfo : outputLayerInfo)
     {
         for (auto bindingId : outputInfo.m_LayerBindingIds)
         {
@@ -88,6 +92,7 @@
         }
     }
     m_BindingIdVec = std::vector<LayerBindingId>(bindingIdCount);
+    IgnoreUnused(m_UnmanagedTensorHandles);
 }
 
 void WorkingMemHandle::Allocate()
@@ -98,9 +103,11 @@
     }
     m_IsAllocated = true;
 
-    for (auto& mgr : m_MemoryManagers)
+    m_MemoryManager->Allocate();
+
+    for (unsigned int i = 0; i < m_TensorMemory.size(); ++i)
     {
-        mgr->Acquire();
+        m_ManagedTensorHandles[i]->Import(m_TensorMemory[i].first->m_Data, m_TensorMemory[i].second);
     }
 }
 
@@ -112,10 +119,7 @@
     }
     m_IsAllocated = false;
 
-    for (auto& mgr : m_MemoryManagers)
-    {
-        mgr->Release();
-    }
+    m_MemoryManager->Deallocate();
 }
 
 void WorkingMemHandle::MemSyncOutputs()
diff --git a/src/armnn/WorkingMemHandle.hpp b/src/armnn/WorkingMemHandle.hpp
index 9078a8d..bca1d2d 100644
--- a/src/armnn/WorkingMemHandle.hpp
+++ b/src/armnn/WorkingMemHandle.hpp
@@ -14,6 +14,7 @@
 
 #include <unordered_map>
 #include <mutex>
+#include <backendsCommon/MemoryManager.hpp>
 
 namespace armnn
 {
@@ -45,11 +46,13 @@
 
     WorkingMemHandle(NetworkId networkId,
                      std::vector<InputMemDescriptorCoords> inputLayerInfo,
-                     std::vector<OutputMemDescriptorCoords> ouputLayerInfo,
+                     std::vector<OutputMemDescriptorCoords> outputLayerInfo,
                      std::vector<WorkingMemDescriptor> workingMemDescriptors,
                      std::unordered_map<LayerGuid, WorkingMemDescriptor> workingMemDescriptorMap,
-                     std::vector<std::shared_ptr<IMemoryManager>> memoryManagers,
-                     std::unordered_map<LayerGuid, std::vector<std::unique_ptr<ITensorHandle> > > ownedTensorHandles);
+                     std::unique_ptr<MemoryManager> memoryManager,
+                     std::vector<std::pair<std::shared_ptr<TensorMemory>, MemorySource>> tensorMemory,
+                     std::vector<std::unique_ptr<ITensorHandle>> managedTensorHandles,
+                     std::vector<std::unique_ptr<ITensorHandle>> unmanagedTensorHandles);
 
     ~WorkingMemHandle()
     { Free(); }
@@ -128,11 +131,17 @@
     std::vector<WorkingMemDescriptor> m_WorkingMemDescriptors;
     std::unordered_map<LayerGuid, WorkingMemDescriptor> m_WorkingMemDescriptorMap;
 
-    // Vector of IMemoryManagers that manage the WorkingMemHandle's memory
-    std::vector<std::shared_ptr<IMemoryManager>> m_MemoryManagers;
-    // TensorHandles owned by this WorkingMemHandle
-    // constant tensor's can be shared by multiple WorkingMemHandles and so will not be stored here
-    std::unordered_map<LayerGuid, std::vector<std::unique_ptr<ITensorHandle> > >  m_OwnedTensorHandles;
+    std::unique_ptr<MemoryManager> m_MemoryManager;
+
+    // Memory to be imported into the tensorHandles after allocation
+    std::vector<std::pair<std::shared_ptr<TensorMemory>, MemorySource>> m_TensorMemory;
+
+
+    // Tensors that will need to be allocated internally within armnn
+    std::vector<std::unique_ptr<ITensorHandle>> m_ManagedTensorHandles;
+
+    // Tensors that will be allocated externally by the user
+    std::vector<std::unique_ptr<ITensorHandle>> m_UnmanagedTensorHandles;
 
     std::unordered_map<LayerBindingId, bool> m_InputValidationMap;
     std::unordered_map<LayerBindingId, bool> m_OutputValidationMap;
diff --git a/src/armnn/test/OptimizerTests.cpp b/src/armnn/test/OptimizerTests.cpp
index 8416a8d..3cea1b5 100644
--- a/src/armnn/test/OptimizerTests.cpp
+++ b/src/armnn/test/OptimizerTests.cpp
@@ -714,7 +714,8 @@
                 case armnn::LayerType::Input:
                 {
                     auto inputLayer = PolymorphicDowncast<const InputLayer*>(layer);
-                    CHECK((inputLayer->GetBackendId() == "MockBackend"));
+                    const auto connectedLayerBackendId = inputLayer->GetOutputSlot(0).GetOwningLayer().GetBackendId();
+                    CHECK((inputLayer->GetBackendId() == connectedLayerBackendId));
                     break;
                 }
                 case armnn::LayerType::Output:
diff --git a/src/backends/backendsCommon/DefaultAllocator.hpp b/src/backends/backendsCommon/DefaultAllocator.hpp
index 2451db3..cf0f177 100644
--- a/src/backends/backendsCommon/DefaultAllocator.hpp
+++ b/src/backends/backendsCommon/DefaultAllocator.hpp
@@ -22,12 +22,12 @@
     void* allocate(size_t size, size_t alignment = 0) override
     {
         IgnoreUnused(alignment);
-        return ::operator new(size);
+        return ::operator new(size_t(size));
     }
 
     void free(void* ptr) override
     {
-        std::free(ptr);
+        ::operator delete(ptr);
     }
 
     armnn::MemorySource GetMemorySourceType() override
diff --git a/src/backends/backendsCommon/MemoryManager.cpp b/src/backends/backendsCommon/MemoryManager.cpp
index 1c109c3..77cab27 100644
--- a/src/backends/backendsCommon/MemoryManager.cpp
+++ b/src/backends/backendsCommon/MemoryManager.cpp
@@ -11,7 +11,7 @@
 {
 
 void MemoryManager::StoreMemToAllocate(std::vector<BufferStorage> bufferStorageVector,
-                                       ICustomAllocator* customAllocator,
+                                       std::shared_ptr<ICustomAllocator> customAllocator,
                                        const size_t typeAlignment)
 {
     IgnoreUnused(typeAlignment);
diff --git a/src/backends/backendsCommon/MemoryManager.hpp b/src/backends/backendsCommon/MemoryManager.hpp
index cbd6fcf..5113b231 100644
--- a/src/backends/backendsCommon/MemoryManager.hpp
+++ b/src/backends/backendsCommon/MemoryManager.hpp
@@ -2,6 +2,7 @@
 // Copyright © 2021 Arm Ltd and Contributors. All rights reserved.
 // SPDX-License-Identifier: MIT
 //
+#pragma once
 
 #include <armnn/backends/ICustomAllocator.hpp>
 
@@ -10,7 +11,7 @@
 struct Allocator
 {
     /// Pointer to @ICustomAllocator.
-    ICustomAllocator* m_CustomAllocator{};
+    std::shared_ptr<ICustomAllocator> m_CustomAllocator{};
     /// Value which the size of each buffer (actual data size + padding) has to be a multiple of.
     size_t m_Alignment = 0 ;
 };
@@ -19,16 +20,16 @@
 {
     /// Number of bytes the value is away from the @BufferStorage.m_Buffer.
     size_t m_Offset{};
-    /// Pointer to the tensor value.
-    void* m_Data = nullptr;
     /// Identifier to be used by the @LoadedNetwork to order the tensors.
     unsigned int m_OutputSlotId{};
+    /// Pointer to the tensor value.
+    void* m_Data = nullptr;
 };
 
 struct BufferStorage
 {
     /// Vector of pointer to @TensorMemory.
-    std::vector<TensorMemory*> m_TensorMemoryVector;
+    std::vector<std::shared_ptr<TensorMemory>> m_TensorMemoryVector;
     /// Total size of the buffer.
     size_t m_BufferSize;
     /// Pointer to the first element of the buffer.
@@ -43,7 +44,7 @@
     /// @param[in] customAllocator - Pointer to @ICustomAllocator.
     /// @param[in] typeAlignment - Optional parameter. Value of which the size of each value has to be multiple of.
     void StoreMemToAllocate(std::vector<BufferStorage> bufferStorageVector,
-                            ICustomAllocator* customAllocator,
+                            std::shared_ptr<ICustomAllocator> customAllocator,
                             size_t typeAlignment = 0);
 
     /// Allocate the amount of memory indicated by @m_BufferSize, and
diff --git a/src/backends/backendsCommon/common.mk b/src/backends/backendsCommon/common.mk
index a77ec06..56c9d65 100644
--- a/src/backends/backendsCommon/common.mk
+++ b/src/backends/backendsCommon/common.mk
@@ -17,6 +17,7 @@
     MapWorkload.cpp \
     MemCopyWorkload.cpp \
     MemImportWorkload.cpp \
+    MemoryManager.cpp \
     MemSyncWorkload.cpp \
     OptimizationViews.cpp \
     TensorHandleFactoryRegistry.cpp \
@@ -25,7 +26,8 @@
     WorkloadFactory.cpp \
     WorkloadUtils.cpp \
     memoryOptimizerStrategyLibrary/strategies/ConstantMemoryStrategy.cpp \
-    memoryOptimizerStrategyLibrary/strategies/StrategyValidator.cpp \
+	memoryOptimizerStrategyLibrary/strategies/SingleAxisPriorityList.cpp \
+    memoryOptimizerStrategyLibrary/strategies/StrategyValidator.cpp
 
 
 # COMMON_TEST_SOURCES contains the list of files to be included
@@ -104,7 +106,8 @@
     test/layerTests/TransposeConvolution2dTestImpl.cpp \
     test/layerTests/UnidirectionalSequenceLstmTestImpl.cpp \
     memoryOptimizerStrategyLibrary/test/ConstMemoryStrategyTests.cpp \
-    memoryOptimizerStrategyLibrary/test/ValidatorStrategyTests.cpp
+    memoryOptimizerStrategyLibrary/test/ValidatorStrategyTests.cpp \
+    memoryOptimizerStrategyLibrary/test/SingleAxisPriorityListTests.cpp
 
 ifeq ($(ARMNN_REF_ENABLED),1)
 COMMON_TEST_SOURCES += \
diff --git a/src/backends/backendsCommon/memoryOptimizerStrategyLibrary/strategies/SingleAxisPriorityList.cpp b/src/backends/backendsCommon/memoryOptimizerStrategyLibrary/strategies/SingleAxisPriorityList.cpp
index 3afa061..738b713 100644
--- a/src/backends/backendsCommon/memoryOptimizerStrategyLibrary/strategies/SingleAxisPriorityList.cpp
+++ b/src/backends/backendsCommon/memoryOptimizerStrategyLibrary/strategies/SingleAxisPriorityList.cpp
@@ -155,9 +155,9 @@
 
         // The indexes don't match we need at least two words
         // Zero the bits to the right of curBlock->m_EndOfLife
-        remainder = (curBlock->m_EndOfLife +1 - lastWordIndex * wordSize);
+        remainder = (curBlock->m_EndOfLife - lastWordIndex * wordSize);
 
-        size_t lastWord = (1u << remainder) - 1;
+        size_t lastWord = (1ul << remainder) - 1;
         lastWord = lastWord << (wordSize - remainder);
 
         if(firstWordIndex + 1 == lastWordIndex)
diff --git a/src/backends/backendsCommon/test/CompatibilityTests.cpp b/src/backends/backendsCommon/test/CompatibilityTests.cpp
index d18a8fb..3685f75 100644
--- a/src/backends/backendsCommon/test/CompatibilityTests.cpp
+++ b/src/backends/backendsCommon/test/CompatibilityTests.cpp
@@ -181,7 +181,7 @@
                           {"ProtectedContentAllocation", false},
                           {"ConstantTensorsAsInputs", true},
                           {"PreImportIOTensors", true},
-                          {"ExternallyManagedMemory", false},
+                          {"ExternallyManagedMemory", true},
                           {"MultiAxisPacking", false}});
 }
 
@@ -200,7 +200,7 @@
                           {"ProtectedContentAllocation", false},
                           {"ConstantTensorsAsInputs", false},
                           {"PreImportIOTensors", false},
-                          {"ExternallyManagedMemory", false},
+                          {"ExternallyManagedMemory", true},
                           {"MultiAxisPacking", false}});
 }
 
@@ -219,7 +219,7 @@
                           {"ProtectedContentAllocation", true},
                           {"ConstantTensorsAsInputs", false},
                           {"PreImportIOTensors", false},
-                          {"ExternallyManagedMemory", false},
+                          {"ExternallyManagedMemory", true},
                           {"MultiAxisPacking", false}});
 }
 
diff --git a/src/backends/backendsCommon/test/MemoryManagerTests.cpp b/src/backends/backendsCommon/test/MemoryManagerTests.cpp
index c873499..662a5c2 100644
--- a/src/backends/backendsCommon/test/MemoryManagerTests.cpp
+++ b/src/backends/backendsCommon/test/MemoryManagerTests.cpp
@@ -59,17 +59,18 @@
 
     // Create mock up bufferStorageVector with 2 BufferStorage with the same TensorMemory
     size_t numTensors = 5;
-    std::vector<TensorMemory*> tensorMemoryPointerVector(numTensors);
-    std::vector<TensorMemory> tensorMemoryVector;
+    std::vector<std::shared_ptr<TensorMemory>> tensorMemoryPointerVector(numTensors);
+    std::vector<std::shared_ptr<TensorMemory>> tensorMemoryVector;
     tensorMemoryVector.reserve(numTensors);
 
     std::vector<size_t> offsets(numTensors);
     std::iota(std::begin(offsets), std::end(offsets), 0);
 
-    for (uint32_t idx = 0; idx < tensorMemoryPointerVector.size(); ++idx)
+    for (uint idx = 0; idx < tensorMemoryPointerVector.size(); ++idx)
     {
-        tensorMemoryVector.emplace_back(TensorMemory{offsets[idx], nullptr, 0});
-        tensorMemoryPointerVector[idx] = &tensorMemoryVector[idx];
+        tensorMemoryVector.emplace_back(std::make_shared<TensorMemory>(TensorMemory{offsets[idx], 0, nullptr}));
+
+        tensorMemoryPointerVector[idx] = tensorMemoryVector[idx];
     }
 
     std::vector<BufferStorage> bufferStorageVector;
@@ -77,30 +78,31 @@
     bufferStorageVector.emplace_back(BufferStorage{tensorMemoryPointerVector, numTensors});
 
     // Create an instance of the SampleCustomAllocator
-    SampleCustomAllocator customAllocator = SampleCustomAllocator();
-    customAllocator.m_Values = {10, 11, 12, 13, 14};
-    // Check that the test was set up correctly
-    CHECK(customAllocator.m_Values.size() == numTensors);
+    std::shared_ptr<SampleCustomAllocator> customAllocator =
+            std::make_unique<SampleCustomAllocator>(SampleCustomAllocator());
 
+    customAllocator->m_Values = {10, 11, 12, 13, 14};
+    // Check that the test was set up correctly
+    CHECK(customAllocator->m_Values.size() == numTensors);
+
+    size_t bufferVecSize =  bufferStorageVector.size();
     // Utilise 3 functions in the MemoryManager. Check the counters and the pointer to the values are correct.
     MemoryManager memoryManager;
-    memoryManager.StoreMemToAllocate(bufferStorageVector, &customAllocator);
+    memoryManager.StoreMemToAllocate(bufferStorageVector, customAllocator);
 
     memoryManager.Allocate();
-    CHECK(customAllocator.m_CounterAllocate == bufferStorageVector.size());
-    for (const auto& bufferStorage : bufferStorageVector)
+    CHECK(customAllocator->m_CounterAllocate == bufferVecSize);
+
+    uint idx = 0;
+    for (auto tensorMemory : tensorMemoryVector)
     {
-        uint32_t idx = 0;
-        for (auto tensorMemory : bufferStorage.m_TensorMemoryVector)
-        {
-            auto value = reinterpret_cast<uint8_t *>(tensorMemory->m_Data);
-            CHECK(customAllocator.m_Values[idx] == *value);
-            idx += 1;
-        }
+        auto value = reinterpret_cast<uint8_t *>(tensorMemory->m_Data);
+        CHECK(customAllocator->m_Values[idx] == *value);
+        idx += 1;
     }
 
     memoryManager.Deallocate();
-    CHECK(customAllocator.m_CounterFree == bufferStorageVector.size());
+    CHECK(customAllocator->m_CounterFree == bufferStorageVector.size());
 }
 }
 
diff --git a/src/backends/backendsCommon/test/OptimizedNetworkTests.cpp b/src/backends/backendsCommon/test/OptimizedNetworkTests.cpp
index 012737e..b0ee9be 100644
--- a/src/backends/backendsCommon/test/OptimizedNetworkTests.cpp
+++ b/src/backends/backendsCommon/test/OptimizedNetworkTests.cpp
@@ -138,7 +138,7 @@
         // the other layers are supported by CpuRef.
         // If NEON is not enabled, all layers are supported by CpuRef.
 #if defined(ARMCOMPUTENEON_ENABLED)
-        if (layer->GetType() == armnn::LayerType::Input || layer->GetType() == armnn::LayerType::Output)
+        if (layer->GetType() == armnn::LayerType::Output)
         {
             CHECK(layer->GetBackendId() == armnn::Compute::CpuAcc);
         }
@@ -337,7 +337,7 @@
         // the other layers are supported by CpuRef.
         // If neither NEON, nor CL is enabled, all layers are supported by CpuRef.
 #if defined(ARMCOMPUTENEON_ENABLED)
-        if (layer->GetType() == armnn::LayerType::Input || layer->GetType() == armnn::LayerType::Output)
+        if (layer->GetType() == armnn::LayerType::Output)
         {
             CHECK(layer->GetBackendId() == armnn::Compute::CpuAcc);
         }
diff --git a/src/backends/cl/ClBackend.hpp b/src/backends/cl/ClBackend.hpp
index 7597d09..99fe906 100644
--- a/src/backends/cl/ClBackend.hpp
+++ b/src/backends/cl/ClBackend.hpp
@@ -29,7 +29,7 @@
                                                      {"ProtectedContentAllocation", true},
                                                      {"ConstantTensorsAsInputs", false},
                                                      {"PreImportIOTensors", false},
-                                                     {"ExternallyManagedMemory", false},
+                                                     {"ExternallyManagedMemory", true},
                                                      {"MultiAxisPacking", false},
                                                      {"SingleAxisPacking", true}
                                              });
diff --git a/src/backends/neon/NeonBackend.hpp b/src/backends/neon/NeonBackend.hpp
index 68d60a4..e53bacb 100644
--- a/src/backends/neon/NeonBackend.hpp
+++ b/src/backends/neon/NeonBackend.hpp
@@ -10,14 +10,14 @@
 {
 
 // add new capabilities here..
-const BackendCapabilities cpuAccCapabilities("GpuAcc",
+const BackendCapabilities cpuAccCapabilities("CpuAcc",
                                              {
                                                      {"NonConstWeights", false},
                                                      {"AsyncExecution", false},
                                                      {"ProtectedContentAllocation", false},
                                                      {"ConstantTensorsAsInputs", false},
                                                      {"PreImportIOTensors", false},
-                                                     {"ExternallyManagedMemory", false},
+                                                     {"ExternallyManagedMemory", true},
                                                      {"MultiAxisPacking", false},
                                                      {"SingleAxisPacking", true}
                                              });
diff --git a/src/backends/neon/NeonTensorHandle.hpp b/src/backends/neon/NeonTensorHandle.hpp
index ae8aa5d..dd4c257 100644
--- a/src/backends/neon/NeonTensorHandle.hpp
+++ b/src/backends/neon/NeonTensorHandle.hpp
@@ -29,7 +29,8 @@
     NeonTensorHandle(const TensorInfo& tensorInfo)
                      : m_ImportFlags(static_cast<MemorySourceFlags>(MemorySource::Malloc)),
                        m_Imported(false),
-                       m_IsImportEnabled(false)
+                       m_IsImportEnabled(false),
+                       m_TypeAlignment(GetDataTypeSize(tensorInfo.GetDataType()))
     {
         armnn::armcomputetensorutils::BuildArmComputeTensor(m_Tensor, tensorInfo);
     }
@@ -39,7 +40,9 @@
                      MemorySourceFlags importFlags = static_cast<MemorySourceFlags>(MemorySource::Malloc))
                      : m_ImportFlags(importFlags),
                        m_Imported(false),
-                       m_IsImportEnabled(false)
+                       m_IsImportEnabled(false),
+                       m_TypeAlignment(GetDataTypeSize(tensorInfo.GetDataType()))
+
 
     {
         armnn::armcomputetensorutils::BuildArmComputeTensor(m_Tensor, tensorInfo, dataLayout);
@@ -117,9 +120,7 @@
         {
             if (source == MemorySource::Malloc && m_IsImportEnabled)
             {
-                // Checks the 16 byte memory alignment
-                constexpr uintptr_t alignment = sizeof(size_t);
-                if (reinterpret_cast<uintptr_t>(memory) % alignment)
+                if (reinterpret_cast<uintptr_t>(memory) % m_TypeAlignment)
                 {
                     throw MemoryImportException("NeonTensorHandle::Import Attempting to import unaligned memory");
                 }
@@ -263,6 +264,7 @@
     MemorySourceFlags m_ImportFlags;
     bool m_Imported;
     bool m_IsImportEnabled;
+    const uintptr_t m_TypeAlignment;
 };
 
 class NeonSubTensorHandle : public IAclTensorHandle
diff --git a/src/backends/reference/RefBackend.hpp b/src/backends/reference/RefBackend.hpp
index 6114ce6..da04f22 100644
--- a/src/backends/reference/RefBackend.hpp
+++ b/src/backends/reference/RefBackend.hpp
@@ -16,7 +16,7 @@
                                                     {"ProtectedContentAllocation", false},
                                                     {"ConstantTensorsAsInputs", true},
                                                     {"PreImportIOTensors", true},
-                                                    {"ExternallyManagedMemory", false},
+                                                    {"ExternallyManagedMemory", true},
                                                     {"MultiAxisPacking", false},
                                                     {"SingleAxisPacking", true}
                                              });
diff --git a/src/backends/reference/RefTensorHandle.cpp b/src/backends/reference/RefTensorHandle.cpp
index b9e566e..5229e9d 100644
--- a/src/backends/reference/RefTensorHandle.cpp
+++ b/src/backends/reference/RefTensorHandle.cpp
@@ -122,7 +122,7 @@
         if (m_IsImportEnabled && source == MemorySource::Malloc)
         {
             // Check memory alignment
-            constexpr uintptr_t alignment = sizeof(size_t);
+            uintptr_t alignment = GetDataTypeSize(m_TensorInfo.GetDataType());
             if (reinterpret_cast<uintptr_t>(memory) % alignment)
             {
                 if (m_Imported)
diff --git a/src/backends/reference/RefWorkloadFactory.cpp b/src/backends/reference/RefWorkloadFactory.cpp
index 75008bc..36dcd21 100644
--- a/src/backends/reference/RefWorkloadFactory.cpp
+++ b/src/backends/reference/RefWorkloadFactory.cpp
@@ -113,10 +113,14 @@
 std::unique_ptr<ITensorHandle> RefWorkloadFactory::CreateTensorHandle(const TensorInfo& tensorInfo,
                                                                       const bool isMemoryManaged) const
 {
-    // For Ref it is okay to make the TensorHandle memory managed as it can also store a pointer
-    // to unmanaged memory. This also ensures memory alignment.
-    IgnoreUnused(isMemoryManaged);
-    return std::make_unique<RefTensorHandle>(tensorInfo, m_MemoryManager);
+    if (isMemoryManaged)
+    {
+        return std::make_unique<RefTensorHandle>(tensorInfo, m_MemoryManager);
+    }
+    else
+    {
+        return std::make_unique<RefTensorHandle>(tensorInfo, static_cast<unsigned int>(MemorySource::Malloc));
+    }
 }
 
 std::unique_ptr<ITensorHandle> RefWorkloadFactory::CreateTensorHandle(const TensorInfo& tensorInfo,
@@ -126,7 +130,15 @@
     // For Ref it is okay to make the TensorHandle memory managed as it can also store a pointer
     // to unmanaged memory. This also ensures memory alignment.
     IgnoreUnused(isMemoryManaged, dataLayout);
-    return std::make_unique<RefTensorHandle>(tensorInfo, m_MemoryManager);
+
+    if (isMemoryManaged)
+    {
+        return std::make_unique<RefTensorHandle>(tensorInfo, m_MemoryManager);
+    }
+    else
+    {
+        return std::make_unique<RefTensorHandle>(tensorInfo, static_cast<unsigned int>(MemorySource::Malloc));
+    }
 }
 
 std::unique_ptr<IWorkload> RefWorkloadFactory::CreateActivation(const ActivationQueueDescriptor& descriptor,