blob: ec79d5da3e29dfcbc76855fcb6a887bbfa744f81 [file] [log] [blame]
//
// Copyright © 2017 Arm Ltd and Contributors. All rights reserved.
// SPDX-License-Identifier: MIT
//
#include "LoadedNetwork.hpp"
#include "Layer.hpp"
#include "Graph.hpp"
#include "Profiling.hpp"
#include "HeapProfiling.hpp"
#include "WorkingMemHandle.hpp"
#include <armnn/BackendHelper.hpp>
#include <armnn/BackendRegistry.hpp>
#include <armnn/Logging.hpp>
#include <armnn/backends/TensorHandle.hpp>
#include <armnn/backends/IMemoryManager.hpp>
#include <armnn/backends/MemCopyWorkload.hpp>
#include <armnn/profiling/ArmNNProfiling.hpp>
#include <armnn/utility/Assert.hpp>
#include <backendsCommon/MemSyncWorkload.hpp>
#include <common/include/Processes.hpp>
#include <fmt/format.h>
namespace armnn
{
using namespace std;
using namespace arm::pipe;
namespace
{
template <typename ExceptionType>
std::string ToErrorMessage(const char * prefix, const ExceptionType & error)
{
std::stringstream ss;
ss << prefix << " " << error.what();
return ss.str();
}
void AddLayerStructure(std::unique_ptr<TimelineUtilityMethods>& timelineUtils,
const Layer& layer,
ProfilingGuid networkGuid)
{
// Add layer to the post-optimisation network structure
std::string layerName = layer.GetNameStr().empty() ? "<Unnamed>" : layer.GetNameStr();
timelineUtils->CreateNamedTypedChildEntity(layer.GetGuid(),
networkGuid,
layerName,
LabelsAndEventClasses::LAYER_GUID);
for (auto&& input : layer.GetInputSlots())
{
const IOutputSlot* source = input.GetConnectedOutputSlot();
ARMNN_ASSERT(source != NULL);
timelineUtils->CreateConnectionRelationship(ProfilingRelationshipType::RetentionLink,
source->GetOwningLayerGuid(),
layer.GetGuid());
}
}
void AddWorkloadStructure(std::unique_ptr<TimelineUtilityMethods>& timelineUtils,
std::unique_ptr<IWorkload>& workload,
const Layer& layer)
{
// Add workload to the post-optimisation network structure
timelineUtils->CreateTypedEntity(workload->GetGuid(), LabelsAndEventClasses::WORKLOAD_GUID);
timelineUtils->MarkEntityWithLabel(workload->GetGuid(),
layer.GetBackendId().Get(),
LabelsAndEventClasses::BACKENDID_GUID);
// Link the workload to the layer
timelineUtils->CreateRelationship(ProfilingRelationshipType::RetentionLink,
layer.GetGuid(),
workload->GetGuid(),
LabelsAndEventClasses::CHILD_GUID);
}
} // anonymous
std::unique_ptr<LoadedNetwork> LoadedNetwork::MakeLoadedNetwork(std::unique_ptr<IOptimizedNetwork> net,
std::string& errorMessage,
const INetworkProperties& networkProperties,
arm::pipe::IProfilingService* profilingService)
{
std::unique_ptr<LoadedNetwork> loadedNetwork;
auto Fail = [&](const std::exception& error) -> std::unique_ptr<LoadedNetwork>
{
errorMessage = ToErrorMessage("An error occurred when preparing the network workloads: ", error);
ARMNN_LOG(error) << errorMessage;
return std::unique_ptr<LoadedNetwork>();
};
try
{
loadedNetwork.reset(new LoadedNetwork(std::move(net), networkProperties, profilingService));
}
catch (const armnn::RuntimeException& error)
{
return Fail(error);
}
catch (const armnn::Exception& error)
{
return Fail(error);
}
catch (const std::runtime_error& error)
{
return Fail(error);
}
return loadedNetwork;
}
LoadedNetwork::LoadedNetwork(std::unique_ptr<IOptimizedNetwork> net,
const INetworkProperties& networkProperties,
arm::pipe::IProfilingService* profilingService) :
m_OptimizedNetwork(std::move(net)),
m_NetworkProperties(networkProperties),
m_TensorHandleFactoryRegistry(),
m_ProfilingService(profilingService)
{
ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "LoadedNetwork");
// Get the profiler and register it for the current thread.
const std::shared_ptr<IProfiler>& profiler = m_OptimizedNetwork->GetProfiler();
ProfilerManager::GetInstance().RegisterProfiler(profiler.get());
profiler->EnableProfiling(networkProperties.m_ProfilingEnabled);
profiler->EnableNetworkDetailsToStdOut(networkProperties.m_OutputNetworkDetailsMethod);
//First create tensor handlers, backends and workload factories.
//Handlers are created before workloads are.
//Because workload creation can modify some of the handlers,
//(for example the splitter and concat layers).
bool useExternalMemoryManager = false;
bool useInternalMemoryManager = false;
Graph& order = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph();
// Ensure Topological order
order.SetLayersOutOfOrder();
order.TopologicalSort();
if (!networkProperties.m_AsyncEnabled)
{
m_IsInputImported = std::vector<bool>(order.GetNumInputs(), false);
m_IsOutputImported = std::vector<bool>(order.GetNumOutputs(), false);
}
for (auto&& layer : order)
{
auto const& backendId = layer->GetBackendId();
if (m_Backends.count(backendId) == 0)
{
auto createBackend = BackendRegistryInstance().GetFactory(backendId);
auto it = m_Backends.emplace(std::make_pair(backendId, createBackend()));
IBackendInternal* backend = it.first->second.get();
if (networkProperties.m_AsyncEnabled &&
!HasCapability(BackendOptions::BackendOption{"AsyncExecution", true}, backend->GetCapabilities()))
{
std::string er = backend->GetId();
er += " does not support AsyncExecution";
throw BackendCapabilityException(er);
}
if (networkProperties.m_AsyncEnabled &&
!HasCapability(BackendOptions::BackendOption{"ExternallyManagedMemory", true},
backend->GetCapabilities()))
{
std::string er = backend->GetId();
er += " does not support ExternallyManagedMemory\n";
er += "AsyncEnabled networks require all backends to support ExternallyManagedMemory";
throw BackendCapabilityException(er);
}
if (HasCapability(BackendOptions::BackendOption{"ExternallyManagedMemory", true},backend->GetCapabilities())
&& (m_NetworkProperties.m_ExternalMemoryManagementEnabled || m_NetworkProperties.m_AsyncEnabled))
{
m_SupportsExternallyManagedMemory[backend->GetId()] = true;
useExternalMemoryManager = true;
}
else
{
m_SupportsExternallyManagedMemory[backend->GetId()] = false;
useInternalMemoryManager = true;
}
IBackendInternal::IWorkloadFactoryPtr workloadFactory;
if (backend->SupportsTensorAllocatorAPI())
{
workloadFactory = backend->CreateWorkloadFactory(
m_TensorHandleFactoryRegistry,
m_OptimizedNetwork->pOptimizedNetworkImpl->GetModelOptions(),
static_cast<MemorySourceFlags>(m_NetworkProperties.m_InputSource),
static_cast<MemorySourceFlags>(m_NetworkProperties.m_OutputSource));
}
else
{
m_BackendMemoryMangers.emplace_back(backend->CreateMemoryManager());
workloadFactory = backend->CreateWorkloadFactory(
m_BackendMemoryMangers.back(), m_OptimizedNetwork->pOptimizedNetworkImpl->GetModelOptions());
}
m_WorkloadFactories[backendId ] = std::move(workloadFactory);
}
}
if (!networkProperties.m_AsyncEnabled)
{
for (auto&& layer : order)
{
auto& workloadFactory = GetWorkloadFactory(*layer);
bool supportsExternalManager = m_SupportsExternallyManagedMemory[layer->GetBackendId()];
switch (layer->GetType())
{
case LayerType::Input:
case LayerType::MemImport:
{
// If IsImportEnabled is true then we need to set IsMemoryManaged
// to false when creating TensorHandles
layer->CreateTensorHandles(m_TensorHandleFactoryRegistry,
workloadFactory,
!supportsExternalManager && !m_NetworkProperties.m_ImportEnabled);
break;
}
case LayerType::Constant:
{
layer->CreateTensorHandles(m_TensorHandleFactoryRegistry, workloadFactory, true);
break;
}
default:
{
// Look for a layer with 1 OutputSlot which has 1 connection and that connection is an Output Layer
// If Export is enabled disable memory management so we can export, otherwise we do a copy
if ((layer->GetNumOutputSlots() == 1) &&
(layer->GetOutputSlots()[0].GetNumConnections() == 1) &&
(layer->GetOutputSlots()[0].GetConnection(0)->GetOwningLayer().GetType() == LayerType::Output))
{
layer->CreateTensorHandles(m_TensorHandleFactoryRegistry,
workloadFactory,
!supportsExternalManager && !m_NetworkProperties.m_ExportEnabled);
}
else
{
layer->CreateTensorHandles(m_TensorHandleFactoryRegistry,
workloadFactory,
!supportsExternalManager);
}
}
}
}
}
ProfilingGuid networkGuid = m_OptimizedNetwork->GetGuid();
std::unique_ptr<TimelineUtilityMethods> timelineUtils =
TimelineUtilityMethods::GetTimelineUtils(*m_ProfilingService);
if (timelineUtils)
{
timelineUtils->CreateTypedEntity(networkGuid, LabelsAndEventClasses::NETWORK_GUID);
// Mark the network with a start of life event
timelineUtils->RecordEvent(networkGuid, LabelsAndEventClasses::ARMNN_PROFILING_SOL_EVENT_CLASS);
// and with the process ID
int processID = arm::pipe::GetCurrentProcessId();
std::stringstream ss;
ss << processID;
timelineUtils->MarkEntityWithLabel(networkGuid, ss.str(), LabelsAndEventClasses::PROCESS_ID_GUID);
}
std::vector<IWorkload*> ConstWorkloads;
//Then create workloads.
{
ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "LoadNetwork_CreateWorkloads");
for (auto&& layer: order)
{
if (timelineUtils)
{
// Add layer to the post-optimisation network structure
AddLayerStructure(timelineUtils, *layer, networkGuid);
}
const IWorkloadFactory& workloadFactory = GetWorkloadFactory(*layer);
switch (layer->GetType())
{
case LayerType::Input:
case LayerType::Output:
{
// Inputs and outputs are treated in a special way - see EnqueueInput() and EnqueueOutput().
break;
}
default:
{
auto workload = layer->CreateWorkload(workloadFactory);
if (!workload)
{
const char* const layerName =
layer->GetNameStr().length() != 0 ? layer->GetName() : "<Unnamed>";
throw InvalidArgumentException(
fmt::format("No workload created for layer (name: '{0}' type: '{1}') (compute '{2}')",
layerName, static_cast<int>(layer->GetType()), layer->GetBackendId().Get()
));
}
if (timelineUtils)
{
// Add workload to the post-optimisation network structure
AddWorkloadStructure(timelineUtils, workload, *layer);
}
// For async networks ConstantWorkloads are managed exclusively by LoadedNetwork
// and are separated out from the other workloads
if((networkProperties.m_AsyncEnabled || useExternalMemoryManager) &&
layer->GetType() == LayerType::Constant)
{
m_ConstantTensorHandles[layer->GetGuid()] =
layer->GetOutputSlot(0).GetOutputHandler().GetData();
m_ConstantWorkloads[layer->GetGuid()] = std::move(workload);
}
else
{
m_WorkloadQueue.push_back(std::move(workload));
if (layer->GetType() == LayerType::Constant)
{
// Place the Constant Workloads into a queue so that they can be executed first
ConstWorkloads.push_back(m_WorkloadQueue.back().get());
}
}
// release the constant data in the layer..
layer->ReleaseConstantData();
break;
}
}
}
}
// Gather information about workloads for inputs & outputs
if (!networkProperties.m_AsyncEnabled && m_WorkloadQueue.size() != 0)
{
const int noOfInputs = armnn::numeric_cast<int>(order.GetNumInputs());
// Get indices of all workloads connected to each input and
// check if they support tensor handle replacement
for (const BindableLayer* layer: order.GetInputLayers())
{
const auto bindingId = layer->GetBindingId();
bool supportsReplacement = true;
for (const auto inputSlot: layer->GetOutputSlot(0).GetConnections())
{
auto workloadIndex = std::distance(order.begin(), order.GetPosInGraph(inputSlot->GetOwningLayer()));
workloadIndex -= noOfInputs;
m_InputWorkloadSlotPairs[bindingId].emplace_back(WorkloadIndices{
armnn::numeric_cast<unsigned int>(workloadIndex), inputSlot->GetSlotIndex()});
auto workload = m_WorkloadQueue[m_InputWorkloadSlotPairs[bindingId].back().m_WorkloadIndex].get();
supportsReplacement &= workload->SupportsTensorHandleReplacement();
}
ITensorHandleFactory::FactoryId factoryId = layer->GetOutputSlot(0).GetTensorHandleFactoryId();
// Get matching import factory Id
ITensorHandleFactory::FactoryId importFactoryId =
m_TensorHandleFactoryRegistry.GetMatchingImportFactoryId(factoryId);
ITensorHandleFactory *importFactory = m_TensorHandleFactoryRegistry.GetFactory(importFactoryId);
if (supportsReplacement && importFactory)
{
m_PreImportedInputHandles.emplace_back(
bindingId, importFactory->CreateTensorHandle(layer->GetOutputSlot(0).GetTensorInfo(), false));
}
else
{
m_PreImportedInputHandles.emplace_back(bindingId, nullptr);
}
}
// Get indices of all workloads connected to each output and
// check if they support tensor handle replacement
for (const BindableLayer* layer: order.GetOutputLayers())
{
const auto bindingId = layer->GetBindingId();
const auto outputSlot = layer->GetInputSlot(0).GetConnectedOutputSlot();
auto& indices = m_OutputWorkloadSlotPairs[bindingId];
auto workloadIndex = std::distance(order.begin(), order.GetPosInGraph(outputSlot->GetOwningLayer()));
workloadIndex -= noOfInputs;
indices.m_OutputSlotIndices = WorkloadIndices{numeric_cast<unsigned int>(workloadIndex),
outputSlot->CalculateIndexOnOwner()};
bool supportsReplacement = true;
auto outputWorkload = m_WorkloadQueue[indices.m_OutputSlotIndices.m_WorkloadIndex].get();
supportsReplacement &= outputWorkload->SupportsTensorHandleReplacement();
for (auto &inputSlot: outputSlot->GetConnections())
{
if(inputSlot->GetOwningLayer().GetType() != LayerType::Output)
{
auto inWorkloadIndex = std::distance(order.begin(),
order.GetPosInGraph(inputSlot->GetOwningLayer()));
inWorkloadIndex -= noOfInputs;
indices.m_InputSlotIndices.emplace_back(WorkloadIndices{numeric_cast<unsigned int>(inWorkloadIndex),
inputSlot->GetSlotIndex()});
auto inputWorkload = m_WorkloadQueue[indices.m_InputSlotIndices.back().m_WorkloadIndex].get();
supportsReplacement &= inputWorkload->SupportsTensorHandleReplacement();
}
}
ITensorHandleFactory::FactoryId factoryId = outputSlot->GetTensorHandleFactoryId();
// Get matching import factory Id
ITensorHandleFactory::FactoryId importFactoryId =
m_TensorHandleFactoryRegistry.GetMatchingImportFactoryId(factoryId);
ITensorHandleFactory *importFactory = m_TensorHandleFactoryRegistry.GetFactory(importFactoryId);
if (supportsReplacement && importFactory)
{
m_PreImportedOutputHandles.emplace_back(
bindingId, importFactory->CreateTensorHandle(outputSlot->GetTensorInfo(), false));
}
else
{
m_PreImportedOutputHandles.emplace_back(bindingId, nullptr);
}
}
}
for (auto&& workloadFactory : m_WorkloadFactories)
{
workloadFactory.second->AfterWorkloadsCreated();
}
if (timelineUtils)
{
// Commit to send the post-optimisation network structure
timelineUtils->Commit();
}
if (useExternalMemoryManager)
{
if (networkProperties.m_AsyncEnabled)
{
CreateMemoryProfileAsync();
}
else
{
CreateMemoryProfile();
}
auto backendStrategyMap = BackendRegistryInstance().GetMemoryOptimizerStrategies();
for (auto& backendMemoryProfile : m_MemBlockMap)
{
const BackendId& backendId = backendMemoryProfile.first;
if (backendStrategyMap.find(backendId) != backendStrategyMap.end())
{
m_MemBinMap[backendId] = backendStrategyMap[backendId]->Optimize(backendMemoryProfile.second);
}
else
{
m_MemBinMap[backendId] = m_ConstantStrategy->Optimize(backendMemoryProfile.second);
}
}
if (!networkProperties.m_AsyncEnabled)
{
m_ExternalMemoryManager = CreateExternalMemoryManger(m_TensorMemory);
// Sort m_TensorMemory, so it's order matches m_Tensorhandles
std::sort(m_TensorMemory.begin(), m_TensorMemory.end(),
[](const std::pair<std::shared_ptr<TensorMemory>, MemorySource>& lhs,
const std::pair<std::shared_ptr<TensorMemory>, MemorySource>& rhs)
{
return lhs.first->m_OutputSlotId < rhs.first->m_OutputSlotId;
});
}
}
// Now that the intermediate tensor memory has been set-up,
// do any post allocation configuration for each workload.
if (!networkProperties.m_AsyncEnabled)
{
if (useInternalMemoryManager)
{
// Set up memory.
m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph().AllocateDynamicBuffers();
}
for (auto &workload : m_WorkloadQueue)
{
workload->PostAllocationConfigure();
}
}
if (useExternalMemoryManager)
{
if (!networkProperties.m_AsyncEnabled)
{
AllocateAndExecuteConstantWorkloads();
}
else
{
AllocateAndExecuteConstantWorkloadsAsync();
}
}
// If synchronous, execute all constant layer workloads
if (!networkProperties.m_AsyncEnabled)
{
for (auto workload: ConstWorkloads)
{
workload->Execute();
}
}
}
void LoadedNetwork::AllocateAndExecuteConstantWorkloads()
{
ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "LoadNetwork_AllocateAndExecuteConstants");
for (auto& pair : m_ConstantWorkloads)
{
auto tensorHandle = m_ConstantTensorHandles[pair.first];
tensorHandle->Allocate();
pair.second->Execute();
}
}
void LoadedNetwork::AllocateAndExecuteConstantWorkloadsAsync()
{
ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "LoadNetwork_AllocateAndExecuteConstants");
Graph& order = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph();
for (auto&& layer : order)
{
if (layer->GetType() == LayerType::Constant)
{
const auto& outSlot = layer->GetOutputSlots()[0];
const auto factoryId = outSlot.GetTensorHandleFactoryId();
ARMNN_ASSERT(factoryId != ITensorHandleFactory::LegacyFactoryId);
auto& workloadFactory = GetWorkloadFactory(*layer);
layer->CreateTensorHandles(m_TensorHandleFactoryRegistry, workloadFactory);
ITensorHandle* tensorHandle = outSlot.GetOutputHandler().GetData();
m_ConstantTensorHandles[layer->GetGuid()] = tensorHandle;
tensorHandle->Allocate();
WorkingMemDescriptor memDesc;
memDesc.m_Outputs.push_back(tensorHandle);
m_ConstantWorkloads[layer->GetGuid()]->ExecuteAsync(memDesc);
}
}
}
void LoadedNetwork::SendNetworkStructure(arm::pipe::IProfilingService& profilingService)
{
ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "LoadNetwork_SendNetworkStructure");
Graph& order = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph().TopologicalSort();
ProfilingGuid networkGuid = m_OptimizedNetwork->GetGuid();
std::unique_ptr<TimelineUtilityMethods> timelineUtils =
TimelineUtilityMethods::GetTimelineUtils(profilingService);
timelineUtils->CreateTypedEntity(networkGuid, LabelsAndEventClasses::NETWORK_GUID);
for (auto&& layer : order)
{
// Add layer to the post-optimisation network structure
AddLayerStructure(timelineUtils, *layer, networkGuid);
switch (layer->GetType())
{
case LayerType::Input:
case LayerType::Output:
{
// Inputs and outputs are treated in a special way - see EnqueueInput() and EnqueueOutput().
break;
}
default:
{
for (auto& workload : m_WorkloadQueue)
{
// Add workload to the post-optimisation network structure
AddWorkloadStructure(timelineUtils, workload, *layer);
}
break;
}
}
}
// Commit to send the post-optimisation network structure
timelineUtils->Commit();
}
ProfilingGuid LoadedNetwork::GetNetworkGuid()
{
return m_OptimizedNetwork->GetGuid();
}
TensorInfo LoadedNetwork::GetInputTensorInfo(LayerBindingId layerId) const
{
for (auto&& inputLayer : m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph().GetInputLayers())
{
ARMNN_ASSERT_MSG(inputLayer->GetNumOutputSlots() == 1, "Input layer should have exactly 1 output slot");
if (inputLayer->GetBindingId() == layerId)
{
return inputLayer->GetOutputSlot(0).GetTensorInfo();
}
}
throw InvalidArgumentException(fmt::format("No input layer is associated with id {}", layerId));
}
TensorInfo LoadedNetwork::GetOutputTensorInfo(LayerBindingId layerId) const
{
for (auto&& outputLayer : m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph().GetOutputLayers())
{
ARMNN_ASSERT_MSG(outputLayer->GetNumInputSlots() == 1, "Output layer should have exactly 1 input slot");
ARMNN_ASSERT_MSG(outputLayer->GetInputSlot(0).GetConnection(), "Input slot on Output layer must be connected");
if (outputLayer->GetBindingId() == layerId)
{
return outputLayer->GetInputSlot(0).GetConnection()->GetTensorInfo();
}
}
throw InvalidArgumentException(fmt::format("No output layer is associated with id {}", layerId));
}
const IWorkloadFactory& LoadedNetwork::GetWorkloadFactory(const Layer& layer) const
{
const IWorkloadFactory* workloadFactory = nullptr;
auto it = m_WorkloadFactories.find(layer.GetBackendId());
if (it == m_WorkloadFactories.end())
{
throw RuntimeException(fmt::format("No workload factory for {0} to be used for layer: {1}",
layer.GetBackendId().Get(),
layer.GetNameStr()),
CHECK_LOCATION());
}
workloadFactory = it->second.get();
ARMNN_ASSERT_MSG(workloadFactory, "No workload factory");
std::string reasonIfUnsupported;
ARMNN_ASSERT_MSG(IWorkloadFactory::IsLayerSupported(layer,
{},
reasonIfUnsupported,
m_OptimizedNetwork->pOptimizedNetworkImpl->GetModelOptions()),
"Factory does not support layer");
IgnoreUnused(reasonIfUnsupported);
return *workloadFactory;
}
namespace {
// Non-copyable class owning accelerator-specific tensor data.
class TensorPin
{
public:
TensorPin(std::unique_ptr<ITensorHandle> handle, const TensorInfo& info, LayerBindingId id)
: m_TensorHandle(std::move(handle))
, m_TensorInfo(info)
, m_Id(id)
{
}
ITensorHandle* GetTensorHandle() const { return m_TensorHandle.get(); }
const TensorInfo& GetTensorInfo() const { return m_TensorInfo; }
LayerBindingId GetBindingId() const { return m_Id; }
private:
std::unique_ptr<ITensorHandle> m_TensorHandle;
TensorInfo m_TensorInfo;
LayerBindingId m_Id;
};
static const TensorPin& GetTensorPin(LayerBindingId id,
const std::vector<TensorPin>& pins,
char const* bindingPointDesc)
{
auto it = std::find_if(pins.begin(), pins.end(),
[id](const TensorPin& pin)
{
return pin.GetBindingId() == id;
});
if (it != pins.end())
{
return *it;
}
else
{
throw InvalidArgumentException(fmt::format("No tensor supplied for {0} {1}", bindingPointDesc, id));
}
}
// Stores data that needs to be kept accessible for the entire execution of a workload.
class WorkloadData
{
public:
WorkloadData(const InputTensors& inputTensors, const OutputTensors& outputTensors)
{
m_InputTensorPins.reserve(inputTensors.size());
m_OutputTensorPins.reserve(outputTensors.size());
for (auto inputTensorPair : inputTensors)
{
auto inputTensor = inputTensorPair.second;
std::unique_ptr<ITensorHandle> tensorHandle =
std::make_unique<ConstPassthroughTensorHandle>(inputTensor.GetInfo(),inputTensor.GetMemoryArea());
LayerBindingId layerId = inputTensorPair.first;
m_InputTensorPins.emplace_back(std::move(tensorHandle), inputTensor.GetInfo(), layerId);
}
for (auto outputTensorPair : outputTensors)
{
auto outputTensor = outputTensorPair.second;
std::unique_ptr<ITensorHandle> tensorHandle =
std::make_unique<PassthroughTensorHandle>(outputTensor.GetInfo(), outputTensor.GetMemoryArea());
LayerBindingId layerId = outputTensorPair.first;
m_OutputTensorPins.emplace_back(std::move(tensorHandle), outputTensor.GetInfo(), layerId);
}
}
const TensorPin& GetInputTensorPin(LayerBindingId id) const
{
return GetTensorPin(id, m_InputTensorPins, "input");
}
const TensorPin& GetOutputTensorPin(LayerBindingId id) const
{
return GetTensorPin(id, m_OutputTensorPins, "output");
}
private:
std::vector<TensorPin> m_InputTensorPins;
std::vector<TensorPin> m_OutputTensorPins;
};
}
Status LoadedNetwork::EnqueueWorkload(const InputTensors& inputTensors,
const OutputTensors& outputTensors,
std::vector<ImportedInputId> preImportedInputIds,
std::vector<ImportedOutputId> preImportedOutputIds)
{
const Graph& graph = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph();
// Walk graph to determine the order of execution.
if (graph.GetNumLayers() < 2)
{
ARMNN_LOG(warning) << "IRuntime::EnqueueWorkload()::Less than two nodes in graph";
return Status::Failure;
}
// Data that must be kept alive for the entire execution of the workload.
WorkloadData workloadData(inputTensors, outputTensors);
if (graph.GetNumInputs() != inputTensors.size())
{
throw InvalidArgumentException("Number of inputs provided does not match network.");
}
// For each input to the network, call EnqueueInput with the data passed by the user.
{
ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "PrepareInputs");
m_InputQueue.clear();
m_InputQueue.reserve(graph.GetNumInputs());
if (preImportedInputIds.size() > graph.GetNumInputs())
{
throw InvalidArgumentException("Invalid number of preImportedInputIds");
}
unsigned int inputIndex = 0;
unsigned int importedInputIdIndex = 0;
std::sort(preImportedInputIds.begin(), preImportedInputIds.end());
for (const BindableLayer* inputLayer : graph.GetInputLayers())
{
if (importedInputIdIndex < preImportedInputIds.size() &&
inputIndex == preImportedInputIds[importedInputIdIndex])
{
// Only replace tensorhandles if they have not already been replaced
if (!m_IsInputImported[inputIndex])
{
auto outputTensorHandle = m_PreImportedInputHandles[inputIndex].m_TensorHandle.get();
for (const auto& workloadInfo: m_InputWorkloadSlotPairs[inputLayer->GetBindingId()])
{
auto workload = m_WorkloadQueue[workloadInfo.m_WorkloadIndex].get();
workload->ReplaceInputTensorHandle(outputTensorHandle, workloadInfo.m_SlotIndex);
}
m_IsInputImported[inputIndex] = true;
}
importedInputIdIndex++;
}
else
{
if (m_IsInputImported[inputIndex])
{
OutputHandler& handler = const_cast<OutputHandler&>(inputLayer->GetOutputHandler(0));
for (const auto& workloadInfo: m_InputWorkloadSlotPairs[inputLayer->GetBindingId()])
{
auto workload = m_WorkloadQueue[workloadInfo.m_WorkloadIndex].get();
workload->ReplaceInputTensorHandle(handler.GetData(), workloadInfo.m_SlotIndex);
}
m_IsInputImported[inputIndex] = false;
}
// InputTensorHandle is not imported yet, process to enqueue input
const TensorPin& pin = workloadData.GetInputTensorPin(inputLayer->GetBindingId());
EnqueueInput(*inputLayer, pin.GetTensorHandle(), pin.GetTensorInfo());
}
inputIndex++;
}
}
// For each output to the network, call EnqueueOutput with the data passed by the user.
{
ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "PrepareOutputs");
m_OutputQueue.clear();
m_OutputQueue.reserve(graph.GetNumOutputs());
if (preImportedOutputIds.size() > graph.GetNumOutputs())
{
throw InvalidArgumentException("Invalid number of preImportedOutputIds");
}
unsigned int outputIndex = 0;
unsigned int importedOutputIdIndex = 0;
std::sort(preImportedOutputIds.begin(), preImportedOutputIds.end());
for (const BindableLayer* outputLayer : graph.GetOutputLayers())
{
if (importedOutputIdIndex < preImportedOutputIds.size() &&
outputIndex == preImportedOutputIds[importedOutputIdIndex])
{
// Only replace tensorhandles if they have not already been replaced
ITensorHandle* inputTensorHandle = m_PreImportedOutputHandles[outputIndex].m_TensorHandle.get();
if (!m_IsOutputImported[outputIndex])
{
const auto bindingId = outputLayer->GetBindingId();
const auto& indices = m_OutputWorkloadSlotPairs[bindingId];
auto outputWorkload = m_WorkloadQueue[indices.m_OutputSlotIndices.m_WorkloadIndex].get();
outputWorkload->ReplaceOutputTensorHandle(inputTensorHandle,
indices.m_OutputSlotIndices.m_SlotIndex);
for (const auto& workloadInfo: indices.m_InputSlotIndices)
{
auto inputWorkload = m_WorkloadQueue[workloadInfo.m_WorkloadIndex].get();
inputWorkload->ReplaceInputTensorHandle(inputTensorHandle, workloadInfo.m_SlotIndex);
}
m_IsOutputImported[outputIndex] = true;
}
ARMNN_ASSERT_MSG(inputTensorHandle != nullptr, "Data should have been allocated.");
MemSyncQueueDescriptor syncDesc;
syncDesc.m_Inputs.push_back(inputTensorHandle);
WorkloadInfo info;
info.m_InputTensorInfos.push_back(
outputLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo());
auto syncWorkload = std::make_unique<SyncMemGenericWorkload>(syncDesc, info);
ARMNN_ASSERT_MSG(syncWorkload, "No sync workload created");
m_OutputQueue.push_back(move(syncWorkload));
importedOutputIdIndex++;
}
else
{
if (m_IsOutputImported[outputIndex])
{
const auto bindingId = outputLayer->GetBindingId();
const auto& indices = m_OutputWorkloadSlotPairs[bindingId];
auto outputWorkload = m_WorkloadQueue[indices.m_OutputSlotIndices.m_WorkloadIndex].get();
const OutputHandler& outputHandler =
outputLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetOutputHandler();
outputWorkload->ReplaceOutputTensorHandle(
outputHandler.GetData(), indices.m_OutputSlotIndices.m_SlotIndex);
for (const auto& workloadInfo: indices.m_InputSlotIndices)
{
auto inputWorkload = m_WorkloadQueue[workloadInfo.m_WorkloadIndex].get();
inputWorkload->ReplaceInputTensorHandle(outputHandler.GetData(), workloadInfo.m_SlotIndex);
}
m_IsOutputImported[outputIndex] = false;
}
const TensorPin& pin = workloadData.GetOutputTensorPin(outputLayer->GetBindingId());
// OutputTensorHandle is not imported yet, process to enqueue Output
EnqueueOutput(*outputLayer, pin.GetTensorHandle(), pin.GetTensorInfo());
}
outputIndex++;
}
}
std::unique_ptr<TimelineUtilityMethods> timelineUtils =
TimelineUtilityMethods::GetTimelineUtils(*m_ProfilingService);
ProfilingGuid inferenceGuid = m_ProfilingService->GetNextGuid();
if (timelineUtils)
{
// Add inference timeline trace if profiling is enabled.
ProfilingGuid networkGuid = m_OptimizedNetwork->GetGuid();
timelineUtils->CreateTypedEntity(inferenceGuid, LabelsAndEventClasses::INFERENCE_GUID);
timelineUtils->CreateRelationship(ProfilingRelationshipType::RetentionLink,
networkGuid,
inferenceGuid,
LabelsAndEventClasses::EXECUTION_OF_GUID);
timelineUtils->RecordEvent(inferenceGuid, LabelsAndEventClasses::ARMNN_PROFILING_SOL_EVENT_CLASS);
}
bool executionSucceeded = true;
{
if (m_ProfilingService->IsProfilingEnabled())
{
m_ProfilingService->IncrementCounterValue(INFERENCES_RUN);
}
ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "Execute");
ARMNN_SCOPED_HEAP_PROFILING("Executing");
executionSucceeded = Execute(timelineUtils, inferenceGuid);
}
if (timelineUtils)
{
// Add end of life of the inference timeline if profiling is enabled.
timelineUtils->RecordEvent(inferenceGuid, LabelsAndEventClasses::ARMNN_PROFILING_EOL_EVENT_CLASS);
timelineUtils->Commit();
}
return executionSucceeded ? Status::Success : Status::Failure;
}
void LoadedNetwork::EnqueueInput(const BindableLayer& layer, ITensorHandle* tensorHandle, const TensorInfo& tensorInfo)
{
if (layer.GetType() != LayerType::Input)
{
throw InvalidArgumentException("EnqueueInput: given layer not an InputLayer");
}
if (tensorHandle == nullptr)
{
throw InvalidArgumentException("EnqueueInput: tensorHandle must not be NULL");
}
InputQueueDescriptor inputQueueDescriptor;
WorkloadInfo info;
inputQueueDescriptor.m_Inputs.push_back(tensorHandle);
info.m_InputTensorInfos.push_back(tensorInfo);
ARMNN_ASSERT_MSG(layer.GetNumOutputSlots() == 1, "Can only handle Input Layer with one output");
const OutputHandler& handler = layer.GetOutputHandler();
const TensorInfo& outputTensorInfo = handler.GetTensorInfo();
ITensorHandle* outputTensorHandle = handler.GetData();
ARMNN_ASSERT_MSG(outputTensorHandle != nullptr,
"Data should have been allocated.");
inputQueueDescriptor.m_Outputs.push_back(outputTensorHandle);
info.m_OutputTensorInfos.push_back(outputTensorInfo);
MemorySourceFlags importFlags = outputTensorHandle->GetImportFlags();
bool needMemCopy = true;
if (m_NetworkProperties.m_ImportEnabled) // Try import the input tensor
{
if(CheckFlag(importFlags, m_NetworkProperties.m_InputSource))
{
needMemCopy = false;
// This assumes a CPU Tensor handle
void* mem = tensorHandle->Map(false);
if (outputTensorHandle->Import(mem, m_NetworkProperties.m_InputSource))
{
tensorHandle->Unmap();
return; // No need for a workload since the import has been done.
}
tensorHandle->Unmap();
throw MemoryImportException("EnqueueInput: Memory Import failed");
}
}
if (needMemCopy)
{
// Create a mem copy workload for input since we did not import
std::unique_ptr<IWorkload> inputWorkload = std::make_unique<CopyMemGenericWorkload>(inputQueueDescriptor, info);
ARMNN_ASSERT_MSG(inputWorkload, "No input workload created");
std::unique_ptr<TimelineUtilityMethods> timelineUtils =
TimelineUtilityMethods::GetTimelineUtils(*m_ProfilingService);
if (timelineUtils)
{
// Add Input Workload to the post-optimisation network structure
AddWorkloadStructure(timelineUtils, inputWorkload, layer);
timelineUtils->Commit();
}
m_InputQueue.push_back(move(inputWorkload));
}
}
void LoadedNetwork::EnqueueOutput(const BindableLayer& layer, ITensorHandle* tensorHandle, const TensorInfo& tensorInfo)
{
if (layer.GetType() != LayerType::Output)
{
throw InvalidArgumentException("EnqueueOutput: given layer not an OutputLayer");
}
if (tensorHandle == nullptr)
{
throw InvalidArgumentException("EnqueueOutput: tensorHandle must not be NULL");
}
OutputQueueDescriptor outputQueueDescriptor;
WorkloadInfo info;
outputQueueDescriptor.m_Outputs.push_back(tensorHandle);
info.m_OutputTensorInfos.push_back(tensorInfo);
ARMNN_ASSERT_MSG(layer.GetNumInputSlots() == 1, "Output Layer should have exactly one input.");
// Gets the output handler from the previous node.
const OutputHandler& outputHandler = layer.GetInputSlots()[0].GetConnectedOutputSlot()->GetOutputHandler();
const TensorInfo& inputTensorInfo = outputHandler.GetTensorInfo();
ITensorHandle* inputTensorHandle = outputHandler.GetData();
ARMNN_ASSERT_MSG(inputTensorHandle != nullptr, "Data should have been allocated.");
// Try import the output tensor.
// Note: We can only import the output pointer if all of the following hold true:
// a) The imported pointer is aligned sufficiently
// b) The tensor has zero padding
// c) There is only one connection to the OutputSlot and it is to an OutputLayer.
// d) The output pointer is allocated via malloc. (Other types will be supported in a later release)
// e) m_IsExportEnabled must be set to true
bool needMemCopy = true;
if (m_NetworkProperties.m_ExportEnabled &&
(layer.GetInputSlots()[0].GetConnectedOutputSlot()->GetNumConnections() == 1))
{
if(layer.GetInputSlots()[0].GetConnectedOutputSlot()->GetOwningLayer().GetType() != LayerType::Input)
{
MemorySourceFlags importFlags = inputTensorHandle->GetImportFlags();
if (CheckFlag(importFlags, m_NetworkProperties.m_OutputSource))
{
needMemCopy = false;
void *mem = tensorHandle->Map(false);
bool importOk = inputTensorHandle->Import(mem, m_NetworkProperties.m_OutputSource);
tensorHandle->Unmap();
if (importOk)
{
// Insert synchronization workload
MemSyncQueueDescriptor syncDesc;
syncDesc.m_Inputs.push_back(inputTensorHandle);
info.m_InputTensorInfos.push_back(inputTensorInfo);
auto syncWorkload = std::make_unique<SyncMemGenericWorkload>(syncDesc, info);
ARMNN_ASSERT_MSG(syncWorkload, "No sync workload created");
m_OutputQueue.push_back(move(syncWorkload));
}
else
{
throw MemoryExportException("EnqueueOutput: Memory Export failed");
}
}
}
}
if (needMemCopy)
{
// If we got here then we didn't export the memory, so add an output workload which performs a memcopy.
outputQueueDescriptor.m_Inputs.push_back(inputTensorHandle);
info.m_InputTensorInfos.push_back(inputTensorInfo);
std::unique_ptr<IWorkload> outputWorkload =
std::make_unique<CopyMemGenericWorkload>(outputQueueDescriptor, info);
ARMNN_ASSERT_MSG(outputWorkload, "No output workload created");
std::unique_ptr<TimelineUtilityMethods> timelineUtils =
TimelineUtilityMethods::GetTimelineUtils(*m_ProfilingService);
if (timelineUtils)
{
// Add Output Workload to the post-optimisation network structure
AddWorkloadStructure(timelineUtils, outputWorkload, layer);
timelineUtils->Commit();
}
m_OutputQueue.push_back(move(outputWorkload));
}
}
void LoadedNetwork::AllocateWorkingMemory(
#if !defined(ARMNN_DISABLE_THREADS)
std::lock_guard<std::mutex>& lock
#endif
)
{
ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "Working Memory Allocation");
#if !defined(ARMNN_DISABLE_THREADS)
// this unused parameter makes sure we can only call this function with a valid lock
IgnoreUnused(lock);
#endif
if (m_IsWorkingMemAllocated)
{
return;
}
if (m_ExternalMemoryManager)
{
m_ExternalMemoryManager->Allocate();
for (unsigned int i = 0; i < m_TensorMemory.size(); ++i)
{
m_Tensorhandles[i]->Import(m_TensorMemory[i].first->m_Data, m_TensorMemory[i].second);
}
}
for (auto&& memoryManager : m_BackendMemoryMangers)
{
if (memoryManager)
{
memoryManager->Acquire();
}
}
m_TensorHandleFactoryRegistry.AquireMemory();
m_IsWorkingMemAllocated = true;
}
void LoadedNetwork::FreeWorkingMemory()
{
#if !defined(ARMNN_DISABLE_THREADS)
std::lock_guard<std::mutex> lockGuard(m_WorkingMemMutex);
#endif
if (!m_IsWorkingMemAllocated)
{
return;
}
if (m_ExternalMemoryManager)
{
m_ExternalMemoryManager->Deallocate();
}
// Informs the memory managers to release memory in its respective memory group
for (auto&& memoryManager : m_BackendMemoryMangers)
{
if (memoryManager)
{
memoryManager->Release();
}
}
m_TensorHandleFactoryRegistry.ReleaseMemory();
m_IsWorkingMemAllocated = false;
}
bool LoadedNetwork::Execute(std::unique_ptr<TimelineUtilityMethods>& timelineUtils,
ProfilingGuid inferenceGuid)
{
bool success = true;
auto Fail = [&](const std::exception& error)
{
ARMNN_LOG(error) << "An error occurred attempting to execute a workload: " << error.what();
success = false;
};
try
{
#if !defined(ARMNN_DISABLE_THREADS)
std::lock_guard<std::mutex> lockGuard(m_WorkingMemMutex);
AllocateWorkingMemory(lockGuard);
#else
AllocateWorkingMemory();
#endif
ProfilingDynamicGuid workloadInferenceID(0);
auto ExecuteQueue = [&timelineUtils, &workloadInferenceID, &inferenceGuid](WorkloadQueue& queue)
{
for (auto& workload : queue)
{
if(timelineUtils)
{
workloadInferenceID = timelineUtils->RecordWorkloadInferenceAndStartOfLifeEvent(workload->GetGuid(),
inferenceGuid);
}
workload->Execute();
if(timelineUtils)
{
timelineUtils->RecordEndOfLifeEvent(workloadInferenceID);
}
}
};
ExecuteQueue(m_InputQueue);
ExecuteQueue(m_WorkloadQueue);
ExecuteQueue(m_OutputQueue);
}
catch (const RuntimeException& error)
{
Fail(error);
}
catch (const std::runtime_error& error)
{
Fail(error);
}
return success;
}
void LoadedNetwork::EnqueueInput(const ConstTensor& inputTensor, ITensorHandle* inputTensorHandle)
{
if (m_NetworkProperties.m_ImportEnabled) // Try import the input tensor
{
MemorySourceFlags importFlags = inputTensorHandle->GetImportFlags();
if (CheckFlag(importFlags, m_NetworkProperties.m_InputSource) )
{
std::unique_ptr<ITensorHandle> tensorHandle =
std::make_unique<ConstPassthroughTensorHandle>(inputTensor.GetInfo(),
inputTensor.GetMemoryArea());
void* mem = tensorHandle->Map(false);
if (inputTensorHandle->Import(mem, m_NetworkProperties.m_InputSource))
{
tensorHandle->Unmap();
return;
}
tensorHandle->Unmap();
throw MemoryImportException("EnqueueInput: Memory Import failed");
}
else
{
throw MemoryImportException("EnqueueInput: Memory Import failed, backend does not support Import");
}
}
else
{
std::unique_ptr<ITensorHandle> tensorHandle =
std::make_unique<ConstPassthroughTensorHandle>(inputTensor.GetInfo(), inputTensor.GetMemoryArea());
auto copyFunc = [](void* dst, const void* src, size_t size)
{
memcpy(dst, src, size);
};
CopyTensorContentsGeneric(tensorHandle.get(), inputTensorHandle, copyFunc);
}
}
// Note: We can only import the output pointer if all of the following hold true:
// a) The imported pointer is aligned sufficiently
// b) The tensor has zero padding
// c) There is only one connection to the OutputSlot and it is to an OutputLayer.
// d) The output pointer is allocated via malloc. (Other types will be supported in a later release)
// e) m_IsExportEnabled must be set to true
void LoadedNetwork::ImportOutputTensor(const Tensor& outputTensor, ITensorHandle* outputTensorHandle)
{
ARMNN_ASSERT_MSG(outputTensorHandle != nullptr, "Data should have been allocated.");
MemorySourceFlags importFlags = outputTensorHandle->GetImportFlags();
if (CheckFlag(importFlags, m_NetworkProperties.m_OutputSource))
{
std::unique_ptr<ITensorHandle> tensorHandle =
std::make_unique<PassthroughTensorHandle>(outputTensor.GetInfo(),
outputTensor.GetMemoryArea());
void* mem = tensorHandle->Map(false);
bool importOk = outputTensorHandle->Import(mem, m_NetworkProperties.m_OutputSource);
tensorHandle->Unmap();
if (!importOk)
{
throw MemoryExportException("ImportOutputTensor: Memory Export failed");
}
}
else
{
throw MemoryExportException("ImportOutputTensor: Memory Export failed, attempting to export Input Layer");
}
}
void CopyToOutputTensor(const Tensor& outputTensor, ITensorHandle* outputTensorHandle)
{
auto copyFunc = [](void* dst, const void* src, size_t size)
{
memcpy(dst, src, size);
};
std::unique_ptr<ITensorHandle> tensorHandle =
std::make_unique<PassthroughTensorHandle>(outputTensor.GetInfo(),
outputTensor.GetMemoryArea());
CopyTensorContentsGeneric(outputTensorHandle, tensorHandle.get(), copyFunc);
}
const armnn::ConstTensor GetInputTensor(const LayerBindingId layerId, const InputTensors& inputTensors)
{
for (auto inputTensorPair : inputTensors)
{
LayerBindingId id = inputTensorPair.first;
if (id == layerId)
{
return inputTensorPair.second;
}
}
throw InvalidArgumentException("Input does not exist.");
}
const armnn::Tensor GetOutputTensor(const LayerBindingId layerId, const OutputTensors& outputTensors)
{
for (auto outputTensorPair : outputTensors)
{
LayerBindingId id = outputTensorPair.first;
if (id == layerId)
{
return outputTensorPair.second;
}
}
throw InvalidArgumentException("Output does not exist.");
}
std::vector<ImportedInputId> LoadedNetwork::ImportInputs(const InputTensors& inputTensors,
MemorySource forceImportMemorySource)
{
if (!m_NetworkProperties.m_AsyncEnabled)
{
// Cannot import if import is not enabled and forceImportMemorySource is undefined
if (forceImportMemorySource == MemorySource::Undefined)
{
throw MemoryImportException("ImportInputs: Memory Import failed, NetworkProperties.m_ImportEnabled");
}
if (inputTensors.size() != m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph().GetNumInputs())
{
throw MemoryImportException("ImportInputs: Force Import failed, incorrect number of tensors");
}
std::vector<ImportedInputId> importedInputs;
Graph& graph = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph().TopologicalSort();
unsigned int inputIndex = 0;
for (const BindableLayer* inputLayer : graph.GetInputLayers())
{
auto outputTensorHandle = m_PreImportedInputHandles[inputIndex].m_TensorHandle.get();
if (!outputTensorHandle)
{
inputIndex++;
continue;
}
auto layerBindingId = inputLayer->GetBindingId();
auto it = std::find_if(inputTensors.begin(), inputTensors.end(), [=](const auto& inputTensor)
{
return inputTensor.first == layerBindingId;
});
if (it == inputTensors.end())
{
inputIndex++;
continue;
}
const auto& inputTensor = *it;
std::unique_ptr<ITensorHandle> passThroughTensorHandle =
std::make_unique<ConstPassthroughTensorHandle>(inputTensor.second.GetInfo(),
inputTensor.second.GetMemoryArea());
if (outputTensorHandle->CanBeImported(passThroughTensorHandle->Map(), forceImportMemorySource)
&& (outputTensorHandle->Import(passThroughTensorHandle->Map(), forceImportMemorySource)))
{
importedInputs.push_back(inputIndex);
}
passThroughTensorHandle->Unmap();
inputIndex++;
}
return importedInputs;
}
else
{
// Import when the import of network properties is enabled
std::vector<ImportedInputId> importedInputs;
Graph& graph = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph().TopologicalSort();
for (auto inputTensor : inputTensors)
{
auto layerBindingId = inputTensor.first;
auto it = std::find_if(graph.GetInputLayers().begin(), graph.GetInputLayers().end(), [=](auto* layer)
{
return layer->GetBindingId() == layerBindingId;
});
if (it == graph.GetInputLayers().end())
{
throw MemoryImportException(fmt::format(
"ImportInputs: Memory Import failed, unknown LayerBindingId: {}", layerBindingId));
}
const Layer* layer = *it;
if (layer->GetType() != LayerType::Input)
{
throw InvalidArgumentException("ImportInputs: given layer not an InputLayer");
}
auto& backend = m_Backends.at(layer->GetBackendId());
if (!HasCapability(BackendOptions::BackendOption{"PreImportIOTensors", true}, backend->GetCapabilities()))
{
std::string er = backend->GetId();
er += " does not have PreImportIOTensors capability";
throw BackendCapabilityException(er);
}
const OutputSlot& outputSlot = layer->GetOutputSlots()[0];
ITensorHandleFactory::FactoryId factoryId = outputSlot.GetTensorHandleFactoryId();
const TensorInfo& tensorInfo = outputSlot.GetTensorInfo();
ITensorHandleFactory* handleFactory = m_TensorHandleFactoryRegistry.GetFactory(factoryId);
ARMNN_ASSERT(handleFactory);
ImportedTensorHandlePin importedTensorHandlePin{layerBindingId,
handleFactory->CreateTensorHandle(tensorInfo, false)};
ITensorHandle* tensorHandle = importedTensorHandlePin.m_TensorHandle.get();
if (!CheckFlag(tensorHandle->GetImportFlags(), m_NetworkProperties.m_InputSource))
{
throw MemoryImportException(
fmt::format("ImportInputs: Memory Import failed, backend: "
"{} does not support importing from source {}"
, factoryId, m_NetworkProperties.m_InputSource));
}
std::unique_ptr<ITensorHandle> passThroughTensorHandle =
std::make_unique<ConstPassthroughTensorHandle>(inputTensor.second.GetInfo(),
inputTensor.second.GetMemoryArea());
if (tensorHandle->Import(passThroughTensorHandle->Map(), m_NetworkProperties.m_InputSource))
{
importedInputs.push_back(m_CurImportedInputId++);
passThroughTensorHandle->Unmap();
}
else
{
passThroughTensorHandle->Unmap();
throw MemoryImportException("ImportInputs: Memory Import failed");
}
m_PreImportedInputHandles.push_back(std::move(importedTensorHandlePin));
}
return importedInputs;
}
}
std::vector<ImportedOutputId> LoadedNetwork::ImportOutputs(const OutputTensors& outputTensors,
MemorySource forceImportMemorySource)
{
if (!m_NetworkProperties.m_AsyncEnabled)
{
// Cannot import if import is not enabled and forceImportMemorySource is undefined
if (forceImportMemorySource == MemorySource::Undefined)
{
throw MemoryImportException("ImportOutputs: Memory Import failed, NetworkProperties.m_ImportEnabled");
}
// If forceImportMemorySource is defined, try import if memory is aligned
if (outputTensors.size() != m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph().GetNumOutputs())
{
throw MemoryImportException("ImportOutputs: Force Import failed, incorrect number of tensors");
}
std::vector<ImportedOutputId> importedOutputs;
Graph& graph = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph().TopologicalSort();
unsigned int outputIndex = 0;
for (const BindableLayer* const outputLayer : graph.GetOutputLayers())
{
auto inputTensorHandle = m_PreImportedOutputHandles[outputIndex].m_TensorHandle.get();
if (!inputTensorHandle)
{
outputIndex++;
continue;
}
auto layerBindingId = outputLayer->GetBindingId();
auto it = std::find_if(outputTensors.begin(), outputTensors.end(), [=] (const auto& outputTensor)
{
return outputTensor.first == layerBindingId;
});
if (it == outputTensors.end())
{
outputIndex++;
continue;
}
const auto outputTensor = *it;
// Check if the output memory can be imported
if (inputTensorHandle->CanBeImported(outputTensor.second.GetMemoryArea(), forceImportMemorySource)
&& inputTensorHandle->Import(outputTensor.second.GetMemoryArea(), forceImportMemorySource))
{
importedOutputs.push_back(outputIndex);
}
outputIndex++;
}
return importedOutputs;
}
std::vector<ImportedOutputId> importedOutputs;
Graph& graph = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph().TopologicalSort();
for (const auto& outputTensor : outputTensors)
{
auto layerBindingId = outputTensor.first;
auto it = std::find_if(graph.GetOutputLayers().begin(), graph.GetOutputLayers().end(), [=](auto* layer)
{
return layer->GetBindingId() == layerBindingId;
});
if (it == graph.GetOutputLayers().end())
{
throw MemoryImportException(fmt::format("ImportOutputs: Memory Import failed, unknown LayerBindingId: {}",
layerBindingId));
}
const Layer* layer = *it;
if (layer->GetType() != LayerType::Output)
{
throw InvalidArgumentException("ImportOutputs: given layer not an OutputLayer");
}
auto& backend = m_Backends.at(layer->GetBackendId());
if (!HasCapability(BackendOptions::BackendOption{"PreImportIOTensors", true}, backend->GetCapabilities()))
{
std::string er = backend->GetId();
er += " does not have PreImportIOTensors capability";
throw BackendCapabilityException(er);
}
const InputSlot& inputSlot = layer->GetInputSlots()[0];
ITensorHandleFactory::FactoryId factoryId = inputSlot.GetConnectedOutputSlot()->GetTensorHandleFactoryId();
const TensorInfo& tensorInfo = inputSlot.GetConnectedOutputSlot()->GetTensorInfo();
ITensorHandleFactory* handleFactory = m_TensorHandleFactoryRegistry.GetFactory(factoryId);
ARMNN_ASSERT(handleFactory);
ImportedTensorHandlePin importedTensorHandlePin{layerBindingId,
handleFactory->CreateTensorHandle(tensorInfo, false)};
ITensorHandle* tensorHandle = importedTensorHandlePin.m_TensorHandle.get();
if (!CheckFlag(tensorHandle->GetImportFlags(), m_NetworkProperties.m_OutputSource))
{
throw MemoryImportException(fmt::format("ImportInputs: Memory Import failed, backend: "
"{} does not support importing from source {}"
, factoryId, m_NetworkProperties.m_OutputSource));
}
if (tensorHandle->Import(outputTensor.second.GetMemoryArea(), m_NetworkProperties.m_OutputSource))
{
importedOutputs.push_back(m_CurImportedOutputId++);
}
else
{
throw MemoryImportException("ImportInputs: Memory Import failed");
}
m_PreImportedOutputHandles.push_back(std::move(importedTensorHandlePin));
}
return importedOutputs;
}
void LoadedNetwork::ClearImportedInputs(const std::vector<ImportedInputId> inputIds)
{
for (auto id : inputIds)
{
if (id > m_PreImportedInputHandles.size())
{
throw InvalidArgumentException(fmt::format("ClearImportedInputs::Unknown ImportedInputId: {}", id));
}
auto& importedTensorHandle = m_PreImportedInputHandles[id].m_TensorHandle;
if (!importedTensorHandle)
{
throw InvalidArgumentException(
fmt::format("ClearImportedInputs::ImportedInput with id: {} has already been deleted", id));
}
// Call Unimport then destroy the tensorHandle
importedTensorHandle->Unimport();
importedTensorHandle = {};
}
}
void LoadedNetwork::ClearImportedOutputs(const std::vector<ImportedOutputId> outputIds)
{
for (auto id : outputIds)
{
if (id > m_PreImportedOutputHandles.size())
{
throw InvalidArgumentException(fmt::format("ClearImportedOutputs::Unknown ImportedOutputId: {}", id));
}
auto& importedTensorHandle = m_PreImportedOutputHandles[id].m_TensorHandle;
if (!importedTensorHandle)
{
throw InvalidArgumentException(
fmt::format("ClearImportedOutputs::ImportedOutput with id: {} has already been deleted", id));
}
// Call Unimport then destroy the tensorHandle
importedTensorHandle->Unimport();
importedTensorHandle = {};
}
}
Status LoadedNetwork::Execute(const InputTensors& inputTensors,
const OutputTensors& outputTensors,
IWorkingMemHandle& iWorkingMemHandle,
std::vector<ImportedInputId> preImportedInputs,
std::vector<ImportedOutputId> preImportedOutputs)
{
const Graph& graph = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph();
if (inputTensors.size() + preImportedInputs.size() != graph.GetNumInputs())
{
if (preImportedInputs.empty())
{
throw InvalidArgumentException("LoadedNetwork::Execute: Number of inputs provided does not match network.");
}
else
{
throw InvalidArgumentException("LoadedNetwork::Execute: "
"Number of inputs + preImportedInputs provided does not match network.");
}
}
if (outputTensors.size() + preImportedOutputs.size() != graph.GetNumOutputs())
{
if (preImportedOutputs.empty())
{
throw InvalidArgumentException("LoadedNetwork::Execute: "
"Number of outputs provided does not match network.");
}
else
{
throw InvalidArgumentException("LoadedNetwork::Execute: "
"Number of outputs + preImportedOutputs provided does not match network.");
}
}
WorkingMemHandle& workingMemHandle = dynamic_cast<WorkingMemHandle&>(iWorkingMemHandle);
// Collect all the given LayerBindingIds and check them for duplicates and unknowns.
std::vector<LayerBindingId>& bindingIds = workingMemHandle.GetBindingIdVector();
unsigned int index = 0;
for (auto pair : inputTensors)
{
bindingIds[index++] = pair.first;
}
for (ImportedInputId id : preImportedInputs)
{
bindingIds[index++] = ValidateImportedInputID(id);
}
for (auto pair : outputTensors)
{
bindingIds[index++] = pair.first;
}
for (ImportedOutputId id : preImportedOutputs)
{
bindingIds[index++] = ValidateImportedOutputID(id);
}
workingMemHandle.ValidateBindingIds();
auto resetMemHandle = [&]()
{
for (ImportedInputId id: preImportedInputs)
{
const LayerBindingId layerBindingId = m_PreImportedInputHandles[id].m_LayerBindingId;
auto inputHandle = workingMemHandle.GetInputHandle(layerBindingId);
auto inputConnections = workingMemHandle.GetInputConnections(layerBindingId);
for (auto it : inputConnections)
{
*it = inputHandle;
}
}
for (ImportedOutputId id: preImportedOutputs)
{
const LayerBindingId layerBindingId = m_PreImportedOutputHandles[id].m_LayerBindingId;
auto outputHandle = workingMemHandle.GetOutputHandle(layerBindingId);
auto outputConnections = workingMemHandle.GetOutputConnection(layerBindingId);
for (auto it : outputConnections)
{
*it = outputHandle;
}
}
};
std::unique_ptr<TimelineUtilityMethods> timelineUtils =
TimelineUtilityMethods::GetTimelineUtils(*m_ProfilingService);
ProfilingGuid inferenceGuid = m_ProfilingService->GetNextGuid();
if (timelineUtils)
{
// Add inference timeline trace if profiling is enabled.
ProfilingGuid networkGuid = m_OptimizedNetwork->GetGuid();
timelineUtils->CreateTypedEntity(inferenceGuid,LabelsAndEventClasses::INFERENCE_GUID);
timelineUtils->CreateRelationship(ProfilingRelationshipType::RetentionLink,
networkGuid,
inferenceGuid,
LabelsAndEventClasses::EXECUTION_OF_GUID);
timelineUtils->RecordEvent(inferenceGuid,LabelsAndEventClasses::ARMNN_PROFILING_SOL_EVENT_CLASS);
}
bool executionSucceeded = true;
if (timelineUtils)
{
// Add end of life of the inference timeline if profiling is enabled.
timelineUtils->RecordEvent(inferenceGuid,LabelsAndEventClasses::ARMNN_PROFILING_EOL_EVENT_CLASS);
timelineUtils->Commit();
}
if (!workingMemHandle.IsAllocated())
{
workingMemHandle.Allocate();
}
{
ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "PrepareInputs");
for (auto pair : inputTensors)
{
EnqueueInput(pair.second, workingMemHandle.GetInputHandle(pair.first));
}
// Swap in the pre-imported inputs if any
for (ImportedInputId id : preImportedInputs)
{
const ImportedTensorHandlePin& importedInputPin = m_PreImportedInputHandles[id];
const LayerBindingId layerBindingId = m_PreImportedInputHandles[id].m_LayerBindingId;
const auto& preimportedHandle = importedInputPin.m_TensorHandle;
auto inputConnections = workingMemHandle.GetInputConnections(layerBindingId);
for (auto it : inputConnections)
{
*it = preimportedHandle.get();
}
}
}
{
ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "PrepareOutputs");
if (m_NetworkProperties.m_ExportEnabled)
{
for (auto pair: outputTensors)
{
ImportOutputTensor(pair.second, workingMemHandle.GetOutputHandle(pair.first));
}
}
for (ImportedOutputId id : preImportedOutputs)
{
const ImportedTensorHandlePin& importedOutputPin = m_PreImportedOutputHandles[id];
const LayerBindingId layerBindingId = m_PreImportedOutputHandles[id].m_LayerBindingId;
const auto& preimportedHandle = importedOutputPin.m_TensorHandle;
auto outputConnections = workingMemHandle.GetOutputConnection(layerBindingId);
for (auto it : outputConnections)
{
*it = preimportedHandle.get();
}
}
}
auto Fail = [&](const std::exception& error)
{
ARMNN_LOG(error) << "An error occurred attempting to execute a workload: " << error.what();
executionSucceeded = false;
};
ProfilingDynamicGuid workloadInferenceID(0);
try
{
for (unsigned int i = 0; i < m_WorkloadQueue.size(); ++i)
{
auto& workload = m_WorkloadQueue[i];
if (timelineUtils)
{
workloadInferenceID = timelineUtils->RecordWorkloadInferenceAndStartOfLifeEvent(workload->GetGuid(),
inferenceGuid);
}
workload->ExecuteAsync(workingMemHandle.GetWorkingMemDescriptorAt(i));
if (timelineUtils)
{
timelineUtils->RecordEndOfLifeEvent(workloadInferenceID);
}
}
}
catch (const RuntimeException& error)
{
resetMemHandle();
Fail(error);
}
catch (const std::runtime_error& error)
{
resetMemHandle();
Fail(error);
}
catch (...)
{
resetMemHandle();
throw;
}
if (!m_NetworkProperties.m_ExportEnabled)
{
for (auto pair: outputTensors)
{
CopyToOutputTensor(pair.second, workingMemHandle.GetOutputHandle(pair.first));
}
}
else
{
ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "SyncMemGeneric_Execute");
workingMemHandle.MemSyncOutputs();
}
resetMemHandle();
return executionSucceeded ? Status::Success : Status::Failure;
}
/// Create a new unique WorkingMemHandle object. Create multiple handles if you wish to have
/// overlapped Execution by calling this function from different threads.
std::unique_ptr<IWorkingMemHandle> LoadedNetwork::CreateWorkingMemHandle(NetworkId networkId)
{
Graph& order = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph();
// Tensors that will need to be allocated internally within armnn
std::vector<std::unique_ptr<ITensorHandle>> managedTensorHandles;
// Tensors that will be allocated externally by the user
std::vector<std::unique_ptr<ITensorHandle>> unmanagedTensorHandles;
std::vector<WorkingMemDescriptor> workingMemDescriptors;
std::unordered_map<LayerGuid, WorkingMemDescriptor> workingMemDescriptorMap;
auto GetTensorHandle = [&](Layer* layer, const OutputSlot& outputSlot)
{
ITensorHandleFactory::FactoryId factoryId = outputSlot.GetTensorHandleFactoryId();
const TensorInfo& tensorInfo = outputSlot.GetTensorInfo();
if (factoryId == ITensorHandleFactory::LegacyFactoryId)
{
BackendId id = layer->GetBackendId();
ARMNN_NO_DEPRECATE_WARN_BEGIN
return m_WorkloadFactories.at(id)->CreateTensorHandle(tensorInfo, false);
ARMNN_NO_DEPRECATE_WARN_END
}
else
{
ITensorHandleFactory* handleFactory = m_TensorHandleFactoryRegistry.GetFactory(factoryId);
ARMNN_ASSERT(handleFactory);
return handleFactory->CreateTensorHandle(tensorInfo, false);
}
};
struct HandleInfo
{
ITensorHandle* m_TensorHandle;
bool m_IsInputLayerHandle = false;
bool m_IsOutputLayerHandle = false;
WorkingMemHandle::InputMemDescriptorCoords m_InputMemDescriptorCoords;
WorkingMemHandle::OutputMemDescriptorCoords m_OutputMemDescriptorCoords;
};
std::unordered_map<const OutputSlot*, HandleInfo> outputToHandleInfoMap;
unsigned int layerIndex = 0;
for (auto&& layer : order)
{
// Constant layers execution and management is handled during loaded network construction
if (layer->GetType() == LayerType::Constant)
{
continue;
}
WorkingMemDescriptor workingMemDescriptor;
bool isMemoryManaged = true;
bool isInputLayer = false;
bool isOutputLayer = false;
bool isConnectedToOutputLayer = false;
if (layer->GetType() == LayerType::Input || layer->GetType() == LayerType::MemImport)
{
// Input layers/workloads will not be executed so the descriptor is not added to workingMemDescriptors
// However we will still need to manage the tensorHandle
isInputLayer = true;
isMemoryManaged = !m_NetworkProperties.m_ImportEnabled;
}
else if (layer->GetType() == LayerType::Output)
{
isOutputLayer = true;
}
unsigned int slotIndex = 0;
// Create a tensor handle for each output slot of a layer
// Once we create it, we start managing its lifetime
for (auto& slot : layer->GetOutputSlots())
{
for (unsigned int i = 0; i < slot.GetNumConnections(); ++i)
{
if ((slot.GetConnection(i)->GetOwningLayer().GetType() == LayerType::Output))
{
if (!isConnectedToOutputLayer)
{
isConnectedToOutputLayer = true;
// If Export is enabled disable memory management, so we can export, otherwise we do a copy
isMemoryManaged = !m_NetworkProperties.m_ExportEnabled;
}
else
{
// Importing in this case would likely cause unexpected behaviour, so we disallow it.
ARMNN_LOG(warning) <<
fmt::format("Layer name: '{0}' guid: '{1}' has two or more OutputLayers connected to it. "
"This will prevent importing on the connected OutputLayers.",
layer->GetName(), layer->GetGuid());
isMemoryManaged = true;
}
}
}
ITensorHandle* tensorHandle;
if (isMemoryManaged)
{
managedTensorHandles.emplace_back(GetTensorHandle(layer, slot));
tensorHandle = managedTensorHandles.back().get();
}
else
{
unmanagedTensorHandles.emplace_back(GetTensorHandle(layer, slot));
tensorHandle = unmanagedTensorHandles.back().get();
}
workingMemDescriptor.m_Outputs.push_back(tensorHandle);
HandleInfo& handleInfo = outputToHandleInfoMap[&slot];
handleInfo.m_TensorHandle = tensorHandle;
// Store the coordinates of the current layer's OutputSlot that is connected to the OutputLayer
if (isConnectedToOutputLayer)
{
handleInfo.m_IsOutputLayerHandle = true;
handleInfo.m_OutputMemDescriptorCoords.m_OutputSlotCoords = {layerIndex, slotIndex};
}
// Store the LayerBindingId of the InputLayer
if (isInputLayer)
{
handleInfo.m_IsInputLayerHandle = true;
LayerBindingId bindingId = static_cast<BindableLayer*>(layer)->GetBindingId();
handleInfo.m_InputMemDescriptorCoords.m_LayerBindingId = bindingId;
}
slotIndex++;
}
// Loop through the input slots in the same layer and decrement the reference counter associated
// to each tensor handle we encounter.
// Once it reaches zero, the lifetime of the tensor handle has ended, and we mark its memory as available
// so that the next tensor handle with a non overlapping lifetime can share its memory.
for (auto& slot : layer->GetInputSlots())
{
ARMNN_ASSERT(slot.GetConnection());
auto outputSlot = slot.GetConnectedOutputSlot();
auto key = outputSlot->GetOwningLayer().GetGuid();
// Constant layers execution and management is handled during loaded network construction
auto found = m_ConstantTensorHandles.find(key);
if (found != m_ConstantTensorHandles.end())
{
ITensorHandle* tensorHandle = found->second;
workingMemDescriptor.m_Inputs.push_back(tensorHandle);
// Odd case where a constant layer is connected to an output layer
// We will need to create a HandleInfo to track it
if (isOutputLayer)
{
LayerBindingId bindingId = static_cast<BindableLayer*>(layer)->GetBindingId();
HandleInfo& handleInfo = outputToHandleInfoMap[outputSlot];
handleInfo.m_TensorHandle = tensorHandle;
handleInfo.m_IsOutputLayerHandle = true;
handleInfo.m_OutputMemDescriptorCoords.m_LayerBindingIds.push_back(bindingId);
handleInfo.m_OutputMemDescriptorCoords.m_InputSlotCoords.push_back({layerIndex, 0});
}
continue;
}
HandleInfo& handleInfo = outputToHandleInfoMap.at(outputSlot);
ITensorHandle* inputTensorHandle = handleInfo.m_TensorHandle;
workingMemDescriptor.m_Inputs.push_back(inputTensorHandle);
// Store the LayerBindingId of the OutputLayer
if (isOutputLayer)
{
LayerBindingId bindingId = static_cast<BindableLayer*>(layer)->GetBindingId();
handleInfo.m_OutputMemDescriptorCoords.m_LayerBindingIds.push_back(bindingId);
handleInfo.m_OutputMemDescriptorCoords.m_InputSlotCoords.push_back({layerIndex, 0});
}
// In this case the layer is not an Output Layer but shares its input tensorhandle with an OutputLayer
// It will need to be updated as well, if we swap out the tensorhandle
else if (handleInfo.m_IsOutputLayerHandle)
{
handleInfo.m_OutputMemDescriptorCoords.m_InputSlotCoords.push_back({layerIndex, slot.GetSlotIndex()});
}
// Store the coordinates of the InputSlots connected to the InputLayer
// There can be more than one InputSlot connected to an InputLayer, so we use a vector
if (handleInfo.m_IsInputLayerHandle)
{
std::pair<LayerGuid, unsigned int> connectionLocation{layerIndex, slot.GetSlotIndex()};
handleInfo.m_InputMemDescriptorCoords.m_InputSlotCoords.emplace_back(connectionLocation);
}
}
workingMemDescriptorMap.insert({layer->GetGuid(), workingMemDescriptor});
// Input/Output layers/workloads will not be executed, so the descriptor is not added to workingMemDescriptors
// However we will still need to manage the tensorHandle
if (!isInputLayer)
{
workingMemDescriptors.push_back(workingMemDescriptor);
layerIndex++;
}
}
std::vector<std::pair<std::shared_ptr<TensorMemory>, MemorySource>> tensorMemory;
auto externalMemoryManager = CreateExternalMemoryManger(tensorMemory);
// Sort m_TensorMemory, so it's order matches the outputSlot order
std::sort(tensorMemory.begin(), tensorMemory.end(),
[](const std::pair<std::shared_ptr<TensorMemory>, MemorySource>& lhs,
const std::pair<std::shared_ptr<TensorMemory>, MemorySource>& rhs)
{
return lhs.first->m_OutputSlotId < rhs.first->m_OutputSlotId;
});
std::vector<WorkingMemHandle::InputMemDescriptorCoords> inputConnectionsInfo;
std::vector<WorkingMemHandle::OutputMemDescriptorCoords> outputConnectionsInfo;
for (const auto& handleInfo: outputToHandleInfoMap)
{
if (handleInfo.second.m_IsOutputLayerHandle)
{
outputConnectionsInfo.emplace_back(handleInfo.second.m_OutputMemDescriptorCoords);
}
if (handleInfo.second.m_IsInputLayerHandle)
{
inputConnectionsInfo.emplace_back(handleInfo.second.m_InputMemDescriptorCoords);
}
}
return std::make_unique<WorkingMemHandle>(networkId,
inputConnectionsInfo,
outputConnectionsInfo,
workingMemDescriptors,
workingMemDescriptorMap,
std::move(externalMemoryManager),
std::move(tensorMemory),
std::move(managedTensorHandles),
std::move(unmanagedTensorHandles));
}
void LoadedNetwork::RegisterDebugCallback(const DebugCallbackFunction& func)
{
for (auto&& workloadPtr: m_WorkloadQueue)
{
workloadPtr.get()->RegisterDebugCallback(func);
}
}
void LoadedNetwork::CreateMemoryProfileAsync()
{
struct PartialBlock
{
unsigned int m_StartOfLife;
unsigned int m_Lifetime;
size_t m_MemSize;
unsigned int m_Index;
BackendId m_BackendId;
};
auto align = [](size_t numToAlign)
{
const size_t alignment = sizeof(float);
return ((numToAlign + alignment - 1) / alignment) * alignment;
};
std::unordered_map<const OutputSlot*, PartialBlock> memBlockTrackerMap;
const bool inputImportingEnabled = m_NetworkProperties.m_InputSource != MemorySource::Undefined;
const bool outputImportingEnabled = m_NetworkProperties.m_OutputSource != MemorySource::Undefined;
unsigned int timestep = 0;
unsigned int outputIndex = 0;
Graph& order = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph().TopologicalSort();
for (auto&& layer : order)
{
const LayerType& layerType = layer->GetType();
// Don't manage memory if importing.
if (layerType == LayerType::Input && inputImportingEnabled)
{
continue;
}
// Don't manage memory if importing.
if (layerType == LayerType::Output && outputImportingEnabled
&& layer->GetInputSlot(0).GetConnectedOutputSlot()->GetNumConnections() == 1)
{
continue;
}
// Because Constant Layer memory can not be shared, the memory must persist for the lifetime of execution,
// management is done separately.
if (layerType == LayerType::Constant)
{
continue;
}
BackendId backendId = layer->GetBackendId();
for (auto& outputSlot : layer->GetOutputSlots())
{
if (!m_SupportsExternallyManagedMemory[backendId])
{
continue;
}
PartialBlock partialBlock;
partialBlock.m_StartOfLife = timestep;
size_t alignedSize = align(outputSlot.GetOutputHandler().GetTensorInfo().GetNumBytes());
partialBlock.m_MemSize = alignedSize;
partialBlock.m_Index = outputIndex++;
partialBlock.m_Lifetime = outputSlot.GetNumConnections();
partialBlock.m_BackendId = backendId;
if (partialBlock.m_Lifetime == 0)
{
m_MemBlockMap[partialBlock.m_BackendId].emplace_back(partialBlock.m_StartOfLife,
partialBlock.m_StartOfLife,
partialBlock.m_MemSize,
0,
partialBlock.m_Index);
}
else
{
memBlockTrackerMap[&outputSlot] = partialBlock;
}
}
for (auto& inputSlot : layer->GetInputSlots())
{
const Layer& connectedInputLayer = inputSlot.GetConnectedOutputSlot()->GetOwningLayer();
const LayerType& owningLayerType = connectedInputLayer.GetType();
if (owningLayerType == LayerType::Constant)
{
continue;
}
if (inputImportingEnabled && owningLayerType == LayerType::Input)
{
continue;
}
auto outputSlot = inputSlot.GetConnectedOutputSlot();
PartialBlock& partialBlock = memBlockTrackerMap.at(outputSlot);
auto& lifetime = partialBlock.m_Lifetime;
--lifetime;
if (lifetime == 0)
{
m_MemBlockMap[partialBlock.m_BackendId].emplace_back(partialBlock.m_StartOfLife,
timestep,
partialBlock.m_MemSize,
0,
partialBlock.m_Index);
}
}
++timestep;
}
}
void LoadedNetwork::CreateMemoryProfile()
{
// Finds the first TensorHandle ancestor of a SubTensorHandle. If the ITensorHandle provided
// is a TensorHandle, the function just returns it
auto TraceSubTensorHandleAncestry = [](ITensorHandle* const subTensorHandle)
{
ITensorHandle* ancestor = subTensorHandle;
while (ancestor && ancestor->GetParent())
{
ancestor = ancestor->GetParent();
}
return ancestor;
};
struct PartialBlock
{
unsigned int m_StartOfLife;
unsigned int m_Lifetime;
size_t m_MemSize;
unsigned int m_Index;
BackendId m_BackendId;
};
auto align = [](size_t numToAlign)
{
const size_t alignment = sizeof(float);
return ((numToAlign + alignment - 1) / alignment) * alignment;
};
std::unordered_map<ITensorHandle*, PartialBlock> memBlockTrackerMap;
const bool inputImportingEnabled = m_NetworkProperties.m_InputSource != MemorySource::Undefined;
const bool outputImportingEnabled = m_NetworkProperties.m_OutputSource != MemorySource::Undefined;
unsigned int timestep = 0;
unsigned int outputIndex = 0;
Graph& order = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph().TopologicalSort();
for (auto&& layer : order)
{
const LayerType& layerType = layer->GetType();
// Don't manage memory if importing.
if (layerType == LayerType::Input && inputImportingEnabled)
{
continue;
}
// Don't manage memory if importing.
if (layerType == LayerType::Output && outputImportingEnabled
&& layer->GetInputSlot(0).GetConnectedOutputSlot()->GetNumConnections() == 1)
{
continue;
}
// Because Constant Layer memory can not be shared, the memory must persist for the lifetime of execution,
// management is done separately.
if (layerType == LayerType::Constant)
{
continue;
}
BackendId backendId = layer->GetBackendId();
for (auto& outputSlot : layer->GetOutputSlots())
{
if (!m_SupportsExternallyManagedMemory[backendId])
{
continue;
}
ITensorHandle* tensorHandle = outputSlot.GetOutputHandler().GetData();
tensorHandle = TraceSubTensorHandleAncestry(tensorHandle);
if (memBlockTrackerMap.find(tensorHandle) == memBlockTrackerMap.end())
{
PartialBlock partialBlock;
partialBlock.m_StartOfLife = timestep;
size_t alignedSize = align(outputSlot.GetOutputHandler().GetTensorInfo().GetNumBytes());
partialBlock.m_MemSize = alignedSize;
partialBlock.m_Index = outputIndex++;
partialBlock.m_Lifetime = outputSlot.GetNumConnections();
partialBlock.m_BackendId = backendId;
if (partialBlock.m_Lifetime == 0)
{
m_MemBlockMap[partialBlock.m_BackendId].emplace_back(partialBlock.m_StartOfLife,
partialBlock.m_StartOfLife,
partialBlock.m_MemSize,
0,
partialBlock.m_Index);
}
else
{
memBlockTrackerMap[tensorHandle] = partialBlock;
}
m_Tensorhandles.push_back(tensorHandle);
}
else
{
memBlockTrackerMap.at(tensorHandle).m_Lifetime += outputSlot.GetNumConnections();
}
}
for (auto& inputSlot : layer->GetInputSlots())
{
const Layer& connectedInputLayer = inputSlot.GetConnectedOutputSlot()->GetOwningLayer();
const LayerType& owningLayerType = connectedInputLayer.GetType();
if (owningLayerType == LayerType::Constant)
{
continue;
}
if (inputImportingEnabled && owningLayerType == LayerType::Input)
{
continue;
}
if (!m_SupportsExternallyManagedMemory[connectedInputLayer.GetBackendId()])
{
continue;
}
auto outputSlot = inputSlot.GetConnectedOutputSlot();
ITensorHandle* tensorHandle = outputSlot->GetOutputHandler().GetData();
tensorHandle = TraceSubTensorHandleAncestry(tensorHandle);
PartialBlock& partialBlock = memBlockTrackerMap.at(tensorHandle);
auto& lifetime = partialBlock.m_Lifetime;
--lifetime;
if (lifetime == 0)
{
m_MemBlockMap[partialBlock.m_BackendId].emplace_back(partialBlock.m_StartOfLife,
timestep,
partialBlock.m_MemSize,
0,
partialBlock.m_Index);
}
}
++timestep;
}
}
std::unique_ptr<MemoryManager> LoadedNetwork::CreateExternalMemoryManger(
std::vector<std::pair<std::shared_ptr<TensorMemory>, MemorySource>>& tensorMemoryVec)
{
std::unique_ptr<MemoryManager> memoryManager = std::make_unique<MemoryManager>();
auto allocatorMap = BackendRegistryInstance().GetAllocators();
for (auto& backend : m_MemBinMap)
{
std::vector<BufferStorage> bufferStorageVec;
std::shared_ptr<ICustomAllocator> backendAllocator;
if (allocatorMap.find(backend.first) != allocatorMap.end())
{
backendAllocator = allocatorMap[backend.first];
}
else
{
backendAllocator = m_Backends[backend.first]->GetDefaultAllocator();
}
for (auto& memBin : backend.second)
{
BufferStorage bufferStorage;
bufferStorage.m_BufferSize = memBin.m_MemSize;
bufferStorage.m_TensorMemoryVector.reserve(memBin.m_MemBlocks.size());
for (auto& memBlock : memBin.m_MemBlocks)
{
auto tensorMemory = std::make_shared<TensorMemory>(TensorMemory{memBlock.m_Offset, memBlock.m_Index});
tensorMemoryVec.emplace_back(tensorMemory, backendAllocator->GetMemorySourceType());
bufferStorage.m_TensorMemoryVector.emplace_back(tensorMemory);
}
bufferStorageVec.emplace_back(std::move(bufferStorage));
}
memoryManager->StoreMemToAllocate(bufferStorageVec, backendAllocator, 4);
}
return memoryManager;
}
LayerBindingId LoadedNetwork::ValidateImportedInputID(ImportedInputId id)
{
try
{
const auto& importedTensorHandlePin = m_PreImportedInputHandles.at(id);
if (!importedTensorHandlePin.m_TensorHandle)
{
throw InvalidArgumentException(fmt::format("LoadedNetwork::Execute:"
"PreImportedInput: {} has been deleted", id));
}
return importedTensorHandlePin.m_LayerBindingId;
}
catch (const std::out_of_range&)
{
throw InvalidArgumentException(fmt::format("LoadedNetwork::Execute: Unknown ImportedInputId: {}", id));
}
}
LayerBindingId LoadedNetwork::ValidateImportedOutputID(ImportedOutputId id)
{
try
{
const auto& importedTensorHandlePin = m_PreImportedOutputHandles.at(id);
if (!importedTensorHandlePin.m_TensorHandle)
{
throw InvalidArgumentException(fmt::format("LoadedNetwork::Execute: "
"PreImportedOutput: {} has been deleted", id));
}
return importedTensorHandlePin.m_LayerBindingId;
}
catch (const std::out_of_range&)
{
throw InvalidArgumentException(fmt::format("LoadedNetwork::Execute: Unknown ImportedOutputId: {}", id));
}
}
}