Blame - src/backends/gpuFsa/GpuFsaBackend.cpp - ml/armnn

blob: f14687b8e073dcc9dd74c5e1afb281b45dde59f1 [file] [log] [blame]

David Monahan	8a57046	2023-11-22 13:24:25 +0000	[diff] [blame]	1	//
David Monahan	bd73808	2023-12-08 12:50:02 +0000	[diff] [blame]	2	// Copyright © 2022-2024 Arm Ltd and Contributors. All rights reserved.
David Monahan	8a57046	2023-11-22 13:24:25 +0000	[diff] [blame]	3	// SPDX-License-Identifier: MIT
				4	//
				5
				6	#include "GpuFsaBackend.hpp"
				7	#include "GpuFsaBackendContext.hpp"
				8	#include "GpuFsaBackendDefaultAllocator.hpp"
				9	#include "GpuFsaBackendId.hpp"
				10	#include "GpuFsaLayerSupport.hpp"
				11	#include "GpuFsaTensorHandleFactory.hpp"
				12	#include "GpuFsaWorkloadFactory.hpp"
				13
				14	#include <armnn/backends/IBackendContext.hpp>
				15	#include <armnn/backends/IMemoryManager.hpp>
				16	#include <aclCommon/BaseMemoryManager.hpp>
				17	#include <backendsCommon/SubgraphUtils.hpp>
				18	#include <Optimizer.hpp>
				19
				20	#include <arm_compute/core/CL/CLKernelLibrary.h>
				21	#include <arm_compute/runtime/CL/CLBufferAllocator.h>
				22
David Monahan	bd73808	2023-12-08 12:50:02 +0000	[diff] [blame]	23	#include "layers/GpuFsaConvolution2d.hpp"
Tianle Cheng	fbfa49e	2024-01-23 11:21:48 +0000	[diff] [blame]	24	#include "layers/GpuFsaDepthwiseConvolution2d.hpp"
Tracy Narine	e7d2785	2024-01-26 09:13:19 +0000	[diff] [blame]	25	#include "layers/GpuFsaElementwiseBinaryAdd.hpp"
John Mcloughlin	829e13e	2024-01-31 11:00:27 +0000	[diff] [blame]	26	#include "layers/GpuFsaElementwiseBinarySub.hpp"
Teresa Charlin	a52bca2	2024-02-01 17:36:48 +0000	[diff] [blame^]	27	#include "layers/GpuFsaPooling2d.hpp"
David Monahan	8a57046	2023-11-22 13:24:25 +0000	[diff] [blame]	28
				29	namespace armnn
				30	{
				31
				32	template <typename T>
				33	inline void DeleteAsType(const void* const blob)
				34	{
				35	delete static_cast<const T*>(blob);
				36	}
				37
				38	inline SubgraphView::InputSlots CreateInputsFrom(Layer* layer)
				39	{
				40	SubgraphView::InputSlots result;
				41	for (auto&& it = layer->BeginInputSlots(); it != layer->EndInputSlots(); ++it)
				42	{
				43	result.push_back(&(*it));
				44	}
				45	return result;
				46	}
				47
				48	inline SubgraphView::OutputSlots CreateOutputsFrom(Layer* layer)
				49	{
				50	SubgraphView::OutputSlots result;
				51	for (auto&& it = layer->BeginOutputSlots(); it != layer->EndOutputSlots(); ++it)
				52	{
				53	result.push_back(&(*it));
				54	}
				55	return result;
				56	}
				57
				58	inline SubgraphView::SubgraphViewPtr CreateSubgraphViewFrom(SubgraphView::InputSlots&& inputs,
				59	SubgraphView::OutputSlots&& outputs,
				60	SubgraphView::Layers&& layers)
				61	{
				62	return std::make_unique<SubgraphView>(std::move(inputs), std::move(outputs), std::move(layers));
				63	}
				64
				65	const BackendId& GpuFsaBackend::GetIdStatic()
				66	{
				67	static const BackendId s_Id{GpuFsaBackendId()};
				68	return s_Id;
				69	}
				70
				71	IBackendInternal::IMemoryManagerUniquePtr GpuFsaBackend::CreateMemoryManager() const
				72	{
				73	if (m_UsingCustomAllocator)
				74	{
				75	return std::make_unique<GpuFsaMemoryManager>(m_CustomAllocator);
				76	}
				77	return std::make_unique<GpuFsaMemoryManager>(std::make_unique<arm_compute::CLBufferAllocator>());
				78	}
				79
				80	IBackendInternal::IWorkloadFactoryPtr GpuFsaBackend::CreateWorkloadFactory(
				81	const IBackendInternal::IMemoryManagerSharedPtr& memoryManager) const
				82	{
				83	return std::make_unique<GpuFsaWorkloadFactory>(PolymorphicPointerDowncast<GpuFsaMemoryManager>(memoryManager));
				84	}
				85
				86	IBackendInternal::IWorkloadFactoryPtr GpuFsaBackend::CreateWorkloadFactory(
				87	TensorHandleFactoryRegistry& registry) const
				88	{
				89	std::shared_ptr<GpuFsaMemoryManager> memoryManager;
				90	if (m_UsingCustomAllocator)
				91	{
				92	memoryManager = std::make_shared<GpuFsaMemoryManager>(m_CustomAllocator);
				93	}
				94	else
				95	{
				96	memoryManager = std::make_shared<GpuFsaMemoryManager>(std::make_unique<arm_compute::CLBufferAllocator>());
				97	}
				98
				99	std::unique_ptr<ITensorHandleFactory> factory = std::make_unique<GpuFsaTensorHandleFactory>(memoryManager);
				100
				101	registry.RegisterMemoryManager(memoryManager);
				102	registry.RegisterFactory(std::move(factory));
				103
				104	return std::make_unique<GpuFsaWorkloadFactory>(PolymorphicPointerDowncast<GpuFsaMemoryManager>(memoryManager));
				105	}
				106
				107	IBackendInternal::IWorkloadFactoryPtr GpuFsaBackend::CreateWorkloadFactory(
				108	TensorHandleFactoryRegistry& registry,
				109	const ModelOptions&,
				110	MemorySourceFlags inputFlags,
				111	MemorySourceFlags outputFlags) const
				112	{
				113
				114	// To allow force import if inputFlags/outputFlags are Undefined, set it as Malloc
				115	if (inputFlags == static_cast<MemorySourceFlags>(MemorySource::Undefined))
				116	{
				117	inputFlags = static_cast<MemorySourceFlags>(MemorySource::Malloc);
				118	}
				119	if (outputFlags == static_cast<MemorySourceFlags>(MemorySource::Undefined))
				120	{
				121	outputFlags = static_cast<MemorySourceFlags>(MemorySource::Malloc);
				122	}
				123
				124	std::shared_ptr<GpuFsaMemoryManager> memoryManager;
				125	if (m_UsingCustomAllocator)
				126	{
				127	memoryManager = std::make_shared<GpuFsaMemoryManager>(m_CustomAllocator);
				128	}
				129	else
				130	{
				131	memoryManager = std::make_shared<GpuFsaMemoryManager>(std::make_unique<arm_compute::CLBufferAllocator>());
				132	}
				133
				134	std::unique_ptr<ITensorHandleFactory> factory = std::make_unique<GpuFsaTensorHandleFactory>(memoryManager);
				135
				136	registry.RegisterMemoryManager(memoryManager);
				137	registry.RegisterFactory(std::move(factory));
				138
				139	return std::make_unique<GpuFsaWorkloadFactory>(PolymorphicPointerDowncast<GpuFsaMemoryManager>(memoryManager));
				140	}
				141
				142	std::vector<ITensorHandleFactory::FactoryId> GpuFsaBackend::GetHandleFactoryPreferences() const
				143	{
				144	return std::vector<ITensorHandleFactory::FactoryId> { GpuFsaTensorHandleFactory::GetIdStatic() };
				145	}
				146
				147	void GpuFsaBackend::RegisterTensorHandleFactories(TensorHandleFactoryRegistry& registry)
				148	{
				149	std::shared_ptr<GpuFsaMemoryManager> memoryManager;
				150	if (m_UsingCustomAllocator)
				151	{
				152	memoryManager = std::make_shared<GpuFsaMemoryManager>(m_CustomAllocator);
				153	}
				154	else
				155	{
				156	memoryManager = std::make_shared<GpuFsaMemoryManager>(std::make_unique<arm_compute::CLBufferAllocator>());
				157	}
				158
				159	std::unique_ptr<ITensorHandleFactory> factory = std::make_unique<GpuFsaTensorHandleFactory>(memoryManager);
				160	registry.RegisterMemoryManager(memoryManager);
				161	registry.RegisterFactory(std::move(factory));
				162
				163	}
				164
				165	void GpuFsaBackend::RegisterTensorHandleFactories(TensorHandleFactoryRegistry& registry,
				166	MemorySourceFlags inputFlags,
				167	MemorySourceFlags outputFlags)
				168	{
				169	// To allow force import if inputFlags/outputFlags are Undefined, set it as Malloc
				170	if (inputFlags == static_cast<MemorySourceFlags>(MemorySource::Undefined))
				171	{
				172	inputFlags = static_cast<MemorySourceFlags>(MemorySource::Malloc);
				173	}
				174	if (outputFlags == static_cast<MemorySourceFlags>(MemorySource::Undefined))
				175	{
				176	outputFlags = static_cast<MemorySourceFlags>(MemorySource::Malloc);
				177	}
				178
				179	std::shared_ptr<GpuFsaMemoryManager> memoryManager;
				180	if (m_UsingCustomAllocator)
				181	{
				182	memoryManager = std::make_shared<GpuFsaMemoryManager>(m_CustomAllocator);
				183	}
				184	else
				185	{
				186	memoryManager = std::make_shared<GpuFsaMemoryManager>(std::make_unique<arm_compute::CLBufferAllocator>());
				187	}
				188
				189	std::unique_ptr<ITensorHandleFactory> factory = std::make_unique<GpuFsaTensorHandleFactory>(memoryManager);
				190	registry.RegisterMemoryManager(memoryManager);
				191	registry.RegisterFactory(std::move(factory));
				192	}
				193
				194	IBackendInternal::IBackendContextPtr GpuFsaBackend::CreateBackendContext(const IRuntime::CreationOptions& options) const
				195	{
				196	return IBackendContextPtr{new GpuFsaBackendContext{options}};
				197	}
				198
				199	IBackendInternal::IBackendProfilingContextPtr GpuFsaBackend::CreateBackendProfilingContext(
				200	const IRuntime::CreationOptions&, IBackendProfilingPtr&)
				201	{
				202	return IBackendProfilingContextPtr{};
				203	}
				204
				205	IBackendInternal::ILayerSupportSharedPtr GpuFsaBackend::GetLayerSupport() const
				206	{
				207	static ILayerSupportSharedPtr layerSupport{new GpuFsaLayerSupport};
				208	return layerSupport;
				209	}
				210
				211	std::unique_ptr<ICustomAllocator> GpuFsaBackend::GetDefaultAllocator() const
				212	{
				213	return std::make_unique<GpuFsaBackendDefaultAllocator>();
				214	}
				215
				216	OptimizationViews GpuFsaBackend::OptimizeSubgraphView(const SubgraphView& subgraph,
				217	const ModelOptions& modelOptions) const
				218	{
				219	OptimizationViews optimizationViews(modelOptions);
				220
				221	using namespace arm_compute::experimental::dynamic_fusion;
David Monahan	8a57046	2023-11-22 13:24:25 +0000	[diff] [blame]	222
				223	auto it = subgraph.end();
				224	std::map<LayerGuid, Layer*> untouched;
				225	while (it != subgraph.begin())
				226	{
				227	--it;
				228	Layer& base = (PolymorphicDowncast<Layer>(*it));
				229	untouched.insert({base.GetGuid(), &base});
				230	}
				231
				232	GpuFsaLayerSupport supportChecker;
				233	it = subgraph.end();
David Monahan	bd73808	2023-12-08 12:50:02 +0000	[diff] [blame]	234	arm_compute::CLCompileContext* compileCtx = &(arm_compute::CLKernelLibrary::get().get_compile_context());
				235
				236	// Setup the GpuWokloadContext which will exist for the lifetime of the Graph. This contains the TensorInfos
				237	std::shared_ptr<GpuWorkloadContext> workloadContext = std::make_shared<GpuWorkloadContext>(compileCtx);
David Monahan	8a57046	2023-11-22 13:24:25 +0000	[diff] [blame]	238	while (it != subgraph.begin())
				239	{
				240	--it;
				241	Layer& base = (PolymorphicDowncast<Layer>(*it));
David Monahan	bd73808	2023-12-08 12:50:02 +0000	[diff] [blame]	242	// Create a GpuFsaPreCompiledBlob, this contains all of the information needed to execute an operator
				243	GpuFsaPreCompiledBlob* preCompiledBlobPtr = new GpuFsaPreCompiledBlob();
				244	preCompiledBlobPtr->workloadContext = workloadContext;
				245	preCompiledBlobPtr->sketch = std::make_unique<GpuWorkloadSketch>(workloadContext.get());
David Monahan	8a57046	2023-11-22 13:24:25 +0000	[diff] [blame]	246
David Monahan	bd73808	2023-12-08 12:50:02 +0000	[diff] [blame]	247	// Configure and setup the sketch for each supported op. Their data will be wrapped into a PreCompiled layer
David Monahan	8a57046	2023-11-22 13:24:25 +0000	[diff] [blame]	248	switch (base.GetType())
				249	{
				250	case (LayerType::Convolution2d):
				251	{
				252	auto input = base.GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo();
				253	auto weights = base.GetInputSlot(1).GetConnectedOutputSlot()->GetTensorInfo();
David Monahan	8a57046	2023-11-22 13:24:25 +0000	[diff] [blame]	254
				255	auto desc = PolymorphicDowncast<const Convolution2dDescriptor*>(&base.GetParameters());
				256	if (desc->m_BiasEnabled)
				257	{
				258	auto bias = base.GetInputSlot(2).GetConnectedOutputSlot()->GetTensorInfo();
David Monahan	bd73808	2023-12-08 12:50:02 +0000	[diff] [blame]	259	GpuFsaConvolution2dCreateOp(preCompiledBlobPtr,
				260	input,
David Monahan	8a57046	2023-11-22 13:24:25 +0000	[diff] [blame]	261	*desc,
				262	weights,
				263	bias);
				264	}
				265	else
				266	{
David Monahan	bd73808	2023-12-08 12:50:02 +0000	[diff] [blame]	267	GpuFsaConvolution2dCreateOp(preCompiledBlobPtr,
				268	input,
David Monahan	8a57046	2023-11-22 13:24:25 +0000	[diff] [blame]	269	*desc,
				270	weights,
				271	EmptyOptional());
				272	}
				273	break;
				274	}
Tianle Cheng	fbfa49e	2024-01-23 11:21:48 +0000	[diff] [blame]	275	case (LayerType::DepthwiseConvolution2d):
				276	{
				277	auto input = base.GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo();
				278	auto weights = base.GetInputSlot(1).GetConnectedOutputSlot()->GetTensorInfo();
				279
				280	auto desc = PolymorphicDowncast<const DepthwiseConvolution2dDescriptor*>(&base.GetParameters());
				281	if (desc->m_BiasEnabled)
				282	{
				283	auto bias = base.GetInputSlot(2).GetConnectedOutputSlot()->GetTensorInfo();
				284	GpuFsaDepthwiseConvolution2dCreateOp(preCompiledBlobPtr,
				285	input,
				286	*desc,
				287	weights,
				288	bias);
				289	}
				290	else
				291	{
				292	GpuFsaDepthwiseConvolution2dCreateOp(preCompiledBlobPtr,
				293	input,
				294	*desc,
				295	weights,
				296	EmptyOptional());
				297	}
				298	break;
				299	}
Tracy Narine	e7d2785	2024-01-26 09:13:19 +0000	[diff] [blame]	300	case LayerType::ElementwiseBinary:
				301	{
				302	auto desc = PolymorphicDowncast<const ElementwiseBinaryDescriptor *>(&base.GetParameters());
				303	if (desc->m_Operation == BinaryOperation::Add)
				304	{
				305	auto input0 = base.GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo();
				306	auto input1 = base.GetInputSlot(1).GetConnectedOutputSlot()->GetTensorInfo();
				307
				308	GpuFsaElementwiseBinaryAddCreateOp(preCompiledBlobPtr, input0, input1);
				309	}
John Mcloughlin	829e13e	2024-01-31 11:00:27 +0000	[diff] [blame]	310	else if (desc->m_Operation == BinaryOperation::Sub)
				311	{
				312	auto input0 = base.GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo();
				313	auto input1 = base.GetInputSlot(1).GetConnectedOutputSlot()->GetTensorInfo();
				314
				315	GpuFsaElementwiseBinarySubCreateOp(preCompiledBlobPtr, input0, input1);
				316	}
Tracy Narine	e7d2785	2024-01-26 09:13:19 +0000	[diff] [blame]	317	break;
				318	}
Teresa Charlin	a52bca2	2024-02-01 17:36:48 +0000	[diff] [blame^]	319	case (LayerType::Pooling2d):
				320	{
				321	auto input = base.GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo();
				322	auto desc = PolymorphicDowncast<const Pooling2dDescriptor*>(&base.GetParameters());
				323	GpuFsaPooling2dCreateOp(preCompiledBlobPtr, input, *desc);
				324	break;
				325	}
David Monahan	8a57046	2023-11-22 13:24:25 +0000	[diff] [blame]	326	default:
				327	// unsupported layer for GpuFsa backend
				328	continue;
				329	}
				330
David Monahan	bd73808	2023-12-08 12:50:02 +0000	[diff] [blame]	331	auto compiledBlob =
				332	std::make_unique<PreCompiledObjectPtr>(preCompiledBlobPtr, DeleteAsType<GpuFsaPreCompiledBlob>);
David Monahan	8a57046	2023-11-22 13:24:25 +0000	[diff] [blame]	333
				334	IConnectableLayer* preCompiledLayer = optimizationViews.GetINetwork()->AddPrecompiledLayer(
				335	PreCompiledDescriptor(base.GetNumInputSlots(), base.GetNumOutputSlots()),
				336	std::move(*compiledBlob),
				337	armnn::Optional<BackendId>(GetId()),
				338	"GpuFsa_Pre_Compiled_Layer");
				339
				340	// Copy the output tensor infos from sub-graph
				341	for (unsigned int i = 0; i < subgraph.GetNumOutputSlots(); i++)
				342	{
				343	preCompiledLayer->GetOutputSlot(i).SetTensorInfo(base.GetOutputSlot(i).GetTensorInfo());
				344	}
				345
				346	SubgraphView::SubgraphViewPtr substituteSubgraph =
				347	CreateSubgraphViewFrom(CreateInputsFrom(&base),
				348	CreateOutputsFrom(&base),
				349	{&base});
				350
David Monahan	bd73808	2023-12-08 12:50:02 +0000	[diff] [blame]	351	optimizationViews.AddSubstitution({ std::move(*substituteSubgraph), SubgraphView(preCompiledLayer) });
David Monahan	8a57046	2023-11-22 13:24:25 +0000	[diff] [blame]	352
				353	untouched.erase(base.GetGuid());
				354	}
				355
				356	if (optimizationViews.GetSubstitutions().empty())
				357	{
				358	optimizationViews.AddUntouchedSubgraph(SubgraphView(subgraph));
				359	}
				360	else
				361	{
				362	ReportUntouchedLayers(optimizationViews, untouched);
				363	}
				364
				365
				366	return optimizationViews;
				367	}
				368
				369	} // namespace armnn