Blame - src/backends/gpuFsa/GpuFsaBackend.cpp - ml/armnn

blob: 7951b17d9d339738cd418f2a06e768eaf135aff3 [file] [log] [blame]

David Monahan	8a57046	2023-11-22 13:24:25 +0000	[diff] [blame]	1	//
David Monahan	bd73808	2023-12-08 12:50:02 +0000	[diff] [blame]	2	// Copyright © 2022-2024 Arm Ltd and Contributors. All rights reserved.
David Monahan	8a57046	2023-11-22 13:24:25 +0000	[diff] [blame]	3	// SPDX-License-Identifier: MIT
				4	//
				5
				6	#include "GpuFsaBackend.hpp"
				7	#include "GpuFsaBackendContext.hpp"
				8	#include "GpuFsaBackendDefaultAllocator.hpp"
				9	#include "GpuFsaBackendId.hpp"
				10	#include "GpuFsaLayerSupport.hpp"
				11	#include "GpuFsaTensorHandleFactory.hpp"
				12	#include "GpuFsaWorkloadFactory.hpp"
				13
				14	#include <armnn/backends/IBackendContext.hpp>
				15	#include <armnn/backends/IMemoryManager.hpp>
				16	#include <aclCommon/BaseMemoryManager.hpp>
				17	#include <backendsCommon/SubgraphUtils.hpp>
				18	#include <Optimizer.hpp>
				19
				20	#include <arm_compute/core/CL/CLKernelLibrary.h>
				21	#include <arm_compute/runtime/CL/CLBufferAllocator.h>
				22
David Monahan	bd73808	2023-12-08 12:50:02 +0000	[diff] [blame]	23	#include "layers/GpuFsaConvolution2d.hpp"
Tianle Cheng	fbfa49e	2024-01-23 11:21:48 +0000	[diff] [blame]	24	#include "layers/GpuFsaDepthwiseConvolution2d.hpp"
Tracy Narine	e7d2785	2024-01-26 09:13:19 +0000	[diff] [blame]	25	#include "layers/GpuFsaElementwiseBinaryAdd.hpp"
David Monahan	8a57046	2023-11-22 13:24:25 +0000	[diff] [blame]	26
				27	namespace armnn
				28	{
				29
				30	template <typename T>
				31	inline void DeleteAsType(const void* const blob)
				32	{
				33	delete static_cast<const T*>(blob);
				34	}
				35
				36	inline SubgraphView::InputSlots CreateInputsFrom(Layer* layer)
				37	{
				38	SubgraphView::InputSlots result;
				39	for (auto&& it = layer->BeginInputSlots(); it != layer->EndInputSlots(); ++it)
				40	{
				41	result.push_back(&(*it));
				42	}
				43	return result;
				44	}
				45
				46	inline SubgraphView::OutputSlots CreateOutputsFrom(Layer* layer)
				47	{
				48	SubgraphView::OutputSlots result;
				49	for (auto&& it = layer->BeginOutputSlots(); it != layer->EndOutputSlots(); ++it)
				50	{
				51	result.push_back(&(*it));
				52	}
				53	return result;
				54	}
				55
				56	inline SubgraphView::SubgraphViewPtr CreateSubgraphViewFrom(SubgraphView::InputSlots&& inputs,
				57	SubgraphView::OutputSlots&& outputs,
				58	SubgraphView::Layers&& layers)
				59	{
				60	return std::make_unique<SubgraphView>(std::move(inputs), std::move(outputs), std::move(layers));
				61	}
				62
				63	const BackendId& GpuFsaBackend::GetIdStatic()
				64	{
				65	static const BackendId s_Id{GpuFsaBackendId()};
				66	return s_Id;
				67	}
				68
				69	IBackendInternal::IMemoryManagerUniquePtr GpuFsaBackend::CreateMemoryManager() const
				70	{
				71	if (m_UsingCustomAllocator)
				72	{
				73	return std::make_unique<GpuFsaMemoryManager>(m_CustomAllocator);
				74	}
				75	return std::make_unique<GpuFsaMemoryManager>(std::make_unique<arm_compute::CLBufferAllocator>());
				76	}
				77
				78	IBackendInternal::IWorkloadFactoryPtr GpuFsaBackend::CreateWorkloadFactory(
				79	const IBackendInternal::IMemoryManagerSharedPtr& memoryManager) const
				80	{
				81	return std::make_unique<GpuFsaWorkloadFactory>(PolymorphicPointerDowncast<GpuFsaMemoryManager>(memoryManager));
				82	}
				83
				84	IBackendInternal::IWorkloadFactoryPtr GpuFsaBackend::CreateWorkloadFactory(
				85	TensorHandleFactoryRegistry& registry) const
				86	{
				87	std::shared_ptr<GpuFsaMemoryManager> memoryManager;
				88	if (m_UsingCustomAllocator)
				89	{
				90	memoryManager = std::make_shared<GpuFsaMemoryManager>(m_CustomAllocator);
				91	}
				92	else
				93	{
				94	memoryManager = std::make_shared<GpuFsaMemoryManager>(std::make_unique<arm_compute::CLBufferAllocator>());
				95	}
				96
				97	std::unique_ptr<ITensorHandleFactory> factory = std::make_unique<GpuFsaTensorHandleFactory>(memoryManager);
				98
				99	registry.RegisterMemoryManager(memoryManager);
				100	registry.RegisterFactory(std::move(factory));
				101
				102	return std::make_unique<GpuFsaWorkloadFactory>(PolymorphicPointerDowncast<GpuFsaMemoryManager>(memoryManager));
				103	}
				104
				105	IBackendInternal::IWorkloadFactoryPtr GpuFsaBackend::CreateWorkloadFactory(
				106	TensorHandleFactoryRegistry& registry,
				107	const ModelOptions&,
				108	MemorySourceFlags inputFlags,
				109	MemorySourceFlags outputFlags) const
				110	{
				111
				112	// To allow force import if inputFlags/outputFlags are Undefined, set it as Malloc
				113	if (inputFlags == static_cast<MemorySourceFlags>(MemorySource::Undefined))
				114	{
				115	inputFlags = static_cast<MemorySourceFlags>(MemorySource::Malloc);
				116	}
				117	if (outputFlags == static_cast<MemorySourceFlags>(MemorySource::Undefined))
				118	{
				119	outputFlags = static_cast<MemorySourceFlags>(MemorySource::Malloc);
				120	}
				121
				122	std::shared_ptr<GpuFsaMemoryManager> memoryManager;
				123	if (m_UsingCustomAllocator)
				124	{
				125	memoryManager = std::make_shared<GpuFsaMemoryManager>(m_CustomAllocator);
				126	}
				127	else
				128	{
				129	memoryManager = std::make_shared<GpuFsaMemoryManager>(std::make_unique<arm_compute::CLBufferAllocator>());
				130	}
				131
				132	std::unique_ptr<ITensorHandleFactory> factory = std::make_unique<GpuFsaTensorHandleFactory>(memoryManager);
				133
				134	registry.RegisterMemoryManager(memoryManager);
				135	registry.RegisterFactory(std::move(factory));
				136
				137	return std::make_unique<GpuFsaWorkloadFactory>(PolymorphicPointerDowncast<GpuFsaMemoryManager>(memoryManager));
				138	}
				139
				140	std::vector<ITensorHandleFactory::FactoryId> GpuFsaBackend::GetHandleFactoryPreferences() const
				141	{
				142	return std::vector<ITensorHandleFactory::FactoryId> { GpuFsaTensorHandleFactory::GetIdStatic() };
				143	}
				144
				145	void GpuFsaBackend::RegisterTensorHandleFactories(TensorHandleFactoryRegistry& registry)
				146	{
				147	std::shared_ptr<GpuFsaMemoryManager> memoryManager;
				148	if (m_UsingCustomAllocator)
				149	{
				150	memoryManager = std::make_shared<GpuFsaMemoryManager>(m_CustomAllocator);
				151	}
				152	else
				153	{
				154	memoryManager = std::make_shared<GpuFsaMemoryManager>(std::make_unique<arm_compute::CLBufferAllocator>());
				155	}
				156
				157	std::unique_ptr<ITensorHandleFactory> factory = std::make_unique<GpuFsaTensorHandleFactory>(memoryManager);
				158	registry.RegisterMemoryManager(memoryManager);
				159	registry.RegisterFactory(std::move(factory));
				160
				161	}
				162
				163	void GpuFsaBackend::RegisterTensorHandleFactories(TensorHandleFactoryRegistry& registry,
				164	MemorySourceFlags inputFlags,
				165	MemorySourceFlags outputFlags)
				166	{
				167	// To allow force import if inputFlags/outputFlags are Undefined, set it as Malloc
				168	if (inputFlags == static_cast<MemorySourceFlags>(MemorySource::Undefined))
				169	{
				170	inputFlags = static_cast<MemorySourceFlags>(MemorySource::Malloc);
				171	}
				172	if (outputFlags == static_cast<MemorySourceFlags>(MemorySource::Undefined))
				173	{
				174	outputFlags = static_cast<MemorySourceFlags>(MemorySource::Malloc);
				175	}
				176
				177	std::shared_ptr<GpuFsaMemoryManager> memoryManager;
				178	if (m_UsingCustomAllocator)
				179	{
				180	memoryManager = std::make_shared<GpuFsaMemoryManager>(m_CustomAllocator);
				181	}
				182	else
				183	{
				184	memoryManager = std::make_shared<GpuFsaMemoryManager>(std::make_unique<arm_compute::CLBufferAllocator>());
				185	}
				186
				187	std::unique_ptr<ITensorHandleFactory> factory = std::make_unique<GpuFsaTensorHandleFactory>(memoryManager);
				188	registry.RegisterMemoryManager(memoryManager);
				189	registry.RegisterFactory(std::move(factory));
				190	}
				191
				192	IBackendInternal::IBackendContextPtr GpuFsaBackend::CreateBackendContext(const IRuntime::CreationOptions& options) const
				193	{
				194	return IBackendContextPtr{new GpuFsaBackendContext{options}};
				195	}
				196
				197	IBackendInternal::IBackendProfilingContextPtr GpuFsaBackend::CreateBackendProfilingContext(
				198	const IRuntime::CreationOptions&, IBackendProfilingPtr&)
				199	{
				200	return IBackendProfilingContextPtr{};
				201	}
				202
				203	IBackendInternal::ILayerSupportSharedPtr GpuFsaBackend::GetLayerSupport() const
				204	{
				205	static ILayerSupportSharedPtr layerSupport{new GpuFsaLayerSupport};
				206	return layerSupport;
				207	}
				208
				209	std::unique_ptr<ICustomAllocator> GpuFsaBackend::GetDefaultAllocator() const
				210	{
				211	return std::make_unique<GpuFsaBackendDefaultAllocator>();
				212	}
				213
				214	OptimizationViews GpuFsaBackend::OptimizeSubgraphView(const SubgraphView& subgraph,
				215	const ModelOptions& modelOptions) const
				216	{
				217	OptimizationViews optimizationViews(modelOptions);
				218
				219	using namespace arm_compute::experimental::dynamic_fusion;
David Monahan	8a57046	2023-11-22 13:24:25 +0000	[diff] [blame]	220
				221	auto it = subgraph.end();
				222	std::map<LayerGuid, Layer*> untouched;
				223	while (it != subgraph.begin())
				224	{
				225	--it;
				226	Layer& base = (PolymorphicDowncast<Layer>(*it));
				227	untouched.insert({base.GetGuid(), &base});
				228	}
				229
				230	GpuFsaLayerSupport supportChecker;
				231	it = subgraph.end();
David Monahan	bd73808	2023-12-08 12:50:02 +0000	[diff] [blame]	232	arm_compute::CLCompileContext* compileCtx = &(arm_compute::CLKernelLibrary::get().get_compile_context());
				233
				234	// Setup the GpuWokloadContext which will exist for the lifetime of the Graph. This contains the TensorInfos
				235	std::shared_ptr<GpuWorkloadContext> workloadContext = std::make_shared<GpuWorkloadContext>(compileCtx);
David Monahan	8a57046	2023-11-22 13:24:25 +0000	[diff] [blame]	236	while (it != subgraph.begin())
				237	{
				238	--it;
				239	Layer& base = (PolymorphicDowncast<Layer>(*it));
David Monahan	bd73808	2023-12-08 12:50:02 +0000	[diff] [blame]	240	// Create a GpuFsaPreCompiledBlob, this contains all of the information needed to execute an operator
				241	GpuFsaPreCompiledBlob* preCompiledBlobPtr = new GpuFsaPreCompiledBlob();
				242	preCompiledBlobPtr->workloadContext = workloadContext;
				243	preCompiledBlobPtr->sketch = std::make_unique<GpuWorkloadSketch>(workloadContext.get());
David Monahan	8a57046	2023-11-22 13:24:25 +0000	[diff] [blame]	244
David Monahan	bd73808	2023-12-08 12:50:02 +0000	[diff] [blame]	245	// Configure and setup the sketch for each supported op. Their data will be wrapped into a PreCompiled layer
David Monahan	8a57046	2023-11-22 13:24:25 +0000	[diff] [blame]	246	switch (base.GetType())
				247	{
				248	case (LayerType::Convolution2d):
				249	{
				250	auto input = base.GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo();
				251	auto weights = base.GetInputSlot(1).GetConnectedOutputSlot()->GetTensorInfo();
David Monahan	8a57046	2023-11-22 13:24:25 +0000	[diff] [blame]	252
				253	auto desc = PolymorphicDowncast<const Convolution2dDescriptor*>(&base.GetParameters());
				254	if (desc->m_BiasEnabled)
				255	{
				256	auto bias = base.GetInputSlot(2).GetConnectedOutputSlot()->GetTensorInfo();
David Monahan	bd73808	2023-12-08 12:50:02 +0000	[diff] [blame]	257	GpuFsaConvolution2dCreateOp(preCompiledBlobPtr,
				258	input,
David Monahan	8a57046	2023-11-22 13:24:25 +0000	[diff] [blame]	259	*desc,
				260	weights,
				261	bias);
				262	}
				263	else
				264	{
David Monahan	bd73808	2023-12-08 12:50:02 +0000	[diff] [blame]	265	GpuFsaConvolution2dCreateOp(preCompiledBlobPtr,
				266	input,
David Monahan	8a57046	2023-11-22 13:24:25 +0000	[diff] [blame]	267	*desc,
				268	weights,
				269	EmptyOptional());
				270	}
				271	break;
				272	}
Tianle Cheng	fbfa49e	2024-01-23 11:21:48 +0000	[diff] [blame]	273	case (LayerType::DepthwiseConvolution2d):
				274	{
				275	auto input = base.GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo();
				276	auto weights = base.GetInputSlot(1).GetConnectedOutputSlot()->GetTensorInfo();
				277
				278	auto desc = PolymorphicDowncast<const DepthwiseConvolution2dDescriptor*>(&base.GetParameters());
				279	if (desc->m_BiasEnabled)
				280	{
				281	auto bias = base.GetInputSlot(2).GetConnectedOutputSlot()->GetTensorInfo();
				282	GpuFsaDepthwiseConvolution2dCreateOp(preCompiledBlobPtr,
				283	input,
				284	*desc,
				285	weights,
				286	bias);
				287	}
				288	else
				289	{
				290	GpuFsaDepthwiseConvolution2dCreateOp(preCompiledBlobPtr,
				291	input,
				292	*desc,
				293	weights,
				294	EmptyOptional());
				295	}
				296	break;
				297	}
Tracy Narine	e7d2785	2024-01-26 09:13:19 +0000	[diff] [blame]	298	case LayerType::ElementwiseBinary:
				299	{
				300	auto desc = PolymorphicDowncast<const ElementwiseBinaryDescriptor *>(&base.GetParameters());
				301	if (desc->m_Operation == BinaryOperation::Add)
				302	{
				303	auto input0 = base.GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo();
				304	auto input1 = base.GetInputSlot(1).GetConnectedOutputSlot()->GetTensorInfo();
				305
				306	GpuFsaElementwiseBinaryAddCreateOp(preCompiledBlobPtr, input0, input1);
				307	}
				308	break;
				309	}
David Monahan	8a57046	2023-11-22 13:24:25 +0000	[diff] [blame]	310	default:
				311	// unsupported layer for GpuFsa backend
				312	continue;
				313	}
				314
David Monahan	bd73808	2023-12-08 12:50:02 +0000	[diff] [blame]	315	auto compiledBlob =
				316	std::make_unique<PreCompiledObjectPtr>(preCompiledBlobPtr, DeleteAsType<GpuFsaPreCompiledBlob>);
David Monahan	8a57046	2023-11-22 13:24:25 +0000	[diff] [blame]	317
				318	IConnectableLayer* preCompiledLayer = optimizationViews.GetINetwork()->AddPrecompiledLayer(
				319	PreCompiledDescriptor(base.GetNumInputSlots(), base.GetNumOutputSlots()),
				320	std::move(*compiledBlob),
				321	armnn::Optional<BackendId>(GetId()),
				322	"GpuFsa_Pre_Compiled_Layer");
				323
				324	// Copy the output tensor infos from sub-graph
				325	for (unsigned int i = 0; i < subgraph.GetNumOutputSlots(); i++)
				326	{
				327	preCompiledLayer->GetOutputSlot(i).SetTensorInfo(base.GetOutputSlot(i).GetTensorInfo());
				328	}
				329
				330	SubgraphView::SubgraphViewPtr substituteSubgraph =
				331	CreateSubgraphViewFrom(CreateInputsFrom(&base),
				332	CreateOutputsFrom(&base),
				333	{&base});
				334
David Monahan	bd73808	2023-12-08 12:50:02 +0000	[diff] [blame]	335	optimizationViews.AddSubstitution({ std::move(*substituteSubgraph), SubgraphView(preCompiledLayer) });
David Monahan	8a57046	2023-11-22 13:24:25 +0000	[diff] [blame]	336
				337	untouched.erase(base.GetGuid());
				338	}
				339
				340	if (optimizationViews.GetSubstitutions().empty())
				341	{
				342	optimizationViews.AddUntouchedSubgraph(SubgraphView(subgraph));
				343	}
				344	else
				345	{
				346	ReportUntouchedLayers(optimizationViews, untouched);
				347	}
				348
				349
				350	return optimizationViews;
				351	}
				352
				353	} // namespace armnn