Blame - src/backends/gpuFsa/GpuFsaBackend.cpp - ml/armnn

blob: 1bfe8dd14ae054ae8245755d50f19c7b9943e9d3 [file] [log] [blame]

David Monahan	8a57046	2023-11-22 13:24:25 +0000	[diff] [blame]	1	//
David Monahan	bd73808	2023-12-08 12:50:02 +0000	[diff] [blame]	2	// Copyright © 2022-2024 Arm Ltd and Contributors. All rights reserved.
David Monahan	8a57046	2023-11-22 13:24:25 +0000	[diff] [blame]	3	// SPDX-License-Identifier: MIT
				4	//
				5
				6	#include "GpuFsaBackend.hpp"
				7	#include "GpuFsaBackendContext.hpp"
				8	#include "GpuFsaBackendDefaultAllocator.hpp"
				9	#include "GpuFsaBackendId.hpp"
				10	#include "GpuFsaLayerSupport.hpp"
				11	#include "GpuFsaTensorHandleFactory.hpp"
				12	#include "GpuFsaWorkloadFactory.hpp"
				13
				14	#include <armnn/backends/IBackendContext.hpp>
				15	#include <armnn/backends/IMemoryManager.hpp>
				16	#include <aclCommon/BaseMemoryManager.hpp>
				17	#include <backendsCommon/SubgraphUtils.hpp>
				18	#include <Optimizer.hpp>
				19
				20	#include <arm_compute/core/CL/CLKernelLibrary.h>
				21	#include <arm_compute/runtime/CL/CLBufferAllocator.h>
				22
Tracy Narine	bc5a5d5	2024-02-06 15:22:41 +0000	[diff] [blame]	23	#include "layers/GpuFsaCast.hpp"
David Monahan	bd73808	2023-12-08 12:50:02 +0000	[diff] [blame]	24	#include "layers/GpuFsaConvolution2d.hpp"
Tianle Cheng	fbfa49e	2024-01-23 11:21:48 +0000	[diff] [blame]	25	#include "layers/GpuFsaDepthwiseConvolution2d.hpp"
Tracy Narine	e7d2785	2024-01-26 09:13:19 +0000	[diff] [blame]	26	#include "layers/GpuFsaElementwiseBinaryAdd.hpp"
John Mcloughlin	829e13e	2024-01-31 11:00:27 +0000	[diff] [blame]	27	#include "layers/GpuFsaElementwiseBinarySub.hpp"
Teresa Charlin	a52bca2	2024-02-01 17:36:48 +0000	[diff] [blame]	28	#include "layers/GpuFsaPooling2d.hpp"
Teresa Charlin	1d6b731	2024-02-07 22:02:48 +0000	[diff] [blame^]	29	#include "layers/GpuFsaResize.hpp"
David Monahan	8a57046	2023-11-22 13:24:25 +0000	[diff] [blame]	30
				31	namespace armnn
				32	{
				33
				34	template <typename T>
				35	inline void DeleteAsType(const void* const blob)
				36	{
				37	delete static_cast<const T*>(blob);
				38	}
				39
				40	inline SubgraphView::InputSlots CreateInputsFrom(Layer* layer)
				41	{
				42	SubgraphView::InputSlots result;
				43	for (auto&& it = layer->BeginInputSlots(); it != layer->EndInputSlots(); ++it)
				44	{
				45	result.push_back(&(*it));
				46	}
				47	return result;
				48	}
				49
				50	inline SubgraphView::OutputSlots CreateOutputsFrom(Layer* layer)
				51	{
				52	SubgraphView::OutputSlots result;
				53	for (auto&& it = layer->BeginOutputSlots(); it != layer->EndOutputSlots(); ++it)
				54	{
				55	result.push_back(&(*it));
				56	}
				57	return result;
				58	}
				59
				60	inline SubgraphView::SubgraphViewPtr CreateSubgraphViewFrom(SubgraphView::InputSlots&& inputs,
				61	SubgraphView::OutputSlots&& outputs,
				62	SubgraphView::Layers&& layers)
				63	{
				64	return std::make_unique<SubgraphView>(std::move(inputs), std::move(outputs), std::move(layers));
				65	}
				66
				67	const BackendId& GpuFsaBackend::GetIdStatic()
				68	{
				69	static const BackendId s_Id{GpuFsaBackendId()};
				70	return s_Id;
				71	}
				72
				73	IBackendInternal::IMemoryManagerUniquePtr GpuFsaBackend::CreateMemoryManager() const
				74	{
				75	if (m_UsingCustomAllocator)
				76	{
				77	return std::make_unique<GpuFsaMemoryManager>(m_CustomAllocator);
				78	}
				79	return std::make_unique<GpuFsaMemoryManager>(std::make_unique<arm_compute::CLBufferAllocator>());
				80	}
				81
				82	IBackendInternal::IWorkloadFactoryPtr GpuFsaBackend::CreateWorkloadFactory(
				83	const IBackendInternal::IMemoryManagerSharedPtr& memoryManager) const
				84	{
				85	return std::make_unique<GpuFsaWorkloadFactory>(PolymorphicPointerDowncast<GpuFsaMemoryManager>(memoryManager));
				86	}
				87
				88	IBackendInternal::IWorkloadFactoryPtr GpuFsaBackend::CreateWorkloadFactory(
				89	TensorHandleFactoryRegistry& registry) const
				90	{
				91	std::shared_ptr<GpuFsaMemoryManager> memoryManager;
				92	if (m_UsingCustomAllocator)
				93	{
				94	memoryManager = std::make_shared<GpuFsaMemoryManager>(m_CustomAllocator);
				95	}
				96	else
				97	{
				98	memoryManager = std::make_shared<GpuFsaMemoryManager>(std::make_unique<arm_compute::CLBufferAllocator>());
				99	}
				100
				101	std::unique_ptr<ITensorHandleFactory> factory = std::make_unique<GpuFsaTensorHandleFactory>(memoryManager);
				102
				103	registry.RegisterMemoryManager(memoryManager);
				104	registry.RegisterFactory(std::move(factory));
				105
				106	return std::make_unique<GpuFsaWorkloadFactory>(PolymorphicPointerDowncast<GpuFsaMemoryManager>(memoryManager));
				107	}
				108
				109	IBackendInternal::IWorkloadFactoryPtr GpuFsaBackend::CreateWorkloadFactory(
				110	TensorHandleFactoryRegistry& registry,
				111	const ModelOptions&,
				112	MemorySourceFlags inputFlags,
				113	MemorySourceFlags outputFlags) const
				114	{
				115
				116	// To allow force import if inputFlags/outputFlags are Undefined, set it as Malloc
				117	if (inputFlags == static_cast<MemorySourceFlags>(MemorySource::Undefined))
				118	{
				119	inputFlags = static_cast<MemorySourceFlags>(MemorySource::Malloc);
				120	}
				121	if (outputFlags == static_cast<MemorySourceFlags>(MemorySource::Undefined))
				122	{
				123	outputFlags = static_cast<MemorySourceFlags>(MemorySource::Malloc);
				124	}
				125
				126	std::shared_ptr<GpuFsaMemoryManager> memoryManager;
				127	if (m_UsingCustomAllocator)
				128	{
				129	memoryManager = std::make_shared<GpuFsaMemoryManager>(m_CustomAllocator);
				130	}
				131	else
				132	{
				133	memoryManager = std::make_shared<GpuFsaMemoryManager>(std::make_unique<arm_compute::CLBufferAllocator>());
				134	}
				135
				136	std::unique_ptr<ITensorHandleFactory> factory = std::make_unique<GpuFsaTensorHandleFactory>(memoryManager);
				137
				138	registry.RegisterMemoryManager(memoryManager);
				139	registry.RegisterFactory(std::move(factory));
				140
				141	return std::make_unique<GpuFsaWorkloadFactory>(PolymorphicPointerDowncast<GpuFsaMemoryManager>(memoryManager));
				142	}
				143
				144	std::vector<ITensorHandleFactory::FactoryId> GpuFsaBackend::GetHandleFactoryPreferences() const
				145	{
				146	return std::vector<ITensorHandleFactory::FactoryId> { GpuFsaTensorHandleFactory::GetIdStatic() };
				147	}
				148
				149	void GpuFsaBackend::RegisterTensorHandleFactories(TensorHandleFactoryRegistry& registry)
				150	{
				151	std::shared_ptr<GpuFsaMemoryManager> memoryManager;
				152	if (m_UsingCustomAllocator)
				153	{
				154	memoryManager = std::make_shared<GpuFsaMemoryManager>(m_CustomAllocator);
				155	}
				156	else
				157	{
				158	memoryManager = std::make_shared<GpuFsaMemoryManager>(std::make_unique<arm_compute::CLBufferAllocator>());
				159	}
				160
				161	std::unique_ptr<ITensorHandleFactory> factory = std::make_unique<GpuFsaTensorHandleFactory>(memoryManager);
				162	registry.RegisterMemoryManager(memoryManager);
				163	registry.RegisterFactory(std::move(factory));
				164
				165	}
				166
				167	void GpuFsaBackend::RegisterTensorHandleFactories(TensorHandleFactoryRegistry& registry,
				168	MemorySourceFlags inputFlags,
				169	MemorySourceFlags outputFlags)
				170	{
				171	// To allow force import if inputFlags/outputFlags are Undefined, set it as Malloc
				172	if (inputFlags == static_cast<MemorySourceFlags>(MemorySource::Undefined))
				173	{
				174	inputFlags = static_cast<MemorySourceFlags>(MemorySource::Malloc);
				175	}
				176	if (outputFlags == static_cast<MemorySourceFlags>(MemorySource::Undefined))
				177	{
				178	outputFlags = static_cast<MemorySourceFlags>(MemorySource::Malloc);
				179	}
				180
				181	std::shared_ptr<GpuFsaMemoryManager> memoryManager;
				182	if (m_UsingCustomAllocator)
				183	{
				184	memoryManager = std::make_shared<GpuFsaMemoryManager>(m_CustomAllocator);
				185	}
				186	else
				187	{
				188	memoryManager = std::make_shared<GpuFsaMemoryManager>(std::make_unique<arm_compute::CLBufferAllocator>());
				189	}
				190
				191	std::unique_ptr<ITensorHandleFactory> factory = std::make_unique<GpuFsaTensorHandleFactory>(memoryManager);
				192	registry.RegisterMemoryManager(memoryManager);
				193	registry.RegisterFactory(std::move(factory));
				194	}
				195
				196	IBackendInternal::IBackendContextPtr GpuFsaBackend::CreateBackendContext(const IRuntime::CreationOptions& options) const
				197	{
				198	return IBackendContextPtr{new GpuFsaBackendContext{options}};
				199	}
				200
				201	IBackendInternal::IBackendProfilingContextPtr GpuFsaBackend::CreateBackendProfilingContext(
				202	const IRuntime::CreationOptions&, IBackendProfilingPtr&)
				203	{
				204	return IBackendProfilingContextPtr{};
				205	}
				206
				207	IBackendInternal::ILayerSupportSharedPtr GpuFsaBackend::GetLayerSupport() const
				208	{
				209	static ILayerSupportSharedPtr layerSupport{new GpuFsaLayerSupport};
				210	return layerSupport;
				211	}
				212
				213	std::unique_ptr<ICustomAllocator> GpuFsaBackend::GetDefaultAllocator() const
				214	{
				215	return std::make_unique<GpuFsaBackendDefaultAllocator>();
				216	}
				217
				218	OptimizationViews GpuFsaBackend::OptimizeSubgraphView(const SubgraphView& subgraph,
				219	const ModelOptions& modelOptions) const
				220	{
				221	OptimizationViews optimizationViews(modelOptions);
				222
				223	using namespace arm_compute::experimental::dynamic_fusion;
David Monahan	8a57046	2023-11-22 13:24:25 +0000	[diff] [blame]	224
				225	auto it = subgraph.end();
				226	std::map<LayerGuid, Layer*> untouched;
				227	while (it != subgraph.begin())
				228	{
				229	--it;
				230	Layer& base = (PolymorphicDowncast<Layer>(*it));
				231	untouched.insert({base.GetGuid(), &base});
				232	}
				233
				234	GpuFsaLayerSupport supportChecker;
				235	it = subgraph.end();
David Monahan	bd73808	2023-12-08 12:50:02 +0000	[diff] [blame]	236	arm_compute::CLCompileContext* compileCtx = &(arm_compute::CLKernelLibrary::get().get_compile_context());
				237
				238	// Setup the GpuWokloadContext which will exist for the lifetime of the Graph. This contains the TensorInfos
				239	std::shared_ptr<GpuWorkloadContext> workloadContext = std::make_shared<GpuWorkloadContext>(compileCtx);
David Monahan	8a57046	2023-11-22 13:24:25 +0000	[diff] [blame]	240	while (it != subgraph.begin())
				241	{
				242	--it;
				243	Layer& base = (PolymorphicDowncast<Layer>(*it));
David Monahan	bd73808	2023-12-08 12:50:02 +0000	[diff] [blame]	244	// Create a GpuFsaPreCompiledBlob, this contains all of the information needed to execute an operator
				245	GpuFsaPreCompiledBlob* preCompiledBlobPtr = new GpuFsaPreCompiledBlob();
				246	preCompiledBlobPtr->workloadContext = workloadContext;
				247	preCompiledBlobPtr->sketch = std::make_unique<GpuWorkloadSketch>(workloadContext.get());
David Monahan	8a57046	2023-11-22 13:24:25 +0000	[diff] [blame]	248
David Monahan	bd73808	2023-12-08 12:50:02 +0000	[diff] [blame]	249	// Configure and setup the sketch for each supported op. Their data will be wrapped into a PreCompiled layer
David Monahan	8a57046	2023-11-22 13:24:25 +0000	[diff] [blame]	250	switch (base.GetType())
				251	{
Tracy Narine	bc5a5d5	2024-02-06 15:22:41 +0000	[diff] [blame]	252	case (LayerType::Cast):
				253	{
				254	auto input = base.GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo();
				255	auto output = base.GetOutputSlot(0).GetTensorInfo();
				256	GpuFsaCastCreateOp(preCompiledBlobPtr, input, output);
				257	break;
				258	}
David Monahan	8a57046	2023-11-22 13:24:25 +0000	[diff] [blame]	259	case (LayerType::Convolution2d):
				260	{
				261	auto input = base.GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo();
				262	auto weights = base.GetInputSlot(1).GetConnectedOutputSlot()->GetTensorInfo();
David Monahan	8a57046	2023-11-22 13:24:25 +0000	[diff] [blame]	263
				264	auto desc = PolymorphicDowncast<const Convolution2dDescriptor*>(&base.GetParameters());
				265	if (desc->m_BiasEnabled)
				266	{
				267	auto bias = base.GetInputSlot(2).GetConnectedOutputSlot()->GetTensorInfo();
David Monahan	bd73808	2023-12-08 12:50:02 +0000	[diff] [blame]	268	GpuFsaConvolution2dCreateOp(preCompiledBlobPtr,
				269	input,
David Monahan	8a57046	2023-11-22 13:24:25 +0000	[diff] [blame]	270	*desc,
				271	weights,
				272	bias);
				273	}
				274	else
				275	{
David Monahan	bd73808	2023-12-08 12:50:02 +0000	[diff] [blame]	276	GpuFsaConvolution2dCreateOp(preCompiledBlobPtr,
				277	input,
David Monahan	8a57046	2023-11-22 13:24:25 +0000	[diff] [blame]	278	*desc,
				279	weights,
				280	EmptyOptional());
				281	}
				282	break;
				283	}
Tianle Cheng	fbfa49e	2024-01-23 11:21:48 +0000	[diff] [blame]	284	case (LayerType::DepthwiseConvolution2d):
				285	{
				286	auto input = base.GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo();
				287	auto weights = base.GetInputSlot(1).GetConnectedOutputSlot()->GetTensorInfo();
				288
				289	auto desc = PolymorphicDowncast<const DepthwiseConvolution2dDescriptor*>(&base.GetParameters());
				290	if (desc->m_BiasEnabled)
				291	{
				292	auto bias = base.GetInputSlot(2).GetConnectedOutputSlot()->GetTensorInfo();
				293	GpuFsaDepthwiseConvolution2dCreateOp(preCompiledBlobPtr,
				294	input,
				295	*desc,
				296	weights,
				297	bias);
				298	}
				299	else
				300	{
				301	GpuFsaDepthwiseConvolution2dCreateOp(preCompiledBlobPtr,
				302	input,
				303	*desc,
				304	weights,
				305	EmptyOptional());
				306	}
				307	break;
				308	}
Tracy Narine	e7d2785	2024-01-26 09:13:19 +0000	[diff] [blame]	309	case LayerType::ElementwiseBinary:
				310	{
				311	auto desc = PolymorphicDowncast<const ElementwiseBinaryDescriptor *>(&base.GetParameters());
				312	if (desc->m_Operation == BinaryOperation::Add)
				313	{
				314	auto input0 = base.GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo();
				315	auto input1 = base.GetInputSlot(1).GetConnectedOutputSlot()->GetTensorInfo();
				316
				317	GpuFsaElementwiseBinaryAddCreateOp(preCompiledBlobPtr, input0, input1);
				318	}
John Mcloughlin	829e13e	2024-01-31 11:00:27 +0000	[diff] [blame]	319	else if (desc->m_Operation == BinaryOperation::Sub)
				320	{
				321	auto input0 = base.GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo();
				322	auto input1 = base.GetInputSlot(1).GetConnectedOutputSlot()->GetTensorInfo();
				323
				324	GpuFsaElementwiseBinarySubCreateOp(preCompiledBlobPtr, input0, input1);
				325	}
Tracy Narine	e7d2785	2024-01-26 09:13:19 +0000	[diff] [blame]	326	break;
				327	}
Teresa Charlin	a52bca2	2024-02-01 17:36:48 +0000	[diff] [blame]	328	case (LayerType::Pooling2d):
				329	{
				330	auto input = base.GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo();
				331	auto desc = PolymorphicDowncast<const Pooling2dDescriptor*>(&base.GetParameters());
				332	GpuFsaPooling2dCreateOp(preCompiledBlobPtr, input, *desc);
				333	break;
				334	}
Teresa Charlin	1d6b731	2024-02-07 22:02:48 +0000	[diff] [blame^]	335	case (LayerType::Resize):
				336	{
				337	auto input = base.GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo();
				338	auto desc = PolymorphicDowncast<const ResizeDescriptor*>(&base.GetParameters());
				339	GpuFsaResizeCreateOp(preCompiledBlobPtr, input, *desc);
				340	break;
				341	}
David Monahan	8a57046	2023-11-22 13:24:25 +0000	[diff] [blame]	342	default:
				343	// unsupported layer for GpuFsa backend
				344	continue;
				345	}
				346
David Monahan	bd73808	2023-12-08 12:50:02 +0000	[diff] [blame]	347	auto compiledBlob =
				348	std::make_unique<PreCompiledObjectPtr>(preCompiledBlobPtr, DeleteAsType<GpuFsaPreCompiledBlob>);
David Monahan	8a57046	2023-11-22 13:24:25 +0000	[diff] [blame]	349
				350	IConnectableLayer* preCompiledLayer = optimizationViews.GetINetwork()->AddPrecompiledLayer(
				351	PreCompiledDescriptor(base.GetNumInputSlots(), base.GetNumOutputSlots()),
				352	std::move(*compiledBlob),
				353	armnn::Optional<BackendId>(GetId()),
				354	"GpuFsa_Pre_Compiled_Layer");
				355
				356	// Copy the output tensor infos from sub-graph
				357	for (unsigned int i = 0; i < subgraph.GetNumOutputSlots(); i++)
				358	{
				359	preCompiledLayer->GetOutputSlot(i).SetTensorInfo(base.GetOutputSlot(i).GetTensorInfo());
				360	}
				361
				362	SubgraphView::SubgraphViewPtr substituteSubgraph =
				363	CreateSubgraphViewFrom(CreateInputsFrom(&base),
				364	CreateOutputsFrom(&base),
				365	{&base});
				366
David Monahan	bd73808	2023-12-08 12:50:02 +0000	[diff] [blame]	367	optimizationViews.AddSubstitution({ std::move(*substituteSubgraph), SubgraphView(preCompiledLayer) });
David Monahan	8a57046	2023-11-22 13:24:25 +0000	[diff] [blame]	368
				369	untouched.erase(base.GetGuid());
				370	}
				371
				372	if (optimizationViews.GetSubstitutions().empty())
				373	{
				374	optimizationViews.AddUntouchedSubgraph(SubgraphView(subgraph));
				375	}
				376	else
				377	{
				378	ReportUntouchedLayers(optimizationViews, untouched);
				379	}
				380
				381
				382	return optimizationViews;
				383	}
				384
				385	} // namespace armnn