Blame - src/backends/gpuFsa/GpuFsaBackend.cpp - ml/armnn

blob: 4a410309e3e325ccfce7d48b597d7d4eec1ced1b [file] [log] [blame]

David Monahan	8a57046	2023-11-22 13:24:25 +0000	[diff] [blame]	1	//
David Monahan	bd73808	2023-12-08 12:50:02 +0000	[diff] [blame]	2	// Copyright © 2022-2024 Arm Ltd and Contributors. All rights reserved.
David Monahan	8a57046	2023-11-22 13:24:25 +0000	[diff] [blame]	3	// SPDX-License-Identifier: MIT
				4	//
				5
				6	#include "GpuFsaBackend.hpp"
				7	#include "GpuFsaBackendContext.hpp"
				8	#include "GpuFsaBackendDefaultAllocator.hpp"
				9	#include "GpuFsaBackendId.hpp"
				10	#include "GpuFsaLayerSupport.hpp"
				11	#include "GpuFsaTensorHandleFactory.hpp"
				12	#include "GpuFsaWorkloadFactory.hpp"
				13
				14	#include <armnn/backends/IBackendContext.hpp>
				15	#include <armnn/backends/IMemoryManager.hpp>
				16	#include <aclCommon/BaseMemoryManager.hpp>
				17	#include <backendsCommon/SubgraphUtils.hpp>
				18	#include <Optimizer.hpp>
				19
				20	#include <arm_compute/core/CL/CLKernelLibrary.h>
				21	#include <arm_compute/runtime/CL/CLBufferAllocator.h>
				22
Tracy Narine	bc5a5d5	2024-02-06 15:22:41 +0000	[diff] [blame^]	23	#include "layers/GpuFsaCast.hpp"
David Monahan	bd73808	2023-12-08 12:50:02 +0000	[diff] [blame]	24	#include "layers/GpuFsaConvolution2d.hpp"
Tianle Cheng	fbfa49e	2024-01-23 11:21:48 +0000	[diff] [blame]	25	#include "layers/GpuFsaDepthwiseConvolution2d.hpp"
Tracy Narine	e7d2785	2024-01-26 09:13:19 +0000	[diff] [blame]	26	#include "layers/GpuFsaElementwiseBinaryAdd.hpp"
John Mcloughlin	829e13e	2024-01-31 11:00:27 +0000	[diff] [blame]	27	#include "layers/GpuFsaElementwiseBinarySub.hpp"
Teresa Charlin	a52bca2	2024-02-01 17:36:48 +0000	[diff] [blame]	28	#include "layers/GpuFsaPooling2d.hpp"
David Monahan	8a57046	2023-11-22 13:24:25 +0000	[diff] [blame]	29
				30	namespace armnn
				31	{
				32
				33	template <typename T>
				34	inline void DeleteAsType(const void* const blob)
				35	{
				36	delete static_cast<const T*>(blob);
				37	}
				38
				39	inline SubgraphView::InputSlots CreateInputsFrom(Layer* layer)
				40	{
				41	SubgraphView::InputSlots result;
				42	for (auto&& it = layer->BeginInputSlots(); it != layer->EndInputSlots(); ++it)
				43	{
				44	result.push_back(&(*it));
				45	}
				46	return result;
				47	}
				48
				49	inline SubgraphView::OutputSlots CreateOutputsFrom(Layer* layer)
				50	{
				51	SubgraphView::OutputSlots result;
				52	for (auto&& it = layer->BeginOutputSlots(); it != layer->EndOutputSlots(); ++it)
				53	{
				54	result.push_back(&(*it));
				55	}
				56	return result;
				57	}
				58
				59	inline SubgraphView::SubgraphViewPtr CreateSubgraphViewFrom(SubgraphView::InputSlots&& inputs,
				60	SubgraphView::OutputSlots&& outputs,
				61	SubgraphView::Layers&& layers)
				62	{
				63	return std::make_unique<SubgraphView>(std::move(inputs), std::move(outputs), std::move(layers));
				64	}
				65
				66	const BackendId& GpuFsaBackend::GetIdStatic()
				67	{
				68	static const BackendId s_Id{GpuFsaBackendId()};
				69	return s_Id;
				70	}
				71
				72	IBackendInternal::IMemoryManagerUniquePtr GpuFsaBackend::CreateMemoryManager() const
				73	{
				74	if (m_UsingCustomAllocator)
				75	{
				76	return std::make_unique<GpuFsaMemoryManager>(m_CustomAllocator);
				77	}
				78	return std::make_unique<GpuFsaMemoryManager>(std::make_unique<arm_compute::CLBufferAllocator>());
				79	}
				80
				81	IBackendInternal::IWorkloadFactoryPtr GpuFsaBackend::CreateWorkloadFactory(
				82	const IBackendInternal::IMemoryManagerSharedPtr& memoryManager) const
				83	{
				84	return std::make_unique<GpuFsaWorkloadFactory>(PolymorphicPointerDowncast<GpuFsaMemoryManager>(memoryManager));
				85	}
				86
				87	IBackendInternal::IWorkloadFactoryPtr GpuFsaBackend::CreateWorkloadFactory(
				88	TensorHandleFactoryRegistry& registry) const
				89	{
				90	std::shared_ptr<GpuFsaMemoryManager> memoryManager;
				91	if (m_UsingCustomAllocator)
				92	{
				93	memoryManager = std::make_shared<GpuFsaMemoryManager>(m_CustomAllocator);
				94	}
				95	else
				96	{
				97	memoryManager = std::make_shared<GpuFsaMemoryManager>(std::make_unique<arm_compute::CLBufferAllocator>());
				98	}
				99
				100	std::unique_ptr<ITensorHandleFactory> factory = std::make_unique<GpuFsaTensorHandleFactory>(memoryManager);
				101
				102	registry.RegisterMemoryManager(memoryManager);
				103	registry.RegisterFactory(std::move(factory));
				104
				105	return std::make_unique<GpuFsaWorkloadFactory>(PolymorphicPointerDowncast<GpuFsaMemoryManager>(memoryManager));
				106	}
				107
				108	IBackendInternal::IWorkloadFactoryPtr GpuFsaBackend::CreateWorkloadFactory(
				109	TensorHandleFactoryRegistry& registry,
				110	const ModelOptions&,
				111	MemorySourceFlags inputFlags,
				112	MemorySourceFlags outputFlags) const
				113	{
				114
				115	// To allow force import if inputFlags/outputFlags are Undefined, set it as Malloc
				116	if (inputFlags == static_cast<MemorySourceFlags>(MemorySource::Undefined))
				117	{
				118	inputFlags = static_cast<MemorySourceFlags>(MemorySource::Malloc);
				119	}
				120	if (outputFlags == static_cast<MemorySourceFlags>(MemorySource::Undefined))
				121	{
				122	outputFlags = static_cast<MemorySourceFlags>(MemorySource::Malloc);
				123	}
				124
				125	std::shared_ptr<GpuFsaMemoryManager> memoryManager;
				126	if (m_UsingCustomAllocator)
				127	{
				128	memoryManager = std::make_shared<GpuFsaMemoryManager>(m_CustomAllocator);
				129	}
				130	else
				131	{
				132	memoryManager = std::make_shared<GpuFsaMemoryManager>(std::make_unique<arm_compute::CLBufferAllocator>());
				133	}
				134
				135	std::unique_ptr<ITensorHandleFactory> factory = std::make_unique<GpuFsaTensorHandleFactory>(memoryManager);
				136
				137	registry.RegisterMemoryManager(memoryManager);
				138	registry.RegisterFactory(std::move(factory));
				139
				140	return std::make_unique<GpuFsaWorkloadFactory>(PolymorphicPointerDowncast<GpuFsaMemoryManager>(memoryManager));
				141	}
				142
				143	std::vector<ITensorHandleFactory::FactoryId> GpuFsaBackend::GetHandleFactoryPreferences() const
				144	{
				145	return std::vector<ITensorHandleFactory::FactoryId> { GpuFsaTensorHandleFactory::GetIdStatic() };
				146	}
				147
				148	void GpuFsaBackend::RegisterTensorHandleFactories(TensorHandleFactoryRegistry& registry)
				149	{
				150	std::shared_ptr<GpuFsaMemoryManager> memoryManager;
				151	if (m_UsingCustomAllocator)
				152	{
				153	memoryManager = std::make_shared<GpuFsaMemoryManager>(m_CustomAllocator);
				154	}
				155	else
				156	{
				157	memoryManager = std::make_shared<GpuFsaMemoryManager>(std::make_unique<arm_compute::CLBufferAllocator>());
				158	}
				159
				160	std::unique_ptr<ITensorHandleFactory> factory = std::make_unique<GpuFsaTensorHandleFactory>(memoryManager);
				161	registry.RegisterMemoryManager(memoryManager);
				162	registry.RegisterFactory(std::move(factory));
				163
				164	}
				165
				166	void GpuFsaBackend::RegisterTensorHandleFactories(TensorHandleFactoryRegistry& registry,
				167	MemorySourceFlags inputFlags,
				168	MemorySourceFlags outputFlags)
				169	{
				170	// To allow force import if inputFlags/outputFlags are Undefined, set it as Malloc
				171	if (inputFlags == static_cast<MemorySourceFlags>(MemorySource::Undefined))
				172	{
				173	inputFlags = static_cast<MemorySourceFlags>(MemorySource::Malloc);
				174	}
				175	if (outputFlags == static_cast<MemorySourceFlags>(MemorySource::Undefined))
				176	{
				177	outputFlags = static_cast<MemorySourceFlags>(MemorySource::Malloc);
				178	}
				179
				180	std::shared_ptr<GpuFsaMemoryManager> memoryManager;
				181	if (m_UsingCustomAllocator)
				182	{
				183	memoryManager = std::make_shared<GpuFsaMemoryManager>(m_CustomAllocator);
				184	}
				185	else
				186	{
				187	memoryManager = std::make_shared<GpuFsaMemoryManager>(std::make_unique<arm_compute::CLBufferAllocator>());
				188	}
				189
				190	std::unique_ptr<ITensorHandleFactory> factory = std::make_unique<GpuFsaTensorHandleFactory>(memoryManager);
				191	registry.RegisterMemoryManager(memoryManager);
				192	registry.RegisterFactory(std::move(factory));
				193	}
				194
				195	IBackendInternal::IBackendContextPtr GpuFsaBackend::CreateBackendContext(const IRuntime::CreationOptions& options) const
				196	{
				197	return IBackendContextPtr{new GpuFsaBackendContext{options}};
				198	}
				199
				200	IBackendInternal::IBackendProfilingContextPtr GpuFsaBackend::CreateBackendProfilingContext(
				201	const IRuntime::CreationOptions&, IBackendProfilingPtr&)
				202	{
				203	return IBackendProfilingContextPtr{};
				204	}
				205
				206	IBackendInternal::ILayerSupportSharedPtr GpuFsaBackend::GetLayerSupport() const
				207	{
				208	static ILayerSupportSharedPtr layerSupport{new GpuFsaLayerSupport};
				209	return layerSupport;
				210	}
				211
				212	std::unique_ptr<ICustomAllocator> GpuFsaBackend::GetDefaultAllocator() const
				213	{
				214	return std::make_unique<GpuFsaBackendDefaultAllocator>();
				215	}
				216
				217	OptimizationViews GpuFsaBackend::OptimizeSubgraphView(const SubgraphView& subgraph,
				218	const ModelOptions& modelOptions) const
				219	{
				220	OptimizationViews optimizationViews(modelOptions);
				221
				222	using namespace arm_compute::experimental::dynamic_fusion;
David Monahan	8a57046	2023-11-22 13:24:25 +0000	[diff] [blame]	223
				224	auto it = subgraph.end();
				225	std::map<LayerGuid, Layer*> untouched;
				226	while (it != subgraph.begin())
				227	{
				228	--it;
				229	Layer& base = (PolymorphicDowncast<Layer>(*it));
				230	untouched.insert({base.GetGuid(), &base});
				231	}
				232
				233	GpuFsaLayerSupport supportChecker;
				234	it = subgraph.end();
David Monahan	bd73808	2023-12-08 12:50:02 +0000	[diff] [blame]	235	arm_compute::CLCompileContext* compileCtx = &(arm_compute::CLKernelLibrary::get().get_compile_context());
				236
				237	// Setup the GpuWokloadContext which will exist for the lifetime of the Graph. This contains the TensorInfos
				238	std::shared_ptr<GpuWorkloadContext> workloadContext = std::make_shared<GpuWorkloadContext>(compileCtx);
David Monahan	8a57046	2023-11-22 13:24:25 +0000	[diff] [blame]	239	while (it != subgraph.begin())
				240	{
				241	--it;
				242	Layer& base = (PolymorphicDowncast<Layer>(*it));
David Monahan	bd73808	2023-12-08 12:50:02 +0000	[diff] [blame]	243	// Create a GpuFsaPreCompiledBlob, this contains all of the information needed to execute an operator
				244	GpuFsaPreCompiledBlob* preCompiledBlobPtr = new GpuFsaPreCompiledBlob();
				245	preCompiledBlobPtr->workloadContext = workloadContext;
				246	preCompiledBlobPtr->sketch = std::make_unique<GpuWorkloadSketch>(workloadContext.get());
David Monahan	8a57046	2023-11-22 13:24:25 +0000	[diff] [blame]	247
David Monahan	bd73808	2023-12-08 12:50:02 +0000	[diff] [blame]	248	// Configure and setup the sketch for each supported op. Their data will be wrapped into a PreCompiled layer
David Monahan	8a57046	2023-11-22 13:24:25 +0000	[diff] [blame]	249	switch (base.GetType())
				250	{
Tracy Narine	bc5a5d5	2024-02-06 15:22:41 +0000	[diff] [blame^]	251	case (LayerType::Cast):
				252	{
				253	auto input = base.GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo();
				254	auto output = base.GetOutputSlot(0).GetTensorInfo();
				255	GpuFsaCastCreateOp(preCompiledBlobPtr, input, output);
				256	break;
				257	}
David Monahan	8a57046	2023-11-22 13:24:25 +0000	[diff] [blame]	258	case (LayerType::Convolution2d):
				259	{
				260	auto input = base.GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo();
				261	auto weights = base.GetInputSlot(1).GetConnectedOutputSlot()->GetTensorInfo();
David Monahan	8a57046	2023-11-22 13:24:25 +0000	[diff] [blame]	262
				263	auto desc = PolymorphicDowncast<const Convolution2dDescriptor*>(&base.GetParameters());
				264	if (desc->m_BiasEnabled)
				265	{
				266	auto bias = base.GetInputSlot(2).GetConnectedOutputSlot()->GetTensorInfo();
David Monahan	bd73808	2023-12-08 12:50:02 +0000	[diff] [blame]	267	GpuFsaConvolution2dCreateOp(preCompiledBlobPtr,
				268	input,
David Monahan	8a57046	2023-11-22 13:24:25 +0000	[diff] [blame]	269	*desc,
				270	weights,
				271	bias);
				272	}
				273	else
				274	{
David Monahan	bd73808	2023-12-08 12:50:02 +0000	[diff] [blame]	275	GpuFsaConvolution2dCreateOp(preCompiledBlobPtr,
				276	input,
David Monahan	8a57046	2023-11-22 13:24:25 +0000	[diff] [blame]	277	*desc,
				278	weights,
				279	EmptyOptional());
				280	}
				281	break;
				282	}
Tianle Cheng	fbfa49e	2024-01-23 11:21:48 +0000	[diff] [blame]	283	case (LayerType::DepthwiseConvolution2d):
				284	{
				285	auto input = base.GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo();
				286	auto weights = base.GetInputSlot(1).GetConnectedOutputSlot()->GetTensorInfo();
				287
				288	auto desc = PolymorphicDowncast<const DepthwiseConvolution2dDescriptor*>(&base.GetParameters());
				289	if (desc->m_BiasEnabled)
				290	{
				291	auto bias = base.GetInputSlot(2).GetConnectedOutputSlot()->GetTensorInfo();
				292	GpuFsaDepthwiseConvolution2dCreateOp(preCompiledBlobPtr,
				293	input,
				294	*desc,
				295	weights,
				296	bias);
				297	}
				298	else
				299	{
				300	GpuFsaDepthwiseConvolution2dCreateOp(preCompiledBlobPtr,
				301	input,
				302	*desc,
				303	weights,
				304	EmptyOptional());
				305	}
				306	break;
				307	}
Tracy Narine	e7d2785	2024-01-26 09:13:19 +0000	[diff] [blame]	308	case LayerType::ElementwiseBinary:
				309	{
				310	auto desc = PolymorphicDowncast<const ElementwiseBinaryDescriptor *>(&base.GetParameters());
				311	if (desc->m_Operation == BinaryOperation::Add)
				312	{
				313	auto input0 = base.GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo();
				314	auto input1 = base.GetInputSlot(1).GetConnectedOutputSlot()->GetTensorInfo();
				315
				316	GpuFsaElementwiseBinaryAddCreateOp(preCompiledBlobPtr, input0, input1);
				317	}
John Mcloughlin	829e13e	2024-01-31 11:00:27 +0000	[diff] [blame]	318	else if (desc->m_Operation == BinaryOperation::Sub)
				319	{
				320	auto input0 = base.GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo();
				321	auto input1 = base.GetInputSlot(1).GetConnectedOutputSlot()->GetTensorInfo();
				322
				323	GpuFsaElementwiseBinarySubCreateOp(preCompiledBlobPtr, input0, input1);
				324	}
Tracy Narine	e7d2785	2024-01-26 09:13:19 +0000	[diff] [blame]	325	break;
				326	}
Teresa Charlin	a52bca2	2024-02-01 17:36:48 +0000	[diff] [blame]	327	case (LayerType::Pooling2d):
				328	{
				329	auto input = base.GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo();
				330	auto desc = PolymorphicDowncast<const Pooling2dDescriptor*>(&base.GetParameters());
				331	GpuFsaPooling2dCreateOp(preCompiledBlobPtr, input, *desc);
				332	break;
				333	}
David Monahan	8a57046	2023-11-22 13:24:25 +0000	[diff] [blame]	334	default:
				335	// unsupported layer for GpuFsa backend
				336	continue;
				337	}
				338
David Monahan	bd73808	2023-12-08 12:50:02 +0000	[diff] [blame]	339	auto compiledBlob =
				340	std::make_unique<PreCompiledObjectPtr>(preCompiledBlobPtr, DeleteAsType<GpuFsaPreCompiledBlob>);
David Monahan	8a57046	2023-11-22 13:24:25 +0000	[diff] [blame]	341
				342	IConnectableLayer* preCompiledLayer = optimizationViews.GetINetwork()->AddPrecompiledLayer(
				343	PreCompiledDescriptor(base.GetNumInputSlots(), base.GetNumOutputSlots()),
				344	std::move(*compiledBlob),
				345	armnn::Optional<BackendId>(GetId()),
				346	"GpuFsa_Pre_Compiled_Layer");
				347
				348	// Copy the output tensor infos from sub-graph
				349	for (unsigned int i = 0; i < subgraph.GetNumOutputSlots(); i++)
				350	{
				351	preCompiledLayer->GetOutputSlot(i).SetTensorInfo(base.GetOutputSlot(i).GetTensorInfo());
				352	}
				353
				354	SubgraphView::SubgraphViewPtr substituteSubgraph =
				355	CreateSubgraphViewFrom(CreateInputsFrom(&base),
				356	CreateOutputsFrom(&base),
				357	{&base});
				358
David Monahan	bd73808	2023-12-08 12:50:02 +0000	[diff] [blame]	359	optimizationViews.AddSubstitution({ std::move(*substituteSubgraph), SubgraphView(preCompiledLayer) });
David Monahan	8a57046	2023-11-22 13:24:25 +0000	[diff] [blame]	360
				361	untouched.erase(base.GetGuid());
				362	}
				363
				364	if (optimizationViews.GetSubstitutions().empty())
				365	{
				366	optimizationViews.AddUntouchedSubgraph(SubgraphView(subgraph));
				367	}
				368	else
				369	{
				370	ReportUntouchedLayers(optimizationViews, untouched);
				371	}
				372
				373
				374	return optimizationViews;
				375	}
				376
				377	} // namespace armnn