blob: 29eb1adb6c39cf114e06e3e64741f36d7f50d7a4 [file] [log] [blame]
David Monahan8a570462023-11-22 13:24:25 +00001//
David Monahanbd738082023-12-08 12:50:02 +00002// Copyright © 2022-2024 Arm Ltd and Contributors. All rights reserved.
David Monahan8a570462023-11-22 13:24:25 +00003// SPDX-License-Identifier: MIT
4//
5
6#include "GpuFsaBackend.hpp"
7#include "GpuFsaBackendContext.hpp"
8#include "GpuFsaBackendDefaultAllocator.hpp"
9#include "GpuFsaBackendId.hpp"
10#include "GpuFsaLayerSupport.hpp"
11#include "GpuFsaTensorHandleFactory.hpp"
12#include "GpuFsaWorkloadFactory.hpp"
13
14#include <armnn/backends/IBackendContext.hpp>
15#include <armnn/backends/IMemoryManager.hpp>
16#include <aclCommon/BaseMemoryManager.hpp>
17#include <backendsCommon/SubgraphUtils.hpp>
18#include <Optimizer.hpp>
19
20#include <arm_compute/core/CL/CLKernelLibrary.h>
21#include <arm_compute/runtime/CL/CLBufferAllocator.h>
22
Teresa Charlinddbda6a2024-02-07 22:58:29 +000023#include "layers/GpuFsaActivation.hpp"
Teresa Charlin5bda9732024-02-08 18:46:38 +000024#include "layers/GpuFsaBatchMatMul.hpp"
Tracy Narinebc5a5d52024-02-06 15:22:41 +000025#include "layers/GpuFsaCast.hpp"
David Monahanbd738082023-12-08 12:50:02 +000026#include "layers/GpuFsaConvolution2d.hpp"
Tianle Chengfbfa49e2024-01-23 11:21:48 +000027#include "layers/GpuFsaDepthwiseConvolution2d.hpp"
Teresa Charlin20dda372024-02-08 16:23:25 +000028#include "layers/GpuFsaElementwiseBinary.hpp"
Teresa Charlina52bca22024-02-01 17:36:48 +000029#include "layers/GpuFsaPooling2d.hpp"
Teresa Charlin1d6b7312024-02-07 22:02:48 +000030#include "layers/GpuFsaResize.hpp"
John Mcloughlin33753902024-02-07 15:00:57 +000031#include "layers/GpuFsaSoftmax.hpp"
David Monahan8a570462023-11-22 13:24:25 +000032
33namespace armnn
34{
35
36template <typename T>
37inline void DeleteAsType(const void* const blob)
38{
39 delete static_cast<const T*>(blob);
40}
41
42inline SubgraphView::InputSlots CreateInputsFrom(Layer* layer)
43{
44 SubgraphView::InputSlots result;
45 for (auto&& it = layer->BeginInputSlots(); it != layer->EndInputSlots(); ++it)
46 {
47 result.push_back(&(*it));
48 }
49 return result;
50}
51
52inline SubgraphView::OutputSlots CreateOutputsFrom(Layer* layer)
53{
54 SubgraphView::OutputSlots result;
55 for (auto&& it = layer->BeginOutputSlots(); it != layer->EndOutputSlots(); ++it)
56 {
57 result.push_back(&(*it));
58 }
59 return result;
60}
61
62inline SubgraphView::SubgraphViewPtr CreateSubgraphViewFrom(SubgraphView::InputSlots&& inputs,
63 SubgraphView::OutputSlots&& outputs,
64 SubgraphView::Layers&& layers)
65{
66 return std::make_unique<SubgraphView>(std::move(inputs), std::move(outputs), std::move(layers));
67}
68
69const BackendId& GpuFsaBackend::GetIdStatic()
70{
71 static const BackendId s_Id{GpuFsaBackendId()};
72 return s_Id;
73}
74
75IBackendInternal::IMemoryManagerUniquePtr GpuFsaBackend::CreateMemoryManager() const
76{
77 if (m_UsingCustomAllocator)
78 {
79 return std::make_unique<GpuFsaMemoryManager>(m_CustomAllocator);
80 }
81 return std::make_unique<GpuFsaMemoryManager>(std::make_unique<arm_compute::CLBufferAllocator>());
82}
83
84IBackendInternal::IWorkloadFactoryPtr GpuFsaBackend::CreateWorkloadFactory(
85 const IBackendInternal::IMemoryManagerSharedPtr& memoryManager) const
86{
87 return std::make_unique<GpuFsaWorkloadFactory>(PolymorphicPointerDowncast<GpuFsaMemoryManager>(memoryManager));
88}
89
90IBackendInternal::IWorkloadFactoryPtr GpuFsaBackend::CreateWorkloadFactory(
91 TensorHandleFactoryRegistry& registry) const
92{
93 std::shared_ptr<GpuFsaMemoryManager> memoryManager;
94 if (m_UsingCustomAllocator)
95 {
96 memoryManager = std::make_shared<GpuFsaMemoryManager>(m_CustomAllocator);
97 }
98 else
99 {
100 memoryManager = std::make_shared<GpuFsaMemoryManager>(std::make_unique<arm_compute::CLBufferAllocator>());
101 }
102
103 std::unique_ptr<ITensorHandleFactory> factory = std::make_unique<GpuFsaTensorHandleFactory>(memoryManager);
104
105 registry.RegisterMemoryManager(memoryManager);
106 registry.RegisterFactory(std::move(factory));
107
108 return std::make_unique<GpuFsaWorkloadFactory>(PolymorphicPointerDowncast<GpuFsaMemoryManager>(memoryManager));
109}
110
111IBackendInternal::IWorkloadFactoryPtr GpuFsaBackend::CreateWorkloadFactory(
112 TensorHandleFactoryRegistry& registry,
113 const ModelOptions&,
114 MemorySourceFlags inputFlags,
115 MemorySourceFlags outputFlags) const
116{
117
118 // To allow force import if inputFlags/outputFlags are Undefined, set it as Malloc
119 if (inputFlags == static_cast<MemorySourceFlags>(MemorySource::Undefined))
120 {
121 inputFlags = static_cast<MemorySourceFlags>(MemorySource::Malloc);
122 }
123 if (outputFlags == static_cast<MemorySourceFlags>(MemorySource::Undefined))
124 {
125 outputFlags = static_cast<MemorySourceFlags>(MemorySource::Malloc);
126 }
127
128 std::shared_ptr<GpuFsaMemoryManager> memoryManager;
129 if (m_UsingCustomAllocator)
130 {
131 memoryManager = std::make_shared<GpuFsaMemoryManager>(m_CustomAllocator);
132 }
133 else
134 {
135 memoryManager = std::make_shared<GpuFsaMemoryManager>(std::make_unique<arm_compute::CLBufferAllocator>());
136 }
137
138 std::unique_ptr<ITensorHandleFactory> factory = std::make_unique<GpuFsaTensorHandleFactory>(memoryManager);
139
140 registry.RegisterMemoryManager(memoryManager);
141 registry.RegisterFactory(std::move(factory));
142
143 return std::make_unique<GpuFsaWorkloadFactory>(PolymorphicPointerDowncast<GpuFsaMemoryManager>(memoryManager));
144}
145
146std::vector<ITensorHandleFactory::FactoryId> GpuFsaBackend::GetHandleFactoryPreferences() const
147{
148 return std::vector<ITensorHandleFactory::FactoryId> { GpuFsaTensorHandleFactory::GetIdStatic() };
149}
150
151void GpuFsaBackend::RegisterTensorHandleFactories(TensorHandleFactoryRegistry& registry)
152{
153 std::shared_ptr<GpuFsaMemoryManager> memoryManager;
154 if (m_UsingCustomAllocator)
155 {
156 memoryManager = std::make_shared<GpuFsaMemoryManager>(m_CustomAllocator);
157 }
158 else
159 {
160 memoryManager = std::make_shared<GpuFsaMemoryManager>(std::make_unique<arm_compute::CLBufferAllocator>());
161 }
162
163 std::unique_ptr<ITensorHandleFactory> factory = std::make_unique<GpuFsaTensorHandleFactory>(memoryManager);
164 registry.RegisterMemoryManager(memoryManager);
165 registry.RegisterFactory(std::move(factory));
166
167}
168
169void GpuFsaBackend::RegisterTensorHandleFactories(TensorHandleFactoryRegistry& registry,
170 MemorySourceFlags inputFlags,
171 MemorySourceFlags outputFlags)
172{
173 // To allow force import if inputFlags/outputFlags are Undefined, set it as Malloc
174 if (inputFlags == static_cast<MemorySourceFlags>(MemorySource::Undefined))
175 {
176 inputFlags = static_cast<MemorySourceFlags>(MemorySource::Malloc);
177 }
178 if (outputFlags == static_cast<MemorySourceFlags>(MemorySource::Undefined))
179 {
180 outputFlags = static_cast<MemorySourceFlags>(MemorySource::Malloc);
181 }
182
183 std::shared_ptr<GpuFsaMemoryManager> memoryManager;
184 if (m_UsingCustomAllocator)
185 {
186 memoryManager = std::make_shared<GpuFsaMemoryManager>(m_CustomAllocator);
187 }
188 else
189 {
190 memoryManager = std::make_shared<GpuFsaMemoryManager>(std::make_unique<arm_compute::CLBufferAllocator>());
191 }
192
193 std::unique_ptr<ITensorHandleFactory> factory = std::make_unique<GpuFsaTensorHandleFactory>(memoryManager);
194 registry.RegisterMemoryManager(memoryManager);
195 registry.RegisterFactory(std::move(factory));
196}
197
198IBackendInternal::IBackendContextPtr GpuFsaBackend::CreateBackendContext(const IRuntime::CreationOptions& options) const
199{
200 return IBackendContextPtr{new GpuFsaBackendContext{options}};
201}
202
203IBackendInternal::IBackendProfilingContextPtr GpuFsaBackend::CreateBackendProfilingContext(
204 const IRuntime::CreationOptions&, IBackendProfilingPtr&)
205{
206 return IBackendProfilingContextPtr{};
207}
208
209IBackendInternal::ILayerSupportSharedPtr GpuFsaBackend::GetLayerSupport() const
210{
211 static ILayerSupportSharedPtr layerSupport{new GpuFsaLayerSupport};
212 return layerSupport;
213}
214
215std::unique_ptr<ICustomAllocator> GpuFsaBackend::GetDefaultAllocator() const
216{
217 return std::make_unique<GpuFsaBackendDefaultAllocator>();
218}
219
220OptimizationViews GpuFsaBackend::OptimizeSubgraphView(const SubgraphView& subgraph,
221 const ModelOptions& modelOptions) const
222{
223 OptimizationViews optimizationViews(modelOptions);
224
225 using namespace arm_compute::experimental::dynamic_fusion;
David Monahan8a570462023-11-22 13:24:25 +0000226
227 auto it = subgraph.end();
228 std::map<LayerGuid, Layer*> untouched;
229 while (it != subgraph.begin())
230 {
231 --it;
232 Layer& base = *(PolymorphicDowncast<Layer*>(*it));
233 untouched.insert({base.GetGuid(), &base});
234 }
235
236 GpuFsaLayerSupport supportChecker;
237 it = subgraph.end();
David Monahanbd738082023-12-08 12:50:02 +0000238 arm_compute::CLCompileContext* compileCtx = &(arm_compute::CLKernelLibrary::get().get_compile_context());
239
240 // Setup the GpuWokloadContext which will exist for the lifetime of the Graph. This contains the TensorInfos
241 std::shared_ptr<GpuWorkloadContext> workloadContext = std::make_shared<GpuWorkloadContext>(compileCtx);
David Monahan8a570462023-11-22 13:24:25 +0000242 while (it != subgraph.begin())
243 {
244 --it;
245 Layer& base = *(PolymorphicDowncast<Layer*>(*it));
David Monahanbd738082023-12-08 12:50:02 +0000246 // Create a GpuFsaPreCompiledBlob, this contains all of the information needed to execute an operator
247 GpuFsaPreCompiledBlob* preCompiledBlobPtr = new GpuFsaPreCompiledBlob();
248 preCompiledBlobPtr->workloadContext = workloadContext;
249 preCompiledBlobPtr->sketch = std::make_unique<GpuWorkloadSketch>(workloadContext.get());
David Monahan8a570462023-11-22 13:24:25 +0000250
David Monahanbd738082023-12-08 12:50:02 +0000251 // Configure and setup the sketch for each supported op. Their data will be wrapped into a PreCompiled layer
David Monahan8a570462023-11-22 13:24:25 +0000252 switch (base.GetType())
253 {
Teresa Charlinddbda6a2024-02-07 22:58:29 +0000254 case (LayerType::Activation):
255 {
256 auto desc = PolymorphicDowncast<const ActivationDescriptor*>(&base.GetParameters());
257 auto input = base.GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo();
258 GpuFsaActivationCreateOp(preCompiledBlobPtr, input, *desc);
259 break;
260 }
Tracy Narinebc5a5d52024-02-06 15:22:41 +0000261 case (LayerType::Cast):
262 {
263 auto input = base.GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo();
264 auto output = base.GetOutputSlot(0).GetTensorInfo();
265 GpuFsaCastCreateOp(preCompiledBlobPtr, input, output);
266 break;
267 }
David Monahan8a570462023-11-22 13:24:25 +0000268 case (LayerType::Convolution2d):
269 {
270 auto input = base.GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo();
271 auto weights = base.GetInputSlot(1).GetConnectedOutputSlot()->GetTensorInfo();
David Monahan8a570462023-11-22 13:24:25 +0000272
273 auto desc = PolymorphicDowncast<const Convolution2dDescriptor*>(&base.GetParameters());
274 if (desc->m_BiasEnabled)
275 {
276 auto bias = base.GetInputSlot(2).GetConnectedOutputSlot()->GetTensorInfo();
David Monahanbd738082023-12-08 12:50:02 +0000277 GpuFsaConvolution2dCreateOp(preCompiledBlobPtr,
278 input,
David Monahan8a570462023-11-22 13:24:25 +0000279 *desc,
280 weights,
281 bias);
282 }
283 else
284 {
David Monahanbd738082023-12-08 12:50:02 +0000285 GpuFsaConvolution2dCreateOp(preCompiledBlobPtr,
286 input,
David Monahan8a570462023-11-22 13:24:25 +0000287 *desc,
288 weights,
289 EmptyOptional());
290 }
291 break;
292 }
Teresa Charlin5bda9732024-02-08 18:46:38 +0000293 case (LayerType::BatchMatMul):
294 {
295 auto input0 = base.GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo();
296 auto input1 = base.GetInputSlot(1).GetConnectedOutputSlot()->GetTensorInfo();
297 auto desc = PolymorphicDowncast<const BatchMatMulDescriptor*>(&base.GetParameters());
298 GpuFsaBatchMatMulCreateOp(preCompiledBlobPtr, input0, input1, *desc);
299 break;
300 }
Tianle Chengfbfa49e2024-01-23 11:21:48 +0000301 case (LayerType::DepthwiseConvolution2d):
302 {
303 auto input = base.GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo();
304 auto weights = base.GetInputSlot(1).GetConnectedOutputSlot()->GetTensorInfo();
305
306 auto desc = PolymorphicDowncast<const DepthwiseConvolution2dDescriptor*>(&base.GetParameters());
307 if (desc->m_BiasEnabled)
308 {
309 auto bias = base.GetInputSlot(2).GetConnectedOutputSlot()->GetTensorInfo();
310 GpuFsaDepthwiseConvolution2dCreateOp(preCompiledBlobPtr,
311 input,
312 *desc,
313 weights,
314 bias);
315 }
316 else
317 {
318 GpuFsaDepthwiseConvolution2dCreateOp(preCompiledBlobPtr,
319 input,
320 *desc,
321 weights,
322 EmptyOptional());
323 }
324 break;
325 }
Tracy Narinee7d27852024-01-26 09:13:19 +0000326 case LayerType::ElementwiseBinary:
327 {
328 auto desc = PolymorphicDowncast<const ElementwiseBinaryDescriptor *>(&base.GetParameters());
Teresa Charlin20dda372024-02-08 16:23:25 +0000329 auto input0 = base.GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo();
330 auto input1 = base.GetInputSlot(1).GetConnectedOutputSlot()->GetTensorInfo();
331 GpuFsaElementwiseBinaryCreateOp(preCompiledBlobPtr, input0, input1, *desc);
Tracy Narinee7d27852024-01-26 09:13:19 +0000332 break;
333 }
Teresa Charlina52bca22024-02-01 17:36:48 +0000334 case (LayerType::Pooling2d):
335 {
336 auto input = base.GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo();
337 auto desc = PolymorphicDowncast<const Pooling2dDescriptor*>(&base.GetParameters());
338 GpuFsaPooling2dCreateOp(preCompiledBlobPtr, input, *desc);
339 break;
340 }
Teresa Charlin1d6b7312024-02-07 22:02:48 +0000341 case (LayerType::Resize):
342 {
343 auto input = base.GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo();
344 auto desc = PolymorphicDowncast<const ResizeDescriptor*>(&base.GetParameters());
345 GpuFsaResizeCreateOp(preCompiledBlobPtr, input, *desc);
346 break;
347 }
John Mcloughlin33753902024-02-07 15:00:57 +0000348 case (LayerType::Softmax):
349 {
350 auto input = base.GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo();
351 auto output = base.GetOutputSlot(0).GetTensorInfo();
352
353 auto desc = PolymorphicDowncast<const SoftmaxDescriptor*>(&base.GetParameters());
354 GpuFsaSoftmaxCreateOp(preCompiledBlobPtr,
355 input,
356 output,
357 *desc);
358 break;
359 }
David Monahan8a570462023-11-22 13:24:25 +0000360 default:
361 // unsupported layer for GpuFsa backend
362 continue;
363 }
364
David Monahanbd738082023-12-08 12:50:02 +0000365 auto compiledBlob =
366 std::make_unique<PreCompiledObjectPtr>(preCompiledBlobPtr, DeleteAsType<GpuFsaPreCompiledBlob>);
David Monahan8a570462023-11-22 13:24:25 +0000367
368 IConnectableLayer* preCompiledLayer = optimizationViews.GetINetwork()->AddPrecompiledLayer(
369 PreCompiledDescriptor(base.GetNumInputSlots(), base.GetNumOutputSlots()),
370 std::move(*compiledBlob),
371 armnn::Optional<BackendId>(GetId()),
372 "GpuFsa_Pre_Compiled_Layer");
373
374 // Copy the output tensor infos from sub-graph
375 for (unsigned int i = 0; i < subgraph.GetNumOutputSlots(); i++)
376 {
377 preCompiledLayer->GetOutputSlot(i).SetTensorInfo(base.GetOutputSlot(i).GetTensorInfo());
378 }
379
380 SubgraphView::SubgraphViewPtr substituteSubgraph =
381 CreateSubgraphViewFrom(CreateInputsFrom(&base),
382 CreateOutputsFrom(&base),
383 {&base});
384
David Monahanbd738082023-12-08 12:50:02 +0000385 optimizationViews.AddSubstitution({ std::move(*substituteSubgraph), SubgraphView(preCompiledLayer) });
David Monahan8a570462023-11-22 13:24:25 +0000386
387 untouched.erase(base.GetGuid());
388 }
389
390 if (optimizationViews.GetSubstitutions().empty())
391 {
392 optimizationViews.AddUntouchedSubgraph(SubgraphView(subgraph));
393 }
394 else
395 {
396 ReportUntouchedLayers(optimizationViews, untouched);
397 }
398
399
400 return optimizationViews;
401}
402
403} // namespace armnn