blob: 1bfe8dd14ae054ae8245755d50f19c7b9943e9d3 [file] [log] [blame]
David Monahan8a570462023-11-22 13:24:25 +00001//
David Monahanbd738082023-12-08 12:50:02 +00002// Copyright © 2022-2024 Arm Ltd and Contributors. All rights reserved.
David Monahan8a570462023-11-22 13:24:25 +00003// SPDX-License-Identifier: MIT
4//
5
6#include "GpuFsaBackend.hpp"
7#include "GpuFsaBackendContext.hpp"
8#include "GpuFsaBackendDefaultAllocator.hpp"
9#include "GpuFsaBackendId.hpp"
10#include "GpuFsaLayerSupport.hpp"
11#include "GpuFsaTensorHandleFactory.hpp"
12#include "GpuFsaWorkloadFactory.hpp"
13
14#include <armnn/backends/IBackendContext.hpp>
15#include <armnn/backends/IMemoryManager.hpp>
16#include <aclCommon/BaseMemoryManager.hpp>
17#include <backendsCommon/SubgraphUtils.hpp>
18#include <Optimizer.hpp>
19
20#include <arm_compute/core/CL/CLKernelLibrary.h>
21#include <arm_compute/runtime/CL/CLBufferAllocator.h>
22
Tracy Narinebc5a5d52024-02-06 15:22:41 +000023#include "layers/GpuFsaCast.hpp"
David Monahanbd738082023-12-08 12:50:02 +000024#include "layers/GpuFsaConvolution2d.hpp"
Tianle Chengfbfa49e2024-01-23 11:21:48 +000025#include "layers/GpuFsaDepthwiseConvolution2d.hpp"
Tracy Narinee7d27852024-01-26 09:13:19 +000026#include "layers/GpuFsaElementwiseBinaryAdd.hpp"
John Mcloughlin829e13e2024-01-31 11:00:27 +000027#include "layers/GpuFsaElementwiseBinarySub.hpp"
Teresa Charlina52bca22024-02-01 17:36:48 +000028#include "layers/GpuFsaPooling2d.hpp"
Teresa Charlin1d6b7312024-02-07 22:02:48 +000029#include "layers/GpuFsaResize.hpp"
David Monahan8a570462023-11-22 13:24:25 +000030
31namespace armnn
32{
33
34template <typename T>
35inline void DeleteAsType(const void* const blob)
36{
37 delete static_cast<const T*>(blob);
38}
39
40inline SubgraphView::InputSlots CreateInputsFrom(Layer* layer)
41{
42 SubgraphView::InputSlots result;
43 for (auto&& it = layer->BeginInputSlots(); it != layer->EndInputSlots(); ++it)
44 {
45 result.push_back(&(*it));
46 }
47 return result;
48}
49
50inline SubgraphView::OutputSlots CreateOutputsFrom(Layer* layer)
51{
52 SubgraphView::OutputSlots result;
53 for (auto&& it = layer->BeginOutputSlots(); it != layer->EndOutputSlots(); ++it)
54 {
55 result.push_back(&(*it));
56 }
57 return result;
58}
59
60inline SubgraphView::SubgraphViewPtr CreateSubgraphViewFrom(SubgraphView::InputSlots&& inputs,
61 SubgraphView::OutputSlots&& outputs,
62 SubgraphView::Layers&& layers)
63{
64 return std::make_unique<SubgraphView>(std::move(inputs), std::move(outputs), std::move(layers));
65}
66
67const BackendId& GpuFsaBackend::GetIdStatic()
68{
69 static const BackendId s_Id{GpuFsaBackendId()};
70 return s_Id;
71}
72
73IBackendInternal::IMemoryManagerUniquePtr GpuFsaBackend::CreateMemoryManager() const
74{
75 if (m_UsingCustomAllocator)
76 {
77 return std::make_unique<GpuFsaMemoryManager>(m_CustomAllocator);
78 }
79 return std::make_unique<GpuFsaMemoryManager>(std::make_unique<arm_compute::CLBufferAllocator>());
80}
81
82IBackendInternal::IWorkloadFactoryPtr GpuFsaBackend::CreateWorkloadFactory(
83 const IBackendInternal::IMemoryManagerSharedPtr& memoryManager) const
84{
85 return std::make_unique<GpuFsaWorkloadFactory>(PolymorphicPointerDowncast<GpuFsaMemoryManager>(memoryManager));
86}
87
88IBackendInternal::IWorkloadFactoryPtr GpuFsaBackend::CreateWorkloadFactory(
89 TensorHandleFactoryRegistry& registry) const
90{
91 std::shared_ptr<GpuFsaMemoryManager> memoryManager;
92 if (m_UsingCustomAllocator)
93 {
94 memoryManager = std::make_shared<GpuFsaMemoryManager>(m_CustomAllocator);
95 }
96 else
97 {
98 memoryManager = std::make_shared<GpuFsaMemoryManager>(std::make_unique<arm_compute::CLBufferAllocator>());
99 }
100
101 std::unique_ptr<ITensorHandleFactory> factory = std::make_unique<GpuFsaTensorHandleFactory>(memoryManager);
102
103 registry.RegisterMemoryManager(memoryManager);
104 registry.RegisterFactory(std::move(factory));
105
106 return std::make_unique<GpuFsaWorkloadFactory>(PolymorphicPointerDowncast<GpuFsaMemoryManager>(memoryManager));
107}
108
109IBackendInternal::IWorkloadFactoryPtr GpuFsaBackend::CreateWorkloadFactory(
110 TensorHandleFactoryRegistry& registry,
111 const ModelOptions&,
112 MemorySourceFlags inputFlags,
113 MemorySourceFlags outputFlags) const
114{
115
116 // To allow force import if inputFlags/outputFlags are Undefined, set it as Malloc
117 if (inputFlags == static_cast<MemorySourceFlags>(MemorySource::Undefined))
118 {
119 inputFlags = static_cast<MemorySourceFlags>(MemorySource::Malloc);
120 }
121 if (outputFlags == static_cast<MemorySourceFlags>(MemorySource::Undefined))
122 {
123 outputFlags = static_cast<MemorySourceFlags>(MemorySource::Malloc);
124 }
125
126 std::shared_ptr<GpuFsaMemoryManager> memoryManager;
127 if (m_UsingCustomAllocator)
128 {
129 memoryManager = std::make_shared<GpuFsaMemoryManager>(m_CustomAllocator);
130 }
131 else
132 {
133 memoryManager = std::make_shared<GpuFsaMemoryManager>(std::make_unique<arm_compute::CLBufferAllocator>());
134 }
135
136 std::unique_ptr<ITensorHandleFactory> factory = std::make_unique<GpuFsaTensorHandleFactory>(memoryManager);
137
138 registry.RegisterMemoryManager(memoryManager);
139 registry.RegisterFactory(std::move(factory));
140
141 return std::make_unique<GpuFsaWorkloadFactory>(PolymorphicPointerDowncast<GpuFsaMemoryManager>(memoryManager));
142}
143
144std::vector<ITensorHandleFactory::FactoryId> GpuFsaBackend::GetHandleFactoryPreferences() const
145{
146 return std::vector<ITensorHandleFactory::FactoryId> { GpuFsaTensorHandleFactory::GetIdStatic() };
147}
148
149void GpuFsaBackend::RegisterTensorHandleFactories(TensorHandleFactoryRegistry& registry)
150{
151 std::shared_ptr<GpuFsaMemoryManager> memoryManager;
152 if (m_UsingCustomAllocator)
153 {
154 memoryManager = std::make_shared<GpuFsaMemoryManager>(m_CustomAllocator);
155 }
156 else
157 {
158 memoryManager = std::make_shared<GpuFsaMemoryManager>(std::make_unique<arm_compute::CLBufferAllocator>());
159 }
160
161 std::unique_ptr<ITensorHandleFactory> factory = std::make_unique<GpuFsaTensorHandleFactory>(memoryManager);
162 registry.RegisterMemoryManager(memoryManager);
163 registry.RegisterFactory(std::move(factory));
164
165}
166
167void GpuFsaBackend::RegisterTensorHandleFactories(TensorHandleFactoryRegistry& registry,
168 MemorySourceFlags inputFlags,
169 MemorySourceFlags outputFlags)
170{
171 // To allow force import if inputFlags/outputFlags are Undefined, set it as Malloc
172 if (inputFlags == static_cast<MemorySourceFlags>(MemorySource::Undefined))
173 {
174 inputFlags = static_cast<MemorySourceFlags>(MemorySource::Malloc);
175 }
176 if (outputFlags == static_cast<MemorySourceFlags>(MemorySource::Undefined))
177 {
178 outputFlags = static_cast<MemorySourceFlags>(MemorySource::Malloc);
179 }
180
181 std::shared_ptr<GpuFsaMemoryManager> memoryManager;
182 if (m_UsingCustomAllocator)
183 {
184 memoryManager = std::make_shared<GpuFsaMemoryManager>(m_CustomAllocator);
185 }
186 else
187 {
188 memoryManager = std::make_shared<GpuFsaMemoryManager>(std::make_unique<arm_compute::CLBufferAllocator>());
189 }
190
191 std::unique_ptr<ITensorHandleFactory> factory = std::make_unique<GpuFsaTensorHandleFactory>(memoryManager);
192 registry.RegisterMemoryManager(memoryManager);
193 registry.RegisterFactory(std::move(factory));
194}
195
196IBackendInternal::IBackendContextPtr GpuFsaBackend::CreateBackendContext(const IRuntime::CreationOptions& options) const
197{
198 return IBackendContextPtr{new GpuFsaBackendContext{options}};
199}
200
201IBackendInternal::IBackendProfilingContextPtr GpuFsaBackend::CreateBackendProfilingContext(
202 const IRuntime::CreationOptions&, IBackendProfilingPtr&)
203{
204 return IBackendProfilingContextPtr{};
205}
206
207IBackendInternal::ILayerSupportSharedPtr GpuFsaBackend::GetLayerSupport() const
208{
209 static ILayerSupportSharedPtr layerSupport{new GpuFsaLayerSupport};
210 return layerSupport;
211}
212
213std::unique_ptr<ICustomAllocator> GpuFsaBackend::GetDefaultAllocator() const
214{
215 return std::make_unique<GpuFsaBackendDefaultAllocator>();
216}
217
218OptimizationViews GpuFsaBackend::OptimizeSubgraphView(const SubgraphView& subgraph,
219 const ModelOptions& modelOptions) const
220{
221 OptimizationViews optimizationViews(modelOptions);
222
223 using namespace arm_compute::experimental::dynamic_fusion;
David Monahan8a570462023-11-22 13:24:25 +0000224
225 auto it = subgraph.end();
226 std::map<LayerGuid, Layer*> untouched;
227 while (it != subgraph.begin())
228 {
229 --it;
230 Layer& base = *(PolymorphicDowncast<Layer*>(*it));
231 untouched.insert({base.GetGuid(), &base});
232 }
233
234 GpuFsaLayerSupport supportChecker;
235 it = subgraph.end();
David Monahanbd738082023-12-08 12:50:02 +0000236 arm_compute::CLCompileContext* compileCtx = &(arm_compute::CLKernelLibrary::get().get_compile_context());
237
238 // Setup the GpuWokloadContext which will exist for the lifetime of the Graph. This contains the TensorInfos
239 std::shared_ptr<GpuWorkloadContext> workloadContext = std::make_shared<GpuWorkloadContext>(compileCtx);
David Monahan8a570462023-11-22 13:24:25 +0000240 while (it != subgraph.begin())
241 {
242 --it;
243 Layer& base = *(PolymorphicDowncast<Layer*>(*it));
David Monahanbd738082023-12-08 12:50:02 +0000244 // Create a GpuFsaPreCompiledBlob, this contains all of the information needed to execute an operator
245 GpuFsaPreCompiledBlob* preCompiledBlobPtr = new GpuFsaPreCompiledBlob();
246 preCompiledBlobPtr->workloadContext = workloadContext;
247 preCompiledBlobPtr->sketch = std::make_unique<GpuWorkloadSketch>(workloadContext.get());
David Monahan8a570462023-11-22 13:24:25 +0000248
David Monahanbd738082023-12-08 12:50:02 +0000249 // Configure and setup the sketch for each supported op. Their data will be wrapped into a PreCompiled layer
David Monahan8a570462023-11-22 13:24:25 +0000250 switch (base.GetType())
251 {
Tracy Narinebc5a5d52024-02-06 15:22:41 +0000252 case (LayerType::Cast):
253 {
254 auto input = base.GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo();
255 auto output = base.GetOutputSlot(0).GetTensorInfo();
256 GpuFsaCastCreateOp(preCompiledBlobPtr, input, output);
257 break;
258 }
David Monahan8a570462023-11-22 13:24:25 +0000259 case (LayerType::Convolution2d):
260 {
261 auto input = base.GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo();
262 auto weights = base.GetInputSlot(1).GetConnectedOutputSlot()->GetTensorInfo();
David Monahan8a570462023-11-22 13:24:25 +0000263
264 auto desc = PolymorphicDowncast<const Convolution2dDescriptor*>(&base.GetParameters());
265 if (desc->m_BiasEnabled)
266 {
267 auto bias = base.GetInputSlot(2).GetConnectedOutputSlot()->GetTensorInfo();
David Monahanbd738082023-12-08 12:50:02 +0000268 GpuFsaConvolution2dCreateOp(preCompiledBlobPtr,
269 input,
David Monahan8a570462023-11-22 13:24:25 +0000270 *desc,
271 weights,
272 bias);
273 }
274 else
275 {
David Monahanbd738082023-12-08 12:50:02 +0000276 GpuFsaConvolution2dCreateOp(preCompiledBlobPtr,
277 input,
David Monahan8a570462023-11-22 13:24:25 +0000278 *desc,
279 weights,
280 EmptyOptional());
281 }
282 break;
283 }
Tianle Chengfbfa49e2024-01-23 11:21:48 +0000284 case (LayerType::DepthwiseConvolution2d):
285 {
286 auto input = base.GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo();
287 auto weights = base.GetInputSlot(1).GetConnectedOutputSlot()->GetTensorInfo();
288
289 auto desc = PolymorphicDowncast<const DepthwiseConvolution2dDescriptor*>(&base.GetParameters());
290 if (desc->m_BiasEnabled)
291 {
292 auto bias = base.GetInputSlot(2).GetConnectedOutputSlot()->GetTensorInfo();
293 GpuFsaDepthwiseConvolution2dCreateOp(preCompiledBlobPtr,
294 input,
295 *desc,
296 weights,
297 bias);
298 }
299 else
300 {
301 GpuFsaDepthwiseConvolution2dCreateOp(preCompiledBlobPtr,
302 input,
303 *desc,
304 weights,
305 EmptyOptional());
306 }
307 break;
308 }
Tracy Narinee7d27852024-01-26 09:13:19 +0000309 case LayerType::ElementwiseBinary:
310 {
311 auto desc = PolymorphicDowncast<const ElementwiseBinaryDescriptor *>(&base.GetParameters());
312 if (desc->m_Operation == BinaryOperation::Add)
313 {
314 auto input0 = base.GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo();
315 auto input1 = base.GetInputSlot(1).GetConnectedOutputSlot()->GetTensorInfo();
316
317 GpuFsaElementwiseBinaryAddCreateOp(preCompiledBlobPtr, input0, input1);
318 }
John Mcloughlin829e13e2024-01-31 11:00:27 +0000319 else if (desc->m_Operation == BinaryOperation::Sub)
320 {
321 auto input0 = base.GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo();
322 auto input1 = base.GetInputSlot(1).GetConnectedOutputSlot()->GetTensorInfo();
323
324 GpuFsaElementwiseBinarySubCreateOp(preCompiledBlobPtr, input0, input1);
325 }
Tracy Narinee7d27852024-01-26 09:13:19 +0000326 break;
327 }
Teresa Charlina52bca22024-02-01 17:36:48 +0000328 case (LayerType::Pooling2d):
329 {
330 auto input = base.GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo();
331 auto desc = PolymorphicDowncast<const Pooling2dDescriptor*>(&base.GetParameters());
332 GpuFsaPooling2dCreateOp(preCompiledBlobPtr, input, *desc);
333 break;
334 }
Teresa Charlin1d6b7312024-02-07 22:02:48 +0000335 case (LayerType::Resize):
336 {
337 auto input = base.GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo();
338 auto desc = PolymorphicDowncast<const ResizeDescriptor*>(&base.GetParameters());
339 GpuFsaResizeCreateOp(preCompiledBlobPtr, input, *desc);
340 break;
341 }
David Monahan8a570462023-11-22 13:24:25 +0000342 default:
343 // unsupported layer for GpuFsa backend
344 continue;
345 }
346
David Monahanbd738082023-12-08 12:50:02 +0000347 auto compiledBlob =
348 std::make_unique<PreCompiledObjectPtr>(preCompiledBlobPtr, DeleteAsType<GpuFsaPreCompiledBlob>);
David Monahan8a570462023-11-22 13:24:25 +0000349
350 IConnectableLayer* preCompiledLayer = optimizationViews.GetINetwork()->AddPrecompiledLayer(
351 PreCompiledDescriptor(base.GetNumInputSlots(), base.GetNumOutputSlots()),
352 std::move(*compiledBlob),
353 armnn::Optional<BackendId>(GetId()),
354 "GpuFsa_Pre_Compiled_Layer");
355
356 // Copy the output tensor infos from sub-graph
357 for (unsigned int i = 0; i < subgraph.GetNumOutputSlots(); i++)
358 {
359 preCompiledLayer->GetOutputSlot(i).SetTensorInfo(base.GetOutputSlot(i).GetTensorInfo());
360 }
361
362 SubgraphView::SubgraphViewPtr substituteSubgraph =
363 CreateSubgraphViewFrom(CreateInputsFrom(&base),
364 CreateOutputsFrom(&base),
365 {&base});
366
David Monahanbd738082023-12-08 12:50:02 +0000367 optimizationViews.AddSubstitution({ std::move(*substituteSubgraph), SubgraphView(preCompiledLayer) });
David Monahan8a570462023-11-22 13:24:25 +0000368
369 untouched.erase(base.GetGuid());
370 }
371
372 if (optimizationViews.GetSubstitutions().empty())
373 {
374 optimizationViews.AddUntouchedSubgraph(SubgraphView(subgraph));
375 }
376 else
377 {
378 ReportUntouchedLayers(optimizationViews, untouched);
379 }
380
381
382 return optimizationViews;
383}
384
385} // namespace armnn