blob: 8b62aec9e6af376d028f447323ede8b712b53c95 [file] [log] [blame]
David Monahan8a570462023-11-22 13:24:25 +00001//
David Monahanbd738082023-12-08 12:50:02 +00002// Copyright © 2022-2024 Arm Ltd and Contributors. All rights reserved.
David Monahan8a570462023-11-22 13:24:25 +00003// SPDX-License-Identifier: MIT
4//
5
6#include "GpuFsaBackend.hpp"
7#include "GpuFsaBackendContext.hpp"
8#include "GpuFsaBackendDefaultAllocator.hpp"
9#include "GpuFsaBackendId.hpp"
10#include "GpuFsaLayerSupport.hpp"
11#include "GpuFsaTensorHandleFactory.hpp"
12#include "GpuFsaWorkloadFactory.hpp"
13
14#include <armnn/backends/IBackendContext.hpp>
15#include <armnn/backends/IMemoryManager.hpp>
16#include <aclCommon/BaseMemoryManager.hpp>
17#include <backendsCommon/SubgraphUtils.hpp>
18#include <Optimizer.hpp>
19
20#include <arm_compute/core/CL/CLKernelLibrary.h>
21#include <arm_compute/runtime/CL/CLBufferAllocator.h>
22
David Monahanbd738082023-12-08 12:50:02 +000023#include "layers/GpuFsaConvolution2d.hpp"
Tianle Chengfbfa49e2024-01-23 11:21:48 +000024#include "layers/GpuFsaDepthwiseConvolution2d.hpp"
Tracy Narinee7d27852024-01-26 09:13:19 +000025#include "layers/GpuFsaElementwiseBinaryAdd.hpp"
John Mcloughlin829e13e2024-01-31 11:00:27 +000026#include "layers/GpuFsaElementwiseBinarySub.hpp"
David Monahan8a570462023-11-22 13:24:25 +000027
28namespace armnn
29{
30
31template <typename T>
32inline void DeleteAsType(const void* const blob)
33{
34 delete static_cast<const T*>(blob);
35}
36
37inline SubgraphView::InputSlots CreateInputsFrom(Layer* layer)
38{
39 SubgraphView::InputSlots result;
40 for (auto&& it = layer->BeginInputSlots(); it != layer->EndInputSlots(); ++it)
41 {
42 result.push_back(&(*it));
43 }
44 return result;
45}
46
47inline SubgraphView::OutputSlots CreateOutputsFrom(Layer* layer)
48{
49 SubgraphView::OutputSlots result;
50 for (auto&& it = layer->BeginOutputSlots(); it != layer->EndOutputSlots(); ++it)
51 {
52 result.push_back(&(*it));
53 }
54 return result;
55}
56
57inline SubgraphView::SubgraphViewPtr CreateSubgraphViewFrom(SubgraphView::InputSlots&& inputs,
58 SubgraphView::OutputSlots&& outputs,
59 SubgraphView::Layers&& layers)
60{
61 return std::make_unique<SubgraphView>(std::move(inputs), std::move(outputs), std::move(layers));
62}
63
64const BackendId& GpuFsaBackend::GetIdStatic()
65{
66 static const BackendId s_Id{GpuFsaBackendId()};
67 return s_Id;
68}
69
70IBackendInternal::IMemoryManagerUniquePtr GpuFsaBackend::CreateMemoryManager() const
71{
72 if (m_UsingCustomAllocator)
73 {
74 return std::make_unique<GpuFsaMemoryManager>(m_CustomAllocator);
75 }
76 return std::make_unique<GpuFsaMemoryManager>(std::make_unique<arm_compute::CLBufferAllocator>());
77}
78
79IBackendInternal::IWorkloadFactoryPtr GpuFsaBackend::CreateWorkloadFactory(
80 const IBackendInternal::IMemoryManagerSharedPtr& memoryManager) const
81{
82 return std::make_unique<GpuFsaWorkloadFactory>(PolymorphicPointerDowncast<GpuFsaMemoryManager>(memoryManager));
83}
84
85IBackendInternal::IWorkloadFactoryPtr GpuFsaBackend::CreateWorkloadFactory(
86 TensorHandleFactoryRegistry& registry) const
87{
88 std::shared_ptr<GpuFsaMemoryManager> memoryManager;
89 if (m_UsingCustomAllocator)
90 {
91 memoryManager = std::make_shared<GpuFsaMemoryManager>(m_CustomAllocator);
92 }
93 else
94 {
95 memoryManager = std::make_shared<GpuFsaMemoryManager>(std::make_unique<arm_compute::CLBufferAllocator>());
96 }
97
98 std::unique_ptr<ITensorHandleFactory> factory = std::make_unique<GpuFsaTensorHandleFactory>(memoryManager);
99
100 registry.RegisterMemoryManager(memoryManager);
101 registry.RegisterFactory(std::move(factory));
102
103 return std::make_unique<GpuFsaWorkloadFactory>(PolymorphicPointerDowncast<GpuFsaMemoryManager>(memoryManager));
104}
105
106IBackendInternal::IWorkloadFactoryPtr GpuFsaBackend::CreateWorkloadFactory(
107 TensorHandleFactoryRegistry& registry,
108 const ModelOptions&,
109 MemorySourceFlags inputFlags,
110 MemorySourceFlags outputFlags) const
111{
112
113 // To allow force import if inputFlags/outputFlags are Undefined, set it as Malloc
114 if (inputFlags == static_cast<MemorySourceFlags>(MemorySource::Undefined))
115 {
116 inputFlags = static_cast<MemorySourceFlags>(MemorySource::Malloc);
117 }
118 if (outputFlags == static_cast<MemorySourceFlags>(MemorySource::Undefined))
119 {
120 outputFlags = static_cast<MemorySourceFlags>(MemorySource::Malloc);
121 }
122
123 std::shared_ptr<GpuFsaMemoryManager> memoryManager;
124 if (m_UsingCustomAllocator)
125 {
126 memoryManager = std::make_shared<GpuFsaMemoryManager>(m_CustomAllocator);
127 }
128 else
129 {
130 memoryManager = std::make_shared<GpuFsaMemoryManager>(std::make_unique<arm_compute::CLBufferAllocator>());
131 }
132
133 std::unique_ptr<ITensorHandleFactory> factory = std::make_unique<GpuFsaTensorHandleFactory>(memoryManager);
134
135 registry.RegisterMemoryManager(memoryManager);
136 registry.RegisterFactory(std::move(factory));
137
138 return std::make_unique<GpuFsaWorkloadFactory>(PolymorphicPointerDowncast<GpuFsaMemoryManager>(memoryManager));
139}
140
141std::vector<ITensorHandleFactory::FactoryId> GpuFsaBackend::GetHandleFactoryPreferences() const
142{
143 return std::vector<ITensorHandleFactory::FactoryId> { GpuFsaTensorHandleFactory::GetIdStatic() };
144}
145
146void GpuFsaBackend::RegisterTensorHandleFactories(TensorHandleFactoryRegistry& registry)
147{
148 std::shared_ptr<GpuFsaMemoryManager> memoryManager;
149 if (m_UsingCustomAllocator)
150 {
151 memoryManager = std::make_shared<GpuFsaMemoryManager>(m_CustomAllocator);
152 }
153 else
154 {
155 memoryManager = std::make_shared<GpuFsaMemoryManager>(std::make_unique<arm_compute::CLBufferAllocator>());
156 }
157
158 std::unique_ptr<ITensorHandleFactory> factory = std::make_unique<GpuFsaTensorHandleFactory>(memoryManager);
159 registry.RegisterMemoryManager(memoryManager);
160 registry.RegisterFactory(std::move(factory));
161
162}
163
164void GpuFsaBackend::RegisterTensorHandleFactories(TensorHandleFactoryRegistry& registry,
165 MemorySourceFlags inputFlags,
166 MemorySourceFlags outputFlags)
167{
168 // To allow force import if inputFlags/outputFlags are Undefined, set it as Malloc
169 if (inputFlags == static_cast<MemorySourceFlags>(MemorySource::Undefined))
170 {
171 inputFlags = static_cast<MemorySourceFlags>(MemorySource::Malloc);
172 }
173 if (outputFlags == static_cast<MemorySourceFlags>(MemorySource::Undefined))
174 {
175 outputFlags = static_cast<MemorySourceFlags>(MemorySource::Malloc);
176 }
177
178 std::shared_ptr<GpuFsaMemoryManager> memoryManager;
179 if (m_UsingCustomAllocator)
180 {
181 memoryManager = std::make_shared<GpuFsaMemoryManager>(m_CustomAllocator);
182 }
183 else
184 {
185 memoryManager = std::make_shared<GpuFsaMemoryManager>(std::make_unique<arm_compute::CLBufferAllocator>());
186 }
187
188 std::unique_ptr<ITensorHandleFactory> factory = std::make_unique<GpuFsaTensorHandleFactory>(memoryManager);
189 registry.RegisterMemoryManager(memoryManager);
190 registry.RegisterFactory(std::move(factory));
191}
192
193IBackendInternal::IBackendContextPtr GpuFsaBackend::CreateBackendContext(const IRuntime::CreationOptions& options) const
194{
195 return IBackendContextPtr{new GpuFsaBackendContext{options}};
196}
197
198IBackendInternal::IBackendProfilingContextPtr GpuFsaBackend::CreateBackendProfilingContext(
199 const IRuntime::CreationOptions&, IBackendProfilingPtr&)
200{
201 return IBackendProfilingContextPtr{};
202}
203
204IBackendInternal::ILayerSupportSharedPtr GpuFsaBackend::GetLayerSupport() const
205{
206 static ILayerSupportSharedPtr layerSupport{new GpuFsaLayerSupport};
207 return layerSupport;
208}
209
210std::unique_ptr<ICustomAllocator> GpuFsaBackend::GetDefaultAllocator() const
211{
212 return std::make_unique<GpuFsaBackendDefaultAllocator>();
213}
214
215OptimizationViews GpuFsaBackend::OptimizeSubgraphView(const SubgraphView& subgraph,
216 const ModelOptions& modelOptions) const
217{
218 OptimizationViews optimizationViews(modelOptions);
219
220 using namespace arm_compute::experimental::dynamic_fusion;
David Monahan8a570462023-11-22 13:24:25 +0000221
222 auto it = subgraph.end();
223 std::map<LayerGuid, Layer*> untouched;
224 while (it != subgraph.begin())
225 {
226 --it;
227 Layer& base = *(PolymorphicDowncast<Layer*>(*it));
228 untouched.insert({base.GetGuid(), &base});
229 }
230
231 GpuFsaLayerSupport supportChecker;
232 it = subgraph.end();
David Monahanbd738082023-12-08 12:50:02 +0000233 arm_compute::CLCompileContext* compileCtx = &(arm_compute::CLKernelLibrary::get().get_compile_context());
234
235 // Setup the GpuWokloadContext which will exist for the lifetime of the Graph. This contains the TensorInfos
236 std::shared_ptr<GpuWorkloadContext> workloadContext = std::make_shared<GpuWorkloadContext>(compileCtx);
David Monahan8a570462023-11-22 13:24:25 +0000237 while (it != subgraph.begin())
238 {
239 --it;
240 Layer& base = *(PolymorphicDowncast<Layer*>(*it));
David Monahanbd738082023-12-08 12:50:02 +0000241 // Create a GpuFsaPreCompiledBlob, this contains all of the information needed to execute an operator
242 GpuFsaPreCompiledBlob* preCompiledBlobPtr = new GpuFsaPreCompiledBlob();
243 preCompiledBlobPtr->workloadContext = workloadContext;
244 preCompiledBlobPtr->sketch = std::make_unique<GpuWorkloadSketch>(workloadContext.get());
David Monahan8a570462023-11-22 13:24:25 +0000245
David Monahanbd738082023-12-08 12:50:02 +0000246 // Configure and setup the sketch for each supported op. Their data will be wrapped into a PreCompiled layer
David Monahan8a570462023-11-22 13:24:25 +0000247 switch (base.GetType())
248 {
249 case (LayerType::Convolution2d):
250 {
251 auto input = base.GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo();
252 auto weights = base.GetInputSlot(1).GetConnectedOutputSlot()->GetTensorInfo();
David Monahan8a570462023-11-22 13:24:25 +0000253
254 auto desc = PolymorphicDowncast<const Convolution2dDescriptor*>(&base.GetParameters());
255 if (desc->m_BiasEnabled)
256 {
257 auto bias = base.GetInputSlot(2).GetConnectedOutputSlot()->GetTensorInfo();
David Monahanbd738082023-12-08 12:50:02 +0000258 GpuFsaConvolution2dCreateOp(preCompiledBlobPtr,
259 input,
David Monahan8a570462023-11-22 13:24:25 +0000260 *desc,
261 weights,
262 bias);
263 }
264 else
265 {
David Monahanbd738082023-12-08 12:50:02 +0000266 GpuFsaConvolution2dCreateOp(preCompiledBlobPtr,
267 input,
David Monahan8a570462023-11-22 13:24:25 +0000268 *desc,
269 weights,
270 EmptyOptional());
271 }
272 break;
273 }
Tianle Chengfbfa49e2024-01-23 11:21:48 +0000274 case (LayerType::DepthwiseConvolution2d):
275 {
276 auto input = base.GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo();
277 auto weights = base.GetInputSlot(1).GetConnectedOutputSlot()->GetTensorInfo();
278
279 auto desc = PolymorphicDowncast<const DepthwiseConvolution2dDescriptor*>(&base.GetParameters());
280 if (desc->m_BiasEnabled)
281 {
282 auto bias = base.GetInputSlot(2).GetConnectedOutputSlot()->GetTensorInfo();
283 GpuFsaDepthwiseConvolution2dCreateOp(preCompiledBlobPtr,
284 input,
285 *desc,
286 weights,
287 bias);
288 }
289 else
290 {
291 GpuFsaDepthwiseConvolution2dCreateOp(preCompiledBlobPtr,
292 input,
293 *desc,
294 weights,
295 EmptyOptional());
296 }
297 break;
298 }
Tracy Narinee7d27852024-01-26 09:13:19 +0000299 case LayerType::ElementwiseBinary:
300 {
301 auto desc = PolymorphicDowncast<const ElementwiseBinaryDescriptor *>(&base.GetParameters());
302 if (desc->m_Operation == BinaryOperation::Add)
303 {
304 auto input0 = base.GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo();
305 auto input1 = base.GetInputSlot(1).GetConnectedOutputSlot()->GetTensorInfo();
306
307 GpuFsaElementwiseBinaryAddCreateOp(preCompiledBlobPtr, input0, input1);
308 }
John Mcloughlin829e13e2024-01-31 11:00:27 +0000309 else if (desc->m_Operation == BinaryOperation::Sub)
310 {
311 auto input0 = base.GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo();
312 auto input1 = base.GetInputSlot(1).GetConnectedOutputSlot()->GetTensorInfo();
313
314 GpuFsaElementwiseBinarySubCreateOp(preCompiledBlobPtr, input0, input1);
315 }
Tracy Narinee7d27852024-01-26 09:13:19 +0000316 break;
317 }
David Monahan8a570462023-11-22 13:24:25 +0000318 default:
319 // unsupported layer for GpuFsa backend
320 continue;
321 }
322
David Monahanbd738082023-12-08 12:50:02 +0000323 auto compiledBlob =
324 std::make_unique<PreCompiledObjectPtr>(preCompiledBlobPtr, DeleteAsType<GpuFsaPreCompiledBlob>);
David Monahan8a570462023-11-22 13:24:25 +0000325
326 IConnectableLayer* preCompiledLayer = optimizationViews.GetINetwork()->AddPrecompiledLayer(
327 PreCompiledDescriptor(base.GetNumInputSlots(), base.GetNumOutputSlots()),
328 std::move(*compiledBlob),
329 armnn::Optional<BackendId>(GetId()),
330 "GpuFsa_Pre_Compiled_Layer");
331
332 // Copy the output tensor infos from sub-graph
333 for (unsigned int i = 0; i < subgraph.GetNumOutputSlots(); i++)
334 {
335 preCompiledLayer->GetOutputSlot(i).SetTensorInfo(base.GetOutputSlot(i).GetTensorInfo());
336 }
337
338 SubgraphView::SubgraphViewPtr substituteSubgraph =
339 CreateSubgraphViewFrom(CreateInputsFrom(&base),
340 CreateOutputsFrom(&base),
341 {&base});
342
David Monahanbd738082023-12-08 12:50:02 +0000343 optimizationViews.AddSubstitution({ std::move(*substituteSubgraph), SubgraphView(preCompiledLayer) });
David Monahan8a570462023-11-22 13:24:25 +0000344
345 untouched.erase(base.GetGuid());
346 }
347
348 if (optimizationViews.GetSubstitutions().empty())
349 {
350 optimizationViews.AddUntouchedSubgraph(SubgraphView(subgraph));
351 }
352 else
353 {
354 ReportUntouchedLayers(optimizationViews, untouched);
355 }
356
357
358 return optimizationViews;
359}
360
361} // namespace armnn