blob: 4a410309e3e325ccfce7d48b597d7d4eec1ced1b [file] [log] [blame]
David Monahan8a570462023-11-22 13:24:25 +00001//
David Monahanbd738082023-12-08 12:50:02 +00002// Copyright © 2022-2024 Arm Ltd and Contributors. All rights reserved.
David Monahan8a570462023-11-22 13:24:25 +00003// SPDX-License-Identifier: MIT
4//
5
6#include "GpuFsaBackend.hpp"
7#include "GpuFsaBackendContext.hpp"
8#include "GpuFsaBackendDefaultAllocator.hpp"
9#include "GpuFsaBackendId.hpp"
10#include "GpuFsaLayerSupport.hpp"
11#include "GpuFsaTensorHandleFactory.hpp"
12#include "GpuFsaWorkloadFactory.hpp"
13
14#include <armnn/backends/IBackendContext.hpp>
15#include <armnn/backends/IMemoryManager.hpp>
16#include <aclCommon/BaseMemoryManager.hpp>
17#include <backendsCommon/SubgraphUtils.hpp>
18#include <Optimizer.hpp>
19
20#include <arm_compute/core/CL/CLKernelLibrary.h>
21#include <arm_compute/runtime/CL/CLBufferAllocator.h>
22
Tracy Narinebc5a5d52024-02-06 15:22:41 +000023#include "layers/GpuFsaCast.hpp"
David Monahanbd738082023-12-08 12:50:02 +000024#include "layers/GpuFsaConvolution2d.hpp"
Tianle Chengfbfa49e2024-01-23 11:21:48 +000025#include "layers/GpuFsaDepthwiseConvolution2d.hpp"
Tracy Narinee7d27852024-01-26 09:13:19 +000026#include "layers/GpuFsaElementwiseBinaryAdd.hpp"
John Mcloughlin829e13e2024-01-31 11:00:27 +000027#include "layers/GpuFsaElementwiseBinarySub.hpp"
Teresa Charlina52bca22024-02-01 17:36:48 +000028#include "layers/GpuFsaPooling2d.hpp"
David Monahan8a570462023-11-22 13:24:25 +000029
30namespace armnn
31{
32
33template <typename T>
34inline void DeleteAsType(const void* const blob)
35{
36 delete static_cast<const T*>(blob);
37}
38
39inline SubgraphView::InputSlots CreateInputsFrom(Layer* layer)
40{
41 SubgraphView::InputSlots result;
42 for (auto&& it = layer->BeginInputSlots(); it != layer->EndInputSlots(); ++it)
43 {
44 result.push_back(&(*it));
45 }
46 return result;
47}
48
49inline SubgraphView::OutputSlots CreateOutputsFrom(Layer* layer)
50{
51 SubgraphView::OutputSlots result;
52 for (auto&& it = layer->BeginOutputSlots(); it != layer->EndOutputSlots(); ++it)
53 {
54 result.push_back(&(*it));
55 }
56 return result;
57}
58
59inline SubgraphView::SubgraphViewPtr CreateSubgraphViewFrom(SubgraphView::InputSlots&& inputs,
60 SubgraphView::OutputSlots&& outputs,
61 SubgraphView::Layers&& layers)
62{
63 return std::make_unique<SubgraphView>(std::move(inputs), std::move(outputs), std::move(layers));
64}
65
66const BackendId& GpuFsaBackend::GetIdStatic()
67{
68 static const BackendId s_Id{GpuFsaBackendId()};
69 return s_Id;
70}
71
72IBackendInternal::IMemoryManagerUniquePtr GpuFsaBackend::CreateMemoryManager() const
73{
74 if (m_UsingCustomAllocator)
75 {
76 return std::make_unique<GpuFsaMemoryManager>(m_CustomAllocator);
77 }
78 return std::make_unique<GpuFsaMemoryManager>(std::make_unique<arm_compute::CLBufferAllocator>());
79}
80
81IBackendInternal::IWorkloadFactoryPtr GpuFsaBackend::CreateWorkloadFactory(
82 const IBackendInternal::IMemoryManagerSharedPtr& memoryManager) const
83{
84 return std::make_unique<GpuFsaWorkloadFactory>(PolymorphicPointerDowncast<GpuFsaMemoryManager>(memoryManager));
85}
86
87IBackendInternal::IWorkloadFactoryPtr GpuFsaBackend::CreateWorkloadFactory(
88 TensorHandleFactoryRegistry& registry) const
89{
90 std::shared_ptr<GpuFsaMemoryManager> memoryManager;
91 if (m_UsingCustomAllocator)
92 {
93 memoryManager = std::make_shared<GpuFsaMemoryManager>(m_CustomAllocator);
94 }
95 else
96 {
97 memoryManager = std::make_shared<GpuFsaMemoryManager>(std::make_unique<arm_compute::CLBufferAllocator>());
98 }
99
100 std::unique_ptr<ITensorHandleFactory> factory = std::make_unique<GpuFsaTensorHandleFactory>(memoryManager);
101
102 registry.RegisterMemoryManager(memoryManager);
103 registry.RegisterFactory(std::move(factory));
104
105 return std::make_unique<GpuFsaWorkloadFactory>(PolymorphicPointerDowncast<GpuFsaMemoryManager>(memoryManager));
106}
107
108IBackendInternal::IWorkloadFactoryPtr GpuFsaBackend::CreateWorkloadFactory(
109 TensorHandleFactoryRegistry& registry,
110 const ModelOptions&,
111 MemorySourceFlags inputFlags,
112 MemorySourceFlags outputFlags) const
113{
114
115 // To allow force import if inputFlags/outputFlags are Undefined, set it as Malloc
116 if (inputFlags == static_cast<MemorySourceFlags>(MemorySource::Undefined))
117 {
118 inputFlags = static_cast<MemorySourceFlags>(MemorySource::Malloc);
119 }
120 if (outputFlags == static_cast<MemorySourceFlags>(MemorySource::Undefined))
121 {
122 outputFlags = static_cast<MemorySourceFlags>(MemorySource::Malloc);
123 }
124
125 std::shared_ptr<GpuFsaMemoryManager> memoryManager;
126 if (m_UsingCustomAllocator)
127 {
128 memoryManager = std::make_shared<GpuFsaMemoryManager>(m_CustomAllocator);
129 }
130 else
131 {
132 memoryManager = std::make_shared<GpuFsaMemoryManager>(std::make_unique<arm_compute::CLBufferAllocator>());
133 }
134
135 std::unique_ptr<ITensorHandleFactory> factory = std::make_unique<GpuFsaTensorHandleFactory>(memoryManager);
136
137 registry.RegisterMemoryManager(memoryManager);
138 registry.RegisterFactory(std::move(factory));
139
140 return std::make_unique<GpuFsaWorkloadFactory>(PolymorphicPointerDowncast<GpuFsaMemoryManager>(memoryManager));
141}
142
143std::vector<ITensorHandleFactory::FactoryId> GpuFsaBackend::GetHandleFactoryPreferences() const
144{
145 return std::vector<ITensorHandleFactory::FactoryId> { GpuFsaTensorHandleFactory::GetIdStatic() };
146}
147
148void GpuFsaBackend::RegisterTensorHandleFactories(TensorHandleFactoryRegistry& registry)
149{
150 std::shared_ptr<GpuFsaMemoryManager> memoryManager;
151 if (m_UsingCustomAllocator)
152 {
153 memoryManager = std::make_shared<GpuFsaMemoryManager>(m_CustomAllocator);
154 }
155 else
156 {
157 memoryManager = std::make_shared<GpuFsaMemoryManager>(std::make_unique<arm_compute::CLBufferAllocator>());
158 }
159
160 std::unique_ptr<ITensorHandleFactory> factory = std::make_unique<GpuFsaTensorHandleFactory>(memoryManager);
161 registry.RegisterMemoryManager(memoryManager);
162 registry.RegisterFactory(std::move(factory));
163
164}
165
166void GpuFsaBackend::RegisterTensorHandleFactories(TensorHandleFactoryRegistry& registry,
167 MemorySourceFlags inputFlags,
168 MemorySourceFlags outputFlags)
169{
170 // To allow force import if inputFlags/outputFlags are Undefined, set it as Malloc
171 if (inputFlags == static_cast<MemorySourceFlags>(MemorySource::Undefined))
172 {
173 inputFlags = static_cast<MemorySourceFlags>(MemorySource::Malloc);
174 }
175 if (outputFlags == static_cast<MemorySourceFlags>(MemorySource::Undefined))
176 {
177 outputFlags = static_cast<MemorySourceFlags>(MemorySource::Malloc);
178 }
179
180 std::shared_ptr<GpuFsaMemoryManager> memoryManager;
181 if (m_UsingCustomAllocator)
182 {
183 memoryManager = std::make_shared<GpuFsaMemoryManager>(m_CustomAllocator);
184 }
185 else
186 {
187 memoryManager = std::make_shared<GpuFsaMemoryManager>(std::make_unique<arm_compute::CLBufferAllocator>());
188 }
189
190 std::unique_ptr<ITensorHandleFactory> factory = std::make_unique<GpuFsaTensorHandleFactory>(memoryManager);
191 registry.RegisterMemoryManager(memoryManager);
192 registry.RegisterFactory(std::move(factory));
193}
194
195IBackendInternal::IBackendContextPtr GpuFsaBackend::CreateBackendContext(const IRuntime::CreationOptions& options) const
196{
197 return IBackendContextPtr{new GpuFsaBackendContext{options}};
198}
199
200IBackendInternal::IBackendProfilingContextPtr GpuFsaBackend::CreateBackendProfilingContext(
201 const IRuntime::CreationOptions&, IBackendProfilingPtr&)
202{
203 return IBackendProfilingContextPtr{};
204}
205
206IBackendInternal::ILayerSupportSharedPtr GpuFsaBackend::GetLayerSupport() const
207{
208 static ILayerSupportSharedPtr layerSupport{new GpuFsaLayerSupport};
209 return layerSupport;
210}
211
212std::unique_ptr<ICustomAllocator> GpuFsaBackend::GetDefaultAllocator() const
213{
214 return std::make_unique<GpuFsaBackendDefaultAllocator>();
215}
216
217OptimizationViews GpuFsaBackend::OptimizeSubgraphView(const SubgraphView& subgraph,
218 const ModelOptions& modelOptions) const
219{
220 OptimizationViews optimizationViews(modelOptions);
221
222 using namespace arm_compute::experimental::dynamic_fusion;
David Monahan8a570462023-11-22 13:24:25 +0000223
224 auto it = subgraph.end();
225 std::map<LayerGuid, Layer*> untouched;
226 while (it != subgraph.begin())
227 {
228 --it;
229 Layer& base = *(PolymorphicDowncast<Layer*>(*it));
230 untouched.insert({base.GetGuid(), &base});
231 }
232
233 GpuFsaLayerSupport supportChecker;
234 it = subgraph.end();
David Monahanbd738082023-12-08 12:50:02 +0000235 arm_compute::CLCompileContext* compileCtx = &(arm_compute::CLKernelLibrary::get().get_compile_context());
236
237 // Setup the GpuWokloadContext which will exist for the lifetime of the Graph. This contains the TensorInfos
238 std::shared_ptr<GpuWorkloadContext> workloadContext = std::make_shared<GpuWorkloadContext>(compileCtx);
David Monahan8a570462023-11-22 13:24:25 +0000239 while (it != subgraph.begin())
240 {
241 --it;
242 Layer& base = *(PolymorphicDowncast<Layer*>(*it));
David Monahanbd738082023-12-08 12:50:02 +0000243 // Create a GpuFsaPreCompiledBlob, this contains all of the information needed to execute an operator
244 GpuFsaPreCompiledBlob* preCompiledBlobPtr = new GpuFsaPreCompiledBlob();
245 preCompiledBlobPtr->workloadContext = workloadContext;
246 preCompiledBlobPtr->sketch = std::make_unique<GpuWorkloadSketch>(workloadContext.get());
David Monahan8a570462023-11-22 13:24:25 +0000247
David Monahanbd738082023-12-08 12:50:02 +0000248 // Configure and setup the sketch for each supported op. Their data will be wrapped into a PreCompiled layer
David Monahan8a570462023-11-22 13:24:25 +0000249 switch (base.GetType())
250 {
Tracy Narinebc5a5d52024-02-06 15:22:41 +0000251 case (LayerType::Cast):
252 {
253 auto input = base.GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo();
254 auto output = base.GetOutputSlot(0).GetTensorInfo();
255 GpuFsaCastCreateOp(preCompiledBlobPtr, input, output);
256 break;
257 }
David Monahan8a570462023-11-22 13:24:25 +0000258 case (LayerType::Convolution2d):
259 {
260 auto input = base.GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo();
261 auto weights = base.GetInputSlot(1).GetConnectedOutputSlot()->GetTensorInfo();
David Monahan8a570462023-11-22 13:24:25 +0000262
263 auto desc = PolymorphicDowncast<const Convolution2dDescriptor*>(&base.GetParameters());
264 if (desc->m_BiasEnabled)
265 {
266 auto bias = base.GetInputSlot(2).GetConnectedOutputSlot()->GetTensorInfo();
David Monahanbd738082023-12-08 12:50:02 +0000267 GpuFsaConvolution2dCreateOp(preCompiledBlobPtr,
268 input,
David Monahan8a570462023-11-22 13:24:25 +0000269 *desc,
270 weights,
271 bias);
272 }
273 else
274 {
David Monahanbd738082023-12-08 12:50:02 +0000275 GpuFsaConvolution2dCreateOp(preCompiledBlobPtr,
276 input,
David Monahan8a570462023-11-22 13:24:25 +0000277 *desc,
278 weights,
279 EmptyOptional());
280 }
281 break;
282 }
Tianle Chengfbfa49e2024-01-23 11:21:48 +0000283 case (LayerType::DepthwiseConvolution2d):
284 {
285 auto input = base.GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo();
286 auto weights = base.GetInputSlot(1).GetConnectedOutputSlot()->GetTensorInfo();
287
288 auto desc = PolymorphicDowncast<const DepthwiseConvolution2dDescriptor*>(&base.GetParameters());
289 if (desc->m_BiasEnabled)
290 {
291 auto bias = base.GetInputSlot(2).GetConnectedOutputSlot()->GetTensorInfo();
292 GpuFsaDepthwiseConvolution2dCreateOp(preCompiledBlobPtr,
293 input,
294 *desc,
295 weights,
296 bias);
297 }
298 else
299 {
300 GpuFsaDepthwiseConvolution2dCreateOp(preCompiledBlobPtr,
301 input,
302 *desc,
303 weights,
304 EmptyOptional());
305 }
306 break;
307 }
Tracy Narinee7d27852024-01-26 09:13:19 +0000308 case LayerType::ElementwiseBinary:
309 {
310 auto desc = PolymorphicDowncast<const ElementwiseBinaryDescriptor *>(&base.GetParameters());
311 if (desc->m_Operation == BinaryOperation::Add)
312 {
313 auto input0 = base.GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo();
314 auto input1 = base.GetInputSlot(1).GetConnectedOutputSlot()->GetTensorInfo();
315
316 GpuFsaElementwiseBinaryAddCreateOp(preCompiledBlobPtr, input0, input1);
317 }
John Mcloughlin829e13e2024-01-31 11:00:27 +0000318 else if (desc->m_Operation == BinaryOperation::Sub)
319 {
320 auto input0 = base.GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo();
321 auto input1 = base.GetInputSlot(1).GetConnectedOutputSlot()->GetTensorInfo();
322
323 GpuFsaElementwiseBinarySubCreateOp(preCompiledBlobPtr, input0, input1);
324 }
Tracy Narinee7d27852024-01-26 09:13:19 +0000325 break;
326 }
Teresa Charlina52bca22024-02-01 17:36:48 +0000327 case (LayerType::Pooling2d):
328 {
329 auto input = base.GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo();
330 auto desc = PolymorphicDowncast<const Pooling2dDescriptor*>(&base.GetParameters());
331 GpuFsaPooling2dCreateOp(preCompiledBlobPtr, input, *desc);
332 break;
333 }
David Monahan8a570462023-11-22 13:24:25 +0000334 default:
335 // unsupported layer for GpuFsa backend
336 continue;
337 }
338
David Monahanbd738082023-12-08 12:50:02 +0000339 auto compiledBlob =
340 std::make_unique<PreCompiledObjectPtr>(preCompiledBlobPtr, DeleteAsType<GpuFsaPreCompiledBlob>);
David Monahan8a570462023-11-22 13:24:25 +0000341
342 IConnectableLayer* preCompiledLayer = optimizationViews.GetINetwork()->AddPrecompiledLayer(
343 PreCompiledDescriptor(base.GetNumInputSlots(), base.GetNumOutputSlots()),
344 std::move(*compiledBlob),
345 armnn::Optional<BackendId>(GetId()),
346 "GpuFsa_Pre_Compiled_Layer");
347
348 // Copy the output tensor infos from sub-graph
349 for (unsigned int i = 0; i < subgraph.GetNumOutputSlots(); i++)
350 {
351 preCompiledLayer->GetOutputSlot(i).SetTensorInfo(base.GetOutputSlot(i).GetTensorInfo());
352 }
353
354 SubgraphView::SubgraphViewPtr substituteSubgraph =
355 CreateSubgraphViewFrom(CreateInputsFrom(&base),
356 CreateOutputsFrom(&base),
357 {&base});
358
David Monahanbd738082023-12-08 12:50:02 +0000359 optimizationViews.AddSubstitution({ std::move(*substituteSubgraph), SubgraphView(preCompiledLayer) });
David Monahan8a570462023-11-22 13:24:25 +0000360
361 untouched.erase(base.GetGuid());
362 }
363
364 if (optimizationViews.GetSubstitutions().empty())
365 {
366 optimizationViews.AddUntouchedSubgraph(SubgraphView(subgraph));
367 }
368 else
369 {
370 ReportUntouchedLayers(optimizationViews, untouched);
371 }
372
373
374 return optimizationViews;
375}
376
377} // namespace armnn