blob: 7951b17d9d339738cd418f2a06e768eaf135aff3 [file] [log] [blame]
David Monahan8a570462023-11-22 13:24:25 +00001//
David Monahanbd738082023-12-08 12:50:02 +00002// Copyright © 2022-2024 Arm Ltd and Contributors. All rights reserved.
David Monahan8a570462023-11-22 13:24:25 +00003// SPDX-License-Identifier: MIT
4//
5
6#include "GpuFsaBackend.hpp"
7#include "GpuFsaBackendContext.hpp"
8#include "GpuFsaBackendDefaultAllocator.hpp"
9#include "GpuFsaBackendId.hpp"
10#include "GpuFsaLayerSupport.hpp"
11#include "GpuFsaTensorHandleFactory.hpp"
12#include "GpuFsaWorkloadFactory.hpp"
13
14#include <armnn/backends/IBackendContext.hpp>
15#include <armnn/backends/IMemoryManager.hpp>
16#include <aclCommon/BaseMemoryManager.hpp>
17#include <backendsCommon/SubgraphUtils.hpp>
18#include <Optimizer.hpp>
19
20#include <arm_compute/core/CL/CLKernelLibrary.h>
21#include <arm_compute/runtime/CL/CLBufferAllocator.h>
22
David Monahanbd738082023-12-08 12:50:02 +000023#include "layers/GpuFsaConvolution2d.hpp"
Tianle Chengfbfa49e2024-01-23 11:21:48 +000024#include "layers/GpuFsaDepthwiseConvolution2d.hpp"
Tracy Narinee7d27852024-01-26 09:13:19 +000025#include "layers/GpuFsaElementwiseBinaryAdd.hpp"
David Monahan8a570462023-11-22 13:24:25 +000026
27namespace armnn
28{
29
30template <typename T>
31inline void DeleteAsType(const void* const blob)
32{
33 delete static_cast<const T*>(blob);
34}
35
36inline SubgraphView::InputSlots CreateInputsFrom(Layer* layer)
37{
38 SubgraphView::InputSlots result;
39 for (auto&& it = layer->BeginInputSlots(); it != layer->EndInputSlots(); ++it)
40 {
41 result.push_back(&(*it));
42 }
43 return result;
44}
45
46inline SubgraphView::OutputSlots CreateOutputsFrom(Layer* layer)
47{
48 SubgraphView::OutputSlots result;
49 for (auto&& it = layer->BeginOutputSlots(); it != layer->EndOutputSlots(); ++it)
50 {
51 result.push_back(&(*it));
52 }
53 return result;
54}
55
56inline SubgraphView::SubgraphViewPtr CreateSubgraphViewFrom(SubgraphView::InputSlots&& inputs,
57 SubgraphView::OutputSlots&& outputs,
58 SubgraphView::Layers&& layers)
59{
60 return std::make_unique<SubgraphView>(std::move(inputs), std::move(outputs), std::move(layers));
61}
62
63const BackendId& GpuFsaBackend::GetIdStatic()
64{
65 static const BackendId s_Id{GpuFsaBackendId()};
66 return s_Id;
67}
68
69IBackendInternal::IMemoryManagerUniquePtr GpuFsaBackend::CreateMemoryManager() const
70{
71 if (m_UsingCustomAllocator)
72 {
73 return std::make_unique<GpuFsaMemoryManager>(m_CustomAllocator);
74 }
75 return std::make_unique<GpuFsaMemoryManager>(std::make_unique<arm_compute::CLBufferAllocator>());
76}
77
78IBackendInternal::IWorkloadFactoryPtr GpuFsaBackend::CreateWorkloadFactory(
79 const IBackendInternal::IMemoryManagerSharedPtr& memoryManager) const
80{
81 return std::make_unique<GpuFsaWorkloadFactory>(PolymorphicPointerDowncast<GpuFsaMemoryManager>(memoryManager));
82}
83
84IBackendInternal::IWorkloadFactoryPtr GpuFsaBackend::CreateWorkloadFactory(
85 TensorHandleFactoryRegistry& registry) const
86{
87 std::shared_ptr<GpuFsaMemoryManager> memoryManager;
88 if (m_UsingCustomAllocator)
89 {
90 memoryManager = std::make_shared<GpuFsaMemoryManager>(m_CustomAllocator);
91 }
92 else
93 {
94 memoryManager = std::make_shared<GpuFsaMemoryManager>(std::make_unique<arm_compute::CLBufferAllocator>());
95 }
96
97 std::unique_ptr<ITensorHandleFactory> factory = std::make_unique<GpuFsaTensorHandleFactory>(memoryManager);
98
99 registry.RegisterMemoryManager(memoryManager);
100 registry.RegisterFactory(std::move(factory));
101
102 return std::make_unique<GpuFsaWorkloadFactory>(PolymorphicPointerDowncast<GpuFsaMemoryManager>(memoryManager));
103}
104
105IBackendInternal::IWorkloadFactoryPtr GpuFsaBackend::CreateWorkloadFactory(
106 TensorHandleFactoryRegistry& registry,
107 const ModelOptions&,
108 MemorySourceFlags inputFlags,
109 MemorySourceFlags outputFlags) const
110{
111
112 // To allow force import if inputFlags/outputFlags are Undefined, set it as Malloc
113 if (inputFlags == static_cast<MemorySourceFlags>(MemorySource::Undefined))
114 {
115 inputFlags = static_cast<MemorySourceFlags>(MemorySource::Malloc);
116 }
117 if (outputFlags == static_cast<MemorySourceFlags>(MemorySource::Undefined))
118 {
119 outputFlags = static_cast<MemorySourceFlags>(MemorySource::Malloc);
120 }
121
122 std::shared_ptr<GpuFsaMemoryManager> memoryManager;
123 if (m_UsingCustomAllocator)
124 {
125 memoryManager = std::make_shared<GpuFsaMemoryManager>(m_CustomAllocator);
126 }
127 else
128 {
129 memoryManager = std::make_shared<GpuFsaMemoryManager>(std::make_unique<arm_compute::CLBufferAllocator>());
130 }
131
132 std::unique_ptr<ITensorHandleFactory> factory = std::make_unique<GpuFsaTensorHandleFactory>(memoryManager);
133
134 registry.RegisterMemoryManager(memoryManager);
135 registry.RegisterFactory(std::move(factory));
136
137 return std::make_unique<GpuFsaWorkloadFactory>(PolymorphicPointerDowncast<GpuFsaMemoryManager>(memoryManager));
138}
139
140std::vector<ITensorHandleFactory::FactoryId> GpuFsaBackend::GetHandleFactoryPreferences() const
141{
142 return std::vector<ITensorHandleFactory::FactoryId> { GpuFsaTensorHandleFactory::GetIdStatic() };
143}
144
145void GpuFsaBackend::RegisterTensorHandleFactories(TensorHandleFactoryRegistry& registry)
146{
147 std::shared_ptr<GpuFsaMemoryManager> memoryManager;
148 if (m_UsingCustomAllocator)
149 {
150 memoryManager = std::make_shared<GpuFsaMemoryManager>(m_CustomAllocator);
151 }
152 else
153 {
154 memoryManager = std::make_shared<GpuFsaMemoryManager>(std::make_unique<arm_compute::CLBufferAllocator>());
155 }
156
157 std::unique_ptr<ITensorHandleFactory> factory = std::make_unique<GpuFsaTensorHandleFactory>(memoryManager);
158 registry.RegisterMemoryManager(memoryManager);
159 registry.RegisterFactory(std::move(factory));
160
161}
162
163void GpuFsaBackend::RegisterTensorHandleFactories(TensorHandleFactoryRegistry& registry,
164 MemorySourceFlags inputFlags,
165 MemorySourceFlags outputFlags)
166{
167 // To allow force import if inputFlags/outputFlags are Undefined, set it as Malloc
168 if (inputFlags == static_cast<MemorySourceFlags>(MemorySource::Undefined))
169 {
170 inputFlags = static_cast<MemorySourceFlags>(MemorySource::Malloc);
171 }
172 if (outputFlags == static_cast<MemorySourceFlags>(MemorySource::Undefined))
173 {
174 outputFlags = static_cast<MemorySourceFlags>(MemorySource::Malloc);
175 }
176
177 std::shared_ptr<GpuFsaMemoryManager> memoryManager;
178 if (m_UsingCustomAllocator)
179 {
180 memoryManager = std::make_shared<GpuFsaMemoryManager>(m_CustomAllocator);
181 }
182 else
183 {
184 memoryManager = std::make_shared<GpuFsaMemoryManager>(std::make_unique<arm_compute::CLBufferAllocator>());
185 }
186
187 std::unique_ptr<ITensorHandleFactory> factory = std::make_unique<GpuFsaTensorHandleFactory>(memoryManager);
188 registry.RegisterMemoryManager(memoryManager);
189 registry.RegisterFactory(std::move(factory));
190}
191
192IBackendInternal::IBackendContextPtr GpuFsaBackend::CreateBackendContext(const IRuntime::CreationOptions& options) const
193{
194 return IBackendContextPtr{new GpuFsaBackendContext{options}};
195}
196
197IBackendInternal::IBackendProfilingContextPtr GpuFsaBackend::CreateBackendProfilingContext(
198 const IRuntime::CreationOptions&, IBackendProfilingPtr&)
199{
200 return IBackendProfilingContextPtr{};
201}
202
203IBackendInternal::ILayerSupportSharedPtr GpuFsaBackend::GetLayerSupport() const
204{
205 static ILayerSupportSharedPtr layerSupport{new GpuFsaLayerSupport};
206 return layerSupport;
207}
208
209std::unique_ptr<ICustomAllocator> GpuFsaBackend::GetDefaultAllocator() const
210{
211 return std::make_unique<GpuFsaBackendDefaultAllocator>();
212}
213
214OptimizationViews GpuFsaBackend::OptimizeSubgraphView(const SubgraphView& subgraph,
215 const ModelOptions& modelOptions) const
216{
217 OptimizationViews optimizationViews(modelOptions);
218
219 using namespace arm_compute::experimental::dynamic_fusion;
David Monahan8a570462023-11-22 13:24:25 +0000220
221 auto it = subgraph.end();
222 std::map<LayerGuid, Layer*> untouched;
223 while (it != subgraph.begin())
224 {
225 --it;
226 Layer& base = *(PolymorphicDowncast<Layer*>(*it));
227 untouched.insert({base.GetGuid(), &base});
228 }
229
230 GpuFsaLayerSupport supportChecker;
231 it = subgraph.end();
David Monahanbd738082023-12-08 12:50:02 +0000232 arm_compute::CLCompileContext* compileCtx = &(arm_compute::CLKernelLibrary::get().get_compile_context());
233
234 // Setup the GpuWokloadContext which will exist for the lifetime of the Graph. This contains the TensorInfos
235 std::shared_ptr<GpuWorkloadContext> workloadContext = std::make_shared<GpuWorkloadContext>(compileCtx);
David Monahan8a570462023-11-22 13:24:25 +0000236 while (it != subgraph.begin())
237 {
238 --it;
239 Layer& base = *(PolymorphicDowncast<Layer*>(*it));
David Monahanbd738082023-12-08 12:50:02 +0000240 // Create a GpuFsaPreCompiledBlob, this contains all of the information needed to execute an operator
241 GpuFsaPreCompiledBlob* preCompiledBlobPtr = new GpuFsaPreCompiledBlob();
242 preCompiledBlobPtr->workloadContext = workloadContext;
243 preCompiledBlobPtr->sketch = std::make_unique<GpuWorkloadSketch>(workloadContext.get());
David Monahan8a570462023-11-22 13:24:25 +0000244
David Monahanbd738082023-12-08 12:50:02 +0000245 // Configure and setup the sketch for each supported op. Their data will be wrapped into a PreCompiled layer
David Monahan8a570462023-11-22 13:24:25 +0000246 switch (base.GetType())
247 {
248 case (LayerType::Convolution2d):
249 {
250 auto input = base.GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo();
251 auto weights = base.GetInputSlot(1).GetConnectedOutputSlot()->GetTensorInfo();
David Monahan8a570462023-11-22 13:24:25 +0000252
253 auto desc = PolymorphicDowncast<const Convolution2dDescriptor*>(&base.GetParameters());
254 if (desc->m_BiasEnabled)
255 {
256 auto bias = base.GetInputSlot(2).GetConnectedOutputSlot()->GetTensorInfo();
David Monahanbd738082023-12-08 12:50:02 +0000257 GpuFsaConvolution2dCreateOp(preCompiledBlobPtr,
258 input,
David Monahan8a570462023-11-22 13:24:25 +0000259 *desc,
260 weights,
261 bias);
262 }
263 else
264 {
David Monahanbd738082023-12-08 12:50:02 +0000265 GpuFsaConvolution2dCreateOp(preCompiledBlobPtr,
266 input,
David Monahan8a570462023-11-22 13:24:25 +0000267 *desc,
268 weights,
269 EmptyOptional());
270 }
271 break;
272 }
Tianle Chengfbfa49e2024-01-23 11:21:48 +0000273 case (LayerType::DepthwiseConvolution2d):
274 {
275 auto input = base.GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo();
276 auto weights = base.GetInputSlot(1).GetConnectedOutputSlot()->GetTensorInfo();
277
278 auto desc = PolymorphicDowncast<const DepthwiseConvolution2dDescriptor*>(&base.GetParameters());
279 if (desc->m_BiasEnabled)
280 {
281 auto bias = base.GetInputSlot(2).GetConnectedOutputSlot()->GetTensorInfo();
282 GpuFsaDepthwiseConvolution2dCreateOp(preCompiledBlobPtr,
283 input,
284 *desc,
285 weights,
286 bias);
287 }
288 else
289 {
290 GpuFsaDepthwiseConvolution2dCreateOp(preCompiledBlobPtr,
291 input,
292 *desc,
293 weights,
294 EmptyOptional());
295 }
296 break;
297 }
Tracy Narinee7d27852024-01-26 09:13:19 +0000298 case LayerType::ElementwiseBinary:
299 {
300 auto desc = PolymorphicDowncast<const ElementwiseBinaryDescriptor *>(&base.GetParameters());
301 if (desc->m_Operation == BinaryOperation::Add)
302 {
303 auto input0 = base.GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo();
304 auto input1 = base.GetInputSlot(1).GetConnectedOutputSlot()->GetTensorInfo();
305
306 GpuFsaElementwiseBinaryAddCreateOp(preCompiledBlobPtr, input0, input1);
307 }
308 break;
309 }
David Monahan8a570462023-11-22 13:24:25 +0000310 default:
311 // unsupported layer for GpuFsa backend
312 continue;
313 }
314
David Monahanbd738082023-12-08 12:50:02 +0000315 auto compiledBlob =
316 std::make_unique<PreCompiledObjectPtr>(preCompiledBlobPtr, DeleteAsType<GpuFsaPreCompiledBlob>);
David Monahan8a570462023-11-22 13:24:25 +0000317
318 IConnectableLayer* preCompiledLayer = optimizationViews.GetINetwork()->AddPrecompiledLayer(
319 PreCompiledDescriptor(base.GetNumInputSlots(), base.GetNumOutputSlots()),
320 std::move(*compiledBlob),
321 armnn::Optional<BackendId>(GetId()),
322 "GpuFsa_Pre_Compiled_Layer");
323
324 // Copy the output tensor infos from sub-graph
325 for (unsigned int i = 0; i < subgraph.GetNumOutputSlots(); i++)
326 {
327 preCompiledLayer->GetOutputSlot(i).SetTensorInfo(base.GetOutputSlot(i).GetTensorInfo());
328 }
329
330 SubgraphView::SubgraphViewPtr substituteSubgraph =
331 CreateSubgraphViewFrom(CreateInputsFrom(&base),
332 CreateOutputsFrom(&base),
333 {&base});
334
David Monahanbd738082023-12-08 12:50:02 +0000335 optimizationViews.AddSubstitution({ std::move(*substituteSubgraph), SubgraphView(preCompiledLayer) });
David Monahan8a570462023-11-22 13:24:25 +0000336
337 untouched.erase(base.GetGuid());
338 }
339
340 if (optimizationViews.GetSubstitutions().empty())
341 {
342 optimizationViews.AddUntouchedSubgraph(SubgraphView(subgraph));
343 }
344 else
345 {
346 ReportUntouchedLayers(optimizationViews, untouched);
347 }
348
349
350 return optimizationViews;
351}
352
353} // namespace armnn