blob: f14687b8e073dcc9dd74c5e1afb281b45dde59f1 [file] [log] [blame]
David Monahan8a570462023-11-22 13:24:25 +00001//
David Monahanbd738082023-12-08 12:50:02 +00002// Copyright © 2022-2024 Arm Ltd and Contributors. All rights reserved.
David Monahan8a570462023-11-22 13:24:25 +00003// SPDX-License-Identifier: MIT
4//
5
6#include "GpuFsaBackend.hpp"
7#include "GpuFsaBackendContext.hpp"
8#include "GpuFsaBackendDefaultAllocator.hpp"
9#include "GpuFsaBackendId.hpp"
10#include "GpuFsaLayerSupport.hpp"
11#include "GpuFsaTensorHandleFactory.hpp"
12#include "GpuFsaWorkloadFactory.hpp"
13
14#include <armnn/backends/IBackendContext.hpp>
15#include <armnn/backends/IMemoryManager.hpp>
16#include <aclCommon/BaseMemoryManager.hpp>
17#include <backendsCommon/SubgraphUtils.hpp>
18#include <Optimizer.hpp>
19
20#include <arm_compute/core/CL/CLKernelLibrary.h>
21#include <arm_compute/runtime/CL/CLBufferAllocator.h>
22
David Monahanbd738082023-12-08 12:50:02 +000023#include "layers/GpuFsaConvolution2d.hpp"
Tianle Chengfbfa49e2024-01-23 11:21:48 +000024#include "layers/GpuFsaDepthwiseConvolution2d.hpp"
Tracy Narinee7d27852024-01-26 09:13:19 +000025#include "layers/GpuFsaElementwiseBinaryAdd.hpp"
John Mcloughlin829e13e2024-01-31 11:00:27 +000026#include "layers/GpuFsaElementwiseBinarySub.hpp"
Teresa Charlina52bca22024-02-01 17:36:48 +000027#include "layers/GpuFsaPooling2d.hpp"
David Monahan8a570462023-11-22 13:24:25 +000028
29namespace armnn
30{
31
32template <typename T>
33inline void DeleteAsType(const void* const blob)
34{
35 delete static_cast<const T*>(blob);
36}
37
38inline SubgraphView::InputSlots CreateInputsFrom(Layer* layer)
39{
40 SubgraphView::InputSlots result;
41 for (auto&& it = layer->BeginInputSlots(); it != layer->EndInputSlots(); ++it)
42 {
43 result.push_back(&(*it));
44 }
45 return result;
46}
47
48inline SubgraphView::OutputSlots CreateOutputsFrom(Layer* layer)
49{
50 SubgraphView::OutputSlots result;
51 for (auto&& it = layer->BeginOutputSlots(); it != layer->EndOutputSlots(); ++it)
52 {
53 result.push_back(&(*it));
54 }
55 return result;
56}
57
58inline SubgraphView::SubgraphViewPtr CreateSubgraphViewFrom(SubgraphView::InputSlots&& inputs,
59 SubgraphView::OutputSlots&& outputs,
60 SubgraphView::Layers&& layers)
61{
62 return std::make_unique<SubgraphView>(std::move(inputs), std::move(outputs), std::move(layers));
63}
64
65const BackendId& GpuFsaBackend::GetIdStatic()
66{
67 static const BackendId s_Id{GpuFsaBackendId()};
68 return s_Id;
69}
70
71IBackendInternal::IMemoryManagerUniquePtr GpuFsaBackend::CreateMemoryManager() const
72{
73 if (m_UsingCustomAllocator)
74 {
75 return std::make_unique<GpuFsaMemoryManager>(m_CustomAllocator);
76 }
77 return std::make_unique<GpuFsaMemoryManager>(std::make_unique<arm_compute::CLBufferAllocator>());
78}
79
80IBackendInternal::IWorkloadFactoryPtr GpuFsaBackend::CreateWorkloadFactory(
81 const IBackendInternal::IMemoryManagerSharedPtr& memoryManager) const
82{
83 return std::make_unique<GpuFsaWorkloadFactory>(PolymorphicPointerDowncast<GpuFsaMemoryManager>(memoryManager));
84}
85
86IBackendInternal::IWorkloadFactoryPtr GpuFsaBackend::CreateWorkloadFactory(
87 TensorHandleFactoryRegistry& registry) const
88{
89 std::shared_ptr<GpuFsaMemoryManager> memoryManager;
90 if (m_UsingCustomAllocator)
91 {
92 memoryManager = std::make_shared<GpuFsaMemoryManager>(m_CustomAllocator);
93 }
94 else
95 {
96 memoryManager = std::make_shared<GpuFsaMemoryManager>(std::make_unique<arm_compute::CLBufferAllocator>());
97 }
98
99 std::unique_ptr<ITensorHandleFactory> factory = std::make_unique<GpuFsaTensorHandleFactory>(memoryManager);
100
101 registry.RegisterMemoryManager(memoryManager);
102 registry.RegisterFactory(std::move(factory));
103
104 return std::make_unique<GpuFsaWorkloadFactory>(PolymorphicPointerDowncast<GpuFsaMemoryManager>(memoryManager));
105}
106
107IBackendInternal::IWorkloadFactoryPtr GpuFsaBackend::CreateWorkloadFactory(
108 TensorHandleFactoryRegistry& registry,
109 const ModelOptions&,
110 MemorySourceFlags inputFlags,
111 MemorySourceFlags outputFlags) const
112{
113
114 // To allow force import if inputFlags/outputFlags are Undefined, set it as Malloc
115 if (inputFlags == static_cast<MemorySourceFlags>(MemorySource::Undefined))
116 {
117 inputFlags = static_cast<MemorySourceFlags>(MemorySource::Malloc);
118 }
119 if (outputFlags == static_cast<MemorySourceFlags>(MemorySource::Undefined))
120 {
121 outputFlags = static_cast<MemorySourceFlags>(MemorySource::Malloc);
122 }
123
124 std::shared_ptr<GpuFsaMemoryManager> memoryManager;
125 if (m_UsingCustomAllocator)
126 {
127 memoryManager = std::make_shared<GpuFsaMemoryManager>(m_CustomAllocator);
128 }
129 else
130 {
131 memoryManager = std::make_shared<GpuFsaMemoryManager>(std::make_unique<arm_compute::CLBufferAllocator>());
132 }
133
134 std::unique_ptr<ITensorHandleFactory> factory = std::make_unique<GpuFsaTensorHandleFactory>(memoryManager);
135
136 registry.RegisterMemoryManager(memoryManager);
137 registry.RegisterFactory(std::move(factory));
138
139 return std::make_unique<GpuFsaWorkloadFactory>(PolymorphicPointerDowncast<GpuFsaMemoryManager>(memoryManager));
140}
141
142std::vector<ITensorHandleFactory::FactoryId> GpuFsaBackend::GetHandleFactoryPreferences() const
143{
144 return std::vector<ITensorHandleFactory::FactoryId> { GpuFsaTensorHandleFactory::GetIdStatic() };
145}
146
147void GpuFsaBackend::RegisterTensorHandleFactories(TensorHandleFactoryRegistry& registry)
148{
149 std::shared_ptr<GpuFsaMemoryManager> memoryManager;
150 if (m_UsingCustomAllocator)
151 {
152 memoryManager = std::make_shared<GpuFsaMemoryManager>(m_CustomAllocator);
153 }
154 else
155 {
156 memoryManager = std::make_shared<GpuFsaMemoryManager>(std::make_unique<arm_compute::CLBufferAllocator>());
157 }
158
159 std::unique_ptr<ITensorHandleFactory> factory = std::make_unique<GpuFsaTensorHandleFactory>(memoryManager);
160 registry.RegisterMemoryManager(memoryManager);
161 registry.RegisterFactory(std::move(factory));
162
163}
164
165void GpuFsaBackend::RegisterTensorHandleFactories(TensorHandleFactoryRegistry& registry,
166 MemorySourceFlags inputFlags,
167 MemorySourceFlags outputFlags)
168{
169 // To allow force import if inputFlags/outputFlags are Undefined, set it as Malloc
170 if (inputFlags == static_cast<MemorySourceFlags>(MemorySource::Undefined))
171 {
172 inputFlags = static_cast<MemorySourceFlags>(MemorySource::Malloc);
173 }
174 if (outputFlags == static_cast<MemorySourceFlags>(MemorySource::Undefined))
175 {
176 outputFlags = static_cast<MemorySourceFlags>(MemorySource::Malloc);
177 }
178
179 std::shared_ptr<GpuFsaMemoryManager> memoryManager;
180 if (m_UsingCustomAllocator)
181 {
182 memoryManager = std::make_shared<GpuFsaMemoryManager>(m_CustomAllocator);
183 }
184 else
185 {
186 memoryManager = std::make_shared<GpuFsaMemoryManager>(std::make_unique<arm_compute::CLBufferAllocator>());
187 }
188
189 std::unique_ptr<ITensorHandleFactory> factory = std::make_unique<GpuFsaTensorHandleFactory>(memoryManager);
190 registry.RegisterMemoryManager(memoryManager);
191 registry.RegisterFactory(std::move(factory));
192}
193
194IBackendInternal::IBackendContextPtr GpuFsaBackend::CreateBackendContext(const IRuntime::CreationOptions& options) const
195{
196 return IBackendContextPtr{new GpuFsaBackendContext{options}};
197}
198
199IBackendInternal::IBackendProfilingContextPtr GpuFsaBackend::CreateBackendProfilingContext(
200 const IRuntime::CreationOptions&, IBackendProfilingPtr&)
201{
202 return IBackendProfilingContextPtr{};
203}
204
205IBackendInternal::ILayerSupportSharedPtr GpuFsaBackend::GetLayerSupport() const
206{
207 static ILayerSupportSharedPtr layerSupport{new GpuFsaLayerSupport};
208 return layerSupport;
209}
210
211std::unique_ptr<ICustomAllocator> GpuFsaBackend::GetDefaultAllocator() const
212{
213 return std::make_unique<GpuFsaBackendDefaultAllocator>();
214}
215
216OptimizationViews GpuFsaBackend::OptimizeSubgraphView(const SubgraphView& subgraph,
217 const ModelOptions& modelOptions) const
218{
219 OptimizationViews optimizationViews(modelOptions);
220
221 using namespace arm_compute::experimental::dynamic_fusion;
David Monahan8a570462023-11-22 13:24:25 +0000222
223 auto it = subgraph.end();
224 std::map<LayerGuid, Layer*> untouched;
225 while (it != subgraph.begin())
226 {
227 --it;
228 Layer& base = *(PolymorphicDowncast<Layer*>(*it));
229 untouched.insert({base.GetGuid(), &base});
230 }
231
232 GpuFsaLayerSupport supportChecker;
233 it = subgraph.end();
David Monahanbd738082023-12-08 12:50:02 +0000234 arm_compute::CLCompileContext* compileCtx = &(arm_compute::CLKernelLibrary::get().get_compile_context());
235
236 // Setup the GpuWokloadContext which will exist for the lifetime of the Graph. This contains the TensorInfos
237 std::shared_ptr<GpuWorkloadContext> workloadContext = std::make_shared<GpuWorkloadContext>(compileCtx);
David Monahan8a570462023-11-22 13:24:25 +0000238 while (it != subgraph.begin())
239 {
240 --it;
241 Layer& base = *(PolymorphicDowncast<Layer*>(*it));
David Monahanbd738082023-12-08 12:50:02 +0000242 // Create a GpuFsaPreCompiledBlob, this contains all of the information needed to execute an operator
243 GpuFsaPreCompiledBlob* preCompiledBlobPtr = new GpuFsaPreCompiledBlob();
244 preCompiledBlobPtr->workloadContext = workloadContext;
245 preCompiledBlobPtr->sketch = std::make_unique<GpuWorkloadSketch>(workloadContext.get());
David Monahan8a570462023-11-22 13:24:25 +0000246
David Monahanbd738082023-12-08 12:50:02 +0000247 // Configure and setup the sketch for each supported op. Their data will be wrapped into a PreCompiled layer
David Monahan8a570462023-11-22 13:24:25 +0000248 switch (base.GetType())
249 {
250 case (LayerType::Convolution2d):
251 {
252 auto input = base.GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo();
253 auto weights = base.GetInputSlot(1).GetConnectedOutputSlot()->GetTensorInfo();
David Monahan8a570462023-11-22 13:24:25 +0000254
255 auto desc = PolymorphicDowncast<const Convolution2dDescriptor*>(&base.GetParameters());
256 if (desc->m_BiasEnabled)
257 {
258 auto bias = base.GetInputSlot(2).GetConnectedOutputSlot()->GetTensorInfo();
David Monahanbd738082023-12-08 12:50:02 +0000259 GpuFsaConvolution2dCreateOp(preCompiledBlobPtr,
260 input,
David Monahan8a570462023-11-22 13:24:25 +0000261 *desc,
262 weights,
263 bias);
264 }
265 else
266 {
David Monahanbd738082023-12-08 12:50:02 +0000267 GpuFsaConvolution2dCreateOp(preCompiledBlobPtr,
268 input,
David Monahan8a570462023-11-22 13:24:25 +0000269 *desc,
270 weights,
271 EmptyOptional());
272 }
273 break;
274 }
Tianle Chengfbfa49e2024-01-23 11:21:48 +0000275 case (LayerType::DepthwiseConvolution2d):
276 {
277 auto input = base.GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo();
278 auto weights = base.GetInputSlot(1).GetConnectedOutputSlot()->GetTensorInfo();
279
280 auto desc = PolymorphicDowncast<const DepthwiseConvolution2dDescriptor*>(&base.GetParameters());
281 if (desc->m_BiasEnabled)
282 {
283 auto bias = base.GetInputSlot(2).GetConnectedOutputSlot()->GetTensorInfo();
284 GpuFsaDepthwiseConvolution2dCreateOp(preCompiledBlobPtr,
285 input,
286 *desc,
287 weights,
288 bias);
289 }
290 else
291 {
292 GpuFsaDepthwiseConvolution2dCreateOp(preCompiledBlobPtr,
293 input,
294 *desc,
295 weights,
296 EmptyOptional());
297 }
298 break;
299 }
Tracy Narinee7d27852024-01-26 09:13:19 +0000300 case LayerType::ElementwiseBinary:
301 {
302 auto desc = PolymorphicDowncast<const ElementwiseBinaryDescriptor *>(&base.GetParameters());
303 if (desc->m_Operation == BinaryOperation::Add)
304 {
305 auto input0 = base.GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo();
306 auto input1 = base.GetInputSlot(1).GetConnectedOutputSlot()->GetTensorInfo();
307
308 GpuFsaElementwiseBinaryAddCreateOp(preCompiledBlobPtr, input0, input1);
309 }
John Mcloughlin829e13e2024-01-31 11:00:27 +0000310 else if (desc->m_Operation == BinaryOperation::Sub)
311 {
312 auto input0 = base.GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo();
313 auto input1 = base.GetInputSlot(1).GetConnectedOutputSlot()->GetTensorInfo();
314
315 GpuFsaElementwiseBinarySubCreateOp(preCompiledBlobPtr, input0, input1);
316 }
Tracy Narinee7d27852024-01-26 09:13:19 +0000317 break;
318 }
Teresa Charlina52bca22024-02-01 17:36:48 +0000319 case (LayerType::Pooling2d):
320 {
321 auto input = base.GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo();
322 auto desc = PolymorphicDowncast<const Pooling2dDescriptor*>(&base.GetParameters());
323 GpuFsaPooling2dCreateOp(preCompiledBlobPtr, input, *desc);
324 break;
325 }
David Monahan8a570462023-11-22 13:24:25 +0000326 default:
327 // unsupported layer for GpuFsa backend
328 continue;
329 }
330
David Monahanbd738082023-12-08 12:50:02 +0000331 auto compiledBlob =
332 std::make_unique<PreCompiledObjectPtr>(preCompiledBlobPtr, DeleteAsType<GpuFsaPreCompiledBlob>);
David Monahan8a570462023-11-22 13:24:25 +0000333
334 IConnectableLayer* preCompiledLayer = optimizationViews.GetINetwork()->AddPrecompiledLayer(
335 PreCompiledDescriptor(base.GetNumInputSlots(), base.GetNumOutputSlots()),
336 std::move(*compiledBlob),
337 armnn::Optional<BackendId>(GetId()),
338 "GpuFsa_Pre_Compiled_Layer");
339
340 // Copy the output tensor infos from sub-graph
341 for (unsigned int i = 0; i < subgraph.GetNumOutputSlots(); i++)
342 {
343 preCompiledLayer->GetOutputSlot(i).SetTensorInfo(base.GetOutputSlot(i).GetTensorInfo());
344 }
345
346 SubgraphView::SubgraphViewPtr substituteSubgraph =
347 CreateSubgraphViewFrom(CreateInputsFrom(&base),
348 CreateOutputsFrom(&base),
349 {&base});
350
David Monahanbd738082023-12-08 12:50:02 +0000351 optimizationViews.AddSubstitution({ std::move(*substituteSubgraph), SubgraphView(preCompiledLayer) });
David Monahan8a570462023-11-22 13:24:25 +0000352
353 untouched.erase(base.GetGuid());
354 }
355
356 if (optimizationViews.GetSubstitutions().empty())
357 {
358 optimizationViews.AddUntouchedSubgraph(SubgraphView(subgraph));
359 }
360 else
361 {
362 ReportUntouchedLayers(optimizationViews, untouched);
363 }
364
365
366 return optimizationViews;
367}
368
369} // namespace armnn