blob: 9886a6e187253ea605fadab4ee1f5328ad5a96fb [file] [log] [blame]
David Monahan8a570462023-11-22 13:24:25 +00001//
David Monahanbd738082023-12-08 12:50:02 +00002// Copyright © 2022-2024 Arm Ltd and Contributors. All rights reserved.
David Monahan8a570462023-11-22 13:24:25 +00003// SPDX-License-Identifier: MIT
4//
5
6#include "GpuFsaBackend.hpp"
7#include "GpuFsaBackendContext.hpp"
8#include "GpuFsaBackendDefaultAllocator.hpp"
9#include "GpuFsaBackendId.hpp"
10#include "GpuFsaLayerSupport.hpp"
11#include "GpuFsaTensorHandleFactory.hpp"
12#include "GpuFsaWorkloadFactory.hpp"
13
14#include <armnn/backends/IBackendContext.hpp>
15#include <armnn/backends/IMemoryManager.hpp>
16#include <aclCommon/BaseMemoryManager.hpp>
17#include <backendsCommon/SubgraphUtils.hpp>
18#include <Optimizer.hpp>
19
20#include <arm_compute/core/CL/CLKernelLibrary.h>
21#include <arm_compute/runtime/CL/CLBufferAllocator.h>
22
David Monahanbd738082023-12-08 12:50:02 +000023#include "layers/GpuFsaConvolution2d.hpp"
David Monahan8a570462023-11-22 13:24:25 +000024
25namespace armnn
26{
27
28template <typename T>
29inline void DeleteAsType(const void* const blob)
30{
31 delete static_cast<const T*>(blob);
32}
33
34inline SubgraphView::InputSlots CreateInputsFrom(Layer* layer)
35{
36 SubgraphView::InputSlots result;
37 for (auto&& it = layer->BeginInputSlots(); it != layer->EndInputSlots(); ++it)
38 {
39 result.push_back(&(*it));
40 }
41 return result;
42}
43
44inline SubgraphView::OutputSlots CreateOutputsFrom(Layer* layer)
45{
46 SubgraphView::OutputSlots result;
47 for (auto&& it = layer->BeginOutputSlots(); it != layer->EndOutputSlots(); ++it)
48 {
49 result.push_back(&(*it));
50 }
51 return result;
52}
53
54inline SubgraphView::SubgraphViewPtr CreateSubgraphViewFrom(SubgraphView::InputSlots&& inputs,
55 SubgraphView::OutputSlots&& outputs,
56 SubgraphView::Layers&& layers)
57{
58 return std::make_unique<SubgraphView>(std::move(inputs), std::move(outputs), std::move(layers));
59}
60
61const BackendId& GpuFsaBackend::GetIdStatic()
62{
63 static const BackendId s_Id{GpuFsaBackendId()};
64 return s_Id;
65}
66
67IBackendInternal::IMemoryManagerUniquePtr GpuFsaBackend::CreateMemoryManager() const
68{
69 if (m_UsingCustomAllocator)
70 {
71 return std::make_unique<GpuFsaMemoryManager>(m_CustomAllocator);
72 }
73 return std::make_unique<GpuFsaMemoryManager>(std::make_unique<arm_compute::CLBufferAllocator>());
74}
75
76IBackendInternal::IWorkloadFactoryPtr GpuFsaBackend::CreateWorkloadFactory(
77 const IBackendInternal::IMemoryManagerSharedPtr& memoryManager) const
78{
79 return std::make_unique<GpuFsaWorkloadFactory>(PolymorphicPointerDowncast<GpuFsaMemoryManager>(memoryManager));
80}
81
82IBackendInternal::IWorkloadFactoryPtr GpuFsaBackend::CreateWorkloadFactory(
83 TensorHandleFactoryRegistry& registry) const
84{
85 std::shared_ptr<GpuFsaMemoryManager> memoryManager;
86 if (m_UsingCustomAllocator)
87 {
88 memoryManager = std::make_shared<GpuFsaMemoryManager>(m_CustomAllocator);
89 }
90 else
91 {
92 memoryManager = std::make_shared<GpuFsaMemoryManager>(std::make_unique<arm_compute::CLBufferAllocator>());
93 }
94
95 std::unique_ptr<ITensorHandleFactory> factory = std::make_unique<GpuFsaTensorHandleFactory>(memoryManager);
96
97 registry.RegisterMemoryManager(memoryManager);
98 registry.RegisterFactory(std::move(factory));
99
100 return std::make_unique<GpuFsaWorkloadFactory>(PolymorphicPointerDowncast<GpuFsaMemoryManager>(memoryManager));
101}
102
103IBackendInternal::IWorkloadFactoryPtr GpuFsaBackend::CreateWorkloadFactory(
104 TensorHandleFactoryRegistry& registry,
105 const ModelOptions&,
106 MemorySourceFlags inputFlags,
107 MemorySourceFlags outputFlags) const
108{
109
110 // To allow force import if inputFlags/outputFlags are Undefined, set it as Malloc
111 if (inputFlags == static_cast<MemorySourceFlags>(MemorySource::Undefined))
112 {
113 inputFlags = static_cast<MemorySourceFlags>(MemorySource::Malloc);
114 }
115 if (outputFlags == static_cast<MemorySourceFlags>(MemorySource::Undefined))
116 {
117 outputFlags = static_cast<MemorySourceFlags>(MemorySource::Malloc);
118 }
119
120 std::shared_ptr<GpuFsaMemoryManager> memoryManager;
121 if (m_UsingCustomAllocator)
122 {
123 memoryManager = std::make_shared<GpuFsaMemoryManager>(m_CustomAllocator);
124 }
125 else
126 {
127 memoryManager = std::make_shared<GpuFsaMemoryManager>(std::make_unique<arm_compute::CLBufferAllocator>());
128 }
129
130 std::unique_ptr<ITensorHandleFactory> factory = std::make_unique<GpuFsaTensorHandleFactory>(memoryManager);
131
132 registry.RegisterMemoryManager(memoryManager);
133 registry.RegisterFactory(std::move(factory));
134
135 return std::make_unique<GpuFsaWorkloadFactory>(PolymorphicPointerDowncast<GpuFsaMemoryManager>(memoryManager));
136}
137
138std::vector<ITensorHandleFactory::FactoryId> GpuFsaBackend::GetHandleFactoryPreferences() const
139{
140 return std::vector<ITensorHandleFactory::FactoryId> { GpuFsaTensorHandleFactory::GetIdStatic() };
141}
142
143void GpuFsaBackend::RegisterTensorHandleFactories(TensorHandleFactoryRegistry& registry)
144{
145 std::shared_ptr<GpuFsaMemoryManager> memoryManager;
146 if (m_UsingCustomAllocator)
147 {
148 memoryManager = std::make_shared<GpuFsaMemoryManager>(m_CustomAllocator);
149 }
150 else
151 {
152 memoryManager = std::make_shared<GpuFsaMemoryManager>(std::make_unique<arm_compute::CLBufferAllocator>());
153 }
154
155 std::unique_ptr<ITensorHandleFactory> factory = std::make_unique<GpuFsaTensorHandleFactory>(memoryManager);
156 registry.RegisterMemoryManager(memoryManager);
157 registry.RegisterFactory(std::move(factory));
158
159}
160
161void GpuFsaBackend::RegisterTensorHandleFactories(TensorHandleFactoryRegistry& registry,
162 MemorySourceFlags inputFlags,
163 MemorySourceFlags outputFlags)
164{
165 // To allow force import if inputFlags/outputFlags are Undefined, set it as Malloc
166 if (inputFlags == static_cast<MemorySourceFlags>(MemorySource::Undefined))
167 {
168 inputFlags = static_cast<MemorySourceFlags>(MemorySource::Malloc);
169 }
170 if (outputFlags == static_cast<MemorySourceFlags>(MemorySource::Undefined))
171 {
172 outputFlags = static_cast<MemorySourceFlags>(MemorySource::Malloc);
173 }
174
175 std::shared_ptr<GpuFsaMemoryManager> memoryManager;
176 if (m_UsingCustomAllocator)
177 {
178 memoryManager = std::make_shared<GpuFsaMemoryManager>(m_CustomAllocator);
179 }
180 else
181 {
182 memoryManager = std::make_shared<GpuFsaMemoryManager>(std::make_unique<arm_compute::CLBufferAllocator>());
183 }
184
185 std::unique_ptr<ITensorHandleFactory> factory = std::make_unique<GpuFsaTensorHandleFactory>(memoryManager);
186 registry.RegisterMemoryManager(memoryManager);
187 registry.RegisterFactory(std::move(factory));
188}
189
190IBackendInternal::IBackendContextPtr GpuFsaBackend::CreateBackendContext(const IRuntime::CreationOptions& options) const
191{
192 return IBackendContextPtr{new GpuFsaBackendContext{options}};
193}
194
195IBackendInternal::IBackendProfilingContextPtr GpuFsaBackend::CreateBackendProfilingContext(
196 const IRuntime::CreationOptions&, IBackendProfilingPtr&)
197{
198 return IBackendProfilingContextPtr{};
199}
200
201IBackendInternal::ILayerSupportSharedPtr GpuFsaBackend::GetLayerSupport() const
202{
203 static ILayerSupportSharedPtr layerSupport{new GpuFsaLayerSupport};
204 return layerSupport;
205}
206
207std::unique_ptr<ICustomAllocator> GpuFsaBackend::GetDefaultAllocator() const
208{
209 return std::make_unique<GpuFsaBackendDefaultAllocator>();
210}
211
212OptimizationViews GpuFsaBackend::OptimizeSubgraphView(const SubgraphView& subgraph,
213 const ModelOptions& modelOptions) const
214{
215 OptimizationViews optimizationViews(modelOptions);
216
217 using namespace arm_compute::experimental::dynamic_fusion;
David Monahan8a570462023-11-22 13:24:25 +0000218
219 auto it = subgraph.end();
220 std::map<LayerGuid, Layer*> untouched;
221 while (it != subgraph.begin())
222 {
223 --it;
224 Layer& base = *(PolymorphicDowncast<Layer*>(*it));
225 untouched.insert({base.GetGuid(), &base});
226 }
227
228 GpuFsaLayerSupport supportChecker;
229 it = subgraph.end();
David Monahanbd738082023-12-08 12:50:02 +0000230 arm_compute::CLCompileContext* compileCtx = &(arm_compute::CLKernelLibrary::get().get_compile_context());
231
232 // Setup the GpuWokloadContext which will exist for the lifetime of the Graph. This contains the TensorInfos
233 std::shared_ptr<GpuWorkloadContext> workloadContext = std::make_shared<GpuWorkloadContext>(compileCtx);
David Monahan8a570462023-11-22 13:24:25 +0000234 while (it != subgraph.begin())
235 {
236 --it;
237 Layer& base = *(PolymorphicDowncast<Layer*>(*it));
David Monahanbd738082023-12-08 12:50:02 +0000238 // Create a GpuFsaPreCompiledBlob, this contains all of the information needed to execute an operator
239 GpuFsaPreCompiledBlob* preCompiledBlobPtr = new GpuFsaPreCompiledBlob();
240 preCompiledBlobPtr->workloadContext = workloadContext;
241 preCompiledBlobPtr->sketch = std::make_unique<GpuWorkloadSketch>(workloadContext.get());
David Monahan8a570462023-11-22 13:24:25 +0000242
David Monahanbd738082023-12-08 12:50:02 +0000243 // Configure and setup the sketch for each supported op. Their data will be wrapped into a PreCompiled layer
David Monahan8a570462023-11-22 13:24:25 +0000244 switch (base.GetType())
245 {
246 case (LayerType::Convolution2d):
247 {
248 auto input = base.GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo();
249 auto weights = base.GetInputSlot(1).GetConnectedOutputSlot()->GetTensorInfo();
David Monahan8a570462023-11-22 13:24:25 +0000250
251 auto desc = PolymorphicDowncast<const Convolution2dDescriptor*>(&base.GetParameters());
252 if (desc->m_BiasEnabled)
253 {
254 auto bias = base.GetInputSlot(2).GetConnectedOutputSlot()->GetTensorInfo();
David Monahanbd738082023-12-08 12:50:02 +0000255 GpuFsaConvolution2dCreateOp(preCompiledBlobPtr,
256 input,
David Monahan8a570462023-11-22 13:24:25 +0000257 *desc,
258 weights,
259 bias);
260 }
261 else
262 {
David Monahanbd738082023-12-08 12:50:02 +0000263 GpuFsaConvolution2dCreateOp(preCompiledBlobPtr,
264 input,
David Monahan8a570462023-11-22 13:24:25 +0000265 *desc,
266 weights,
267 EmptyOptional());
268 }
269 break;
270 }
271 default:
272 // unsupported layer for GpuFsa backend
273 continue;
274 }
275
David Monahanbd738082023-12-08 12:50:02 +0000276 auto compiledBlob =
277 std::make_unique<PreCompiledObjectPtr>(preCompiledBlobPtr, DeleteAsType<GpuFsaPreCompiledBlob>);
David Monahan8a570462023-11-22 13:24:25 +0000278
279 IConnectableLayer* preCompiledLayer = optimizationViews.GetINetwork()->AddPrecompiledLayer(
280 PreCompiledDescriptor(base.GetNumInputSlots(), base.GetNumOutputSlots()),
281 std::move(*compiledBlob),
282 armnn::Optional<BackendId>(GetId()),
283 "GpuFsa_Pre_Compiled_Layer");
284
285 // Copy the output tensor infos from sub-graph
286 for (unsigned int i = 0; i < subgraph.GetNumOutputSlots(); i++)
287 {
288 preCompiledLayer->GetOutputSlot(i).SetTensorInfo(base.GetOutputSlot(i).GetTensorInfo());
289 }
290
291 SubgraphView::SubgraphViewPtr substituteSubgraph =
292 CreateSubgraphViewFrom(CreateInputsFrom(&base),
293 CreateOutputsFrom(&base),
294 {&base});
295
David Monahanbd738082023-12-08 12:50:02 +0000296 optimizationViews.AddSubstitution({ std::move(*substituteSubgraph), SubgraphView(preCompiledLayer) });
David Monahan8a570462023-11-22 13:24:25 +0000297
298 untouched.erase(base.GetGuid());
299 }
300
301 if (optimizationViews.GetSubstitutions().empty())
302 {
303 optimizationViews.AddUntouchedSubgraph(SubgraphView(subgraph));
304 }
305 else
306 {
307 ReportUntouchedLayers(optimizationViews, untouched);
308 }
309
310
311 return optimizationViews;
312}
313
314} // namespace armnn