blob: ec82f3ddf1c46bf44621b5358c0abbd6ca3abd0f [file] [log] [blame]
David Monahan8a570462023-11-22 13:24:25 +00001//
David Monahanbd738082023-12-08 12:50:02 +00002// Copyright © 2022-2024 Arm Ltd and Contributors. All rights reserved.
David Monahan8a570462023-11-22 13:24:25 +00003// SPDX-License-Identifier: MIT
4//
5
6#include "GpuFsaBackend.hpp"
7#include "GpuFsaBackendContext.hpp"
8#include "GpuFsaBackendDefaultAllocator.hpp"
9#include "GpuFsaBackendId.hpp"
10#include "GpuFsaLayerSupport.hpp"
11#include "GpuFsaTensorHandleFactory.hpp"
12#include "GpuFsaWorkloadFactory.hpp"
13
14#include <armnn/backends/IBackendContext.hpp>
15#include <armnn/backends/IMemoryManager.hpp>
16#include <aclCommon/BaseMemoryManager.hpp>
17#include <backendsCommon/SubgraphUtils.hpp>
18#include <Optimizer.hpp>
19
20#include <arm_compute/core/CL/CLKernelLibrary.h>
21#include <arm_compute/runtime/CL/CLBufferAllocator.h>
22
Teresa Charlin5bda9732024-02-08 18:46:38 +000023#include "layers/GpuFsaBatchMatMul.hpp"
Tracy Narinebc5a5d52024-02-06 15:22:41 +000024#include "layers/GpuFsaCast.hpp"
David Monahanbd738082023-12-08 12:50:02 +000025#include "layers/GpuFsaConvolution2d.hpp"
Tianle Chengfbfa49e2024-01-23 11:21:48 +000026#include "layers/GpuFsaDepthwiseConvolution2d.hpp"
Teresa Charlin20dda372024-02-08 16:23:25 +000027#include "layers/GpuFsaElementwiseBinary.hpp"
Teresa Charlina52bca22024-02-01 17:36:48 +000028#include "layers/GpuFsaPooling2d.hpp"
Teresa Charlin1d6b7312024-02-07 22:02:48 +000029#include "layers/GpuFsaResize.hpp"
John Mcloughlin33753902024-02-07 15:00:57 +000030#include "layers/GpuFsaSoftmax.hpp"
David Monahan8a570462023-11-22 13:24:25 +000031
32namespace armnn
33{
34
35template <typename T>
36inline void DeleteAsType(const void* const blob)
37{
38 delete static_cast<const T*>(blob);
39}
40
41inline SubgraphView::InputSlots CreateInputsFrom(Layer* layer)
42{
43 SubgraphView::InputSlots result;
44 for (auto&& it = layer->BeginInputSlots(); it != layer->EndInputSlots(); ++it)
45 {
46 result.push_back(&(*it));
47 }
48 return result;
49}
50
51inline SubgraphView::OutputSlots CreateOutputsFrom(Layer* layer)
52{
53 SubgraphView::OutputSlots result;
54 for (auto&& it = layer->BeginOutputSlots(); it != layer->EndOutputSlots(); ++it)
55 {
56 result.push_back(&(*it));
57 }
58 return result;
59}
60
61inline SubgraphView::SubgraphViewPtr CreateSubgraphViewFrom(SubgraphView::InputSlots&& inputs,
62 SubgraphView::OutputSlots&& outputs,
63 SubgraphView::Layers&& layers)
64{
65 return std::make_unique<SubgraphView>(std::move(inputs), std::move(outputs), std::move(layers));
66}
67
68const BackendId& GpuFsaBackend::GetIdStatic()
69{
70 static const BackendId s_Id{GpuFsaBackendId()};
71 return s_Id;
72}
73
74IBackendInternal::IMemoryManagerUniquePtr GpuFsaBackend::CreateMemoryManager() const
75{
76 if (m_UsingCustomAllocator)
77 {
78 return std::make_unique<GpuFsaMemoryManager>(m_CustomAllocator);
79 }
80 return std::make_unique<GpuFsaMemoryManager>(std::make_unique<arm_compute::CLBufferAllocator>());
81}
82
83IBackendInternal::IWorkloadFactoryPtr GpuFsaBackend::CreateWorkloadFactory(
84 const IBackendInternal::IMemoryManagerSharedPtr& memoryManager) const
85{
86 return std::make_unique<GpuFsaWorkloadFactory>(PolymorphicPointerDowncast<GpuFsaMemoryManager>(memoryManager));
87}
88
89IBackendInternal::IWorkloadFactoryPtr GpuFsaBackend::CreateWorkloadFactory(
90 TensorHandleFactoryRegistry& registry) const
91{
92 std::shared_ptr<GpuFsaMemoryManager> memoryManager;
93 if (m_UsingCustomAllocator)
94 {
95 memoryManager = std::make_shared<GpuFsaMemoryManager>(m_CustomAllocator);
96 }
97 else
98 {
99 memoryManager = std::make_shared<GpuFsaMemoryManager>(std::make_unique<arm_compute::CLBufferAllocator>());
100 }
101
102 std::unique_ptr<ITensorHandleFactory> factory = std::make_unique<GpuFsaTensorHandleFactory>(memoryManager);
103
104 registry.RegisterMemoryManager(memoryManager);
105 registry.RegisterFactory(std::move(factory));
106
107 return std::make_unique<GpuFsaWorkloadFactory>(PolymorphicPointerDowncast<GpuFsaMemoryManager>(memoryManager));
108}
109
110IBackendInternal::IWorkloadFactoryPtr GpuFsaBackend::CreateWorkloadFactory(
111 TensorHandleFactoryRegistry& registry,
112 const ModelOptions&,
113 MemorySourceFlags inputFlags,
114 MemorySourceFlags outputFlags) const
115{
116
117 // To allow force import if inputFlags/outputFlags are Undefined, set it as Malloc
118 if (inputFlags == static_cast<MemorySourceFlags>(MemorySource::Undefined))
119 {
120 inputFlags = static_cast<MemorySourceFlags>(MemorySource::Malloc);
121 }
122 if (outputFlags == static_cast<MemorySourceFlags>(MemorySource::Undefined))
123 {
124 outputFlags = static_cast<MemorySourceFlags>(MemorySource::Malloc);
125 }
126
127 std::shared_ptr<GpuFsaMemoryManager> memoryManager;
128 if (m_UsingCustomAllocator)
129 {
130 memoryManager = std::make_shared<GpuFsaMemoryManager>(m_CustomAllocator);
131 }
132 else
133 {
134 memoryManager = std::make_shared<GpuFsaMemoryManager>(std::make_unique<arm_compute::CLBufferAllocator>());
135 }
136
137 std::unique_ptr<ITensorHandleFactory> factory = std::make_unique<GpuFsaTensorHandleFactory>(memoryManager);
138
139 registry.RegisterMemoryManager(memoryManager);
140 registry.RegisterFactory(std::move(factory));
141
142 return std::make_unique<GpuFsaWorkloadFactory>(PolymorphicPointerDowncast<GpuFsaMemoryManager>(memoryManager));
143}
144
145std::vector<ITensorHandleFactory::FactoryId> GpuFsaBackend::GetHandleFactoryPreferences() const
146{
147 return std::vector<ITensorHandleFactory::FactoryId> { GpuFsaTensorHandleFactory::GetIdStatic() };
148}
149
150void GpuFsaBackend::RegisterTensorHandleFactories(TensorHandleFactoryRegistry& registry)
151{
152 std::shared_ptr<GpuFsaMemoryManager> memoryManager;
153 if (m_UsingCustomAllocator)
154 {
155 memoryManager = std::make_shared<GpuFsaMemoryManager>(m_CustomAllocator);
156 }
157 else
158 {
159 memoryManager = std::make_shared<GpuFsaMemoryManager>(std::make_unique<arm_compute::CLBufferAllocator>());
160 }
161
162 std::unique_ptr<ITensorHandleFactory> factory = std::make_unique<GpuFsaTensorHandleFactory>(memoryManager);
163 registry.RegisterMemoryManager(memoryManager);
164 registry.RegisterFactory(std::move(factory));
165
166}
167
168void GpuFsaBackend::RegisterTensorHandleFactories(TensorHandleFactoryRegistry& registry,
169 MemorySourceFlags inputFlags,
170 MemorySourceFlags outputFlags)
171{
172 // To allow force import if inputFlags/outputFlags are Undefined, set it as Malloc
173 if (inputFlags == static_cast<MemorySourceFlags>(MemorySource::Undefined))
174 {
175 inputFlags = static_cast<MemorySourceFlags>(MemorySource::Malloc);
176 }
177 if (outputFlags == static_cast<MemorySourceFlags>(MemorySource::Undefined))
178 {
179 outputFlags = static_cast<MemorySourceFlags>(MemorySource::Malloc);
180 }
181
182 std::shared_ptr<GpuFsaMemoryManager> memoryManager;
183 if (m_UsingCustomAllocator)
184 {
185 memoryManager = std::make_shared<GpuFsaMemoryManager>(m_CustomAllocator);
186 }
187 else
188 {
189 memoryManager = std::make_shared<GpuFsaMemoryManager>(std::make_unique<arm_compute::CLBufferAllocator>());
190 }
191
192 std::unique_ptr<ITensorHandleFactory> factory = std::make_unique<GpuFsaTensorHandleFactory>(memoryManager);
193 registry.RegisterMemoryManager(memoryManager);
194 registry.RegisterFactory(std::move(factory));
195}
196
197IBackendInternal::IBackendContextPtr GpuFsaBackend::CreateBackendContext(const IRuntime::CreationOptions& options) const
198{
199 return IBackendContextPtr{new GpuFsaBackendContext{options}};
200}
201
202IBackendInternal::IBackendProfilingContextPtr GpuFsaBackend::CreateBackendProfilingContext(
203 const IRuntime::CreationOptions&, IBackendProfilingPtr&)
204{
205 return IBackendProfilingContextPtr{};
206}
207
208IBackendInternal::ILayerSupportSharedPtr GpuFsaBackend::GetLayerSupport() const
209{
210 static ILayerSupportSharedPtr layerSupport{new GpuFsaLayerSupport};
211 return layerSupport;
212}
213
214std::unique_ptr<ICustomAllocator> GpuFsaBackend::GetDefaultAllocator() const
215{
216 return std::make_unique<GpuFsaBackendDefaultAllocator>();
217}
218
219OptimizationViews GpuFsaBackend::OptimizeSubgraphView(const SubgraphView& subgraph,
220 const ModelOptions& modelOptions) const
221{
222 OptimizationViews optimizationViews(modelOptions);
223
224 using namespace arm_compute::experimental::dynamic_fusion;
David Monahan8a570462023-11-22 13:24:25 +0000225
226 auto it = subgraph.end();
227 std::map<LayerGuid, Layer*> untouched;
228 while (it != subgraph.begin())
229 {
230 --it;
231 Layer& base = *(PolymorphicDowncast<Layer*>(*it));
232 untouched.insert({base.GetGuid(), &base});
233 }
234
235 GpuFsaLayerSupport supportChecker;
236 it = subgraph.end();
David Monahanbd738082023-12-08 12:50:02 +0000237 arm_compute::CLCompileContext* compileCtx = &(arm_compute::CLKernelLibrary::get().get_compile_context());
238
239 // Setup the GpuWokloadContext which will exist for the lifetime of the Graph. This contains the TensorInfos
240 std::shared_ptr<GpuWorkloadContext> workloadContext = std::make_shared<GpuWorkloadContext>(compileCtx);
David Monahan8a570462023-11-22 13:24:25 +0000241 while (it != subgraph.begin())
242 {
243 --it;
244 Layer& base = *(PolymorphicDowncast<Layer*>(*it));
David Monahanbd738082023-12-08 12:50:02 +0000245 // Create a GpuFsaPreCompiledBlob, this contains all of the information needed to execute an operator
246 GpuFsaPreCompiledBlob* preCompiledBlobPtr = new GpuFsaPreCompiledBlob();
247 preCompiledBlobPtr->workloadContext = workloadContext;
248 preCompiledBlobPtr->sketch = std::make_unique<GpuWorkloadSketch>(workloadContext.get());
David Monahan8a570462023-11-22 13:24:25 +0000249
David Monahanbd738082023-12-08 12:50:02 +0000250 // Configure and setup the sketch for each supported op. Their data will be wrapped into a PreCompiled layer
David Monahan8a570462023-11-22 13:24:25 +0000251 switch (base.GetType())
252 {
Tracy Narinebc5a5d52024-02-06 15:22:41 +0000253 case (LayerType::Cast):
254 {
255 auto input = base.GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo();
256 auto output = base.GetOutputSlot(0).GetTensorInfo();
257 GpuFsaCastCreateOp(preCompiledBlobPtr, input, output);
258 break;
259 }
David Monahan8a570462023-11-22 13:24:25 +0000260 case (LayerType::Convolution2d):
261 {
262 auto input = base.GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo();
263 auto weights = base.GetInputSlot(1).GetConnectedOutputSlot()->GetTensorInfo();
David Monahan8a570462023-11-22 13:24:25 +0000264
265 auto desc = PolymorphicDowncast<const Convolution2dDescriptor*>(&base.GetParameters());
266 if (desc->m_BiasEnabled)
267 {
268 auto bias = base.GetInputSlot(2).GetConnectedOutputSlot()->GetTensorInfo();
David Monahanbd738082023-12-08 12:50:02 +0000269 GpuFsaConvolution2dCreateOp(preCompiledBlobPtr,
270 input,
David Monahan8a570462023-11-22 13:24:25 +0000271 *desc,
272 weights,
273 bias);
274 }
275 else
276 {
David Monahanbd738082023-12-08 12:50:02 +0000277 GpuFsaConvolution2dCreateOp(preCompiledBlobPtr,
278 input,
David Monahan8a570462023-11-22 13:24:25 +0000279 *desc,
280 weights,
281 EmptyOptional());
282 }
283 break;
284 }
Teresa Charlin5bda9732024-02-08 18:46:38 +0000285 case (LayerType::BatchMatMul):
286 {
287 auto input0 = base.GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo();
288 auto input1 = base.GetInputSlot(1).GetConnectedOutputSlot()->GetTensorInfo();
289 auto desc = PolymorphicDowncast<const BatchMatMulDescriptor*>(&base.GetParameters());
290 GpuFsaBatchMatMulCreateOp(preCompiledBlobPtr, input0, input1, *desc);
291 break;
292 }
Tianle Chengfbfa49e2024-01-23 11:21:48 +0000293 case (LayerType::DepthwiseConvolution2d):
294 {
295 auto input = base.GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo();
296 auto weights = base.GetInputSlot(1).GetConnectedOutputSlot()->GetTensorInfo();
297
298 auto desc = PolymorphicDowncast<const DepthwiseConvolution2dDescriptor*>(&base.GetParameters());
299 if (desc->m_BiasEnabled)
300 {
301 auto bias = base.GetInputSlot(2).GetConnectedOutputSlot()->GetTensorInfo();
302 GpuFsaDepthwiseConvolution2dCreateOp(preCompiledBlobPtr,
303 input,
304 *desc,
305 weights,
306 bias);
307 }
308 else
309 {
310 GpuFsaDepthwiseConvolution2dCreateOp(preCompiledBlobPtr,
311 input,
312 *desc,
313 weights,
314 EmptyOptional());
315 }
316 break;
317 }
Tracy Narinee7d27852024-01-26 09:13:19 +0000318 case LayerType::ElementwiseBinary:
319 {
320 auto desc = PolymorphicDowncast<const ElementwiseBinaryDescriptor *>(&base.GetParameters());
Teresa Charlin20dda372024-02-08 16:23:25 +0000321 auto input0 = base.GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo();
322 auto input1 = base.GetInputSlot(1).GetConnectedOutputSlot()->GetTensorInfo();
323 GpuFsaElementwiseBinaryCreateOp(preCompiledBlobPtr, input0, input1, *desc);
Tracy Narinee7d27852024-01-26 09:13:19 +0000324 break;
325 }
Teresa Charlina52bca22024-02-01 17:36:48 +0000326 case (LayerType::Pooling2d):
327 {
328 auto input = base.GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo();
329 auto desc = PolymorphicDowncast<const Pooling2dDescriptor*>(&base.GetParameters());
330 GpuFsaPooling2dCreateOp(preCompiledBlobPtr, input, *desc);
331 break;
332 }
Teresa Charlin1d6b7312024-02-07 22:02:48 +0000333 case (LayerType::Resize):
334 {
335 auto input = base.GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo();
336 auto desc = PolymorphicDowncast<const ResizeDescriptor*>(&base.GetParameters());
337 GpuFsaResizeCreateOp(preCompiledBlobPtr, input, *desc);
338 break;
339 }
John Mcloughlin33753902024-02-07 15:00:57 +0000340 case (LayerType::Softmax):
341 {
342 auto input = base.GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo();
343 auto output = base.GetOutputSlot(0).GetTensorInfo();
344
345 auto desc = PolymorphicDowncast<const SoftmaxDescriptor*>(&base.GetParameters());
346 GpuFsaSoftmaxCreateOp(preCompiledBlobPtr,
347 input,
348 output,
349 *desc);
350 break;
351 }
David Monahan8a570462023-11-22 13:24:25 +0000352 default:
353 // unsupported layer for GpuFsa backend
354 continue;
355 }
356
David Monahanbd738082023-12-08 12:50:02 +0000357 auto compiledBlob =
358 std::make_unique<PreCompiledObjectPtr>(preCompiledBlobPtr, DeleteAsType<GpuFsaPreCompiledBlob>);
David Monahan8a570462023-11-22 13:24:25 +0000359
360 IConnectableLayer* preCompiledLayer = optimizationViews.GetINetwork()->AddPrecompiledLayer(
361 PreCompiledDescriptor(base.GetNumInputSlots(), base.GetNumOutputSlots()),
362 std::move(*compiledBlob),
363 armnn::Optional<BackendId>(GetId()),
364 "GpuFsa_Pre_Compiled_Layer");
365
366 // Copy the output tensor infos from sub-graph
367 for (unsigned int i = 0; i < subgraph.GetNumOutputSlots(); i++)
368 {
369 preCompiledLayer->GetOutputSlot(i).SetTensorInfo(base.GetOutputSlot(i).GetTensorInfo());
370 }
371
372 SubgraphView::SubgraphViewPtr substituteSubgraph =
373 CreateSubgraphViewFrom(CreateInputsFrom(&base),
374 CreateOutputsFrom(&base),
375 {&base});
376
David Monahanbd738082023-12-08 12:50:02 +0000377 optimizationViews.AddSubstitution({ std::move(*substituteSubgraph), SubgraphView(preCompiledLayer) });
David Monahan8a570462023-11-22 13:24:25 +0000378
379 untouched.erase(base.GetGuid());
380 }
381
382 if (optimizationViews.GetSubstitutions().empty())
383 {
384 optimizationViews.AddUntouchedSubgraph(SubgraphView(subgraph));
385 }
386 else
387 {
388 ReportUntouchedLayers(optimizationViews, untouched);
389 }
390
391
392 return optimizationViews;
393}
394
395} // namespace armnn