blob: f5a866b186f60e0e8ddc38ea5e6b1688e1fe8743 [file] [log] [blame]
David Monahan8a570462023-11-22 13:24:25 +00001//
David Monahanbd738082023-12-08 12:50:02 +00002// Copyright © 2022-2024 Arm Ltd and Contributors. All rights reserved.
David Monahan8a570462023-11-22 13:24:25 +00003// SPDX-License-Identifier: MIT
4//
5#pragma once
6
7#include <armnn/backends/IBackendInternal.hpp>
8#include <aclCommon/BaseMemoryManager.hpp>
9
10#include <arm_compute/runtime/CL/CLBufferAllocator.h>
11#include <arm_compute/runtime/CL/CLMemoryRegion.h>
12#include <arm_compute/core/CL/CLKernelLibrary.h>
13#include <CL/cl_ext.h>
David Monahanbd738082023-12-08 12:50:02 +000014#include <arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadContext.h>
15#include <arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h>
David Monahan8a570462023-11-22 13:24:25 +000016
17// System includes for mapping and unmapping memory
18#include <sys/mman.h>
19
20namespace armnn
21{
22
David Monahanbd738082023-12-08 12:50:02 +000023/**
24 * A structure which contains all the elements needed to execute a fused workload in the GpuFsa Backend
25 *
Orlaith Monahane1ac8692024-01-23 13:52:30 +000026 * @param[in, out] sketch A unique pointer to the sketch containing the operators which have been fused.
27 * @param[in, out] TensorInfos A shared pointer to a GpuWorkloadContext which creates + stores TensorInfos
28 * @param[in, out] inputTensorInfos A unique pointer to a vector of inputTensorInfos used by the sketch
29 * @param[in, out] outputTensorInfos A unique pointer to a vector of outputTensorInfos used by the sketch
David Monahanbd738082023-12-08 12:50:02 +000030 *
31 */
32struct GpuFsaPreCompiledBlob
33{
34 std::unique_ptr<arm_compute::experimental::dynamic_fusion::GpuWorkloadSketch> sketch = nullptr;
35 std::shared_ptr<arm_compute::experimental::dynamic_fusion::GpuWorkloadContext> workloadContext = nullptr;
36
Orlaith Monahane1ac8692024-01-23 13:52:30 +000037 std::unique_ptr<std::vector<arm_compute::ITensorInfo*>> inputTensorInfos = nullptr;
38 std::unique_ptr<std::vector<arm_compute::ITensorInfo*>> outputTensorInfos = nullptr;
David Monahanbd738082023-12-08 12:50:02 +000039};
40
David Monahan8a570462023-11-22 13:24:25 +000041// add new capabilities here..
42const BackendCapabilities gpuFsaCapabilities("GpuFsa",
43 {
44 {"NonConstWeights", false},
45 {"AsyncExecution", false},
46 {"ProtectedContentAllocation", false},
David Monahanbd738082023-12-08 12:50:02 +000047 {"ConstantTensorsAsInputs", true},
David Monahan8a570462023-11-22 13:24:25 +000048 {"PreImportIOTensors", false},
49 {"ExternallyManagedMemory", false},
50 {"MultiAxisPacking", false},
51 {"SingleAxisPacking", false}
52 });
53
54class GpuFsaBackend : public IBackendInternal
55{
56public:
57 GpuFsaBackend() : m_CustomAllocator(nullptr) {};
58 GpuFsaBackend(std::shared_ptr<ICustomAllocator> allocator)
59 {
60 UseCustomMemoryAllocator(allocator, armnn::EmptyOptional());
61 }
62 ~GpuFsaBackend() = default;
63
64 static const BackendId& GetIdStatic();
65 const BackendId& GetId() const override { return GetIdStatic(); }
66
67 IBackendInternal::IMemoryManagerUniquePtr CreateMemoryManager() const override;
68
69 IBackendInternal::IWorkloadFactoryPtr CreateWorkloadFactory(
70 const IBackendInternal::IMemoryManagerSharedPtr& memoryManager = nullptr) const override;
71
72 IBackendInternal::IWorkloadFactoryPtr CreateWorkloadFactory(TensorHandleFactoryRegistry& registry) const override;
73
74 IWorkloadFactoryPtr CreateWorkloadFactory(class TensorHandleFactoryRegistry& tensorHandleFactoryRegistry,
75 const ModelOptions& modelOptions,
76 MemorySourceFlags inputFlags,
77 MemorySourceFlags outputFlags) const override;
78
79 std::vector<ITensorHandleFactory::FactoryId> GetHandleFactoryPreferences() const override;
80
81 void RegisterTensorHandleFactories(TensorHandleFactoryRegistry& registry) override;
82
83 void RegisterTensorHandleFactories(TensorHandleFactoryRegistry& registry,
84 MemorySourceFlags inputFlags,
85 MemorySourceFlags outputFlags) override;
86
87 IBackendInternal::IBackendContextPtr CreateBackendContext(const IRuntime::CreationOptions&) const override;
88 IBackendInternal::IBackendProfilingContextPtr CreateBackendProfilingContext(
89 const IRuntime::CreationOptions&, IBackendProfilingPtr& backendProfiling) override;
90
91 IBackendInternal::ILayerSupportSharedPtr GetLayerSupport() const override;
92
93 OptimizationViews OptimizeSubgraphView(const SubgraphView& subgraph,
94 const ModelOptions& modelOptions) const override;
95
96 std::unique_ptr<ICustomAllocator> GetDefaultAllocator() const override;
97
98 BackendCapabilities GetCapabilities() const override
99 {
100 return gpuFsaCapabilities;
101 };
102
103 virtual bool UseCustomMemoryAllocator(std::shared_ptr<ICustomAllocator> allocator,
104 armnn::Optional<std::string&>) override
105 {
106 ARMNN_LOG(info) << "Using Custom Allocator for GpuFsaBackend";
107
108 // Set flag to signal the backend to use a custom memory allocator
109 m_CustomAllocator = std::make_shared<GpuFsaBackendCustomAllocatorWrapper>(std::move(allocator));
110 m_UsingCustomAllocator = true;
111 return m_UsingCustomAllocator;
112 }
113
114 // Cl requires a arm_compute::IAllocator we wrap the Arm NN ICustomAllocator to achieve this
115 class GpuFsaBackendCustomAllocatorWrapper : public arm_compute::IAllocator
116 {
117 public:
118 GpuFsaBackendCustomAllocatorWrapper(std::shared_ptr<ICustomAllocator> alloc) : m_CustomAllocator(alloc)
119 {}
120 // Inherited methods overridden:
121 void* allocate(size_t size, size_t alignment) override
122 {
123 auto alloc = m_CustomAllocator->allocate(size, alignment);
124 return MapAllocatedMemory(alloc, size, m_CustomAllocator->GetMemorySourceType());
125 }
126 void free(void* ptr) override
127 {
128 auto hostMemPtr = m_AllocatedBufferMappings[ptr];
129 clReleaseMemObject(static_cast<cl_mem>(ptr));
130 m_CustomAllocator->free(hostMemPtr);
131 }
132 std::unique_ptr<arm_compute::IMemoryRegion> make_region(size_t size, size_t alignment) override
133 {
134 auto hostMemPtr = m_CustomAllocator->allocate(size, alignment);
135 cl_mem buffer = MapAllocatedMemory(hostMemPtr, size, m_CustomAllocator->GetMemorySourceType());
136
137 return std::make_unique<ClBackendCustomAllocatorMemoryRegion>(cl::Buffer(buffer),
138 hostMemPtr,
139 m_CustomAllocator->GetMemorySourceType());
140 }
141 private:
142 cl_mem MapAllocatedMemory(void* memory, size_t size, MemorySource source)
143 {
144 // Round the size of the buffer to a multiple of the CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE
145 auto cachelineAlignment =
146 arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>();
147 auto roundedSize = cachelineAlignment + size - (size % cachelineAlignment);
148
149 if (source == MemorySource::Malloc)
150 {
151 const cl_import_properties_arm importProperties[] =
152 {
153 CL_IMPORT_TYPE_ARM,
154 CL_IMPORT_TYPE_HOST_ARM,
155 0
156 };
157 cl_int error = CL_SUCCESS;
158 cl_mem buffer = clImportMemoryARM(arm_compute::CLKernelLibrary::get().context().get(),
159 CL_MEM_READ_WRITE,
160 importProperties,
161 memory,
162 roundedSize,
163 &error);
164 if (error == CL_SUCCESS)
165 {
166 m_AllocatedBufferMappings.insert(std::make_pair(static_cast<void *>(buffer), memory));
167 return buffer;
168 }
169 throw armnn::Exception(
170 "Mapping allocated memory from CustomMemoryAllocator failed, errcode: " + std::to_string(error));
171 }
172 else if (source == MemorySource::DmaBuf)
173 {
174 const cl_import_properties_arm importProperties[] =
175 {
176 CL_IMPORT_TYPE_ARM,
177 CL_IMPORT_TYPE_DMA_BUF_ARM,
178 CL_IMPORT_DMA_BUF_DATA_CONSISTENCY_WITH_HOST_ARM,
179 CL_TRUE,
180 0
181 };
182 cl_int error = CL_SUCCESS;
183 cl_mem buffer = clImportMemoryARM(arm_compute::CLKernelLibrary::get().context().get(),
184 CL_MEM_READ_WRITE,
185 importProperties,
186 memory,
187 roundedSize,
188 &error);
189 if (error == CL_SUCCESS)
190 {
191 m_AllocatedBufferMappings.insert(std::make_pair(static_cast<void *>(buffer), memory));
192 return buffer;
193 }
194 throw armnn::Exception(
195 "Mapping allocated memory from CustomMemoryAllocator failed, errcode: "
196 + std::to_string(error));
197 }
198 else if (source == MemorySource::DmaBufProtected)
199 {
200 const cl_import_properties_arm importProperties[] =
201 {
202 CL_IMPORT_TYPE_ARM,
203 CL_IMPORT_TYPE_DMA_BUF_ARM,
204 CL_IMPORT_TYPE_PROTECTED_ARM,
205 CL_TRUE,
206 0
207 };
208 cl_int error = CL_SUCCESS;
209 cl_mem buffer = clImportMemoryARM(arm_compute::CLKernelLibrary::get().context().get(),
210 CL_MEM_READ_WRITE,
211 importProperties,
212 memory,
213 roundedSize,
214 &error);
215 if (error == CL_SUCCESS)
216 {
217 m_AllocatedBufferMappings.insert(std::make_pair(static_cast<void *>(buffer), memory));
218 return buffer;
219 }
220 throw armnn::Exception(
221 "Mapping allocated memory from CustomMemoryAllocator failed, errcode: "
222 + std::to_string(error));
223 }
224 throw armnn::Exception(
225 "Attempting to allocate memory with unsupported MemorySource type in CustomAllocator");
226 }
227 std::shared_ptr<ICustomAllocator> m_CustomAllocator;
228 std::map<void*, void*> m_AllocatedBufferMappings;
229 };
230
231 class ClBackendCustomAllocatorMemoryRegion : public arm_compute::ICLMemoryRegion
232 {
233 public:
234 // We need to have a new version of ICLMemoryRegion which holds a hostMemPtr to allow for cpu copy access
235 ClBackendCustomAllocatorMemoryRegion(const cl::Buffer &buffer, void* hostMemPtr, armnn::MemorySource source)
236 : ICLMemoryRegion(buffer.getInfo<CL_MEM_SIZE>())
237 {
238 _mem = buffer;
239 m_HostMemPtr = hostMemPtr;
240 m_MemorySource = source;
241 }
242
243 // Inherited methods overridden :
244 void* ptr() override
245 {
246 return nullptr;
247 }
248
249 void* map(cl::CommandQueue &q, bool blocking) override
250 {
251 armnn::IgnoreUnused(q, blocking);
252 if (m_HostMemPtr == nullptr)
253 {
254 throw armnn::Exception("ClBackend: Attempting to map memory with an invalid host ptr");
255 }
256 if (_mapping != nullptr)
257 {
258 throw armnn::Exception("ClBackend: Attempting to map memory which has not yet been unmapped");
259 }
260 switch (m_MemorySource)
261 {
262 case armnn::MemorySource::Malloc:
263 _mapping = m_HostMemPtr;
264 return _mapping;
265 break;
266 case armnn::MemorySource::DmaBuf:
267 case armnn::MemorySource::DmaBufProtected:
268 // If the source is a Dmabuf then the memory ptr should be pointing to an integer value for the fd
269 _mapping = mmap(NULL, _size, PROT_WRITE, MAP_SHARED, *(reinterpret_cast<int*>(m_HostMemPtr)), 0);
270 return _mapping;
271 break;
272 default:
273 throw armnn::Exception("ClBackend: Attempting to map imported memory without a valid source");
274 break;
275 }
276 }
277
278 void unmap(cl::CommandQueue &q) override
279 {
280 armnn::IgnoreUnused(q);
281 switch (m_MemorySource)
282 {
283 case armnn::MemorySource::Malloc:
284 _mapping = nullptr;
285 break;
286 case armnn::MemorySource::DmaBuf:
287 case armnn::MemorySource::DmaBufProtected:
288 munmap(_mapping, _size);
289 _mapping = nullptr;
290 break;
291 default:
292 throw armnn::Exception("ClBackend: Attempting to unmap imported memory without a valid source");
293 break;
294 }
295 }
296 private:
297 void* m_HostMemPtr = nullptr;
298 armnn::MemorySource m_MemorySource;
299 };
300
301 std::shared_ptr<GpuFsaBackendCustomAllocatorWrapper> m_CustomAllocator;
302 bool m_UsingCustomAllocator = false;
303};
304
305} // namespace armnn