Blame - src/backends/gpuFsa/GpuFsaBackend.hpp - ml/armnn

David Monahan

8a57046

2023-11-22 13:24:25 +0000

[diff] [blame]

1

//

David Monahan

bd73808

2023-12-08 12:50:02 +0000

[diff] [blame]

2

David Monahan

8a57046

2023-11-22 13:24:25 +0000

[diff] [blame]

3

// SPDX-License-Identifier: MIT

//

#pragma once

#include <armnn/backends/IBackendInternal.hpp>

8

#include <aclCommon/BaseMemoryManager.hpp>

9

10

#include <arm_compute/runtime/CL/CLBufferAllocator.h>

11

#include <arm_compute/runtime/CL/CLMemoryRegion.h>

12

#include <arm_compute/core/CL/CLKernelLibrary.h>

13

#include <CL/cl_ext.h>

David Monahan

bd73808

2023-12-08 12:50:02 +0000

[diff] [blame]

14

#include <arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadContext.h>

15

#include <arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h>

David Monahan

8a57046

2023-11-22 13:24:25 +0000

[diff] [blame]

16

17

// System includes for mapping and unmapping memory

18

#include <sys/mman.h>

namespace armnn

{

David Monahan

2023-12-08 12:50:02 +0000

[diff] [blame]

23

/**

24

* A structure which contains all the elements needed to execute a fused workload in the GpuFsa Backend

25

*

Orlaith Monahan

e1ac869

2024-01-23 13:52:30 +0000

[diff] [blame]

26

* @param[in, out] sketch A unique pointer to the sketch containing the operators which have been fused.

27

* @param[in, out] TensorInfos A shared pointer to a GpuWorkloadContext which creates + stores TensorInfos

28

* @param[in, out] inputTensorInfos A unique pointer to a vector of inputTensorInfos used by the sketch

29

* @param[in, out] outputTensorInfos A unique pointer to a vector of outputTensorInfos used by the sketch

David Monahan

bd73808

2023-12-08 12:50:02 +0000

[diff] [blame]

30

*

31

*/

32

struct GpuFsaPreCompiledBlob

33

{

34

std::unique_ptr<arm_compute::experimental::dynamic_fusion::GpuWorkloadSketch> sketch = nullptr;

35

std::shared_ptr<arm_compute::experimental::dynamic_fusion::GpuWorkloadContext> workloadContext = nullptr;

36

Orlaith Monahan

e1ac869

2024-01-23 13:52:30 +0000

[diff] [blame]

37

std::unique_ptr<std::vector<arm_compute::ITensorInfo*>> inputTensorInfos = nullptr;

38

std::unique_ptr<std::vector<arm_compute::ITensorInfo*>> outputTensorInfos = nullptr;

David Monahan

bd73808

2023-12-08 12:50:02 +0000

[diff] [blame]

39

};

40

David Monahan

8a57046

2023-11-22 13:24:25 +0000

[diff] [blame]

41

// add new capabilities here..

42

const BackendCapabilities gpuFsaCapabilities("GpuFsa",

43

{

44

{"NonConstWeights", false},

45

{"AsyncExecution", false},

46

{"ProtectedContentAllocation", false},

David Monahan

bd73808

2023-12-08 12:50:02 +0000

[diff] [blame]

47

{"ConstantTensorsAsInputs", true},

David Monahan

8a57046

2023-11-22 13:24:25 +0000

[diff] [blame]

48

{"PreImportIOTensors", false},

49

{"ExternallyManagedMemory", false},

50

{"MultiAxisPacking", false},

51

{"SingleAxisPacking", false}

52

});

53

54

class GpuFsaBackend : public IBackendInternal

55

{

56

public:

57

GpuFsaBackend() : m_CustomAllocator(nullptr) {};

58

GpuFsaBackend(std::shared_ptr<ICustomAllocator> allocator)

59

{

60

UseCustomMemoryAllocator(allocator, armnn::EmptyOptional());

61

}

62

~GpuFsaBackend() = default;

63

64

static const BackendId& GetIdStatic();

65

const BackendId& GetId() const override { return GetIdStatic(); }

66

67

IBackendInternal::IMemoryManagerUniquePtr CreateMemoryManager() const override;

68

69

IBackendInternal::IWorkloadFactoryPtr CreateWorkloadFactory(

70

const IBackendInternal::IMemoryManagerSharedPtr& memoryManager = nullptr) const override;

71

72

IBackendInternal::IWorkloadFactoryPtr CreateWorkloadFactory(TensorHandleFactoryRegistry& registry) const override;

73

74

IWorkloadFactoryPtr CreateWorkloadFactory(class TensorHandleFactoryRegistry& tensorHandleFactoryRegistry,

75

const ModelOptions& modelOptions,

76

MemorySourceFlags inputFlags,

77

MemorySourceFlags outputFlags) const override;

78

79

std::vector<ITensorHandleFactory::FactoryId> GetHandleFactoryPreferences() const override;

80

81

void RegisterTensorHandleFactories(TensorHandleFactoryRegistry& registry) override;

82

83

void RegisterTensorHandleFactories(TensorHandleFactoryRegistry& registry,

84

MemorySourceFlags inputFlags,

85

MemorySourceFlags outputFlags) override;

86

87

IBackendInternal::IBackendContextPtr CreateBackendContext(const IRuntime::CreationOptions&) const override;

88

IBackendInternal::IBackendProfilingContextPtr CreateBackendProfilingContext(

89

const IRuntime::CreationOptions&, IBackendProfilingPtr& backendProfiling) override;

90

91

IBackendInternal::ILayerSupportSharedPtr GetLayerSupport() const override;

92

93

OptimizationViews OptimizeSubgraphView(const SubgraphView& subgraph,

94

const ModelOptions& modelOptions) const override;

95

96

std::unique_ptr<ICustomAllocator> GetDefaultAllocator() const override;

97

98

BackendCapabilities GetCapabilities() const override

99

{

100

return gpuFsaCapabilities;

101

};

102

103

virtual bool UseCustomMemoryAllocator(std::shared_ptr<ICustomAllocator> allocator,

104

armnn::Optional<std::string&>) override

105

{

106

ARMNN_LOG(info) << "Using Custom Allocator for GpuFsaBackend";

107

108

// Set flag to signal the backend to use a custom memory allocator

109

m_CustomAllocator = std::make_shared<GpuFsaBackendCustomAllocatorWrapper>(std::move(allocator));

110

m_UsingCustomAllocator = true;

111

return m_UsingCustomAllocator;

112

}

113

114

// Cl requires a arm_compute::IAllocator we wrap the Arm NN ICustomAllocator to achieve this

115

class GpuFsaBackendCustomAllocatorWrapper : public arm_compute::IAllocator

116

{

117

public:

118

GpuFsaBackendCustomAllocatorWrapper(std::shared_ptr<ICustomAllocator> alloc) : m_CustomAllocator(alloc)

119

{}

120

// Inherited methods overridden:

121

void* allocate(size_t size, size_t alignment) override

122

{

123

auto alloc = m_CustomAllocator->allocate(size, alignment);

124

return MapAllocatedMemory(alloc, size, m_CustomAllocator->GetMemorySourceType());

125

}

126

void free(void* ptr) override

127

{

128

auto hostMemPtr = m_AllocatedBufferMappings[ptr];

129

clReleaseMemObject(static_cast<cl_mem>(ptr));

130

m_CustomAllocator->free(hostMemPtr);

131

}

132

std::unique_ptr<arm_compute::IMemoryRegion> make_region(size_t size, size_t alignment) override

133

{

134

auto hostMemPtr = m_CustomAllocator->allocate(size, alignment);

135

cl_mem buffer = MapAllocatedMemory(hostMemPtr, size, m_CustomAllocator->GetMemorySourceType());

136

137

return std::make_unique<ClBackendCustomAllocatorMemoryRegion>(cl::Buffer(buffer),

138

hostMemPtr,

139

m_CustomAllocator->GetMemorySourceType());

140

}

141

private:

142

cl_mem MapAllocatedMemory(void* memory, size_t size, MemorySource source)

143

{

144

// Round the size of the buffer to a multiple of the CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE

145

auto cachelineAlignment =

146

arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>();

147

auto roundedSize = cachelineAlignment + size - (size % cachelineAlignment);

148

149

if (source == MemorySource::Malloc)

150

{

151

const cl_import_properties_arm importProperties[] =

152

{

153

CL_IMPORT_TYPE_ARM,

154

CL_IMPORT_TYPE_HOST_ARM,

155

0

156

};

157

cl_int error = CL_SUCCESS;

158

cl_mem buffer = clImportMemoryARM(arm_compute::CLKernelLibrary::get().context().get(),

CL_MEM_READ_WRITE,

importProperties,

memory,

roundedSize,

&error);

if (error == CL_SUCCESS)

165

{

166

m_AllocatedBufferMappings.insert(std::make_pair(static_cast<void *>(buffer), memory));

167

return buffer;

168

}

169

throw armnn::Exception(

170

"Mapping allocated memory from CustomMemoryAllocator failed, errcode: " + std::to_string(error));

171

}

172

else if (source == MemorySource::DmaBuf)

173

{

174

const cl_import_properties_arm importProperties[] =

175

{

176

CL_IMPORT_TYPE_ARM,

177

CL_IMPORT_TYPE_DMA_BUF_ARM,

178

CL_IMPORT_DMA_BUF_DATA_CONSISTENCY_WITH_HOST_ARM,

CL_TRUE,

0

};

cl_int error = CL_SUCCESS;

183

cl_mem buffer = clImportMemoryARM(arm_compute::CLKernelLibrary::get().context().get(),

CL_MEM_READ_WRITE,

importProperties,

memory,

roundedSize,

&error);

if (error == CL_SUCCESS)

190

{

191

m_AllocatedBufferMappings.insert(std::make_pair(static_cast<void *>(buffer), memory));

192

return buffer;

193

}

194

throw armnn::Exception(

195

"Mapping allocated memory from CustomMemoryAllocator failed, errcode: "

196

+ std::to_string(error));

197

}

198

else if (source == MemorySource::DmaBufProtected)

199

{

200

const cl_import_properties_arm importProperties[] =

201

{

202

CL_IMPORT_TYPE_ARM,

203

CL_IMPORT_TYPE_DMA_BUF_ARM,

204

CL_IMPORT_TYPE_PROTECTED_ARM,

CL_TRUE,

0

};

cl_int error = CL_SUCCESS;

209

cl_mem buffer = clImportMemoryARM(arm_compute::CLKernelLibrary::get().context().get(),

CL_MEM_READ_WRITE,

importProperties,

memory,

roundedSize,

&error);

if (error == CL_SUCCESS)

216

{

217

m_AllocatedBufferMappings.insert(std::make_pair(static_cast<void *>(buffer), memory));

218

return buffer;

219

}

220

throw armnn::Exception(

221

"Mapping allocated memory from CustomMemoryAllocator failed, errcode: "

222

+ std::to_string(error));

223

}

224

throw armnn::Exception(

225

"Attempting to allocate memory with unsupported MemorySource type in CustomAllocator");

226

}

227

std::shared_ptr<ICustomAllocator> m_CustomAllocator;

228

std::map<void*, void*> m_AllocatedBufferMappings;

229

};

230

231

class ClBackendCustomAllocatorMemoryRegion : public arm_compute::ICLMemoryRegion

232

{

233

public:

234

// We need to have a new version of ICLMemoryRegion which holds a hostMemPtr to allow for cpu copy access

235

ClBackendCustomAllocatorMemoryRegion(const cl::Buffer &buffer, void* hostMemPtr, armnn::MemorySource source)

236

: ICLMemoryRegion(buffer.getInfo<CL_MEM_SIZE>())

237

{

238

_mem = buffer;

239

m_HostMemPtr = hostMemPtr;

240

m_MemorySource = source;

241

}

242

243

// Inherited methods overridden :

void* ptr() override

{

return nullptr;

}

void* map(cl::CommandQueue &q, bool blocking) override

250

{

251

armnn::IgnoreUnused(q, blocking);

252

if (m_HostMemPtr == nullptr)

253

{

254

throw armnn::Exception("ClBackend: Attempting to map memory with an invalid host ptr");

255

}

256

if (_mapping != nullptr)

257

{

258

throw armnn::Exception("ClBackend: Attempting to map memory which has not yet been unmapped");

259

}

260

switch (m_MemorySource)

261

{

262

case armnn::MemorySource::Malloc:

263

_mapping = m_HostMemPtr;

264

return _mapping;

265

break;

266

case armnn::MemorySource::DmaBuf:

267

case armnn::MemorySource::DmaBufProtected:

268

// If the source is a Dmabuf then the memory ptr should be pointing to an integer value for the fd

269

_mapping = mmap(NULL, _size, PROT_WRITE, MAP_SHARED, *(reinterpret_cast<int*>(m_HostMemPtr)), 0);

return _mapping;

break;

default:

throw armnn::Exception("ClBackend: Attempting to map imported memory without a valid source");

break;

}

}

void unmap(cl::CommandQueue &q) override

279

{

280

armnn::IgnoreUnused(q);

281

switch (m_MemorySource)

282

{

283

case armnn::MemorySource::Malloc:

284

_mapping = nullptr;

285

break;

286

case armnn::MemorySource::DmaBuf:

287

case armnn::MemorySource::DmaBufProtected:

288

munmap(_mapping, _size);

_mapping = nullptr;

break;

default:

throw armnn::Exception("ClBackend: Attempting to unmap imported memory without a valid source");

break;

}

}

private:

void* m_HostMemPtr = nullptr;

298

armnn::MemorySource m_MemorySource;

299

};

300

301

std::shared_ptr<GpuFsaBackendCustomAllocatorWrapper> m_CustomAllocator;

302

bool m_UsingCustomAllocator = false;

303

};

304

305

} // namespace armnn