IVGCVSW-8157 - Rebase existing GpuFsa patches to 23.11
Squashed commit of the following:

IVGCVSW-7159 Add GpuFsa backend skeleton
IVGCVSW-7380 Update the GpuFsa Skeleton to build and load ACL
IVGCVSW-7381 Add IsLayerSupported implementation to GpuFsa backend
IVGCVSW-7382 Implementation of Conv2d within GpuFsa

Signed-off-by: James Conroy <james.conroy@arm.com>
Signed-off-by: Matthew Sloyan <matthew.sloyan@arm.com>
Signed-off-by: David Monahan <david.monahan@arm.com>
Change-Id: Id23d9ee598535de7b38a99ca223cdf0ad2102cef
diff --git a/src/backends/gpuFsa/GpuFsaBackend.hpp b/src/backends/gpuFsa/GpuFsaBackend.hpp
new file mode 100644
index 0000000..2696006
--- /dev/null
+++ b/src/backends/gpuFsa/GpuFsaBackend.hpp
@@ -0,0 +1,285 @@
+//
+// Copyright © 2022-2023 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+#pragma once
+
+#include <armnn/backends/IBackendInternal.hpp>
+#include <aclCommon/BaseMemoryManager.hpp>
+
+#include <arm_compute/runtime/CL/CLBufferAllocator.h>
+#include <arm_compute/runtime/CL/CLMemoryRegion.h>
+#include <arm_compute/core/CL/CLKernelLibrary.h>
+#include <CL/cl_ext.h>
+
+// System includes for mapping and unmapping memory
+#include <sys/mman.h>
+
+namespace armnn
+{
+
+// add new capabilities here..
+const BackendCapabilities gpuFsaCapabilities("GpuFsa",
+                                             {
+                                                     {"NonConstWeights", false},
+                                                     {"AsyncExecution", false},
+                                                     {"ProtectedContentAllocation", false},
+                                                     {"ConstantTensorsAsInputs", false},
+                                                     {"PreImportIOTensors", false},
+                                                     {"ExternallyManagedMemory", false},
+                                                     {"MultiAxisPacking", false},
+                                                     {"SingleAxisPacking", false}
+                                             });
+
+class GpuFsaBackend : public IBackendInternal
+{
+public:
+    GpuFsaBackend() : m_CustomAllocator(nullptr) {};
+    GpuFsaBackend(std::shared_ptr<ICustomAllocator> allocator)
+    {
+        UseCustomMemoryAllocator(allocator, armnn::EmptyOptional());
+    }
+    ~GpuFsaBackend() = default;
+
+    static const BackendId& GetIdStatic();
+    const BackendId& GetId() const override { return GetIdStatic(); }
+
+    IBackendInternal::IMemoryManagerUniquePtr CreateMemoryManager() const override;
+
+    IBackendInternal::IWorkloadFactoryPtr CreateWorkloadFactory(
+        const IBackendInternal::IMemoryManagerSharedPtr& memoryManager = nullptr) const override;
+
+    IBackendInternal::IWorkloadFactoryPtr CreateWorkloadFactory(TensorHandleFactoryRegistry& registry) const override;
+
+    IWorkloadFactoryPtr CreateWorkloadFactory(class TensorHandleFactoryRegistry& tensorHandleFactoryRegistry,
+                                              const ModelOptions& modelOptions,
+                                              MemorySourceFlags inputFlags,
+                                              MemorySourceFlags outputFlags) const override;
+
+    std::vector<ITensorHandleFactory::FactoryId> GetHandleFactoryPreferences() const override;
+
+    void RegisterTensorHandleFactories(TensorHandleFactoryRegistry& registry) override;
+
+    void RegisterTensorHandleFactories(TensorHandleFactoryRegistry& registry,
+                                       MemorySourceFlags inputFlags,
+                                       MemorySourceFlags outputFlags) override;
+
+    IBackendInternal::IBackendContextPtr CreateBackendContext(const IRuntime::CreationOptions&) const override;
+    IBackendInternal::IBackendProfilingContextPtr CreateBackendProfilingContext(
+        const IRuntime::CreationOptions&, IBackendProfilingPtr& backendProfiling) override;
+
+    IBackendInternal::ILayerSupportSharedPtr GetLayerSupport() const override;
+
+    OptimizationViews OptimizeSubgraphView(const SubgraphView& subgraph,
+                                           const ModelOptions& modelOptions) const override;
+
+    std::unique_ptr<ICustomAllocator> GetDefaultAllocator() const override;
+
+    BackendCapabilities GetCapabilities() const override
+    {
+        return gpuFsaCapabilities;
+    };
+
+    virtual bool UseCustomMemoryAllocator(std::shared_ptr<ICustomAllocator> allocator,
+                                          armnn::Optional<std::string&>) override
+    {
+        ARMNN_LOG(info) << "Using Custom Allocator for GpuFsaBackend";
+
+        // Set flag to signal the backend to use a custom memory allocator
+        m_CustomAllocator = std::make_shared<GpuFsaBackendCustomAllocatorWrapper>(std::move(allocator));
+        m_UsingCustomAllocator = true;
+        return m_UsingCustomAllocator;
+    }
+
+    // Cl requires a arm_compute::IAllocator we wrap the Arm NN ICustomAllocator to achieve this
+    class GpuFsaBackendCustomAllocatorWrapper : public arm_compute::IAllocator
+    {
+    public:
+        GpuFsaBackendCustomAllocatorWrapper(std::shared_ptr<ICustomAllocator> alloc) : m_CustomAllocator(alloc)
+        {}
+        // Inherited methods overridden:
+        void* allocate(size_t size, size_t alignment) override
+        {
+            auto alloc = m_CustomAllocator->allocate(size, alignment);
+            return MapAllocatedMemory(alloc, size, m_CustomAllocator->GetMemorySourceType());
+        }
+        void free(void* ptr) override
+        {
+            auto hostMemPtr = m_AllocatedBufferMappings[ptr];
+            clReleaseMemObject(static_cast<cl_mem>(ptr));
+            m_CustomAllocator->free(hostMemPtr);
+        }
+        std::unique_ptr<arm_compute::IMemoryRegion> make_region(size_t size, size_t alignment) override
+        {
+            auto hostMemPtr = m_CustomAllocator->allocate(size, alignment);
+            cl_mem buffer = MapAllocatedMemory(hostMemPtr, size, m_CustomAllocator->GetMemorySourceType());
+
+            return std::make_unique<ClBackendCustomAllocatorMemoryRegion>(cl::Buffer(buffer),
+                                                                          hostMemPtr,
+                                                                          m_CustomAllocator->GetMemorySourceType());
+        }
+    private:
+        cl_mem MapAllocatedMemory(void* memory, size_t size, MemorySource source)
+        {
+            // Round the size of the buffer to a multiple of the CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE
+            auto cachelineAlignment =
+                    arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>();
+            auto roundedSize = cachelineAlignment + size - (size % cachelineAlignment);
+
+            if (source == MemorySource::Malloc)
+            {
+                const cl_import_properties_arm importProperties[] =
+                        {
+                            CL_IMPORT_TYPE_ARM,
+                            CL_IMPORT_TYPE_HOST_ARM,
+                            0
+                        };
+                cl_int error = CL_SUCCESS;
+                cl_mem buffer = clImportMemoryARM(arm_compute::CLKernelLibrary::get().context().get(),
+                                                  CL_MEM_READ_WRITE,
+                                                  importProperties,
+                                                  memory,
+                                                  roundedSize,
+                                                  &error);
+                if (error == CL_SUCCESS)
+                {
+                    m_AllocatedBufferMappings.insert(std::make_pair(static_cast<void *>(buffer), memory));
+                    return buffer;
+                }
+                throw armnn::Exception(
+                    "Mapping allocated memory from CustomMemoryAllocator failed, errcode: " + std::to_string(error));
+            }
+            else if (source == MemorySource::DmaBuf)
+            {
+                const cl_import_properties_arm importProperties[] =
+                        {
+                            CL_IMPORT_TYPE_ARM,
+                            CL_IMPORT_TYPE_DMA_BUF_ARM,
+                            CL_IMPORT_DMA_BUF_DATA_CONSISTENCY_WITH_HOST_ARM,
+                            CL_TRUE,
+                            0
+                        };
+                cl_int error = CL_SUCCESS;
+                cl_mem buffer = clImportMemoryARM(arm_compute::CLKernelLibrary::get().context().get(),
+                                                  CL_MEM_READ_WRITE,
+                                                  importProperties,
+                                                  memory,
+                                                  roundedSize,
+                                                  &error);
+                if (error == CL_SUCCESS)
+                {
+                    m_AllocatedBufferMappings.insert(std::make_pair(static_cast<void *>(buffer), memory));
+                    return buffer;
+                }
+                throw armnn::Exception(
+                        "Mapping allocated memory from CustomMemoryAllocator failed, errcode: "
+                         + std::to_string(error));
+            }
+            else if (source == MemorySource::DmaBufProtected)
+            {
+                const cl_import_properties_arm importProperties[] =
+                        {
+                                CL_IMPORT_TYPE_ARM,
+                                CL_IMPORT_TYPE_DMA_BUF_ARM,
+                                CL_IMPORT_TYPE_PROTECTED_ARM,
+                                CL_TRUE,
+                                0
+                        };
+                cl_int error = CL_SUCCESS;
+                cl_mem buffer = clImportMemoryARM(arm_compute::CLKernelLibrary::get().context().get(),
+                                                  CL_MEM_READ_WRITE,
+                                                  importProperties,
+                                                  memory,
+                                                  roundedSize,
+                                                  &error);
+                if (error == CL_SUCCESS)
+                {
+                    m_AllocatedBufferMappings.insert(std::make_pair(static_cast<void *>(buffer), memory));
+                    return buffer;
+                }
+                throw armnn::Exception(
+                        "Mapping allocated memory from CustomMemoryAllocator failed, errcode: "
+                         + std::to_string(error));
+            }
+            throw armnn::Exception(
+                    "Attempting to allocate memory with unsupported MemorySource type in CustomAllocator");
+        }
+        std::shared_ptr<ICustomAllocator> m_CustomAllocator;
+        std::map<void*, void*> m_AllocatedBufferMappings;
+    };
+
+    class ClBackendCustomAllocatorMemoryRegion : public arm_compute::ICLMemoryRegion
+    {
+    public:
+        // We need to have a new version of ICLMemoryRegion which holds a hostMemPtr to allow for cpu copy access
+        ClBackendCustomAllocatorMemoryRegion(const cl::Buffer &buffer, void* hostMemPtr, armnn::MemorySource source)
+            : ICLMemoryRegion(buffer.getInfo<CL_MEM_SIZE>())
+        {
+            _mem = buffer;
+            m_HostMemPtr = hostMemPtr;
+            m_MemorySource = source;
+        }
+
+        // Inherited methods overridden :
+        void* ptr() override
+        {
+            return nullptr;
+        }
+
+        void* map(cl::CommandQueue &q, bool blocking) override
+        {
+            armnn::IgnoreUnused(q, blocking);
+            if (m_HostMemPtr == nullptr)
+            {
+                throw armnn::Exception("ClBackend: Attempting to map memory with an invalid host ptr");
+            }
+            if (_mapping != nullptr)
+            {
+                throw armnn::Exception("ClBackend: Attempting to map memory which has not yet been unmapped");
+            }
+            switch (m_MemorySource)
+            {
+                case armnn::MemorySource::Malloc:
+                    _mapping = m_HostMemPtr;
+                    return _mapping;
+                    break;
+                case armnn::MemorySource::DmaBuf:
+                case armnn::MemorySource::DmaBufProtected:
+                    // If the source is a Dmabuf then the memory ptr should be pointing to an integer value for the fd
+                    _mapping = mmap(NULL, _size, PROT_WRITE, MAP_SHARED, *(reinterpret_cast<int*>(m_HostMemPtr)), 0);
+                    return _mapping;
+                    break;
+                default:
+                    throw armnn::Exception("ClBackend: Attempting to map imported memory without a valid source");
+                    break;
+            }
+        }
+
+        void unmap(cl::CommandQueue &q) override
+        {
+            armnn::IgnoreUnused(q);
+            switch (m_MemorySource)
+            {
+                case armnn::MemorySource::Malloc:
+                    _mapping = nullptr;
+                    break;
+                case armnn::MemorySource::DmaBuf:
+                case armnn::MemorySource::DmaBufProtected:
+                    munmap(_mapping, _size);
+                    _mapping = nullptr;
+                    break;
+                default:
+                    throw armnn::Exception("ClBackend: Attempting to unmap imported memory without a valid source");
+                    break;
+            }
+        }
+    private:
+        void* m_HostMemPtr = nullptr;
+        armnn::MemorySource m_MemorySource;
+    };
+
+    std::shared_ptr<GpuFsaBackendCustomAllocatorWrapper> m_CustomAllocator;
+    bool m_UsingCustomAllocator = false;
+};
+
+} // namespace armnn