Adds CustomAllocator interface and Sample App

 * Updates the runtime options with a CustomAllocatorMap which allows to define a CustomAllocator for specific backends
 * Change IBackendInternal interface to use a shared pointer to a custom allocator
 * Update ClBackend.hpp/cpp to use the CustomAllocator
 * Adds an example application and unit test which uses a CustomAllocator for GpuAcc
 * Refactor of the interface to use MemorySource instead of the user Mapping cl_mem directly
 * Modify the BackendRegistry to also hold a registry of CustomAllocators
 * BackendRegistry Deregister will also deregister any allocators associated with that backend id
 * set_global_allocator within the BaseMemoryManager so that it always matches the currently used allocator

Signed-off-by: Jan Eilers <jan.eilers@arm.com>
Change-Id: I156d819686021865f4375e6cb7a5c5dec8fee9e8
Signed-off-by: David Monahan <david.monahan@arm.com>
diff --git a/src/backends/cl/ClBackend.cpp b/src/backends/cl/ClBackend.cpp
index f1e52c1..b85232e 100644
--- a/src/backends/cl/ClBackend.cpp
+++ b/src/backends/cl/ClBackend.cpp
@@ -49,6 +49,10 @@
 
 IBackendInternal::IMemoryManagerUniquePtr ClBackend::CreateMemoryManager() const
 {
+    if (m_UsingCustomAllocator)
+    {
+        return std::make_unique<ClMemoryManager>(m_CustomAllocator);
+    }
     return std::make_unique<ClMemoryManager>(std::make_unique<arm_compute::CLBufferAllocator>());
 }
 
@@ -69,7 +73,15 @@
 IBackendInternal::IWorkloadFactoryPtr ClBackend::CreateWorkloadFactory(
     TensorHandleFactoryRegistry& registry) const
 {
-    auto memoryManager = std::make_shared<ClMemoryManager>(std::make_unique<arm_compute::CLBufferAllocator>());
+    std::shared_ptr<ClMemoryManager> memoryManager;
+    if (m_UsingCustomAllocator)
+    {
+        memoryManager = std::make_shared<ClMemoryManager>(m_CustomAllocator);
+    }
+    else
+    {
+        memoryManager = std::make_shared<ClMemoryManager>(std::make_unique<arm_compute::CLBufferAllocator>());
+    }
 
     registry.RegisterMemoryManager(memoryManager);
     registry.RegisterFactory(std::make_unique<ClTensorHandleFactory>(memoryManager));
@@ -83,7 +95,15 @@
 IBackendInternal::IWorkloadFactoryPtr ClBackend::CreateWorkloadFactory(
     TensorHandleFactoryRegistry& registry, const ModelOptions& modelOptions) const
 {
-    auto memoryManager = std::make_shared<ClMemoryManager>(std::make_unique<arm_compute::CLBufferAllocator>());
+    std::shared_ptr<ClMemoryManager> memoryManager;
+    if (m_UsingCustomAllocator)
+    {
+        memoryManager = std::make_shared<ClMemoryManager>(m_CustomAllocator);
+    }
+    else
+    {
+        memoryManager = std::make_shared<ClMemoryManager>(std::make_unique<arm_compute::CLBufferAllocator>());
+    }
 
     registry.RegisterMemoryManager(memoryManager);
     registry.RegisterFactory(std::make_unique<ClTensorHandleFactory>(memoryManager));
@@ -100,7 +120,15 @@
     MemorySourceFlags inputFlags,
     MemorySourceFlags outputFlags) const
 {
-    auto memoryManager = std::make_shared<ClMemoryManager>(std::make_unique<arm_compute::CLBufferAllocator>());
+    std::shared_ptr<ClMemoryManager> memoryManager;
+    if (m_UsingCustomAllocator)
+    {
+        memoryManager = std::make_shared<ClMemoryManager>(m_CustomAllocator);
+    }
+    else
+    {
+        memoryManager = std::make_shared<ClMemoryManager>(std::make_unique<arm_compute::CLBufferAllocator>());
+    }
 
     registry.RegisterMemoryManager(memoryManager);
     registry.RegisterFactory(std::make_unique<ClTensorHandleFactory>(memoryManager));
@@ -118,10 +146,18 @@
 
 void ClBackend::RegisterTensorHandleFactories(TensorHandleFactoryRegistry& registry)
 {
-    auto mgr = std::make_shared<ClMemoryManager>(std::make_unique<arm_compute::CLBufferAllocator>());
+    std::shared_ptr<ClMemoryManager> memoryManager;
+    if (m_UsingCustomAllocator)
+    {
+        memoryManager = std::make_shared<ClMemoryManager>(m_CustomAllocator);
+    }
+    else
+    {
+        memoryManager = std::make_shared<ClMemoryManager>(std::make_unique<arm_compute::CLBufferAllocator>());
+    }
 
-    registry.RegisterMemoryManager(mgr);
-    registry.RegisterFactory(std::make_unique<ClTensorHandleFactory>(mgr));
+    registry.RegisterMemoryManager(memoryManager);
+    registry.RegisterFactory(std::make_unique<ClTensorHandleFactory>(memoryManager));
     registry.RegisterFactory(std::make_unique<ClImportTensorHandleFactory>(
         static_cast<MemorySourceFlags>(MemorySource::Malloc), static_cast<MemorySourceFlags>(MemorySource::Malloc)));
 }
@@ -130,10 +166,18 @@
                                               MemorySourceFlags inputFlags,
                                               MemorySourceFlags outputFlags)
 {
-    auto mgr = std::make_shared<ClMemoryManager>(std::make_unique<arm_compute::CLBufferAllocator>());
+    std::shared_ptr<ClMemoryManager> memoryManager;
+    if (m_UsingCustomAllocator)
+    {
+        memoryManager = std::make_shared<ClMemoryManager>(m_CustomAllocator);
+    }
+    else
+    {
+        memoryManager = std::make_shared<ClMemoryManager>(std::make_unique<arm_compute::CLBufferAllocator>());
+    }
 
-    registry.RegisterMemoryManager(mgr);
-    registry.RegisterFactory(std::make_unique<ClTensorHandleFactory>(mgr));
+    registry.RegisterMemoryManager(memoryManager);
+    registry.RegisterFactory(std::make_unique<ClTensorHandleFactory>(memoryManager));
     registry.RegisterFactory(std::make_unique<ClImportTensorHandleFactory>(inputFlags, outputFlags));
 }
 
diff --git a/src/backends/cl/ClBackend.hpp b/src/backends/cl/ClBackend.hpp
index c742c0b..c63bd25 100644
--- a/src/backends/cl/ClBackend.hpp
+++ b/src/backends/cl/ClBackend.hpp
@@ -6,6 +6,15 @@
 
 #include <armnn/backends/IBackendInternal.hpp>
 
+#include <arm_compute/core/Types.h>
+#include <arm_compute/runtime/CL/CLBufferAllocator.h>
+
+#include <aclCommon/BaseMemoryManager.hpp>
+#include <arm_compute/runtime/CL/CLMemoryRegion.h>
+
+#include <arm_compute/core/CL/CLKernelLibrary.h>
+#include <CL/cl_ext.h>
+
 namespace armnn
 {
 
@@ -20,7 +29,12 @@
 class ClBackend : public IBackendInternal
 {
 public:
-    ClBackend() : m_EnableCustomAllocator(false) {};
+    ClBackend() : m_CustomAllocator(nullptr) {};
+    ClBackend(std::shared_ptr<ICustomAllocator> allocator)
+    {
+        std::string err;
+        UseCustomMemoryAllocator(allocator, err);
+    }
     ~ClBackend() = default;
 
     static const BackendId& GetIdStatic();
@@ -72,17 +86,119 @@
         return gpuAccCapabilities;
     };
 
-    virtual bool UseCustomMemoryAllocator(armnn::Optional<std::string&> errMsg) override
+    virtual bool UseCustomMemoryAllocator(std::shared_ptr<ICustomAllocator> allocator,
+                                          armnn::Optional<std::string&> errMsg) override
     {
         IgnoreUnused(errMsg);
+        ARMNN_LOG(info) << "Using Custom Allocator for ClBackend";
 
         // Set flag to signal the backend to use a custom memory allocator
-        m_EnableCustomAllocator = true;
-
-        return m_EnableCustomAllocator;
+        m_CustomAllocator = std::make_shared<ClBackendCustomAllocatorWrapper>(std::move(allocator));
+        m_UsingCustomAllocator = true;
+        return m_UsingCustomAllocator;
     }
 
-    bool m_EnableCustomAllocator;
+    // Cl requires a arm_compute::IAllocator we wrap the Arm NN ICustomAllocator to achieve this
+    class ClBackendCustomAllocatorWrapper : public arm_compute::IAllocator
+    {
+    public:
+        ClBackendCustomAllocatorWrapper(std::shared_ptr<ICustomAllocator> alloc) : m_CustomAllocator(alloc)
+        {}
+        // Inherited methods overridden:
+        void* allocate(size_t size, size_t alignment) override
+        {
+            auto alloc = m_CustomAllocator->allocate(size, alignment);
+            return MapAllocatedMemory(alloc, size, m_CustomAllocator->GetMemorySourceType());
+        }
+        void free(void* ptr) override
+        {
+            auto hostMemPtr = m_AllocatedBufferMappings[ptr];
+            clReleaseMemObject(static_cast<cl_mem>(ptr));
+            m_CustomAllocator->free(hostMemPtr);
+        }
+        std::unique_ptr<arm_compute::IMemoryRegion> make_region(size_t size, size_t alignment) override
+        {
+            auto hostMemPtr = m_CustomAllocator->allocate(size, alignment);
+            cl_mem buffer = MapAllocatedMemory(hostMemPtr, size, m_CustomAllocator->GetMemorySourceType());
+
+            return std::make_unique<ClBackendCustomAllocatorMemoryRegion>(cl::Buffer(buffer), hostMemPtr);
+        }
+    private:
+        cl_mem MapAllocatedMemory(void* memory, size_t size, MemorySource source)
+        {
+            // Round the size of the buffer to a multiple of the CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE
+            auto cachelineAlignment =
+                    arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>();
+            auto roundedSize = cachelineAlignment + size - (size % cachelineAlignment);
+
+            if (source == MemorySource::Malloc)
+            {
+                const cl_import_properties_arm importProperties[] =
+                        {
+                        CL_IMPORT_TYPE_ARM,
+                        CL_IMPORT_TYPE_HOST_ARM,
+                        0
+                        };
+                cl_int error = CL_SUCCESS;
+                cl_mem buffer = clImportMemoryARM(arm_compute::CLKernelLibrary::get().context().get(),
+                                                  CL_MEM_READ_WRITE,
+                                                  importProperties,
+                                                  memory,
+                                                  roundedSize,
+                                                  &error);
+                if (error == CL_SUCCESS)
+                {
+                    m_AllocatedBufferMappings.insert(std::make_pair(static_cast<void *>(buffer), memory));
+                    return buffer;
+                }
+                throw armnn::Exception(
+                    "Mapping allocated memory from CustomMemoryAllocator failed, errcode: " + std::to_string(error));
+            }
+            throw armnn::Exception(
+                    "Attempting to allocate memory with unsupported MemorySource type in CustomAllocator");
+        }
+        std::shared_ptr<ICustomAllocator> m_CustomAllocator;
+        std::map<void*, void*> m_AllocatedBufferMappings;
+    };
+
+    class ClBackendCustomAllocatorMemoryRegion : public arm_compute::ICLMemoryRegion
+    {
+    public:
+        // We need to have a new version of ICLMemoryRegion which holds a hostMemPtr to allow for cpu copy access
+        ClBackendCustomAllocatorMemoryRegion(const cl::Buffer &buffer, void* hostMemPtr)
+            : ICLMemoryRegion(buffer.getInfo<CL_MEM_SIZE>())
+        {
+            _mem = buffer;
+            m_HostMemPtr = hostMemPtr;
+        }
+
+        // Inherited methods overridden :
+        void* ptr() override
+        {
+            return nullptr;
+        }
+
+        void* map(cl::CommandQueue &q, bool blocking) override
+        {
+            armnn::IgnoreUnused(q, blocking);
+            if (m_HostMemPtr == nullptr)
+            {
+                throw armnn::Exception("ClBackend: Attempting to map memory with an invalid host ptr");
+            }
+            _mapping = m_HostMemPtr;
+            return _mapping;
+        }
+
+        void unmap(cl::CommandQueue &q) override
+        {
+            armnn::IgnoreUnused(q);
+            _mapping = nullptr;
+        }
+        void* m_HostMemPtr = nullptr;
+    };
+
+    std::shared_ptr<ClBackendCustomAllocatorWrapper> m_CustomAllocator;
+    bool m_UsingCustomAllocator = false;
 };
 
 } // namespace armnn
diff --git a/src/backends/cl/ClImportTensorHandle.hpp b/src/backends/cl/ClImportTensorHandle.hpp
index 3fca7cb..69cd4a6 100644
--- a/src/backends/cl/ClImportTensorHandle.hpp
+++ b/src/backends/cl/ClImportTensorHandle.hpp
@@ -140,10 +140,16 @@
 private:
     bool ClImport(const cl_import_properties_arm* importProperties, void* memory)
     {
-        const size_t totalBytes = m_Tensor.info()->total_size();
+        size_t totalBytes = m_Tensor.info()->total_size();
+
+        // Round the size of the buffer to a multiple of the CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE
+        auto cachelineAlignment =
+                arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>();
+        auto roundedSize = cachelineAlignment + totalBytes - (totalBytes % cachelineAlignment);
+
         cl_int error = CL_SUCCESS;
         cl_mem buffer = clImportMemoryARM(arm_compute::CLKernelLibrary::get().context().get(),
-                                          CL_MEM_READ_WRITE, importProperties, memory, totalBytes, &error);
+                                          CL_MEM_READ_WRITE, importProperties, memory, roundedSize, &error);
         if (error != CL_SUCCESS)
         {
             throw MemoryImportException("ClImportTensorHandle::Invalid imported memory" + std::to_string(error));
diff --git a/src/backends/cl/ClRegistryInitializer.cpp b/src/backends/cl/ClRegistryInitializer.cpp
index 8decd6f..aadc14b 100644
--- a/src/backends/cl/ClRegistryInitializer.cpp
+++ b/src/backends/cl/ClRegistryInitializer.cpp
@@ -18,6 +18,14 @@
     ClBackend::GetIdStatic(),
     []()
     {
+        // Check if we have a CustomMemoryAllocator associated with the backend
+        // and if so register it with the backend.
+        auto customAllocators = BackendRegistryInstance().GetAllocators();
+        auto allocatorIterator = customAllocators.find(ClBackend::GetIdStatic());
+        if (allocatorIterator != customAllocators.end())
+        {
+            return IBackendInternalUniquePtr(new ClBackend(allocatorIterator->second));
+        }
         return IBackendInternalUniquePtr(new ClBackend);
     }
 };
diff --git a/src/backends/cl/test/CMakeLists.txt b/src/backends/cl/test/CMakeLists.txt
index 6662a1e..41cbe24 100644
--- a/src/backends/cl/test/CMakeLists.txt
+++ b/src/backends/cl/test/CMakeLists.txt
@@ -6,6 +6,7 @@
 list(APPEND armnnClBackendUnitTests_sources
     ClContextControlFixture.hpp
     ClContextSerializerTests.cpp
+    ClCustomAllocatorTests.cpp
     ClCreateWorkloadTests.cpp
     ClEndToEndTests.cpp
     ClImportTensorHandleFactoryTests.cpp
diff --git a/src/backends/cl/test/ClCustomAllocatorTests.cpp b/src/backends/cl/test/ClCustomAllocatorTests.cpp
new file mode 100644
index 0000000..4d1a0e1
--- /dev/null
+++ b/src/backends/cl/test/ClCustomAllocatorTests.cpp
@@ -0,0 +1,160 @@
+//
+// Copyright © 2021 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include <armnn/backends/ICustomAllocator.hpp>
+#include <armnn/Descriptors.hpp>
+#include <armnn/Exceptions.hpp>
+#include <armnn/INetwork.hpp>
+#include <armnn/IRuntime.hpp>
+#include <armnn/Utils.hpp>
+#include <armnn/BackendRegistry.hpp>
+#include <cl/ClBackend.hpp>
+
+#include <doctest/doctest.h>
+
+// Contains the OpenCl interfaces for mapping memory in the Gpu Page Tables
+// Requires the OpenCl backend to be included (GpuAcc)
+#include <arm_compute/core/CL/CLKernelLibrary.h>
+#include <CL/cl_ext.h>
+#include <arm_compute/runtime/CL/CLScheduler.h>
+
+
+/** Sample implementation of ICustomAllocator for use with the ClBackend.
+ *  Note: any memory allocated must be host accessible with write access to allow for weights and biases
+ *  to be passed in. Read access is not required.. */
+class SampleClBackendCustomAllocator : public armnn::ICustomAllocator
+{
+public:
+    SampleClBackendCustomAllocator() = default;
+
+    void* allocate(size_t size, size_t alignment)
+    {
+        // If alignment is 0 just use the CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE for alignment
+        if (alignment == 0)
+        {
+            alignment = arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>();
+        }
+        size_t space = size + alignment + alignment;
+        auto allocatedMemPtr = std::malloc(space * sizeof(size_t));
+
+        if (std::align(alignment, size, allocatedMemPtr, space) == nullptr)
+        {
+            throw armnn::Exception("SampleClBackendCustomAllocator::Alignment failed");
+        }
+        return allocatedMemPtr;
+    }
+
+    /** Interface to be implemented by the child class to free the allocated tensor */
+    void free(void* ptr)
+    {
+        std::free(ptr);
+    }
+
+    armnn::MemorySource GetMemorySourceType()
+    {
+        return armnn::MemorySource::Malloc;
+    }
+};
+
+TEST_SUITE("ClCustomAllocatorTests")
+{
+
+// This is a copy of the SimpleSample app modified to use a custom
+// allocator for the clbackend. It creates a FullyConnected network with a single layer
+// taking a single number as an input
+TEST_CASE("ClCustomAllocatorTest")
+{
+    using namespace armnn;
+
+    float number = 3;
+
+    // Construct ArmNN network
+    armnn::NetworkId networkIdentifier;
+    INetworkPtr myNetwork = INetwork::Create();
+
+    armnn::FullyConnectedDescriptor fullyConnectedDesc;
+    float weightsData[] = {1.0f}; // Identity
+    TensorInfo weightsInfo(TensorShape({1, 1}), DataType::Float32);
+    weightsInfo.SetConstant(true);
+    armnn::ConstTensor weights(weightsInfo, weightsData);
+
+    ARMNN_NO_DEPRECATE_WARN_BEGIN
+    IConnectableLayer* fullyConnected = myNetwork->AddFullyConnectedLayer(fullyConnectedDesc,
+                                                                          weights,
+                                                                          EmptyOptional(),
+                                                                          "fully connected");
+    ARMNN_NO_DEPRECATE_WARN_END
+    IConnectableLayer* InputLayer = myNetwork->AddInputLayer(0);
+    IConnectableLayer* OutputLayer = myNetwork->AddOutputLayer(0);
+    InputLayer->GetOutputSlot(0).Connect(fullyConnected->GetInputSlot(0));
+    fullyConnected->GetOutputSlot(0).Connect(OutputLayer->GetInputSlot(0));
+
+
+    // Create ArmNN runtime
+    IRuntime::CreationOptions options; // default options
+    auto customAllocator = std::make_shared<SampleClBackendCustomAllocator>();
+    options.m_CustomAllocatorMap = {{"GpuAcc", std::move(customAllocator)}};
+    IRuntimePtr run = IRuntime::Create(options);
+
+    //Set the tensors in the network.
+    TensorInfo inputTensorInfo(TensorShape({1, 1}), DataType::Float32);
+    InputLayer->GetOutputSlot(0).SetTensorInfo(inputTensorInfo);
+
+    TensorInfo outputTensorInfo(TensorShape({1, 1}), DataType::Float32);
+    fullyConnected->GetOutputSlot(0).SetTensorInfo(outputTensorInfo);
+
+    // Optimise ArmNN network
+    OptimizerOptions optOptions;
+    optOptions.m_ImportEnabled = true;
+    armnn::IOptimizedNetworkPtr optNet = Optimize(*myNetwork, {"GpuAcc"}, run->GetDeviceSpec(), optOptions);
+    CHECK(optNet);
+
+    // Load graph into runtime
+    std::string ignoredErrorMessage;
+    INetworkProperties networkProperties(false, MemorySource::Malloc, MemorySource::Malloc);
+    run->LoadNetwork(networkIdentifier, std::move(optNet), ignoredErrorMessage, networkProperties);
+
+    // Creates structures for input & output
+    unsigned int numElements = inputTensorInfo.GetNumElements();
+    size_t totalBytes = numElements * sizeof(float);
+
+    const size_t alignment =
+            arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>();
+
+    void* alignedInputPtr = options.m_CustomAllocatorMap["GpuAcc"]->allocate(totalBytes, alignment);
+
+    // Input with negative values
+    auto* inputPtr = reinterpret_cast<float*>(alignedInputPtr);
+    std::fill_n(inputPtr, numElements, number);
+
+    void* alignedOutputPtr = options.m_CustomAllocatorMap["GpuAcc"]->allocate(totalBytes, alignment);
+    auto* outputPtr = reinterpret_cast<float*>(alignedOutputPtr);
+    std::fill_n(outputPtr, numElements, -10.0f);
+
+    armnn::InputTensors inputTensors
+    {
+        {0, armnn::ConstTensor(run->GetInputTensorInfo(networkIdentifier, 0), alignedInputPtr)},
+    };
+    armnn::OutputTensors outputTensors
+    {
+        {0, armnn::Tensor(run->GetOutputTensorInfo(networkIdentifier, 0), alignedOutputPtr)}
+    };
+
+    // Execute network
+    run->EnqueueWorkload(networkIdentifier, inputTensors, outputTensors);
+    run->UnloadNetwork(networkIdentifier);
+
+
+    // Tell the CLBackend to sync memory so we can read the output.
+    arm_compute::CLScheduler::get().sync();
+    auto* outputResult = reinterpret_cast<float*>(alignedOutputPtr);
+
+    run->UnloadNetwork(networkIdentifier);
+    CHECK(outputResult[0] == number);
+    auto& backendRegistry = armnn::BackendRegistryInstance();
+    backendRegistry.DeregisterAllocator(ClBackend::GetIdStatic());
+}
+
+} // test suite ClCustomAllocatorTests
\ No newline at end of file
diff --git a/src/backends/cl/test/ClImportTensorHandleTests.cpp b/src/backends/cl/test/ClImportTensorHandleTests.cpp
index 931729a..6b1d352 100644
--- a/src/backends/cl/test/ClImportTensorHandleTests.cpp
+++ b/src/backends/cl/test/ClImportTensorHandleTests.cpp
@@ -61,7 +61,7 @@
     // Validate result by checking that the output has no negative values
     for(unsigned int i = 0; i < numElements; ++i)
     {
-        CHECK(typedPtr[i] >= 0);
+        CHECK(typedPtr[i] == 0);
     }
 }