Adds CustomAllocator interface and Sample App

 * Updates the runtime options with a CustomAllocatorMap which allows to define a CustomAllocator for specific backends
 * Change IBackendInternal interface to use a shared pointer to a custom allocator
 * Update ClBackend.hpp/cpp to use the CustomAllocator
 * Adds an example application and unit test which uses a CustomAllocator for GpuAcc
 * Refactor of the interface to use MemorySource instead of the user Mapping cl_mem directly
 * Modify the BackendRegistry to also hold a registry of CustomAllocators
 * BackendRegistry Deregister will also deregister any allocators associated with that backend id
 * set_global_allocator within the BaseMemoryManager so that it always matches the currently used allocator

Signed-off-by: Jan Eilers <jan.eilers@arm.com>
Change-Id: I156d819686021865f4375e6cb7a5c5dec8fee9e8
Signed-off-by: David Monahan <david.monahan@arm.com>
diff --git a/samples/CMakeLists.txt b/samples/CMakeLists.txt
index ff45eec..7be6a69 100644
--- a/samples/CMakeLists.txt
+++ b/samples/CMakeLists.txt
@@ -8,3 +8,8 @@
     target_link_libraries(DynamicSample armnn ${CMAKE_THREAD_LIBS_INIT})
 endif()
 
+if(BUILD_SAMPLE_APP AND ARMCOMPUTECL)
+    add_executable(CustomMemoryAllocatorSample CustomMemoryAllocatorSample.cpp)
+    target_link_libraries(CustomMemoryAllocatorSample armnn ${CMAKE_THREAD_LIBS_INIT})
+endif()
+
diff --git a/samples/CustomMemoryAllocatorSample.cpp b/samples/CustomMemoryAllocatorSample.cpp
new file mode 100644
index 0000000..51b3c81
--- /dev/null
+++ b/samples/CustomMemoryAllocatorSample.cpp
@@ -0,0 +1,175 @@
+//
+// Copyright © 2021 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include <armnn/ArmNN.hpp>
+#include <armnn/backends/ICustomAllocator.hpp>
+
+#include <arm_compute/core/CL/CLKernelLibrary.h>
+#include <arm_compute/runtime/CL/CLScheduler.h>
+
+#include <iostream>
+
+/** Sample implementation of ICustomAllocator for use with the ClBackend.
+ *  Note: any memory allocated must be host addressable with write access
+ *  in order for ArmNN to be able to properly use it. */
+class SampleClBackendCustomAllocator : public armnn::ICustomAllocator
+{
+public:
+    SampleClBackendCustomAllocator() = default;
+
+    void* allocate(size_t size, size_t alignment)
+    {
+        // If alignment is 0 just use the CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE for alignment
+        if (alignment == 0)
+        {
+            alignment = arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>();
+        }
+        size_t space = size + alignment + alignment;
+        auto allocatedMemPtr = std::malloc(space * sizeof(size_t));
+
+        if (std::align(alignment, size, allocatedMemPtr, space) == nullptr)
+        {
+            throw armnn::Exception("SampleClBackendCustomAllocator::Alignment failed");
+        }
+        return allocatedMemPtr;
+    }
+
+    void free(void* ptr)
+    {
+        std::free(ptr);
+    }
+
+    armnn::MemorySource GetMemorySourceType()
+    {
+        return armnn::MemorySource::Malloc;
+    }
+};
+
+
+// A simple example application to show the usage of a custom memory allocator. In this sample, the users single
+// input number is multiplied by 1.0f using a fully connected layer with a single neuron to produce an output
+// number that is the same as the input. All memory required to execute this mini network is allocated with
+// the provided custom allocator.
+//
+// Using a Custom Allocator is required for use with Protected Mode and Protected Memory.
+// This example is provided using only unprotected malloc as Protected Memory is platform
+// and implementation specific.
+//
+// Note: This example is similar to the SimpleSample application that can also be found in armnn/samples.
+//       The differences are in the use of a custom allocator, the backend is GpuAcc, and the inputs/outputs
+//       are being imported instead of copied. (Import must be enabled when using a Custom Allocator)
+//       You might find this useful for comparison.
+int main()
+{
+    using namespace armnn;
+
+    float number;
+    std::cout << "Please enter a number: " << std::endl;
+    std::cin >> number;
+
+    // Turn on logging to standard output
+    // This is useful in this sample so that users can learn more about what is going on
+    armnn::ConfigureLogging(true, false, LogSeverity::Info);
+
+    // Construct ArmNN network
+    armnn::NetworkId networkIdentifier;
+    INetworkPtr myNetwork = INetwork::Create();
+    armnn::FullyConnectedDescriptor fullyConnectedDesc;
+    float weightsData[] = {1.0f}; // Identity
+    TensorInfo weightsInfo(TensorShape({1, 1}), DataType::Float32);
+    weightsInfo.SetConstant(true);
+    armnn::ConstTensor weights(weightsInfo, weightsData);
+    ARMNN_NO_DEPRECATE_WARN_BEGIN
+    IConnectableLayer *fullyConnected = myNetwork->AddFullyConnectedLayer(fullyConnectedDesc,
+                                                                          weights,
+                                                                          EmptyOptional(),
+                                                                          "fully connected");
+    ARMNN_NO_DEPRECATE_WARN_END
+    IConnectableLayer *InputLayer = myNetwork->AddInputLayer(0);
+    IConnectableLayer *OutputLayer = myNetwork->AddOutputLayer(0);
+    InputLayer->GetOutputSlot(0).Connect(fullyConnected->GetInputSlot(0));
+    fullyConnected->GetOutputSlot(0).Connect(OutputLayer->GetInputSlot(0));
+
+    // Create ArmNN runtime:
+    //
+    // This is the interesting bit when executing a model with a custom allocator.
+    // You can have different allocators for different backends. To support this
+    // the runtime creation option has a map that takes a BackendId and the corresponding
+    // allocator that should be used for that backend.
+    // Only GpuAcc supports a Custom Allocator for now
+    //
+    // Note: This is not covered in this example but if you want to run a model on
+    //       protected memory a custom allocator needs to be provided that supports
+    //       protected memory allocations and the MemorySource of that allocator is
+    //       set to MemorySource::DmaBufProtected
+    IRuntime::CreationOptions options;
+    auto customAllocator = std::make_shared<SampleClBackendCustomAllocator>();
+    options.m_CustomAllocatorMap = {{"GpuAcc", std::move(customAllocator)}};
+    IRuntimePtr runtime = IRuntime::Create(options);
+
+    //Set the tensors in the network.
+    TensorInfo inputTensorInfo(TensorShape({1, 1}), DataType::Float32);
+    InputLayer->GetOutputSlot(0).SetTensorInfo(inputTensorInfo);
+
+    unsigned int numElements = inputTensorInfo.GetNumElements();
+    size_t totalBytes = numElements * sizeof(float);
+
+    TensorInfo outputTensorInfo(TensorShape({1, 1}), DataType::Float32);
+    fullyConnected->GetOutputSlot(0).SetTensorInfo(outputTensorInfo);
+
+    // Optimise ArmNN network
+    OptimizerOptions optOptions;
+    optOptions.m_ImportEnabled = true;
+    armnn::IOptimizedNetworkPtr optNet =
+                Optimize(*myNetwork, {"GpuAcc"}, runtime->GetDeviceSpec(), optOptions);
+    if (!optNet)
+    {
+        // This shouldn't happen for this simple sample, with GpuAcc backend.
+        // But in general usage Optimize could fail if the backend at runtime cannot
+        // support the model that has been provided.
+        std::cerr << "Error: Failed to optimise the input network." << std::endl;
+        return 1;
+    }
+
+    // Load graph into runtime
+    std::string ignoredErrorMessage;
+    INetworkProperties networkProperties(false, MemorySource::Malloc, MemorySource::Malloc);
+    runtime->LoadNetwork(networkIdentifier, std::move(optNet), ignoredErrorMessage, networkProperties);
+
+    // Creates structures for input & output
+    const size_t alignment =
+            arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>();
+
+    void* alignedInputPtr = options.m_CustomAllocatorMap["GpuAcc"]->allocate(totalBytes, alignment);
+
+    // Input with negative values
+    auto* inputPtr = reinterpret_cast<float*>(alignedInputPtr);
+    std::fill_n(inputPtr, numElements, number);
+
+    void* alignedOutputPtr = options.m_CustomAllocatorMap["GpuAcc"]->allocate(totalBytes, alignment);
+    auto* outputPtr = reinterpret_cast<float*>(alignedOutputPtr);
+    std::fill_n(outputPtr, numElements, -10.0f);
+
+
+    armnn::InputTensors inputTensors
+    {
+        {0, armnn::ConstTensor(runtime->GetInputTensorInfo(networkIdentifier, 0), alignedInputPtr)},
+    };
+    armnn::OutputTensors outputTensors
+    {
+        {0, armnn::Tensor(runtime->GetOutputTensorInfo(networkIdentifier, 0), alignedOutputPtr)}
+    };
+
+    // Execute network
+    runtime->EnqueueWorkload(networkIdentifier, inputTensors, outputTensors);
+
+    // Tell the CLBackend to sync memory so we can read the output.
+    arm_compute::CLScheduler::get().sync();
+    auto* outputResult = reinterpret_cast<float*>(alignedOutputPtr);
+    std::cout << "Your number was " << outputResult[0] << std::endl;
+    runtime->UnloadNetwork(networkIdentifier);
+    return 0;
+
+}