src/backends/cl/test/ClCustomAllocatorTests.cpp - ml/armnn - Gitiles

 //
 // Copyright © 2021 Arm Ltd and Contributors. All rights reserved.
 // SPDX-License-Identifier: MIT
 //

 #include <armnn/backends/ICustomAllocator.hpp>
 #include <armnn/Descriptors.hpp>
 #include <armnn/Exceptions.hpp>
 #include <armnn/INetwork.hpp>
 #include <armnn/IRuntime.hpp>
 #include <armnn/Utils.hpp>
 #include <armnn/BackendRegistry.hpp>

 #include <cl/ClBackend.hpp>
 #if defined(ARMCOMPUTENEON_ENABLED)
 #include <neon/NeonBackend.hpp>
 #endif
 #include <doctest/doctest.h>
 #include <armnn/utility/IgnoreUnused.hpp>
 // Contains the OpenCl interfaces for mapping memory in the Gpu Page Tables
 // Requires the OpenCl backend to be included (GpuAcc)
 #include <arm_compute/core/CL/CLKernelLibrary.h>
 #include <CL/cl_ext.h>
 #include <arm_compute/runtime/CL/CLScheduler.h>

 /** Sample implementation of ICustomAllocator for use with the ClBackend.
  *  Note: any memory allocated must be host accessible with write access to allow for weights and biases
  *  to be passed in. Read access is not required.. */
 class SampleClBackendCustomAllocator : public armnn::ICustomAllocator
 {
 public:
     SampleClBackendCustomAllocator() = default;

     void* allocate(size_t size, size_t alignment) override
     {
         // If alignment is 0 just use the CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE for alignment
         if (alignment == 0)
         {
             alignment = arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>();
         }
         size_t space = size + alignment + alignment;
         auto allocatedMemPtr = std::malloc(space * sizeof(size_t));
         if (std::align(alignment, size, allocatedMemPtr, space) == nullptr)
         {
             throw armnn::Exception("SampleClBackendCustomAllocator::Alignment failed");
         }
         return allocatedMemPtr;
     }

     /** Interface to be implemented by the child class to free the allocated tensor */
     void free(void* ptr) override
     {
         std::free(ptr);
     }

     armnn::MemorySource GetMemorySourceType() override
     {
         return armnn::MemorySource::Malloc;
     }
 };

 armnn::INetworkPtr CreateTestNetwork(armnn::TensorInfo& inputTensorInfo)
 {
     using namespace armnn;

     armnn::FullyConnectedDescriptor fullyConnectedDesc;
     float weightsData[] = {1.0f}; // Identity
     TensorInfo weightsInfo(TensorShape({1, 1}), DataType::Float32, 0.0f, 0, true);
     weightsInfo.SetConstant(true);
     armnn::ConstTensor weights(weightsInfo, weightsData);

     armnn::INetworkPtr network = armnn::INetwork::Create();
     armnn::IConnectableLayer* const inputLayer = network->AddInputLayer(0);
     armnn::IConnectableLayer* const weightsLayer = network->AddConstantLayer(weights, "Weights");
     armnn::IConnectableLayer* const fullyConnectedLayer =
         network->AddFullyConnectedLayer(fullyConnectedDesc, "fully connected");
     armnn::IConnectableLayer* const outputLayer = network->AddOutputLayer(0);

     inputLayer->GetOutputSlot(0).Connect(fullyConnectedLayer->GetInputSlot(0));
     weightsLayer->GetOutputSlot(0).Connect(fullyConnectedLayer->GetInputSlot(1));
     fullyConnectedLayer->GetOutputSlot(0).Connect(outputLayer->GetInputSlot(0));

     weightsLayer->GetOutputSlot(0).SetTensorInfo(weightsInfo);

     //Set the tensors in the network.

     inputLayer->GetOutputSlot(0).SetTensorInfo(inputTensorInfo);

     TensorInfo outputTensorInfo(TensorShape({1, 1}), DataType::Float32);
     fullyConnectedLayer->GetOutputSlot(0).SetTensorInfo(outputTensorInfo);

     return network;
 }

 TEST_SUITE("ClCustomAllocatorTests")
 {

 // This is a copy of the SimpleSample app modified to use a custom
 // allocator for the clbackend. It creates a FullyConnected network with a single layer
 // taking a single number as an input
 TEST_CASE("ClCustomAllocatorTest")
 {
     using namespace armnn;

     float number = 3;

     // Construct ArmNN network
     armnn::NetworkId networkIdentifier;

     TensorInfo inputTensorInfo(TensorShape({1, 1}), DataType::Float32);

     INetworkPtr myNetwork = CreateTestNetwork(inputTensorInfo);

     // Create ArmNN runtime
     IRuntime::CreationOptions options; // default options
     auto customAllocator = std::make_shared<SampleClBackendCustomAllocator>();
     options.m_CustomAllocatorMap = {{"GpuAcc", std::move(customAllocator)}};
     IRuntimePtr run = IRuntime::Create(options);

     // Optimise ArmNN network
     OptimizerOptions optOptions;
     optOptions.m_ImportEnabled = true;
     optOptions.m_ExportEnabled = true;
     armnn::IOptimizedNetworkPtr optNet = Optimize(*myNetwork, {"GpuAcc"}, run->GetDeviceSpec(), optOptions);
     CHECK(optNet);

     // Load graph into runtime
     std::string ignoredErrorMessage;
     INetworkProperties networkProperties(false, MemorySource::Malloc, MemorySource::Malloc);
     run->LoadNetwork(networkIdentifier, std::move(optNet), ignoredErrorMessage, networkProperties);

     // Creates structures for input & output
     unsigned int numElements = inputTensorInfo.GetNumElements();
     size_t totalBytes = numElements * sizeof(float);

     const size_t alignment =
             arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>();

     void* alignedInputPtr = options.m_CustomAllocatorMap["GpuAcc"]->allocate(totalBytes, alignment);

     // Input with negative values
     auto* inputPtr = reinterpret_cast<float*>(alignedInputPtr);
     std::fill_n(inputPtr, numElements, number);

     void* alignedOutputPtr = options.m_CustomAllocatorMap["GpuAcc"]->allocate(totalBytes, alignment);
     auto* outputPtr = reinterpret_cast<float*>(alignedOutputPtr);
     std::fill_n(outputPtr, numElements, -10.0f);

     armnn::TensorInfo inputTensorInfo2 = run->GetInputTensorInfo(networkIdentifier, 0);
     inputTensorInfo2.SetConstant(true);
     armnn::InputTensors inputTensors
     {
         {0, armnn::ConstTensor(inputTensorInfo2, alignedInputPtr)},
     };
     armnn::OutputTensors outputTensors
     {
         {0, armnn::Tensor(run->GetOutputTensorInfo(networkIdentifier, 0), alignedOutputPtr)}
     };

     // Execute network
     run->EnqueueWorkload(networkIdentifier, inputTensors, outputTensors);
     run->UnloadNetwork(networkIdentifier);


     // Tell the CLBackend to sync memory so we can read the output.
     arm_compute::CLScheduler::get().sync();
     auto* outputResult = reinterpret_cast<float*>(alignedOutputPtr);

     run->UnloadNetwork(networkIdentifier);
     CHECK(outputResult[0] == number);
     auto& backendRegistry = armnn::BackendRegistryInstance();
     backendRegistry.DeregisterAllocator(ClBackend::GetIdStatic());
 }

 // Only run this test if NEON is enabled
 #if defined(ARMCOMPUTENEON_ENABLED)

 TEST_CASE("ClCustomAllocatorCpuAccNegativeTest")
 {
     using namespace armnn;

     // Create ArmNN runtime
     IRuntime::CreationOptions options; // default options
     auto customAllocator = std::make_shared<SampleClBackendCustomAllocator>();
     options.m_CustomAllocatorMap = {{"CpuAcc", std::move(customAllocator)}};
     IRuntimePtr run = IRuntime::Create(options);
     TensorInfo inputTensorInfo(TensorShape({1, 1}), DataType::Float32);
     INetworkPtr myNetwork = CreateTestNetwork(inputTensorInfo);

     // Optimise ArmNN network
     OptimizerOptions optOptions;
     optOptions.m_ImportEnabled = true;
     IOptimizedNetworkPtr optNet(nullptr, nullptr);
     std::vector<std::string> errMessages;

     CHECK_THROWS_AS_MESSAGE(Optimize(*myNetwork, {"CpuAcc"}, run->GetDeviceSpec(), optOptions, errMessages),
                             armnn::InvalidArgumentException,
                             "Expected an exception as GetAvailablePreferredBackends() should be empty in Optimize().");

     auto& backendRegistry = armnn::BackendRegistryInstance();
     backendRegistry.DeregisterAllocator(NeonBackend::GetIdStatic());
 }

 #endif

 TEST_CASE("ClCustomAllocatorGpuAccNullptrTest")
 {
     using namespace armnn;

     // Create ArmNN runtime
     IRuntime::CreationOptions options; // default options
     auto customAllocator = std::make_shared<SampleClBackendCustomAllocator>();
     options.m_CustomAllocatorMap = {{"GpuAcc", nullptr}};

     CHECK_THROWS_AS_MESSAGE(IRuntimePtr run = IRuntime::Create(options),
                             armnn::Exception,
                             "Expected exception in RuntimeImpl::RuntimeImpl() as allocator was nullptr.");
 }

 } // test suite ClCustomAllocatorTests
	//
	// Copyright © 2021 Arm Ltd and Contributors. All rights reserved.
	// SPDX-License-Identifier: MIT
	//

	#include <armnn/backends/ICustomAllocator.hpp>
	#include <armnn/Descriptors.hpp>
	#include <armnn/Exceptions.hpp>
	#include <armnn/INetwork.hpp>
	#include <armnn/IRuntime.hpp>
	#include <armnn/Utils.hpp>
	#include <armnn/BackendRegistry.hpp>

	#include <cl/ClBackend.hpp>
	#if defined(ARMCOMPUTENEON_ENABLED)
	#include <neon/NeonBackend.hpp>
	#endif
	#include <doctest/doctest.h>
	#include <armnn/utility/IgnoreUnused.hpp>
	// Contains the OpenCl interfaces for mapping memory in the Gpu Page Tables
	// Requires the OpenCl backend to be included (GpuAcc)
	#include <arm_compute/core/CL/CLKernelLibrary.h>
	#include <CL/cl_ext.h>
	#include <arm_compute/runtime/CL/CLScheduler.h>

	/** Sample implementation of ICustomAllocator for use with the ClBackend.
	* Note: any memory allocated must be host accessible with write access to allow for weights and biases
	* to be passed in. Read access is not required.. */
	class SampleClBackendCustomAllocator : public armnn::ICustomAllocator
	{
	public:
	SampleClBackendCustomAllocator() = default;

	void* allocate(size_t size, size_t alignment) override
	{
	// If alignment is 0 just use the CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE for alignment
	if (alignment == 0)
	{
	alignment = arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>();
	}
	size_t space = size + alignment + alignment;
	auto allocatedMemPtr = std::malloc(space * sizeof(size_t));
	if (std::align(alignment, size, allocatedMemPtr, space) == nullptr)
	{
	throw armnn::Exception("SampleClBackendCustomAllocator::Alignment failed");
	}
	return allocatedMemPtr;
	}

	/** Interface to be implemented by the child class to free the allocated tensor */
	void free(void* ptr) override
	{
	std::free(ptr);
	}

	armnn::MemorySource GetMemorySourceType() override
	{
	return armnn::MemorySource::Malloc;
	}
	};

	armnn::INetworkPtr CreateTestNetwork(armnn::TensorInfo& inputTensorInfo)
	{
	using namespace armnn;

	armnn::FullyConnectedDescriptor fullyConnectedDesc;
	float weightsData[] = {1.0f}; // Identity
	TensorInfo weightsInfo(TensorShape({1, 1}), DataType::Float32, 0.0f, 0, true);
	weightsInfo.SetConstant(true);
	armnn::ConstTensor weights(weightsInfo, weightsData);

	armnn::INetworkPtr network = armnn::INetwork::Create();
	armnn::IConnectableLayer* const inputLayer = network->AddInputLayer(0);
	armnn::IConnectableLayer* const weightsLayer = network->AddConstantLayer(weights, "Weights");
	armnn::IConnectableLayer* const fullyConnectedLayer =
	network->AddFullyConnectedLayer(fullyConnectedDesc, "fully connected");
	armnn::IConnectableLayer* const outputLayer = network->AddOutputLayer(0);

	inputLayer->GetOutputSlot(0).Connect(fullyConnectedLayer->GetInputSlot(0));
	weightsLayer->GetOutputSlot(0).Connect(fullyConnectedLayer->GetInputSlot(1));
	fullyConnectedLayer->GetOutputSlot(0).Connect(outputLayer->GetInputSlot(0));

	weightsLayer->GetOutputSlot(0).SetTensorInfo(weightsInfo);

	//Set the tensors in the network.

	inputLayer->GetOutputSlot(0).SetTensorInfo(inputTensorInfo);

	TensorInfo outputTensorInfo(TensorShape({1, 1}), DataType::Float32);
	fullyConnectedLayer->GetOutputSlot(0).SetTensorInfo(outputTensorInfo);

	return network;
	}

	TEST_SUITE("ClCustomAllocatorTests")
	{

	// This is a copy of the SimpleSample app modified to use a custom
	// allocator for the clbackend. It creates a FullyConnected network with a single layer
	// taking a single number as an input
	TEST_CASE("ClCustomAllocatorTest")
	{
	using namespace armnn;

	float number = 3;

	// Construct ArmNN network
	armnn::NetworkId networkIdentifier;

	TensorInfo inputTensorInfo(TensorShape({1, 1}), DataType::Float32);

	INetworkPtr myNetwork = CreateTestNetwork(inputTensorInfo);

	// Create ArmNN runtime
	IRuntime::CreationOptions options; // default options
	auto customAllocator = std::make_shared<SampleClBackendCustomAllocator>();
	options.m_CustomAllocatorMap = {{"GpuAcc", std::move(customAllocator)}};
	IRuntimePtr run = IRuntime::Create(options);

	// Optimise ArmNN network
	OptimizerOptions optOptions;
	optOptions.m_ImportEnabled = true;
	optOptions.m_ExportEnabled = true;
	armnn::IOptimizedNetworkPtr optNet = Optimize(*myNetwork, {"GpuAcc"}, run->GetDeviceSpec(), optOptions);
	CHECK(optNet);

	// Load graph into runtime
	std::string ignoredErrorMessage;
	INetworkProperties networkProperties(false, MemorySource::Malloc, MemorySource::Malloc);
	run->LoadNetwork(networkIdentifier, std::move(optNet), ignoredErrorMessage, networkProperties);

	// Creates structures for input & output
	unsigned int numElements = inputTensorInfo.GetNumElements();
	size_t totalBytes = numElements * sizeof(float);

	const size_t alignment =
	arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>();

	void* alignedInputPtr = options.m_CustomAllocatorMap["GpuAcc"]->allocate(totalBytes, alignment);

	// Input with negative values
	auto* inputPtr = reinterpret_cast<float*>(alignedInputPtr);
	std::fill_n(inputPtr, numElements, number);

	void* alignedOutputPtr = options.m_CustomAllocatorMap["GpuAcc"]->allocate(totalBytes, alignment);
	auto* outputPtr = reinterpret_cast<float*>(alignedOutputPtr);
	std::fill_n(outputPtr, numElements, -10.0f);

	armnn::TensorInfo inputTensorInfo2 = run->GetInputTensorInfo(networkIdentifier, 0);
	inputTensorInfo2.SetConstant(true);
	armnn::InputTensors inputTensors
	{
	{0, armnn::ConstTensor(inputTensorInfo2, alignedInputPtr)},
	};
	armnn::OutputTensors outputTensors
	{
	{0, armnn::Tensor(run->GetOutputTensorInfo(networkIdentifier, 0), alignedOutputPtr)}
	};

	// Execute network
	run->EnqueueWorkload(networkIdentifier, inputTensors, outputTensors);
	run->UnloadNetwork(networkIdentifier);


	// Tell the CLBackend to sync memory so we can read the output.
	arm_compute::CLScheduler::get().sync();
	auto* outputResult = reinterpret_cast<float*>(alignedOutputPtr);

	run->UnloadNetwork(networkIdentifier);
	CHECK(outputResult[0] == number);
	auto& backendRegistry = armnn::BackendRegistryInstance();
	backendRegistry.DeregisterAllocator(ClBackend::GetIdStatic());
	}

	// Only run this test if NEON is enabled
	#if defined(ARMCOMPUTENEON_ENABLED)

	TEST_CASE("ClCustomAllocatorCpuAccNegativeTest")
	{
	using namespace armnn;

	// Create ArmNN runtime
	IRuntime::CreationOptions options; // default options
	auto customAllocator = std::make_shared<SampleClBackendCustomAllocator>();
	options.m_CustomAllocatorMap = {{"CpuAcc", std::move(customAllocator)}};
	IRuntimePtr run = IRuntime::Create(options);
	TensorInfo inputTensorInfo(TensorShape({1, 1}), DataType::Float32);
	INetworkPtr myNetwork = CreateTestNetwork(inputTensorInfo);

	// Optimise ArmNN network
	OptimizerOptions optOptions;
	optOptions.m_ImportEnabled = true;
	IOptimizedNetworkPtr optNet(nullptr, nullptr);
	std::vector<std::string> errMessages;

	CHECK_THROWS_AS_MESSAGE(Optimize(*myNetwork, {"CpuAcc"}, run->GetDeviceSpec(), optOptions, errMessages),
	armnn::InvalidArgumentException,
	"Expected an exception as GetAvailablePreferredBackends() should be empty in Optimize().");

	auto& backendRegistry = armnn::BackendRegistryInstance();
	backendRegistry.DeregisterAllocator(NeonBackend::GetIdStatic());
	}

	#endif

	TEST_CASE("ClCustomAllocatorGpuAccNullptrTest")
	{
	using namespace armnn;

	// Create ArmNN runtime
	IRuntime::CreationOptions options; // default options
	auto customAllocator = std::make_shared<SampleClBackendCustomAllocator>();
	options.m_CustomAllocatorMap = {{"GpuAcc", nullptr}};

	CHECK_THROWS_AS_MESSAGE(IRuntimePtr run = IRuntime::Create(options),
	armnn::Exception,
	"Expected exception in RuntimeImpl::RuntimeImpl() as allocator was nullptr.");
	}

	} // test suite ClCustomAllocatorTests