src/backends/backendsCommon/test/PreCompiledTestImpl.cpp - ml/armnn - Gitiles

 //
 // Copyright © 2017 Arm Ltd. All rights reserved.
 // SPDX-License-Identifier: MIT
 //

 #include "PreCompiledTestImpl.hpp"

 #include "TensorCopyUtils.hpp"

 #include <Graph.hpp>
 #include <Network.hpp>
 #include <Runtime.hpp>

 #include <armnn/Exceptions.hpp>
 #include <armnn/INetwork.hpp>

 #include <test/TensorHelpers.hpp>

 #include <backendsCommon/WorkloadFactory.hpp>

 #include <boost/polymorphic_pointer_cast.hpp>

 using namespace armnn;

 namespace
 {

 template<typename ConvolutionDescriptor>
 struct PreCompiledConvolutionHelper
 {
 };

 template<>
 struct PreCompiledConvolutionHelper<Convolution2dDescriptor>
 {
     static IConnectableLayer* AddConvolutionLayerToNetwork(
         Network& network,
         Convolution2dDescriptor descriptor,
         const ConstTensor& weights,
         const ConstTensor& biases)
     {
         return network.AddConvolution2dLayer(descriptor, weights, biases, "convolution");
     }
 };

 template<>
 struct PreCompiledConvolutionHelper<DepthwiseConvolution2dDescriptor>
 {
     static IConnectableLayer* AddConvolutionLayerToNetwork(
         Network& network,
         DepthwiseConvolution2dDescriptor descriptor,
         const ConstTensor& weights,
         const ConstTensor& biases)
     {
         return network.AddDepthwiseConvolution2dLayer(descriptor, weights, biases, "depthwiseConvolution");
     }
 };

 template<typename ConvolutionDescriptor>
 ConvolutionDescriptor CreateConvolutionDescriptor(unsigned int stride, unsigned int padding)
 {
     ConvolutionDescriptor descriptor;

     descriptor.m_StrideX     = stride;
     descriptor.m_StrideY     = stride;
     descriptor.m_PadLeft     = padding;
     descriptor.m_PadRight    = padding;
     descriptor.m_PadTop      = padding;
     descriptor.m_PadBottom   = padding;
     descriptor.m_BiasEnabled = true;
     descriptor.m_DataLayout  = DataLayout::NHWC;

     return descriptor;
 }

 static std::vector<uint8_t> CreateIdentityConvolutionKernel(
     unsigned int kernelSize, unsigned int channels)
 {
     BOOST_ASSERT(kernelSize % 2 == 1); // kernelSize need to be an odd number

     const unsigned int numElements = channels * (kernelSize * kernelSize);
     std::vector<uint8_t> kernel(numElements, 0u);

     unsigned int centerIndex = kernelSize / 2;
     for(unsigned int y = 0u; y < kernelSize; y++)
     {
         for(unsigned int x = 0u; x < kernelSize; x++)
         {
             for(unsigned int channel = 0u; channel < channels; channel++)
             {
                 if (x == centerIndex && y == centerIndex)
                 {
                     const unsigned int flatIndex =
                         (y * kernelSize * channels) + (x * channels) + channel;

                     kernel[flatIndex] = 1u;
                 }
             }
         }
     }

     return kernel;
 }

 template<typename ConvolutionDescriptor>
 std::vector<uint8_t> GetIdentityConvolutionExpectedOutputData(
     const TensorInfo& inputInfo,
     const TensorInfo& outputInfo,
     const ConvolutionDescriptor& descriptor,
     const std::vector<uint8_t>& inputData)
 {
     const unsigned int outputDataSize = outputInfo.GetNumElements();
     std::vector<uint8_t> expectedOutputData(outputDataSize);

     const unsigned int channels = outputInfo.GetShape()[3];
     BOOST_ASSERT(channels == inputInfo.GetShape()[3]);

     const unsigned int inputW  = inputInfo.GetShape()[2];

     const unsigned int outputH = outputInfo.GetShape()[1];
     const unsigned int outputW = outputInfo.GetShape()[2];

     // Pick values from the input buffer, but after each iteration skip a number of
     // rows and columns equal to the stride in the respective dimension
     for (unsigned int inputY = 0, outputY = 0; outputY < outputH; inputY += descriptor.m_StrideY, outputY++)
     {
         for (unsigned int inputX = 0, outputX = 0; outputX < outputW; inputX += descriptor.m_StrideX, outputX++)
         {
             for (unsigned int channel = 0u; channel < channels; channel++)
             {
                 const unsigned int inputIndex  =
                     (inputY * inputW * channels) + (inputX * channels) + channel;
                 const unsigned int outputIndex =
                     (outputY * outputW * channels) + (outputX * channels) + channel;

                 expectedOutputData[outputIndex] = inputData[inputIndex];
             }
         }
     }

     return expectedOutputData;
 }

 armnn::PreCompiledLayer* FindPreCompiledLayer(armnn::Graph& optimisedGraph)
 {
     for (auto& layer : optimisedGraph)
     {
         if (layer->GetType() == armnn::LayerType::PreCompiled)
         {
             return boost::polymorphic_pointer_downcast<armnn::PreCompiledLayer>(layer);
         }
     }

     // No pre-compiled layer found
     return nullptr;
 }

 // NOTE: This only supports a single input and a single output
 LayerTestResult<uint8_t, 4> OptimiseAndRunNetwork(armnn::IWorkloadFactory& workloadFactory,
                                                   Network& net,
                                                   TensorInfo inputInfo,
                                                   std::vector<uint8_t> inputData,
                                                   TensorInfo outputInfo,
                                                   std::vector<uint8_t> expectedOutputData)
 {
     // Optimize the network for the backend supported by the factory
     std::vector<BackendId> backends = {workloadFactory.GetBackendId()};
     IRuntimePtr runtime(IRuntime::Create(IRuntime::CreationOptions()));
     IOptimizedNetworkPtr optimizedNet = Optimize(net, backends, runtime->GetDeviceSpec(), OptimizerOptions());
     if (!optimizedNet)
     {
         throw RuntimeException(std::string("Failed to optimize network for ") + std::string(backends[0]),
                                CHECK_LOCATION());
     }

     // Find the pre-compiled layer in the optimised graph
     Graph& optimisedGraph = static_cast<OptimizedNetwork*>(optimizedNet.get())->GetGraph();
     PreCompiledLayer* preCompiledLayer = FindPreCompiledLayer(optimisedGraph);
     if (!preCompiledLayer)
     {
         throw RuntimeException("Could not find pre-compiled layer in optimised graph", CHECK_LOCATION());
     }

     // Create the tensor handles
     for (auto&& layer : optimisedGraph.TopologicalSort())
     {
         layer->CreateTensorHandles(optimisedGraph, workloadFactory);
     }

     // Create the pre-compiled workload
     auto workload = preCompiledLayer->CreateWorkload(optimisedGraph, workloadFactory);

     // Set the input data
     boost::multi_array<uint8_t, 4> input = MakeTensor<uint8_t, 4>(inputInfo, inputData);
     const QueueDescriptor& workloadData =
         static_cast<BaseWorkload<PreCompiledQueueDescriptor>*>(workload.get())->GetData();
     CopyDataToITensorHandle(workloadData.m_Inputs[0], &input[0][0][0][0]);

     // Execute the workload
     workload->Execute();

     // Set the expected and actual outputs
     LayerTestResult<uint8_t, 4> result(outputInfo);
     result.outputExpected = MakeTensor<uint8_t, 4>(outputInfo, expectedOutputData);
     CopyDataFromITensorHandle(&result.output[0][0][0][0], workloadData.m_Outputs[0]);
     return result;
 }

 } // anonymous namespace

 template<typename ConvolutionDescriptor>
 LayerTestResult<uint8_t, 4> PreCompiledConvolution2dTestImpl(
     armnn::IWorkloadFactory& workloadFactory,
     const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
     unsigned int inputSize,
     unsigned int outputSize,
     unsigned int channels,
     unsigned int kernelSize,
     const ConvolutionDescriptor& descriptor,
     bool isDepthwiseConvolution = false)
 {
     BOOST_ASSERT(descriptor.m_BiasEnabled == true);
     BOOST_ASSERT(descriptor.m_DataLayout  == DataLayout::NHWC);

     // Set up tensor shapes and infos
     const TensorShape inputShape ({1, inputSize,  inputSize,  channels});
     const TensorShape outputShape({1, outputSize, outputSize, channels});
     const TensorShape kernelShape = isDepthwiseConvolution
                                     // The format for the depthwise convolution is MIHW
                                     ? TensorShape({1, channels, kernelSize, kernelSize})
                                     // The format for the regular convolution depends on the layout of the inputs,
                                     // in this case is NHWC
                                     : TensorShape({1, kernelSize, kernelSize, channels});
     const TensorShape biasesShape({1, 1, 1, channels});

     // NOTE: inputScale * weightsScale / outputScale must be >= 0.0 and < 1.0
     TensorInfo inputInfo(inputShape, DataType::QuantisedAsymm8, 1.0f, 0);
     TensorInfo outputInfo(outputShape, DataType::QuantisedAsymm8, 2.0f, 0);
     TensorInfo weightsInfo(kernelShape, DataType::QuantisedAsymm8, 1.0f, 0);
     TensorInfo biasesInfo(biasesShape, DataType::Signed32, 1.0f, 0);

     // Populate weight and bias data
     std::vector<uint8_t> weightsData = CreateIdentityConvolutionKernel(kernelSize, channels);

     // NOTE: We need to multiply the elements of the identity kernel by 2
     // to compensate for the scaling factor
     std::transform(weightsData.begin(), weightsData.end(), weightsData.begin(),
         [](uint8_t w) -> uint8_t { return static_cast<uint8_t>(w * 2); });

     const unsigned int biasDataSize = biasesInfo.GetNumElements();
     std::vector<int32_t> biasesData(biasDataSize, 0);

     // Construct network
     Network network;
     ConstTensor weights(weightsInfo, weightsData);
     ConstTensor biases(biasesInfo, biasesData);

     IConnectableLayer* const inputLayer = network.AddInputLayer(0, "input");

     IConnectableLayer* const convolutionLayer =
         PreCompiledConvolutionHelper<ConvolutionDescriptor>
             ::AddConvolutionLayerToNetwork(network, descriptor, weights, biases);

     IConnectableLayer* const outputLayer = network.AddOutputLayer(0, "output");

     inputLayer->GetOutputSlot(0).Connect(convolutionLayer->GetInputSlot(0));
     inputLayer->GetOutputSlot(0).SetTensorInfo(inputInfo);

     convolutionLayer->GetOutputSlot(0).Connect(outputLayer->GetInputSlot(0));
     convolutionLayer->GetOutputSlot(0).SetTensorInfo(outputInfo);

     // Generate input data: sequence [0, 1 .. 255]
     const unsigned int inputDataSize = inputInfo.GetNumElements();
     std::vector<uint8_t> inputData(inputDataSize);
     std::iota(inputData.begin(), inputData.end(), 0);

     // Set expected output
     std::vector<uint8_t> expectedOutputData =
         GetIdentityConvolutionExpectedOutputData(inputInfo,
                                                  outputInfo,
                                                  descriptor,
                                                  inputData);

     return OptimiseAndRunNetwork(workloadFactory,
                                  network,
                                  inputInfo,
                                  inputData,
                                  outputInfo,
                                  expectedOutputData);
 }

 LayerTestResult<uint8_t, 4> PreCompiledConvolution2dTestImpl(
     armnn::IWorkloadFactory& workloadFactory,
     const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
 {
     const unsigned int inputSize  = 16;
     const unsigned int outputSize = 16;
     const unsigned int channels   = 1;
     const unsigned int kernelSize = 3;
     const unsigned int stride     = 1;
     const unsigned int padding    = 1;

     Convolution2dDescriptor descriptor =
         CreateConvolutionDescriptor<Convolution2dDescriptor>(stride, padding);

     return PreCompiledConvolution2dTestImpl(workloadFactory,
                                             memoryManager,
                                             inputSize,
                                             outputSize,
                                             channels,
                                             kernelSize,
                                             descriptor);
 }

 LayerTestResult<uint8_t, 4> PreCompiledConvolution2dStride2x2TestImpl(
     armnn::IWorkloadFactory& workloadFactory,
     const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
 {
     const unsigned int inputSize  = 16;
     const unsigned int outputSize = 8;
     const unsigned int channels   = 1;
     const unsigned int kernelSize = 3;
     const unsigned int stride     = 2;
     const unsigned int padding    = 1;

     Convolution2dDescriptor descriptor =
         CreateConvolutionDescriptor<Convolution2dDescriptor>(stride, padding);

     return PreCompiledConvolution2dTestImpl(workloadFactory,
                                             memoryManager,
                                             inputSize,
                                             outputSize,
                                             channels,
                                             kernelSize,
                                             descriptor);
 }

 LayerTestResult<uint8_t, 4> PreCompiledDepthwiseConvolution2dTestImpl(
     armnn::IWorkloadFactory & workloadFactory,
     const armnn::IBackendInternal::IMemoryManagerSharedPtr & memoryManager)
 {
     const unsigned int inputSize  = 16;
     const unsigned int outputSize = 16;
     const unsigned int channels   = 3;
     const unsigned int kernelSize = 1;
     const unsigned int stride     = 1;
     const unsigned int padding    = 0;

     DepthwiseConvolution2dDescriptor descriptor =
         CreateConvolutionDescriptor<DepthwiseConvolution2dDescriptor>(stride, padding);

     return PreCompiledConvolution2dTestImpl(workloadFactory,
                                             memoryManager,
                                             inputSize,
                                             outputSize,
                                             channels,
                                             kernelSize,
                                             descriptor,
                                             true);
 }

 LayerTestResult<uint8_t, 4> PreCompiledDepthwiseConvolution2dStride2x2TestImpl(
     armnn::IWorkloadFactory & workloadFactory,
     const armnn::IBackendInternal::IMemoryManagerSharedPtr & memoryManager)
 {
     const unsigned int inputSize  = 16;
     const unsigned int outputSize = 8;
     const unsigned int channels   = 3;
     const unsigned int kernelSize = 3;
     const unsigned int stride     = 2;
     const unsigned int padding    = 1;

     DepthwiseConvolution2dDescriptor descriptor =
         CreateConvolutionDescriptor<DepthwiseConvolution2dDescriptor>(stride, padding);

     return PreCompiledConvolution2dTestImpl(workloadFactory,
                                             memoryManager,
                                             inputSize,
                                             outputSize,
                                             channels,
                                             kernelSize,
                                             descriptor);
 }

 LayerTestResult<uint8_t, 4> PreCompiledMaxPooling2dTestImpl(
     armnn::IWorkloadFactory& workloadFactory,
     const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
 {
     // Pooling cannot be run in isolation, it must be fused with the previous layer, e.g. Convolution2d.

     // Set up the Convolution descriptor
     Convolution2dDescriptor convDescriptor;
     convDescriptor.m_StrideX = 1;
     convDescriptor.m_StrideY = 1;
     convDescriptor.m_BiasEnabled = true;
     convDescriptor.m_DataLayout  = DataLayout::NHWC;

     // Set up the Convolution weights
     TensorInfo weightsInfo(TensorShape({16, 1, 1, 16}), DataType::QuantisedAsymm8, 2.0f, 0);
     const unsigned int weightsDataSize = weightsInfo.GetNumElements();
     std::vector<uint8_t> weightsData(weightsDataSize);
     for (unsigned int i = 0; i < 16; ++i)
     {
         for (unsigned int j = 0; j < 16; ++j)
         {
             weightsData[(i * 16) + j] = i == j ? 1.0f : 0.0f;
         }
     }
     ConstTensor weights(weightsInfo, weightsData);

     // Set up the Convolution biases
     TensorInfo biasInfo(TensorShape({1, 1, 1, 16}), DataType::Signed32, 1.0f * 2.0f, 0);
     const unsigned int biasDataSize = biasInfo.GetNumElements();
     std::vector<int32_t> biasData(biasDataSize, 0);
     ConstTensor biases(biasInfo, biasData);

     // Set up the Convolution input
     TensorInfo inputInfo(TensorShape({1, 16, 16, 16 }), DataType::QuantisedAsymm8, 1.0f, 0);
     const unsigned int inputDataSize = inputInfo.GetNumElements();
     std::vector<uint8_t> inputData(inputDataSize);
     for (unsigned int i = 0; i < inputDataSize; ++i)
     {
         inputData[i] = boost::numeric_cast<uint8_t>((i * 4) % 250);
     }

     // Set up the Convolution output / Pooling input info
     TensorInfo convOutputInfo(TensorShape({1, 16, 16, 16 }), DataType::QuantisedAsymm8, 4.0f, 0);

     // Set up the Pooling descriptor
     Pooling2dDescriptor poolDescriptor;
     poolDescriptor.m_PoolType = PoolingAlgorithm::Max;
     poolDescriptor.m_PoolWidth = 2;
     poolDescriptor.m_PoolHeight = 2;
     poolDescriptor.m_StrideX = 2;
     poolDescriptor.m_StrideY = 2;
     poolDescriptor.m_PaddingMethod = PaddingMethod::Exclude;
     poolDescriptor.m_DataLayout = DataLayout::NHWC;

     // Set the expected output from the Pooling layer
     TensorInfo outputInfo(TensorShape({1, 8, 8, 16 }), DataType::QuantisedAsymm8, 4.0f, 0);
     const unsigned int outputDataSize = outputInfo.GetNumElements();
     std::vector<uint8_t> expectedOutputData(outputDataSize);
     // The Maxpooling inputs are the Convolution outputs, i.e. (Convolution inputs / 2) after scale adjustments
     // Maxpooling selects the max value in each pool from its inputs and our pool size is 2x2
     for (unsigned int channel = 0; channel < 16; ++channel)
     {
         for (unsigned int row = 0; row < 8; ++row)
         {
             for (unsigned int column = 0; column < 8; ++column)
             {
                 // The input and output data indexes are calculated for NHWC data layout.
                 // Output index: (row * columns * channels) + (column * channels) + channel
                 auto outIndex = (row * 8 * 16) + (column * 16) + channel;
                 // Input index: (row * strideY * columns * channels) + (column * strideX * channels) + channel
                 //      and we take 4 entries for the 2x2 pool
                 auto in0Index = ((row * 2) * 16 * 16) + ((column * 2) * 16) + channel;
                 auto in1Index = ((row * 2) * 16 * 16) + (((column * 2) + 1) * 16) + channel;
                 auto in2Index = (((row * 2) + 1) * 16 * 16) + ((column * 2) * 16) + channel;
                 auto in3Index = (((row * 2) + 1) * 16 * 16) + (((column * 2) + 1) * 16) + channel;
                 // output value is the maximum of the input pool values, adjusted for the quantization scale change
                 auto maxIn = std::max<uint8_t>({inputData[in0Index],
                                                 inputData[in1Index],
                                                 inputData[in2Index],
                                                 inputData[in3Index]});
                 expectedOutputData[outIndex] = maxIn / 2;
             }
         }
     }

     // Construct the network
     Network net;
     IConnectableLayer* const inputLayer   = net.AddInputLayer(0, "input");
     IConnectableLayer* const convLayer    = net.AddConvolution2dLayer(convDescriptor, weights, biases, "conv");
     IConnectableLayer* const poolingLayer = net.AddPooling2dLayer(poolDescriptor, "pooling2d");
     IConnectableLayer* const outputLayer  = net.AddOutputLayer(0, "output");

     // Connect the layers
     inputLayer->GetOutputSlot(0).Connect(convLayer->GetInputSlot(0));
     inputLayer->GetOutputSlot(0).SetTensorInfo(inputInfo);
     convLayer->GetOutputSlot(0).Connect(poolingLayer->GetInputSlot(0));
     convLayer->GetOutputSlot(0).SetTensorInfo(convOutputInfo);
     poolingLayer->GetOutputSlot(0).Connect(outputLayer->GetInputSlot(0));
     poolingLayer->GetOutputSlot(0).SetTensorInfo(outputInfo);

     return OptimiseAndRunNetwork(workloadFactory,
                                  net,
                                  inputInfo,
                                  inputData,
                                  outputInfo,
                                  expectedOutputData);
 }
	//
	// Copyright © 2017 Arm Ltd. All rights reserved.
	// SPDX-License-Identifier: MIT
	//

	#include "PreCompiledTestImpl.hpp"

	#include "TensorCopyUtils.hpp"

	#include <Graph.hpp>
	#include <Network.hpp>
	#include <Runtime.hpp>

	#include <armnn/Exceptions.hpp>
	#include <armnn/INetwork.hpp>

	#include <test/TensorHelpers.hpp>

	#include <backendsCommon/WorkloadFactory.hpp>

	#include <boost/polymorphic_pointer_cast.hpp>

	using namespace armnn;

	namespace
	{

	template<typename ConvolutionDescriptor>
	struct PreCompiledConvolutionHelper
	{
	};

	template<>
	struct PreCompiledConvolutionHelper<Convolution2dDescriptor>
	{
	static IConnectableLayer* AddConvolutionLayerToNetwork(
	Network& network,
	Convolution2dDescriptor descriptor,
	const ConstTensor& weights,
	const ConstTensor& biases)
	{
	return network.AddConvolution2dLayer(descriptor, weights, biases, "convolution");
	}
	};

	template<>
	struct PreCompiledConvolutionHelper<DepthwiseConvolution2dDescriptor>
	{
	static IConnectableLayer* AddConvolutionLayerToNetwork(
	Network& network,
	DepthwiseConvolution2dDescriptor descriptor,
	const ConstTensor& weights,
	const ConstTensor& biases)
	{
	return network.AddDepthwiseConvolution2dLayer(descriptor, weights, biases, "depthwiseConvolution");
	}
	};

	template<typename ConvolutionDescriptor>
	ConvolutionDescriptor CreateConvolutionDescriptor(unsigned int stride, unsigned int padding)
	{
	ConvolutionDescriptor descriptor;

	descriptor.m_StrideX = stride;
	descriptor.m_StrideY = stride;
	descriptor.m_PadLeft = padding;
	descriptor.m_PadRight = padding;
	descriptor.m_PadTop = padding;
	descriptor.m_PadBottom = padding;
	descriptor.m_BiasEnabled = true;
	descriptor.m_DataLayout = DataLayout::NHWC;

	return descriptor;
	}

	static std::vector<uint8_t> CreateIdentityConvolutionKernel(
	unsigned int kernelSize, unsigned int channels)
	{
	BOOST_ASSERT(kernelSize % 2 == 1); // kernelSize need to be an odd number

	const unsigned int numElements = channels * (kernelSize * kernelSize);
	std::vector<uint8_t> kernel(numElements, 0u);

	unsigned int centerIndex = kernelSize / 2;
	for(unsigned int y = 0u; y < kernelSize; y++)
	{
	for(unsigned int x = 0u; x < kernelSize; x++)
	{
	for(unsigned int channel = 0u; channel < channels; channel++)
	{
	if (x == centerIndex && y == centerIndex)
	{
	const unsigned int flatIndex =
	(y * kernelSize * channels) + (x * channels) + channel;

	kernel[flatIndex] = 1u;
	}
	}
	}
	}

	return kernel;
	}

	template<typename ConvolutionDescriptor>
	std::vector<uint8_t> GetIdentityConvolutionExpectedOutputData(
	const TensorInfo& inputInfo,
	const TensorInfo& outputInfo,
	const ConvolutionDescriptor& descriptor,
	const std::vector<uint8_t>& inputData)
	{
	const unsigned int outputDataSize = outputInfo.GetNumElements();
	std::vector<uint8_t> expectedOutputData(outputDataSize);

	const unsigned int channels = outputInfo.GetShape()[3];
	BOOST_ASSERT(channels == inputInfo.GetShape()[3]);

	const unsigned int inputW = inputInfo.GetShape()[2];

	const unsigned int outputH = outputInfo.GetShape()[1];
	const unsigned int outputW = outputInfo.GetShape()[2];

	// Pick values from the input buffer, but after each iteration skip a number of
	// rows and columns equal to the stride in the respective dimension
	for (unsigned int inputY = 0, outputY = 0; outputY < outputH; inputY += descriptor.m_StrideY, outputY++)
	{
	for (unsigned int inputX = 0, outputX = 0; outputX < outputW; inputX += descriptor.m_StrideX, outputX++)
	{
	for (unsigned int channel = 0u; channel < channels; channel++)
	{
	const unsigned int inputIndex =
	(inputY * inputW * channels) + (inputX * channels) + channel;
	const unsigned int outputIndex =
	(outputY * outputW * channels) + (outputX * channels) + channel;

	expectedOutputData[outputIndex] = inputData[inputIndex];
	}
	}
	}

	return expectedOutputData;
	}

	armnn::PreCompiledLayer* FindPreCompiledLayer(armnn::Graph& optimisedGraph)
	{
	for (auto& layer : optimisedGraph)
	{
	if (layer->GetType() == armnn::LayerType::PreCompiled)
	{
	return boost::polymorphic_pointer_downcast<armnn::PreCompiledLayer>(layer);
	}
	}

	// No pre-compiled layer found
	return nullptr;
	}

	// NOTE: This only supports a single input and a single output
	LayerTestResult<uint8_t, 4> OptimiseAndRunNetwork(armnn::IWorkloadFactory& workloadFactory,
	Network& net,
	TensorInfo inputInfo,
	std::vector<uint8_t> inputData,
	TensorInfo outputInfo,
	std::vector<uint8_t> expectedOutputData)
	{
	// Optimize the network for the backend supported by the factory
	std::vector<BackendId> backends = {workloadFactory.GetBackendId()};
	IRuntimePtr runtime(IRuntime::Create(IRuntime::CreationOptions()));
	IOptimizedNetworkPtr optimizedNet = Optimize(net, backends, runtime->GetDeviceSpec(), OptimizerOptions());
	if (!optimizedNet)
	{
	throw RuntimeException(std::string("Failed to optimize network for ") + std::string(backends[0]),
	CHECK_LOCATION());
	}

	// Find the pre-compiled layer in the optimised graph
	Graph& optimisedGraph = static_cast<OptimizedNetwork*>(optimizedNet.get())->GetGraph();
	PreCompiledLayer* preCompiledLayer = FindPreCompiledLayer(optimisedGraph);
	if (!preCompiledLayer)
	{
	throw RuntimeException("Could not find pre-compiled layer in optimised graph", CHECK_LOCATION());
	}

	// Create the tensor handles
	for (auto&& layer : optimisedGraph.TopologicalSort())
	{
	layer->CreateTensorHandles(optimisedGraph, workloadFactory);
	}

	// Create the pre-compiled workload
	auto workload = preCompiledLayer->CreateWorkload(optimisedGraph, workloadFactory);

	// Set the input data
	boost::multi_array<uint8_t, 4> input = MakeTensor<uint8_t, 4>(inputInfo, inputData);
	const QueueDescriptor& workloadData =
	static_cast<BaseWorkload<PreCompiledQueueDescriptor>*>(workload.get())->GetData();
	CopyDataToITensorHandle(workloadData.m_Inputs[0], &input[0][0][0][0]);

	// Execute the workload
	workload->Execute();

	// Set the expected and actual outputs
	LayerTestResult<uint8_t, 4> result(outputInfo);
	result.outputExpected = MakeTensor<uint8_t, 4>(outputInfo, expectedOutputData);
	CopyDataFromITensorHandle(&result.output[0][0][0][0], workloadData.m_Outputs[0]);
	return result;
	}

	} // anonymous namespace

	template<typename ConvolutionDescriptor>
	LayerTestResult<uint8_t, 4> PreCompiledConvolution2dTestImpl(
	armnn::IWorkloadFactory& workloadFactory,
	const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
	unsigned int inputSize,
	unsigned int outputSize,
	unsigned int channels,
	unsigned int kernelSize,
	const ConvolutionDescriptor& descriptor,
	bool isDepthwiseConvolution = false)
	{
	BOOST_ASSERT(descriptor.m_BiasEnabled == true);
	BOOST_ASSERT(descriptor.m_DataLayout == DataLayout::NHWC);

	// Set up tensor shapes and infos
	const TensorShape inputShape ({1, inputSize, inputSize, channels});
	const TensorShape outputShape({1, outputSize, outputSize, channels});
	const TensorShape kernelShape = isDepthwiseConvolution
	// The format for the depthwise convolution is MIHW
	? TensorShape({1, channels, kernelSize, kernelSize})
	// The format for the regular convolution depends on the layout of the inputs,
	// in this case is NHWC
	: TensorShape({1, kernelSize, kernelSize, channels});
	const TensorShape biasesShape({1, 1, 1, channels});

	// NOTE: inputScale * weightsScale / outputScale must be >= 0.0 and < 1.0
	TensorInfo inputInfo(inputShape, DataType::QuantisedAsymm8, 1.0f, 0);
	TensorInfo outputInfo(outputShape, DataType::QuantisedAsymm8, 2.0f, 0);
	TensorInfo weightsInfo(kernelShape, DataType::QuantisedAsymm8, 1.0f, 0);
	TensorInfo biasesInfo(biasesShape, DataType::Signed32, 1.0f, 0);

	// Populate weight and bias data
	std::vector<uint8_t> weightsData = CreateIdentityConvolutionKernel(kernelSize, channels);

	// NOTE: We need to multiply the elements of the identity kernel by 2
	// to compensate for the scaling factor
	std::transform(weightsData.begin(), weightsData.end(), weightsData.begin(),
	[](uint8_t w) -> uint8_t { return static_cast<uint8_t>(w * 2); });

	const unsigned int biasDataSize = biasesInfo.GetNumElements();
	std::vector<int32_t> biasesData(biasDataSize, 0);

	// Construct network
	Network network;
	ConstTensor weights(weightsInfo, weightsData);
	ConstTensor biases(biasesInfo, biasesData);

	IConnectableLayer* const inputLayer = network.AddInputLayer(0, "input");

	IConnectableLayer* const convolutionLayer =
	PreCompiledConvolutionHelper<ConvolutionDescriptor>
	::AddConvolutionLayerToNetwork(network, descriptor, weights, biases);

	IConnectableLayer* const outputLayer = network.AddOutputLayer(0, "output");

	inputLayer->GetOutputSlot(0).Connect(convolutionLayer->GetInputSlot(0));
	inputLayer->GetOutputSlot(0).SetTensorInfo(inputInfo);

	convolutionLayer->GetOutputSlot(0).Connect(outputLayer->GetInputSlot(0));
	convolutionLayer->GetOutputSlot(0).SetTensorInfo(outputInfo);

	// Generate input data: sequence [0, 1 .. 255]
	const unsigned int inputDataSize = inputInfo.GetNumElements();
	std::vector<uint8_t> inputData(inputDataSize);
	std::iota(inputData.begin(), inputData.end(), 0);

	// Set expected output
	std::vector<uint8_t> expectedOutputData =
	GetIdentityConvolutionExpectedOutputData(inputInfo,
	outputInfo,
	descriptor,
	inputData);

	return OptimiseAndRunNetwork(workloadFactory,
	network,
	inputInfo,
	inputData,
	outputInfo,
	expectedOutputData);
	}

	LayerTestResult<uint8_t, 4> PreCompiledConvolution2dTestImpl(
	armnn::IWorkloadFactory& workloadFactory,
	const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
	{
	const unsigned int inputSize = 16;
	const unsigned int outputSize = 16;
	const unsigned int channels = 1;
	const unsigned int kernelSize = 3;
	const unsigned int stride = 1;
	const unsigned int padding = 1;

	Convolution2dDescriptor descriptor =
	CreateConvolutionDescriptor<Convolution2dDescriptor>(stride, padding);

	return PreCompiledConvolution2dTestImpl(workloadFactory,
	memoryManager,
	inputSize,
	outputSize,
	channels,
	kernelSize,
	descriptor);
	}

	LayerTestResult<uint8_t, 4> PreCompiledConvolution2dStride2x2TestImpl(
	armnn::IWorkloadFactory& workloadFactory,
	const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
	{
	const unsigned int inputSize = 16;
	const unsigned int outputSize = 8;
	const unsigned int channels = 1;
	const unsigned int kernelSize = 3;
	const unsigned int stride = 2;
	const unsigned int padding = 1;

	Convolution2dDescriptor descriptor =
	CreateConvolutionDescriptor<Convolution2dDescriptor>(stride, padding);

	return PreCompiledConvolution2dTestImpl(workloadFactory,
	memoryManager,
	inputSize,
	outputSize,
	channels,
	kernelSize,
	descriptor);
	}

	LayerTestResult<uint8_t, 4> PreCompiledDepthwiseConvolution2dTestImpl(
	armnn::IWorkloadFactory & workloadFactory,
	const armnn::IBackendInternal::IMemoryManagerSharedPtr & memoryManager)
	{
	const unsigned int inputSize = 16;
	const unsigned int outputSize = 16;
	const unsigned int channels = 3;
	const unsigned int kernelSize = 1;
	const unsigned int stride = 1;
	const unsigned int padding = 0;

	DepthwiseConvolution2dDescriptor descriptor =
	CreateConvolutionDescriptor<DepthwiseConvolution2dDescriptor>(stride, padding);

	return PreCompiledConvolution2dTestImpl(workloadFactory,
	memoryManager,
	inputSize,
	outputSize,
	channels,
	kernelSize,
	descriptor,
	true);
	}

	LayerTestResult<uint8_t, 4> PreCompiledDepthwiseConvolution2dStride2x2TestImpl(
	armnn::IWorkloadFactory & workloadFactory,
	const armnn::IBackendInternal::IMemoryManagerSharedPtr & memoryManager)
	{
	const unsigned int inputSize = 16;
	const unsigned int outputSize = 8;
	const unsigned int channels = 3;
	const unsigned int kernelSize = 3;
	const unsigned int stride = 2;
	const unsigned int padding = 1;

	DepthwiseConvolution2dDescriptor descriptor =
	CreateConvolutionDescriptor<DepthwiseConvolution2dDescriptor>(stride, padding);

	return PreCompiledConvolution2dTestImpl(workloadFactory,
	memoryManager,
	inputSize,
	outputSize,
	channels,
	kernelSize,
	descriptor);
	}

	LayerTestResult<uint8_t, 4> PreCompiledMaxPooling2dTestImpl(
	armnn::IWorkloadFactory& workloadFactory,
	const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
	{
	// Pooling cannot be run in isolation, it must be fused with the previous layer, e.g. Convolution2d.

	// Set up the Convolution descriptor
	Convolution2dDescriptor convDescriptor;
	convDescriptor.m_StrideX = 1;
	convDescriptor.m_StrideY = 1;
	convDescriptor.m_BiasEnabled = true;
	convDescriptor.m_DataLayout = DataLayout::NHWC;

	// Set up the Convolution weights
	TensorInfo weightsInfo(TensorShape({16, 1, 1, 16}), DataType::QuantisedAsymm8, 2.0f, 0);
	const unsigned int weightsDataSize = weightsInfo.GetNumElements();
	std::vector<uint8_t> weightsData(weightsDataSize);
	for (unsigned int i = 0; i < 16; ++i)
	{
	for (unsigned int j = 0; j < 16; ++j)
	{
	weightsData[(i * 16) + j] = i == j ? 1.0f : 0.0f;
	}
	}
	ConstTensor weights(weightsInfo, weightsData);

	// Set up the Convolution biases
	TensorInfo biasInfo(TensorShape({1, 1, 1, 16}), DataType::Signed32, 1.0f * 2.0f, 0);
	const unsigned int biasDataSize = biasInfo.GetNumElements();
	std::vector<int32_t> biasData(biasDataSize, 0);
	ConstTensor biases(biasInfo, biasData);

	// Set up the Convolution input
	TensorInfo inputInfo(TensorShape({1, 16, 16, 16 }), DataType::QuantisedAsymm8, 1.0f, 0);
	const unsigned int inputDataSize = inputInfo.GetNumElements();
	std::vector<uint8_t> inputData(inputDataSize);
	for (unsigned int i = 0; i < inputDataSize; ++i)
	{
	inputData[i] = boost::numeric_cast<uint8_t>((i * 4) % 250);
	}

	// Set up the Convolution output / Pooling input info
	TensorInfo convOutputInfo(TensorShape({1, 16, 16, 16 }), DataType::QuantisedAsymm8, 4.0f, 0);

	// Set up the Pooling descriptor
	Pooling2dDescriptor poolDescriptor;
	poolDescriptor.m_PoolType = PoolingAlgorithm::Max;
	poolDescriptor.m_PoolWidth = 2;
	poolDescriptor.m_PoolHeight = 2;
	poolDescriptor.m_StrideX = 2;
	poolDescriptor.m_StrideY = 2;
	poolDescriptor.m_PaddingMethod = PaddingMethod::Exclude;
	poolDescriptor.m_DataLayout = DataLayout::NHWC;

	// Set the expected output from the Pooling layer
	TensorInfo outputInfo(TensorShape({1, 8, 8, 16 }), DataType::QuantisedAsymm8, 4.0f, 0);
	const unsigned int outputDataSize = outputInfo.GetNumElements();
	std::vector<uint8_t> expectedOutputData(outputDataSize);
	// The Maxpooling inputs are the Convolution outputs, i.e. (Convolution inputs / 2) after scale adjustments
	// Maxpooling selects the max value in each pool from its inputs and our pool size is 2x2
	for (unsigned int channel = 0; channel < 16; ++channel)
	{
	for (unsigned int row = 0; row < 8; ++row)
	{
	for (unsigned int column = 0; column < 8; ++column)
	{
	// The input and output data indexes are calculated for NHWC data layout.
	// Output index: (row * columns * channels) + (column * channels) + channel
	auto outIndex = (row * 8 * 16) + (column * 16) + channel;
	// Input index: (row * strideY * columns * channels) + (column * strideX * channels) + channel
	// and we take 4 entries for the 2x2 pool
	auto in0Index = ((row * 2) * 16 * 16) + ((column * 2) * 16) + channel;
	auto in1Index = ((row * 2) * 16 * 16) + (((column * 2) + 1) * 16) + channel;
	auto in2Index = (((row * 2) + 1) * 16 * 16) + ((column * 2) * 16) + channel;
	auto in3Index = (((row * 2) + 1) * 16 * 16) + (((column * 2) + 1) * 16) + channel;
	// output value is the maximum of the input pool values, adjusted for the quantization scale change
	auto maxIn = std::max<uint8_t>({inputData[in0Index],
	inputData[in1Index],
	inputData[in2Index],
	inputData[in3Index]});
	expectedOutputData[outIndex] = maxIn / 2;
	}
	}
	}

	// Construct the network
	Network net;
	IConnectableLayer* const inputLayer = net.AddInputLayer(0, "input");
	IConnectableLayer* const convLayer = net.AddConvolution2dLayer(convDescriptor, weights, biases, "conv");
	IConnectableLayer* const poolingLayer = net.AddPooling2dLayer(poolDescriptor, "pooling2d");
	IConnectableLayer* const outputLayer = net.AddOutputLayer(0, "output");

	// Connect the layers
	inputLayer->GetOutputSlot(0).Connect(convLayer->GetInputSlot(0));
	inputLayer->GetOutputSlot(0).SetTensorInfo(inputInfo);
	convLayer->GetOutputSlot(0).Connect(poolingLayer->GetInputSlot(0));
	convLayer->GetOutputSlot(0).SetTensorInfo(convOutputInfo);
	poolingLayer->GetOutputSlot(0).Connect(outputLayer->GetInputSlot(0));
	poolingLayer->GetOutputSlot(0).SetTensorInfo(outputInfo);

	return OptimiseAndRunNetwork(workloadFactory,
	net,
	inputInfo,
	inputData,
	outputInfo,
	expectedOutputData);
	}