src/backends/backendsCommon/test/EndToEndTestImpl.hpp - ml/armnn - Gitiles

 //
 // Copyright © 2017 Arm Ltd. All rights reserved.
 // SPDX-License-Identifier: MIT
 //
 #pragma once

 #include <CommonTestUtils.hpp>

 #include <armnn/Descriptors.hpp>
 #include <armnn/INetwork.hpp>
 #include <armnn/IRuntime.hpp>

 #include <Profiling.hpp>
 #include <armnnUtils/QuantizeHelper.hpp>
 #include <ResolveType.hpp>

 #include <doctest/doctest.h>

 #include <vector>

 namespace
 {

 using namespace armnn;

 template<typename T>
 bool ConstantUsageTest(const std::vector<BackendId>& computeDevice,
                        const TensorInfo& commonTensorInfo,
                        const std::vector<T>& inputData,
                        const std::vector<T>& constantData,
                        const std::vector<T>& expectedOutputData)
 {
     // Create runtime in which test will run
     IRuntime::CreationOptions options;
     IRuntimePtr runtime(IRuntime::Create(options));

     // Builds up the structure of the network.
     INetworkPtr net(INetwork::Create());

     IConnectableLayer* input = net->AddInputLayer(0);
     IConnectableLayer* constant = net->AddConstantLayer(ConstTensor(commonTensorInfo, constantData));
     IConnectableLayer* add = net->AddAdditionLayer();
     IConnectableLayer* output = net->AddOutputLayer(0);

     input->GetOutputSlot(0).Connect(add->GetInputSlot(0));
     constant->GetOutputSlot(0).Connect(add->GetInputSlot(1));
     add->GetOutputSlot(0).Connect(output->GetInputSlot(0));

     // Sets the tensors in the network.
     input->GetOutputSlot(0).SetTensorInfo(commonTensorInfo);
     constant->GetOutputSlot(0).SetTensorInfo(commonTensorInfo);
     add->GetOutputSlot(0).SetTensorInfo(commonTensorInfo);

     // optimize the network
     IOptimizedNetworkPtr optNet = Optimize(*net, computeDevice, runtime->GetDeviceSpec());

     // Loads it into the runtime.
     NetworkId netId;
     runtime->LoadNetwork(netId, std::move(optNet));

     // Creates structures for input & output.
     std::vector<T> outputData(inputData.size());

     InputTensors inputTensors
     {
         {0, ConstTensor(runtime->GetInputTensorInfo(netId, 0), inputData.data())}
     };
     OutputTensors outputTensors
     {
         {0, Tensor(runtime->GetOutputTensorInfo(netId, 0), outputData.data())}
     };

     // Does the inference.
     runtime->EnqueueWorkload(netId, inputTensors, outputTensors);

     // Checks the results.
     return outputData == expectedOutputData;
 }

 inline bool ConstantUsageFloat32Test(const std::vector<BackendId>& backends)
 {
     TensorInfo commonTensorInfo({ 2, 3 }, DataType::Float32);
     commonTensorInfo.SetConstant(true);

     return ConstantUsageTest(backends,
         commonTensorInfo,
         std::vector<float>{ 1.f, 2.f, 3.f, 4.f, 5.f, 6.f }, // Input.
         std::vector<float>{ 6.f, 5.f, 4.f, 3.f, 2.f, 1.f }, // Const input.
         std::vector<float>{ 7.f, 7.f, 7.f, 7.f, 7.f, 7.f }  // Expected output.
     );
 }

 inline bool ConstantUsageUint8Test(const std::vector<BackendId>& backends)
 {
     TensorInfo commonTensorInfo({ 2, 3 }, DataType::QAsymmU8);

     const float scale = 0.023529f;
     const int8_t offset = -43;

     commonTensorInfo.SetQuantizationScale(scale);
     commonTensorInfo.SetQuantizationOffset(offset);
     commonTensorInfo.SetConstant(true);

     return ConstantUsageTest(backends,
         commonTensorInfo,
         armnnUtils::QuantizedVector<uint8_t>({ 1.f, 2.f, 3.f, 4.f, 5.f, 6.f }, scale, offset), // Input.
         armnnUtils::QuantizedVector<uint8_t>({ 6.f, 5.f, 4.f, 3.f, 2.f, 1.f }, scale, offset), // Const input.
         armnnUtils::QuantizedVector<uint8_t>({ 7.f, 7.f, 7.f, 7.f, 7.f, 7.f }, scale, offset)  // Expected output.
     );
 }

 // Utility function to find the number of instances of a substring within a string.
 int SubStringCounter(std::string& string, std::string&& substring)
 {
     std::size_t found = 0;
     int count = 0;
     // Look for the substring starting from where we last found the substring
     while((found = string.find(substring, found)) != std::string::npos)
     {
         count++;
         // Offset by substring length to avoid finding the same substring twice
         found += substring.length();
     }
     return count;
 }

 template<DataType ArmnnIType, DataType ArmnnOType,
          typename TInput = ResolveType<ArmnnIType>, typename TOutput = ResolveType<ArmnnOType>>
 void EndToEndLayerTestImpl(INetworkPtr network,
                            const std::map<int, std::vector<TInput>>& inputTensorData,
                            const std::map<int, std::vector<TOutput>>& expectedOutputData,
                            std::vector<BackendId> backends,
                            float tolerance = 0.000001f)
 {
     // Create runtime in which test will run
     IRuntime::CreationOptions options;
     IRuntimePtr runtime(IRuntime::Create(options));

     // optimize the network
     IOptimizedNetworkPtr optNet = Optimize(*network, backends, runtime->GetDeviceSpec());

     // Loads it into the runtime.
     NetworkId netId;
     runtime->LoadNetwork(netId, std::move(optNet));

     InputTensors inputTensors;
     inputTensors.reserve(inputTensorData.size());
     for (auto&& it : inputTensorData)
     {
         inputTensors.push_back({it.first,
                                 ConstTensor(runtime->GetInputTensorInfo(netId, it.first), it.second.data())});
     }
     OutputTensors outputTensors;
     outputTensors.reserve(expectedOutputData.size());
     std::map<int, std::vector<TOutput>> outputStorage;
     for (auto&& it : expectedOutputData)
     {
         std::vector<TOutput> out(it.second.size());
         outputStorage.emplace(it.first, out);
         outputTensors.push_back({it.first,
                                  Tensor(runtime->GetOutputTensorInfo(netId, it.first),
                                                outputStorage.at(it.first).data())});
     }

     // Does the inference.
     runtime->EnqueueWorkload(netId, inputTensors, outputTensors);

     // Checks the results.
     for (auto&& it : expectedOutputData)
     {
         std::vector<TOutput> out = outputStorage.at(it.first);
         for (unsigned int i = 0; i < out.size(); ++i)
         {
             CHECK_MESSAGE(Compare<ArmnnOType>(it.second[i], out[i], tolerance) == true,
                     "Actual output: " << out[i] << ". Expected output:" << it.second[i]);

         }
     }
 }

 inline void ImportNonAlignedInputPointerTest(std::vector<BackendId> backends)
 {
     using namespace armnn;

     // Create runtime in which test will run
     IRuntime::CreationOptions options;
     IRuntimePtr runtime(armnn::IRuntime::Create(options));

     // build up the structure of the network
     INetworkPtr net(INetwork::Create());

     IConnectableLayer* input = net->AddInputLayer(0);

     ActivationDescriptor descriptor;
     descriptor.m_Function = ActivationFunction::Square;
     IConnectableLayer* pooling = net->AddActivationLayer(descriptor);

     IConnectableLayer* output = net->AddOutputLayer(0);

     input->GetOutputSlot(0).Connect(pooling->GetInputSlot(0));
     pooling->GetOutputSlot(0).Connect(output->GetInputSlot(0));

     input->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32, 0.0f, 0, true));
     pooling->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32));

     // Optimize the network
     OptimizerOptions optimizedOptions;
     optimizedOptions.m_ImportEnabled = true;
     IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec(), optimizedOptions);
     CHECK(optNet);

     // Loads it into the runtime.
     NetworkId netId;
     std::string ignoredErrorMessage;
     // Enable Importing
     INetworkProperties networkProperties(false, MemorySource::Malloc, MemorySource::Undefined);
     runtime->LoadNetwork(netId, std::move(optNet), ignoredErrorMessage, networkProperties);

     // Creates structures for input & output
     std::vector<float> inputData
     {
         1.0f, 2.0f, 3.0f, 4.0f
     };

     // Misaligned input
     float* misalignedInputData = reinterpret_cast<float*>(reinterpret_cast<char*>(inputData.data()) + 1);

     std::vector<float> outputData(4);

     // Aligned output
     float* alignedOutputData = outputData.data();

     InputTensors inputTensors
     {
         {0,armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 0), misalignedInputData)},
     };
     OutputTensors outputTensors
     {
         {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), alignedOutputData)}
     };

     runtime->GetProfiler(netId)->EnableProfiling(true);

     // Do the inference and expect it to fail with a ImportMemoryException
     CHECK_THROWS_AS(runtime->EnqueueWorkload(netId, inputTensors, outputTensors), MemoryImportException);
 }

 inline void ExportNonAlignedOutputPointerTest(std::vector<BackendId> backends)
 {
     using namespace armnn;

     // Create runtime in which test will run
     IRuntime::CreationOptions options;
     IRuntimePtr runtime(armnn::IRuntime::Create(options));

     // build up the structure of the network
     INetworkPtr net(INetwork::Create());

     IConnectableLayer* input = net->AddInputLayer(0);

     ActivationDescriptor descriptor;
     descriptor.m_Function = ActivationFunction::Square;
     IConnectableLayer* pooling = net->AddActivationLayer(descriptor);

     IConnectableLayer* output = net->AddOutputLayer(0);

     input->GetOutputSlot(0).Connect(pooling->GetInputSlot(0));
     pooling->GetOutputSlot(0).Connect(output->GetInputSlot(0));

     input->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32, 0.0f, 0, true));
     pooling->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32));

     // Optimize the network
     OptimizerOptions optimizedOptions;
     optimizedOptions.m_ImportEnabled = true;
     optimizedOptions.m_ExportEnabled = true;
     IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec(), optimizedOptions);
     CHECK(optNet);

     // Loads it into the runtime.
     NetworkId netId;
     std::string ignoredErrorMessage;
     // Enable Importing and Exporting
     INetworkProperties networkProperties(false, MemorySource::Malloc, MemorySource::Malloc);
     runtime->LoadNetwork(netId, std::move(optNet), ignoredErrorMessage, networkProperties);

     // Creates structures for input & output
     std::vector<float> inputData
     {
         1.0f, 2.0f, 3.0f, 4.0f, 5.0f
     };

     // Aligned input
     float* alignedInputData = inputData.data();

     std::vector<float> outputData(5);

     // Misaligned output
     float* misalignedOutputData = reinterpret_cast<float*>(reinterpret_cast<char*>(outputData.data()) + 1);

     InputTensors inputTensors
     {
         {0,armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 0), alignedInputData)},
     };
     OutputTensors outputTensors
     {
         {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), misalignedOutputData)}
     };

     // Do the inference and expect it to fail with a ExportMemoryException
     if (backends[0] == Compute::CpuAcc)
     {
         // For CpuAcc the NeonTensorHandle will throw its own exception on misaligned memory
         CHECK_THROWS_AS(runtime->EnqueueWorkload(netId, inputTensors, outputTensors), MemoryImportException);
     }
     else
     {
         CHECK_THROWS_AS(runtime->EnqueueWorkload(netId, inputTensors, outputTensors), MemoryExportException);
     }
 }

 inline void ImportAlignedPointerTest(std::vector<BackendId> backends)
 {
     using namespace armnn;

     // Create runtime in which test will run
     IRuntime::CreationOptions options;
     IRuntimePtr runtime(armnn::IRuntime::Create(options));

     // build up the structure of the network
     INetworkPtr net(INetwork::Create());

     IConnectableLayer* input = net->AddInputLayer(0);

     ActivationDescriptor descriptor;
     descriptor.m_Function = ActivationFunction::Square;
     IConnectableLayer* pooling = net->AddActivationLayer(descriptor);

     IConnectableLayer* output = net->AddOutputLayer(0);

     input->GetOutputSlot(0).Connect(pooling->GetInputSlot(0));
     pooling->GetOutputSlot(0).Connect(output->GetInputSlot(0));

     input->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32, 0.0f, 0, true));
     pooling->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32));

     // Optimize the network
     OptimizerOptions optimizedOptions;
     optimizedOptions.m_ImportEnabled = true;
     optimizedOptions.m_ExportEnabled = true;
     IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec(), optimizedOptions);
     CHECK(optNet);

     // Loads it into the runtime.
     NetworkId netId;
     std::string ignoredErrorMessage;
     // Enable Importing
     INetworkProperties networkProperties(false, MemorySource::Malloc, MemorySource::Malloc);
     runtime->LoadNetwork(netId, std::move(optNet), ignoredErrorMessage, networkProperties);

     // Creates structures for input & output
     std::vector<float> inputData
     {
         1.0f, 2.0f, 3.0f, 4.0f
     };

     std::vector<float> outputData(4);

     std::vector<float> expectedOutput
     {
         1.0f, 4.0f, 9.0f, 16.0f
     };

     InputTensors inputTensors
     {
         {0,armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 0), inputData.data())},
     };
     OutputTensors outputTensors
     {
         {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), outputData.data())}
     };

     runtime->GetProfiler(netId)->EnableProfiling(true);

     // Do the inference
     runtime->EnqueueWorkload(netId, inputTensors, outputTensors);

     // Retrieve the Profiler.Print() output to get the workload execution
     ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance();
     std::stringstream ss;
     profilerManager.GetProfiler()->Print(ss);
     std::string dump = ss.str();

     // Contains ActivationWorkload
     std::size_t found = dump.find("ActivationWorkload");
     CHECK(found != std::string::npos);

     // Contains SyncMemGeneric
     found = dump.find("SyncMemGeneric");
     CHECK(found != std::string::npos);

     // Does not contain CopyMemGeneric
     found = dump.find("CopyMemGeneric");
     CHECK(found == std::string::npos);

     // Check output is as expected
     CHECK(outputData == expectedOutput);
 }

 inline void ImportOnlyWorkload(std::vector<BackendId> backends)
 {
     using namespace armnn;

     IRuntime::CreationOptions options;
     IRuntimePtr runtime(IRuntime::Create(options));

     // Builds up the structure of the network.
     INetworkPtr net(INetwork::Create());

     IConnectableLayer* input = net->AddInputLayer(0);

     ActivationDescriptor descriptor;
     descriptor.m_Function = ActivationFunction::Square;
     IConnectableLayer* pooling = net->AddActivationLayer(descriptor);

     IConnectableLayer* output = net->AddOutputLayer(0);

     input->GetOutputSlot(0).Connect(pooling->GetInputSlot(0));
     pooling->GetOutputSlot(0).Connect(output->GetInputSlot(0));

     input->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32, 0.0f, 0, true));
     pooling->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32));

     // optimize the network
     OptimizerOptions optimizedOptions;
     optimizedOptions.m_ImportEnabled = true;
     IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec(), optimizedOptions);

     INFO("Load Network");
     // Load it into the runtime. It should pass.
     NetworkId netId;
     std::string ignoredErrorMessage;

     INetworkProperties networkProperties(false, MemorySource::Malloc, MemorySource::Undefined);

     CHECK(runtime->LoadNetwork(netId, std::move(optNet),ignoredErrorMessage, networkProperties)
                == Status::Success);

     INFO("Generate Data");
     // Creates structures for input & output
     std::vector<float> inputData
     {
         1.0f, 2.0f, 3.0f, 4.0f
     };

     std::vector<float> outputData(4);

     std::vector<float> expectedOutput
     {
          1.0f, 4.0f, 9.0f, 16.0f
     };

     INFO("Create Inference");

     InputTensors inputTensors
     {
         {0,armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 0), inputData.data())},
     };
     OutputTensors outputTensors
     {
         {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), outputData.data())}
     };

     INFO("Get Profiler");
     runtime->GetProfiler(netId)->EnableProfiling(true);

     INFO("Run Inference");
     // Do the inference
     runtime->EnqueueWorkload(netId, inputTensors, outputTensors);

     INFO("Print Profiler");
     // Retrieve the Profiler.Print() output to get the workload execution
     ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance();
     std::stringstream ss;
     profilerManager.GetProfiler()->Print(ss);
     std::string dump = ss.str();

     // Check there are no SyncMemGeneric workloads as we didn't export
     INFO("Find SyncMemGeneric");
     int count = SubStringCounter(dump, "SyncMemGeneric");
     CHECK(count == 0);

     // Should only be 1 CopyMemGeneric for the output as we imported
     INFO("Find CopyMemGeneric");
     count = SubStringCounter(dump, "CopyMemGeneric");
     CHECK(count == 1);

     // Check the output is correct
     CHECK(std::equal(outputData.begin(), outputData.end(), expectedOutput.begin(), expectedOutput.end()));
 }

 inline void ExportOnlyWorkload(std::vector<BackendId> backends)
 {
     using namespace armnn;

     IRuntime::CreationOptions options;
     IRuntimePtr runtime(IRuntime::Create(options));

     // Builds up the structure of the network.
     INetworkPtr net(INetwork::Create());

     IConnectableLayer* input = net->AddInputLayer(0);

     ActivationDescriptor descriptor;
     descriptor.m_Function = ActivationFunction::Square;
     IConnectableLayer* pooling = net->AddActivationLayer(descriptor);

     IConnectableLayer* output = net->AddOutputLayer(0);

     input->GetOutputSlot(0).Connect(pooling->GetInputSlot(0));
     pooling->GetOutputSlot(0).Connect(output->GetInputSlot(0));

     input->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32, 0.0f, 0, true));
     pooling->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32));

     // optimize the network
     OptimizerOptions optimizedOptions;
     optimizedOptions.m_ExportEnabled = true;
     IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec(), optimizedOptions);

     INFO("Load Network");
     // Load it into the runtime. It should pass.
     NetworkId netId;
     std::string ignoredErrorMessage;
     INetworkProperties networkProperties(false, MemorySource::Undefined, MemorySource::Malloc);
     CHECK(runtime->LoadNetwork(netId, std::move(optNet),ignoredErrorMessage, networkProperties)
                == Status::Success);

     INFO("Generate Data");
     // Creates structures for input & output
     std::vector<float> inputData
     {
         1.0f, 2.0f, 3.0f, 4.0f
     };

     std::vector<float> outputData(4);

     std::vector<float> expectedOutput
     {
          1.0f, 4.0f, 9.0f, 16.0f
     };

     INFO("Create Inference");

     InputTensors inputTensors
     {
         {0,armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 0), inputData.data())},
     };
     OutputTensors outputTensors
     {
         {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), outputData.data())}
     };

     INFO("Get Profiler");
     runtime->GetProfiler(netId)->EnableProfiling(true);

     INFO("Run Inference");
     // Do the inference
     runtime->EnqueueWorkload(netId, inputTensors, outputTensors);

     INFO("Print Profiler");
     // Retrieve the Profiler.Print() output to get the workload execution
     ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance();
     std::stringstream ss;
     profilerManager.GetProfiler()->Print(ss);
     std::string dump = ss.str();

     // Check there is a SyncMemGeneric workload as we exported
     INFO("Find SyncMemGeneric");
     int count = SubStringCounter(dump, "SyncMemGeneric");
     CHECK(count == 1);

     // Should be 1 CopyMemGeneric for the output as we did not import
     INFO("Find CopyMemGeneric");
     count = SubStringCounter(dump, "CopyMemGeneric");
     CHECK(count == 1);

     // Check the output is correct
     CHECK(std::equal(outputData.begin(), outputData.end(), expectedOutput.begin(), expectedOutput.end()));
 }

 inline void ImportAndExportWorkload(std::vector<BackendId> backends)
 {
     using namespace armnn;

     IRuntime::CreationOptions options;
     IRuntimePtr runtime(IRuntime::Create(options));

     // Builds up the structure of the network.
     INetworkPtr net(INetwork::Create());

     IConnectableLayer* input = net->AddInputLayer(0);

     ActivationDescriptor descriptor;
     descriptor.m_Function = ActivationFunction::Square;
     IConnectableLayer* pooling = net->AddActivationLayer(descriptor);

     IConnectableLayer* output = net->AddOutputLayer(0);

     input->GetOutputSlot(0).Connect(pooling->GetInputSlot(0));
     pooling->GetOutputSlot(0).Connect(output->GetInputSlot(0));

     input->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32, 0.0f, 0, true));
     pooling->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32));

     OptimizerOptions optimizedOptions;
     optimizedOptions.m_ImportEnabled = true;
     optimizedOptions.m_ExportEnabled = true;
     IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec(), optimizedOptions);

     INFO("Load Network");
     // Load it into the runtime. It should pass.
     NetworkId netId;
     std::string ignoredErrorMessage;

     INetworkProperties networkProperties(false, MemorySource::Malloc, MemorySource::Malloc);

     CHECK(runtime->LoadNetwork(netId, std::move(optNet),ignoredErrorMessage, networkProperties)
                == Status::Success);

     INFO("Generate Data");
     // Creates structures for input & output
     std::vector<float> inputData
     {
         1.0f, 2.0f, 3.0f, 4.0f
     };

     std::vector<float> outputData(4);

     std::vector<float> expectedOutput
     {
          1.0f, 4.0f, 9.0f, 16.0f
     };

     INFO("Create inference");

     InputTensors inputTensors
     {
         {0,armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 0), inputData.data())},
     };
     OutputTensors outputTensors
     {
         {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), outputData.data())}
     };

     INFO("Get Profiler");
     runtime->GetProfiler(netId)->EnableProfiling(true);

     INFO("Run Inference");
     // Do the inference
     runtime->EnqueueWorkload(netId, inputTensors, outputTensors);

     INFO("Print Profiler");
     // Retrieve the Profiler.Print() output to get the workload execution
     ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance();
     std::stringstream ss;
     profilerManager.GetProfiler()->Print(ss);
     std::string dump = ss.str();

     // Check there is a SyncMemGeneric workload as we exported
     INFO("Find SyncMemGeneric");
     int count = SubStringCounter(dump, "SyncMemGeneric");
     CHECK(count == 1);

     // Shouldn't be any CopyMemGeneric workloads
     INFO("Find CopyMemGeneric");
     count = SubStringCounter(dump, "CopyMemGeneric");
     CHECK(count == 0);

     // Check the output is correct
     CHECK(std::equal(outputData.begin(), outputData.end(), expectedOutput.begin(), expectedOutput.end()));
 }

 inline void ExportOutputWithSeveralOutputSlotConnectionsTest(std::vector<BackendId> backends)
 {
     using namespace armnn;

     // Create runtime in which test will run
     IRuntime::CreationOptions options;
     IRuntimePtr runtime(armnn::IRuntime::Create(options));

     // build up the structure of the network
     INetworkPtr net(INetwork::Create());

     IConnectableLayer* input = net->AddInputLayer(0);

     ActivationDescriptor descriptor;
     descriptor.m_Function = ActivationFunction::Square;
     IConnectableLayer* activation = net->AddActivationLayer(descriptor);

     IConnectableLayer* output0 = net->AddOutputLayer(0);
     IConnectableLayer* output1 = net->AddOutputLayer(1);

     input->GetOutputSlot(0).Connect(activation->GetInputSlot(0));
     activation->GetOutputSlot(0).Connect(output0->GetInputSlot(0));
     activation->GetOutputSlot(0).Connect(output1->GetInputSlot(0));

     input->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 4, 1 }, DataType::Float32, 0.0f, 0, true));
     activation->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 4, 1 }, DataType::Float32));

     // Optimize the network
     OptimizerOptions optimizedOptions;
     optimizedOptions.m_ImportEnabled = true;
     optimizedOptions.m_ExportEnabled = true;
     IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec(), optimizedOptions);

     // Loads it into the runtime.
     NetworkId netId;
     std::string ignoredErrorMessage;
     // Enable Importing
     INetworkProperties networkProperties(false, MemorySource::Malloc, MemorySource::Malloc);
     runtime->LoadNetwork(netId, std::move(optNet), ignoredErrorMessage, networkProperties);

     // Creates structures for input & output
     std::vector<float> inputData
     {
         1.0f, 2.0f, 3.0f, 4.0f
     };

     std::vector<float> outputData0(4);
     std::vector<float> outputData1(4);

     std::vector<float> expectedOutput
     {
          1.0f, 4.0f, 9.0f, 16.0f
     };

     InputTensors inputTensors
     {
         {0,armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 0), inputData.data())},
     };
     OutputTensors outputTensors
     {
         {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), outputData0.data())},
         {1,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 1), outputData1.data())}
     };

     // The result of the inference is not important, just the fact that there
     // should not be CopyMemGeneric workloads.
     runtime->GetProfiler(netId)->EnableProfiling(true);

     // Do the inference
     runtime->EnqueueWorkload(netId, inputTensors, outputTensors);

     // Retrieve the Profiler.Print() output to get the workload execution
     ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance();
     std::stringstream ss;
     profilerManager.GetProfiler()->Print(ss);
     std::string dump = ss.str();

     std::size_t found = std::string::npos;

     if (backends[0] == Compute::CpuRef)
     {
         found = dump.find("RefActivationWorkload");
     }
     else if (backends[0] == Compute::CpuAcc)
     {
         found = dump.find("NeonActivationWorkload");
     }
     else if (backends[0] == Compute::GpuAcc)
     {
         found = dump.find("ClActivationWorkload");
     }

     CHECK(found != std::string::npos);
     // No contains SyncMemGeneric
     found = dump.find("SyncMemGeneric");
     CHECK(found == std::string::npos);
     // Contains CopyMemGeneric
     found = dump.find("CopyMemGeneric");
     CHECK(found != std::string::npos);

     // Check that the outputs are correct
     CHECK(std::equal(outputData0.begin(), outputData0.end(),
                                   expectedOutput.begin(), expectedOutput.end()));
     CHECK(std::equal(outputData1.begin(), outputData1.end(),
                                   expectedOutput.begin(), expectedOutput.end()));
 }

 inline void StridedSliceInvalidSliceEndToEndTest(std::vector<BackendId> backends)
 {
     using namespace armnn;

     // Create runtime in which test will run
     IRuntime::CreationOptions options;
     IRuntimePtr runtime(armnn::IRuntime::Create(options));

     // build up the structure of the network
     INetworkPtr net(INetwork::Create());

     IConnectableLayer* input = net->AddInputLayer(0);

     // Configure a strided slice with a stride the same size as the input but with a ShrinkAxisMask on the first
     // dim of the output to make it too small to hold the specified slice.
     StridedSliceDescriptor descriptor;
     descriptor.m_Begin          = {0, 0};
     descriptor.m_End            = {2, 3};
     descriptor.m_Stride         = {1, 1};
     descriptor.m_BeginMask      = 0;
     descriptor.m_EndMask        = 0;
     descriptor.m_ShrinkAxisMask = 1;
     IConnectableLayer* stridedSlice = net->AddStridedSliceLayer(descriptor);

     IConnectableLayer* output0 = net->AddOutputLayer(0);

     input->GetOutputSlot(0).Connect(stridedSlice->GetInputSlot(0));
     stridedSlice->GetOutputSlot(0).Connect(output0->GetInputSlot(0));

     input->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 2, 3 }, DataType::Float32, 0.0f, 0, true));
     stridedSlice->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 3 }, DataType::Float32));

     // Attempt to optimize the network and check that the correct exception is thrown
     CHECK_THROWS_AS(Optimize(*net, backends, runtime->GetDeviceSpec()), armnn::LayerValidationException);
 }

 inline void ForceImportWithAlignedBuffersEndToEndTest(std::vector<BackendId> backends)
 {
     /**
      * This test is similar to the Import tests above, we create a network with a square function and pass in a vector
      * with 4 floats, square them. and validate the output. We then check the profiling logs to see if input/output
      * tensors are copied (CopyMemGeneric) or imported (SyncMemGeneric)
      * In this case all inputs and outputs should be imported
      */
     using namespace armnn;
     IRuntime::CreationOptions options;
     IRuntimePtr runtime(IRuntime::Create(options));

     // Builds up the structure of the network.
     INetworkPtr net(INetwork::Create());
     IConnectableLayer* input = net->AddInputLayer(0);
     ActivationDescriptor descriptor;
     descriptor.m_Function = ActivationFunction::Square;
     IConnectableLayer* activationLayer = net->AddActivationLayer(descriptor);
     IConnectableLayer* output = net->AddOutputLayer(0);
     input->GetOutputSlot(0).Connect(activationLayer->GetInputSlot(0));
     activationLayer->GetOutputSlot(0).Connect(output->GetInputSlot(0));
     input->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32, 0.0f, 0, true));
     activationLayer->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32));
     IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec());
     INFO("Load Network");

     // Load it into the runtime. It should pass.
     NetworkId netId;
     std::string ignoredErrorMessage;
     INetworkProperties networkProperties(false, MemorySource::Undefined, MemorySource::Undefined);
     CHECK(runtime->LoadNetwork(netId, std::move(optNet),ignoredErrorMessage, networkProperties)
                == Status::Success);
     INFO("Generate Data");

     // Creates structures for input & output
     std::vector<float> inputData
     {
         1.0f, 2.0f, 3.0f, 4.0f
     };
     std::vector<float> outputData(4);
     std::vector<float> expectedOutput
     {
          1.0f, 4.0f, 9.0f, 16.0f
     };

     // Check our input and output pointers are actually aligned
     uintptr_t alignment = GetDataTypeSize(DataType::Float32);
     CHECK(!(reinterpret_cast<uintptr_t>(inputData.data()) % alignment));
     CHECK(!(reinterpret_cast<uintptr_t>(outputData.data()) % alignment));

     INFO("Create Inference");
     InputTensors inputTensors
     {
         {0,armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 0), inputData.data())},
     };
     OutputTensors outputTensors
     {
         {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), outputData.data())}
     };

     runtime->GetProfiler(netId)->EnableProfiling(true);
     std::vector<ImportedInputId> importedInputIds =
         runtime->ImportInputs(netId, inputTensors, MemorySource::Malloc);
     std::vector<ImportedOutputId> importedOutputIds =
         runtime->ImportOutputs(netId, outputTensors, MemorySource::Malloc);
     // Do the inference and force the import as the memory is aligned.
     runtime->EnqueueWorkload(netId, inputTensors, outputTensors, importedInputIds, importedOutputIds);

     // Retrieve the Profiler.Print() output to get the workload execution
     ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance();
     std::stringstream ss;
     profilerManager.GetProfiler()->Print(ss);
     std::string dump = ss.str();

     if (backends[0] == Compute::CpuAcc)
     {
         // Reconfigure has not been implemented for CpuAcc so it will always copy, this will break whenever
         // reconfigure is implemented
         int count = SubStringCounter(dump, "SyncMemGeneric");
         CHECK(count == 0);
         // Should be 2 CopyMemGeneric workloads
         count = SubStringCounter(dump, "CopyMemGeneric");
         CHECK(count == 2);
     }
     else
     {
         // Check there is a SyncMemGeneric workload as we exported
         int count = SubStringCounter(dump, "SyncMemGeneric");
         CHECK(count == 1);
         // Shouldn't be any CopyMemGeneric workloads
         count = SubStringCounter(dump, "CopyMemGeneric");
         CHECK(count == 0);
     }
     // Check the output is correct
     CHECK(std::equal(outputData.begin(), outputData.end(), expectedOutput.begin(), expectedOutput.end()));
 }

 inline void ForceImportWithMisalignedInputBuffersEndToEndTest(std::vector<BackendId> backends)
 {
     /**
      * This test is similar to the Import tests above, we create a network with a square function and pass in a vector
      * with 4 floats, square them. and validate the output. We then check the profiling logs to see if input/output
      * tensors are copied (CopyMemGeneric) or imported (SyncMemGeneric)
      * In this case all only the output should be imported
      */
     using namespace armnn;

     IRuntime::CreationOptions options;
     IRuntimePtr runtime(IRuntime::Create(options));

     // Builds up the structure of the network.
     INetworkPtr net(INetwork::Create());
     IConnectableLayer* input = net->AddInputLayer(0);

     ActivationDescriptor descriptor;
     descriptor.m_Function = ActivationFunction::Square;
     IConnectableLayer* activationLayer = net->AddActivationLayer(descriptor);

     IConnectableLayer* output = net->AddOutputLayer(0);

     input->GetOutputSlot(0).Connect(activationLayer->GetInputSlot(0));
     activationLayer->GetOutputSlot(0).Connect(output->GetInputSlot(0));
     input->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32, 0.0f, 0, true));
     activationLayer->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32));

     IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec());
     INFO("Load Network");
     // Load it into the runtime. It should pass.
     NetworkId netId;
     std::string ignoredErrorMessage;
     INetworkProperties networkProperties(false, MemorySource::Undefined, MemorySource::Undefined);
     CHECK(runtime->LoadNetwork(netId, std::move(optNet),ignoredErrorMessage, networkProperties)
                == Status::Success);
     INFO("Generate Data");

     // This code looks a little funky but the idea is to create a buffer of floats but offset by the size of a char
     // this will guarantee that the resultant buffer is misaligned and thus should always be copied.
     auto memPtr = std::malloc(4 * sizeof(float) + sizeof(char));

     float* misalignedMemPtr = reinterpret_cast<float*>(reinterpret_cast<char*>(memPtr) + 1);

     // Check if our pointer is truly misaligned
     uintptr_t alignment = GetDataTypeSize(DataType::Float32);
     CHECK (reinterpret_cast<uintptr_t>(misalignedMemPtr) % alignment);

     std::vector<float> inputData
     {
          1.0f, 2.0f, 3.0f, 4.0f
     };

     std::memcpy(misalignedMemPtr, inputData.data(), 4*sizeof(float));

     std::vector<float> outputData(4);
     // Check our output buffer is aligned
     CHECK(!(reinterpret_cast<uintptr_t>(outputData.data()) % alignment));

     std::vector<float> expectedOutput
     {
          1.0f, 4.0f, 9.0f, 16.0f
     };

     INFO("Create Inference");
     InputTensors inputTensors
     {
         {0,armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 0), misalignedMemPtr)},
     };
     OutputTensors outputTensors
     {
         {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), outputData.data())}
     };
     runtime->GetProfiler(netId)->EnableProfiling(true);
     std::vector<ImportedInputId> importedInputIds =
         runtime->ImportInputs(netId, inputTensors, MemorySource::Malloc);
     std::vector<ImportedOutputId> importedOutputIds =
         runtime->ImportOutputs(netId, outputTensors, MemorySource::Malloc);

     // Do the inference and force the import as the memory is misaligned.
     runtime->EnqueueWorkload(netId, inputTensors, outputTensors, importedInputIds, importedOutputIds);

     // Retrieve the Profiler.Print() output to get the workload execution
     ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance();
     std::stringstream ss;
     profilerManager.GetProfiler()->Print(ss);
     std::string dump = ss.str();

     // GpuAcc is a different case to CpuRef and CpuAcc, it doesn't use the buffer directly but instead maps it to a
     // new set of addresses within Gpu Memory. This will almost always be auto-aligned, so we don't need to check
     // for imports/copies. Only that the output is correct.
     if (backends[0] != Compute::GpuAcc)
     {
         if (backends[0] == Compute::CpuAcc)
         {
             // Reconfigure has not been implemented for CpuAcc so it will always copy, this will break whenever
             // reconfigure is implemented
             // We should get 0 SyncMemGeneric for the Output
             int count = SubStringCounter(dump, "SyncMemGeneric");
             CHECK(count == 0);
             // Should be 2 CopyMemGeneric as we copied the input
             count = SubStringCounter(dump, "CopyMemGeneric");
             CHECK(count == 2);
         }
         else
         {
             // We should get 1 SyncMemGeneric for the Output
             int count = SubStringCounter(dump, "SyncMemGeneric");
             CHECK(count == 1);
             // Should only be 1 CopyMemGeneric as we copied the input
             count = SubStringCounter(dump, "CopyMemGeneric");
             CHECK(count == 1);
         }
     }
     // Check the output is correct
     CHECK(std::equal(outputData.begin(), outputData.end(), expectedOutput.begin(), expectedOutput.end()));
     std::free(memPtr);
 }

 inline void ForceImportWithMisalignedOutputBuffersEndToEndTest(std::vector<BackendId> backends)
 {
     /**
      * This test is similar to the Import tests above, we create a network with a square function and pass in a vector
      * with 4 floats, square them. and validate the output. We then check the profiling logs to see if input/output
      * tensors are copied (CopyMemGeneric) or imported (SyncMemGeneric)
      * In this case all only the input should be imported
      */
     using namespace armnn;

     IRuntime::CreationOptions options;
     IRuntimePtr runtime(IRuntime::Create(options));

     // Builds up the structure of the network.
     INetworkPtr net(INetwork::Create());
     IConnectableLayer* input = net->AddInputLayer(0);

     ActivationDescriptor descriptor;
     descriptor.m_Function = ActivationFunction::Square;
     IConnectableLayer* activationLayer = net->AddActivationLayer(descriptor);

     IConnectableLayer* output = net->AddOutputLayer(0);

     input->GetOutputSlot(0).Connect(activationLayer->GetInputSlot(0));
     activationLayer->GetOutputSlot(0).Connect(output->GetInputSlot(0));
     input->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32, 0.0f, 0, true));
     activationLayer->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32));

     IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec());
     INFO("Load Network");
     // Load it into the runtime. It should pass.
     NetworkId netId;
     std::string ignoredErrorMessage;
     INetworkProperties networkProperties(false, MemorySource::Undefined, MemorySource::Undefined);
     CHECK(runtime->LoadNetwork(netId, std::move(optNet),ignoredErrorMessage, networkProperties)
                == Status::Success);
     INFO("Generate Data");

     // This code looks a little funky but the idea is to create a buffer of floats but offset by the size of a char
     // this will guarantee that the resultant buffer is misaligned and thus should always be copied.
     auto memPtr = std::malloc(4 * sizeof(float) + sizeof(char));

     float* misalignedMemPtr = reinterpret_cast<float*>(reinterpret_cast<char*>(memPtr) + 1);

     // Check if our pointer is truly misaligned
     uintptr_t alignment = GetDataTypeSize(DataType::Float32);
     CHECK (reinterpret_cast<uintptr_t>(misalignedMemPtr) % alignment);

     // Creates structures for input & output
     std::vector<float> inputData
     {
         1.0f, 2.0f, 3.0f, 4.0f
     };

     // Check our input buffer is aligned
     CHECK(!(reinterpret_cast<uintptr_t>(inputData.data()) % alignment));
     std::vector<float> expectedOutput
     {
          1.0f, 4.0f, 9.0f, 16.0f
     };

     INFO("Create Inference");
     InputTensors inputTensors
     {
         {0,armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 0), inputData.data())},
     };
     OutputTensors outputTensors
     {
         {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), misalignedMemPtr)}
     };
     runtime->GetProfiler(netId)->EnableProfiling(true);
     std::vector<ImportedInputId> importedInputIds =
         runtime->ImportInputs(netId, inputTensors, MemorySource::Malloc);
     std::vector<ImportedOutputId> importedOutputIds =
         runtime->ImportOutputs(netId, outputTensors, MemorySource::Malloc);

     // Do the inference and force the import as the memory is misaligned.
     runtime->EnqueueWorkload(netId, inputTensors, outputTensors, importedInputIds, importedOutputIds);

     // Retrieve the Profiler.Print() output to get the workload execution
     ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance();
     std::stringstream ss;
     profilerManager.GetProfiler()->Print(ss);
     std::string dump = ss.str();

     // GpuAcc is a different case to CpuRef and CpuAcc, it doesn't use the buffer directly but instead maps it to a
     // new set of addresses within Gpu Memory. This will almost always be auto-aligned, so we don't need to check
     // for imports/copies. Only that the output is correct.
     if (backends[0] != Compute::GpuAcc)
     {
         // Even though we Imported the Input we still shouldn't have a SyncMemGeneric
         int count = SubStringCounter(dump, "SyncMemGeneric");
         CHECK(count == 0);
         // Should only be 1 CopyMemGeneric as we copied the input
         count = SubStringCounter(dump, "CopyMemGeneric");
         if (backends[0] == Compute::CpuAcc)
         {
             // Reconfigure has not been implemented for CpuAcc so it will always copy, this will break whenever
             // reconfigure is implemented
             CHECK(count == 2);
         }
         else
         {
             CHECK(count == 1);
         }
         // Check the output is correct
     }
     unsigned int index = 0;
     std::vector<float> outputData(expectedOutput.size(), 0);
     std::memcpy(outputData.data(), misalignedMemPtr, expectedOutput.size() * sizeof(float));
     for (auto outputValue : expectedOutput)
     {
         CHECK(outputValue == outputData[index]);
         ++index;
     }
     std::free(memPtr);
 }

 inline void ForceImportWithMisalignedInputAndOutputBuffersEndToEndTest(std::vector<BackendId> backends)
 {
     /**
      * This test is similar to the Import tests above, we create a network with a square function and pass in a vector
      * with 4 floats, square them. and validate the output. We then check the profiling logs to see if input/output
      * tensors are copied (CopyMemGeneric) or imported (SyncMemGeneric)
      * In this case all inputs and outputs should be copied
      */
     using namespace armnn;

     IRuntime::CreationOptions options;
     IRuntimePtr runtime(IRuntime::Create(options));

     // Builds up the structure of the network.
     INetworkPtr net(INetwork::Create());
     IConnectableLayer* input = net->AddInputLayer(0);

     ActivationDescriptor descriptor;
     descriptor.m_Function = ActivationFunction::Square;
     IConnectableLayer* activationLayer = net->AddActivationLayer(descriptor);

     IConnectableLayer* output = net->AddOutputLayer(0);

     input->GetOutputSlot(0).Connect(activationLayer->GetInputSlot(0));
     activationLayer->GetOutputSlot(0).Connect(output->GetInputSlot(0));
     input->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32, 0.0f, 0, true));
     activationLayer->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32));

     IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec());
     INFO("Load Network");
     // Load it into the runtime. It should pass.
     NetworkId netId;
     std::string ignoredErrorMessage;
     INetworkProperties networkProperties(false, MemorySource::Undefined, MemorySource::Undefined);
     CHECK(runtime->LoadNetwork(netId, std::move(optNet),ignoredErrorMessage, networkProperties)
                == Status::Success);
     INFO("Generate Data");

     // This code looks a little funky but the idea is to create a buffer of floats but offset by the size of a char
     // this will guarantee that the resultant buffer is misaligned and thus should always be copied.
     auto inputMemPtr = std::malloc(4 * sizeof(float) + sizeof(char));
     float* misalignedInputPtr = reinterpret_cast<float*>(reinterpret_cast<char*>(inputMemPtr) + 1);

     // Check if our pointer is truly misaligned
     uintptr_t alignment = GetDataTypeSize(DataType::Float32);
     CHECK (reinterpret_cast<uintptr_t>(misalignedInputPtr) % alignment);
     std::vector<float> inputData
     {
          1.0f, 2.0f, 3.0f, 4.0f
     };
     std::memcpy(misalignedInputPtr, inputData.data(), 4*sizeof(float));

     auto outputMemPtr = std::malloc(4 * sizeof(float) + sizeof(char));
     float* misalignedOutputPtr = reinterpret_cast<float*>(reinterpret_cast<char*>(outputMemPtr) + 1);

     // Check if our pointer is truly misaligned
     CHECK (reinterpret_cast<uintptr_t>(misalignedOutputPtr) % alignment);

     std::vector<float> expectedOutput
     {
          1.0f, 4.0f, 9.0f, 16.0f
     };

     INFO("Create Inference");
     InputTensors inputTensors
     {
         {0,armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 0), misalignedInputPtr)},
     };
     OutputTensors outputTensors
     {
         {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), misalignedOutputPtr)}
     };
     runtime->GetProfiler(netId)->EnableProfiling(true);
     std::vector<ImportedInputId> importedInputIds =
         runtime->ImportInputs(netId, inputTensors, MemorySource::Malloc);
     std::vector<ImportedOutputId> importedOutputIds =
         runtime->ImportOutputs(netId, outputTensors, MemorySource::Malloc);

     // Do the inference and force the import as the memory is misaligned.
     runtime->EnqueueWorkload(netId, inputTensors, outputTensors, importedInputIds, importedOutputIds);

     // Retrieve the Profiler.Print() output to get the workload execution
     ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance();
     std::stringstream ss;
     profilerManager.GetProfiler()->Print(ss);
     std::string dump = ss.str();

     // GpuAcc is a different case to CpuRef and CpuAcc, it doesn't use the buffer directly but instead maps it to a
     // new set of addresses within Gpu Memory. This will almost always be auto-aligned, so we don't need to check
     // for imports/copies. Only that the output is correct.
     if (backends[0] != Compute::GpuAcc)
     {
         // We can only copy so there should be no SyncMemGeneric
         int count = SubStringCounter(dump, "SyncMemGeneric");
         CHECK(count == 0);
         // Should only be CopyMemGeneric workloads as we copied all buffers
         count = SubStringCounter(dump, "CopyMemGeneric");
         CHECK(count == 2);
     }
     // Check the output is correct
     unsigned int index = 0;
     std::vector<float> outputData(expectedOutput.size(), 0);
     std::memcpy(outputData.data(), misalignedOutputPtr, expectedOutput.size() * sizeof(float));
     for (auto expectedValue : expectedOutput)
     {
         CHECK(expectedValue == outputData[index]);
         ++index;
     }
     std::free(inputMemPtr);
     std::free(outputMemPtr);
 }

 inline void ForceImportRepeatedInferencesEndToEndTest(std::vector<BackendId> backends)
 {
     /**
      * This test is similar to the Import tests above, we create a network with a square function and pass in a vector
      * with 4 floats, square them. and validate the output. We then check the profiling logs to see if input/output
      * tensors are copied (CopyMemGeneric) or imported (SyncMemGeneric)
      * In this we create some aligned buffers, import them into a network and validate the output and number of
      * SynMemGeneric/CopyMemgeneric. Then we try the same network again with misaligned buffers to make sure it falls
      * back to copying correctly.
      */
     using namespace armnn;

     IRuntime::CreationOptions options;
     IRuntimePtr runtime(IRuntime::Create(options));

     // Builds up the structure of the network.
     INetworkPtr net(INetwork::Create());
     IConnectableLayer* input = net->AddInputLayer(0);

     ActivationDescriptor descriptor;
     descriptor.m_Function = ActivationFunction::Square;
     IConnectableLayer* activationLayer = net->AddActivationLayer(descriptor);

     IConnectableLayer* output = net->AddOutputLayer(0);

     input->GetOutputSlot(0).Connect(activationLayer->GetInputSlot(0));
     activationLayer->GetOutputSlot(0).Connect(output->GetInputSlot(0));
     input->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32, 0.0f, 0, true));
     activationLayer->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32));

     IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec());
     INFO("Load Network");
     // Load it into the runtime. It should pass.
     NetworkId netId;
     std::string ignoredErrorMessage;
     INetworkProperties networkProperties(false, MemorySource::Undefined, MemorySource::Undefined);
     CHECK(runtime->LoadNetwork(netId, std::move(optNet),ignoredErrorMessage, networkProperties)
                == Status::Success);
     INFO("Generate Data");

     // Creates structures for input & output
     std::vector<float> inputData
     {
         1.0f, 2.0f, 3.0f, 4.0f
     };
     std::vector<float> outputData(4);
     std::vector<float> expectedOutput
     {
          1.0f, 4.0f, 9.0f, 16.0f
     };

     // Check our input and output pointers are actually aligned
     uintptr_t alignment = GetDataTypeSize(DataType::Float32);
     CHECK(!(reinterpret_cast<uintptr_t>(inputData.data()) % alignment));
     CHECK(!(reinterpret_cast<uintptr_t>(outputData.data()) % alignment));

     INFO("Create Inference");
     InputTensors inputTensors
     {
         {0,armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 0), inputData.data())},
     };
     OutputTensors outputTensors
     {
         {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), outputData.data())}
     };

     runtime->GetProfiler(netId)->EnableProfiling(true);
     std::vector<ImportedInputId> importedInputIds =
         runtime->ImportInputs(netId, inputTensors, MemorySource::Malloc);
     std::vector<ImportedOutputId> importedOutputIds =
         runtime->ImportOutputs(netId, outputTensors, MemorySource::Malloc);
     // Do the inference and force the import as the memory is aligned.
     runtime->EnqueueWorkload(netId, inputTensors, outputTensors, importedInputIds, importedOutputIds);

     // Retrieve the Profiler.AnalyzeEventsAndWriteResults() output to get the workload execution
     ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance();
     std::stringstream ss;
     profilerManager.GetProfiler()->AnalyzeEventsAndWriteResults(ss);
     std::string dump = ss.str();

     if (backends[0] == Compute::CpuAcc)
     {
         // Reconfigure has not been implemented for CpuAcc so it will always copy, this will break whenever
         // reconfigure is implemented
         int count = SubStringCounter(dump, "SyncMemGeneric");
         CHECK(count == 0);
         // Should be 2 CopyMemGeneric workloads
         count = SubStringCounter(dump, "CopyMemGeneric");
         CHECK(count >= 1);
     }
     else
     {
         // Check there is at least 1 SyncMemGeneric workload as we exported
         int count = SubStringCounter(dump, "SyncMemGeneric");
         CHECK(count >= 1);
         // Shouldn't be any CopyMemGeneric workloads
         count = SubStringCounter(dump, "CopyMemGeneric");
         CHECK(count == 0);
     }
     // Check the output is correct
     CHECK(std::equal(outputData.begin(), outputData.end(), expectedOutput.begin(), expectedOutput.end()));

     // This code looks a little funky but the idea is to create a buffer of floats but offset by the size of a char
     // this will guarantee that the resultant buffer is misaligned and thus should always be copied.
     auto inputMemPtr = std::malloc(4 * sizeof(float) + sizeof(char));
     float* misalignedInputPtr = reinterpret_cast<float*>(reinterpret_cast<char*>(inputMemPtr) + 1);

     // Check if our pointer is truly misaligned
     CHECK (reinterpret_cast<uintptr_t>(misalignedInputPtr) % alignment);

     std::vector<float> inputValues
     {
          2.0f, 3.0f, 4.0f, 5.0f
     };

     std::memcpy(misalignedInputPtr, inputValues.data(), inputValues.size()*sizeof(float));

     auto outputMemPtr = std::malloc(4 * sizeof(float) + sizeof(char));
     float* misalignedOutputPtr = reinterpret_cast<float*>(reinterpret_cast<char*>(outputMemPtr) + 1);

     // Check if our pointer is truly misaligned
     CHECK (reinterpret_cast<uintptr_t>(misalignedOutputPtr) % alignment);

     std::vector<float> expectedMisalignedOutput
     {
          4.0f, 9.0f, 16.0f, 25.0f
     };

     INFO("Create Second Inference");
     InputTensors inputTensorsMisaligned
     {
         {0,armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 0), misalignedInputPtr)},
     };
     OutputTensors outputTensorsMisaligned
     {
         {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), misalignedOutputPtr)}
     };
     importedInputIds = runtime->ImportInputs(netId, inputTensorsMisaligned, MemorySource::Malloc);
     importedOutputIds = runtime->ImportOutputs(netId, outputTensorsMisaligned, MemorySource::Malloc);

     // Do the inference and force the import as the memory is misaligned.
     runtime->EnqueueWorkload(netId,
                              inputTensorsMisaligned,
                              outputTensorsMisaligned,
                              importedInputIds,
                              importedOutputIds);

     // Retrieve the Profiler.AnalyzeEventsAndWriteResults() output to get the workload execution
     // We need to use AnalyzeEventsAndWriteResults here to make sure the second inference has been profiled
     profilerManager.GetProfiler()->AnalyzeEventsAndWriteResults(ss);
     dump = ss.str();

     // GpuAcc is a different case to CpuRef and CpuAcc, it doesn't use the buffer directly but instead maps it to a
     // new set of addresses within Gpu Memory. This will almost always be auto-aligned, so we don't need to check
     // for imports/copies. Only that the output is correct.
     if (backends[0] != Compute::GpuAcc)
     {
         // The SyncMemGeneric will still be in the profiling log from the first inference
         int count = SubStringCounter(dump, "SyncMemGeneric");
         CHECK(count >= 1);
         // We should now see CopyMemGeneric workloads as we copied all buffers
         count = SubStringCounter(dump, "CopyMemGeneric");
         CHECK(count >= 1);
     }
     // Check the output is correct
     unsigned int index = 0;
     std::vector<float> alignedOutputData(expectedMisalignedOutput.size(), 0);
     std::memcpy(alignedOutputData.data(), misalignedOutputPtr, expectedMisalignedOutput.size() * sizeof(float));
     for (auto outputValue : expectedMisalignedOutput)
     {
         CHECK(outputValue == alignedOutputData[index]);
         ++index;
     }
     // Clean up to avoid interfering with other tests
     runtime->UnloadNetwork(netId);
     std::free(inputMemPtr);
     std::free(outputMemPtr);
 }


 inline void ForceImportRepeatedInferencesInvertedEndToEndTest(std::vector<BackendId> backends)
 {
     /**
      * This test is similar to the Import tests above, we create a network with a square function and pass in a vector
      * with 4 floats, square them. and validate the output. We then check the profiling logs to see if input/output
      * tensors are copied (CopyMemGeneric) or imported (SyncMemGeneric)
      * In this we create some misaligned buffers, copy them into a network and validate the output and number of
      * SynMemGeneric/CopyMemgeneric. Then we try the same network again with aligned buffers to make sure it switches
      * to importing correctly.
      */
     using namespace armnn;

     IRuntime::CreationOptions options;
     IRuntimePtr runtime(IRuntime::Create(options));

     // Builds up the structure of the network.
     INetworkPtr net(INetwork::Create());
     IConnectableLayer* input = net->AddInputLayer(0);

     ActivationDescriptor descriptor;
     descriptor.m_Function = ActivationFunction::Square;
     IConnectableLayer* activationLayer = net->AddActivationLayer(descriptor);

     IConnectableLayer* output = net->AddOutputLayer(0);

     input->GetOutputSlot(0).Connect(activationLayer->GetInputSlot(0));
     activationLayer->GetOutputSlot(0).Connect(output->GetInputSlot(0));
     input->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32, 0.0f, 0, true));
     activationLayer->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32));

     IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec());
     INFO("Load Network");
     // Load it into the runtime. It should pass.
     NetworkId netId;
     std::string ignoredErrorMessage;
     INetworkProperties networkProperties(false, MemorySource::Undefined, MemorySource::Undefined);
     CHECK(runtime->LoadNetwork(netId, std::move(optNet),ignoredErrorMessage, networkProperties)
                == Status::Success);
     INFO("Generate Data");

     // This code looks a little funky but the idea is to create a buffer of floats but offset by the size of a char
     // this will guarantee that the resultant buffer is misaligned and thus should always be copied.
     auto inputMemPtr = std::malloc(4 * sizeof(float) + sizeof(char));
     float* misalignedInputPtr = reinterpret_cast<float*>(reinterpret_cast<char*>(inputMemPtr) + 1);

     // Check if our pointer is truly misaligned
     uintptr_t alignment = GetDataTypeSize(DataType::Float32);
     CHECK (reinterpret_cast<uintptr_t>(misalignedInputPtr) % alignment);
     std::vector<float> inputValues
     {
          2.0f, 3.0f, 4.0f, 5.0f
     };
     std::memcpy(misalignedInputPtr, inputValues.data(), inputValues.size() * sizeof(float));

     auto outputMemPtr = std::malloc(4 * sizeof(float) + sizeof(char));
     float* misalignedOutputPtr = reinterpret_cast<float*>(reinterpret_cast<char*>(outputMemPtr) + 1);

     // Check if our pointer is truly misaligned
     CHECK (reinterpret_cast<uintptr_t>(misalignedOutputPtr) % alignment);

     std::vector<float> expectedMisalignedOutput
     {
          4.0f, 9.0f, 16.0f, 25.0f
     };

     INFO("Create Second Inference");
     InputTensors inputTensorsMisaligned
     {
         {0,armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 0), misalignedInputPtr)},
     };
     OutputTensors outputTensorsMisaligned
     {
         {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), misalignedOutputPtr)}
     };
     runtime->GetProfiler(netId)->EnableProfiling(true);
     std::vector<ImportedInputId>  importedInputIds =
         runtime->ImportInputs(netId, inputTensorsMisaligned, MemorySource::Malloc);
     std::vector<ImportedOutputId> importedOutputIds =
         runtime->ImportOutputs(netId, outputTensorsMisaligned, MemorySource::Malloc);

     // Do the inference and force the import as the memory is misaligned.
     runtime->EnqueueWorkload(netId,
                              inputTensorsMisaligned,
                              outputTensorsMisaligned,
                              importedInputIds,
                              importedOutputIds);

     // Retrieve the Profiler.AnalyzeEventsAndWriteResults() output to get the workload execution
     ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance();
     std::stringstream ss;
     profilerManager.GetProfiler()->AnalyzeEventsAndWriteResults(ss);
     std::string dump = ss.str();

     // GpuAcc is a different case to CpuRef and CpuAcc, it doesn't use the buffer directly but instead maps it to a
     // new set of addresses within Gpu Memory. This will almost always be auto-aligned, so we don't need to check
     // for imports/copies. Only that the output is correct.
     if (backends[0] != Compute::GpuAcc)
     {
         // We can only copy so there should be no SyncMemGeneric
         int count = SubStringCounter(dump, "SyncMemGeneric");
         CHECK(count == 0);
         // Should only be CopyMemGeneric workloads as we copied all buffers
         count = SubStringCounter(dump, "CopyMemGeneric");
         CHECK(count >= 1);
     }
     // Check the output is correct
     unsigned int index = 0;
     std::vector<float> alignedOutput(expectedMisalignedOutput.size());
     std::memcpy(alignedOutput.data(), misalignedOutputPtr, expectedMisalignedOutput.size()*sizeof(float));
     for (auto outputValue : expectedMisalignedOutput)
     {
         CHECK(outputValue == alignedOutput[index]);
         ++index;
     }
     std::free(inputMemPtr);
     std::free(outputMemPtr);

     // Creates structures for input & output
     std::vector<float> inputData
     {
         1.0f, 2.0f, 3.0f, 4.0f
     };
     std::vector<float> outputData(4);
     std::vector<float> expectedOutput
     {
          1.0f, 4.0f, 9.0f, 16.0f
     };

     // Check our input and output pointers are actually aligned
     CHECK(!(reinterpret_cast<uintptr_t>(inputData.data()) % alignment));
     CHECK(!(reinterpret_cast<uintptr_t>(outputData.data()) % alignment));

     INFO("Create Inference");
     InputTensors inputTensors
     {
         {0,armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 0), inputData.data())},
     };
     OutputTensors outputTensors
     {
         {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), outputData.data())}
     };

     importedInputIds = runtime->ImportInputs(netId, inputTensors, MemorySource::Malloc);
     importedOutputIds = runtime->ImportOutputs(netId, outputTensors, MemorySource::Malloc);
     // Do the inference and force the import as the memory is aligned.
     runtime->EnqueueWorkload(netId, inputTensors, outputTensors, importedInputIds, importedOutputIds);

     // Retrieve the Profiler.AnalyzeEventsAndWriteResults() output to get the workload execution
     // We need to use AnalyzeEventsAndWriteResults here to make sure the second inference has been profiled
     profilerManager.GetProfiler()->AnalyzeEventsAndWriteResults(ss);
     dump = ss.str();

     if (backends[0] == Compute::CpuAcc)
     {
         // Reconfigure has not been implemented for CpuAcc so it will always copy, this will break whenever
         // reconfigure is implemented
         int count = SubStringCounter(dump, "SyncMemGeneric");
         CHECK(count == 0);
         // Should be 2 CopyMemGeneric workloads
         count = SubStringCounter(dump, "CopyMemGeneric");
         CHECK(count >= 1);
     }
     else
     {
         // Repeated inferences make it difficult to check for an accurate count. So we just validate that we have a
         // SyncMemGeneric Workload when we previously didn't
         int count = SubStringCounter(dump, "SyncMemGeneric");
         CHECK(count >= 1);
         // Should still be some CopyMemGeneric Workloads from the last inference
         count = SubStringCounter(dump, "CopyMemGeneric");
         CHECK(count >= 1);
     }
     // Check the output is correct
     CHECK(std::equal(outputData.begin(), outputData.end(), expectedOutput.begin(), expectedOutput.end()));
     // Clean up to avoid interfering with other tests
     runtime->UnloadNetwork(netId);
 }

 } // anonymous namespace