blob: cc5aa23ca35897fffc499c89f6ed5835cbb2ac49 [file] [log] [blame]
//
// Copyright © 2017 Arm Ltd. All rights reserved.
// SPDX-License-Identifier: MIT
//
#pragma once
#include <CommonTestUtils.hpp>
#include <armnn/Descriptors.hpp>
#include <armnn/INetwork.hpp>
#include <armnn/IRuntime.hpp>
#include <Profiling.hpp>
#include <armnnUtils/QuantizeHelper.hpp>
#include <ResolveType.hpp>
#include <doctest/doctest.h>
#include <vector>
namespace
{
using namespace armnn;
template<typename T>
bool ConstantUsageTest(const std::vector<BackendId>& computeDevice,
const TensorInfo& commonTensorInfo,
const std::vector<T>& inputData,
const std::vector<T>& constantData,
const std::vector<T>& expectedOutputData)
{
// Create runtime in which test will run
IRuntime::CreationOptions options;
IRuntimePtr runtime(IRuntime::Create(options));
// Builds up the structure of the network.
INetworkPtr net(INetwork::Create());
IConnectableLayer* input = net->AddInputLayer(0);
IConnectableLayer* constant = net->AddConstantLayer(ConstTensor(commonTensorInfo, constantData));
IConnectableLayer* add = net->AddAdditionLayer();
IConnectableLayer* output = net->AddOutputLayer(0);
input->GetOutputSlot(0).Connect(add->GetInputSlot(0));
constant->GetOutputSlot(0).Connect(add->GetInputSlot(1));
add->GetOutputSlot(0).Connect(output->GetInputSlot(0));
// Sets the tensors in the network.
input->GetOutputSlot(0).SetTensorInfo(commonTensorInfo);
constant->GetOutputSlot(0).SetTensorInfo(commonTensorInfo);
add->GetOutputSlot(0).SetTensorInfo(commonTensorInfo);
// optimize the network
IOptimizedNetworkPtr optNet = Optimize(*net, computeDevice, runtime->GetDeviceSpec());
// Loads it into the runtime.
NetworkId netId;
runtime->LoadNetwork(netId, std::move(optNet));
// Creates structures for input & output.
std::vector<T> outputData(inputData.size());
InputTensors inputTensors
{
{0, ConstTensor(runtime->GetInputTensorInfo(netId, 0), inputData.data())}
};
OutputTensors outputTensors
{
{0, Tensor(runtime->GetOutputTensorInfo(netId, 0), outputData.data())}
};
// Does the inference.
runtime->EnqueueWorkload(netId, inputTensors, outputTensors);
// Checks the results.
return outputData == expectedOutputData;
}
inline bool ConstantUsageFloat32Test(const std::vector<BackendId>& backends)
{
TensorInfo commonTensorInfo({ 2, 3 }, DataType::Float32);
commonTensorInfo.SetConstant(true);
return ConstantUsageTest(backends,
commonTensorInfo,
std::vector<float>{ 1.f, 2.f, 3.f, 4.f, 5.f, 6.f }, // Input.
std::vector<float>{ 6.f, 5.f, 4.f, 3.f, 2.f, 1.f }, // Const input.
std::vector<float>{ 7.f, 7.f, 7.f, 7.f, 7.f, 7.f } // Expected output.
);
}
inline bool ConstantUsageUint8Test(const std::vector<BackendId>& backends)
{
TensorInfo commonTensorInfo({ 2, 3 }, DataType::QAsymmU8);
const float scale = 0.023529f;
const int8_t offset = -43;
commonTensorInfo.SetQuantizationScale(scale);
commonTensorInfo.SetQuantizationOffset(offset);
commonTensorInfo.SetConstant(true);
return ConstantUsageTest(backends,
commonTensorInfo,
armnnUtils::QuantizedVector<uint8_t>({ 1.f, 2.f, 3.f, 4.f, 5.f, 6.f }, scale, offset), // Input.
armnnUtils::QuantizedVector<uint8_t>({ 6.f, 5.f, 4.f, 3.f, 2.f, 1.f }, scale, offset), // Const input.
armnnUtils::QuantizedVector<uint8_t>({ 7.f, 7.f, 7.f, 7.f, 7.f, 7.f }, scale, offset) // Expected output.
);
}
// Utility function to find the number of instances of a substring within a string.
int SubStringCounter(std::string& string, std::string&& substring)
{
std::size_t found = 0;
int count = 0;
// Look for the substring starting from where we last found the substring
while((found = string.find(substring, found)) != std::string::npos)
{
count++;
// Offset by substring length to avoid finding the same substring twice
found += substring.length();
}
return count;
}
template<DataType ArmnnIType, DataType ArmnnOType,
typename TInput = ResolveType<ArmnnIType>, typename TOutput = ResolveType<ArmnnOType>>
void EndToEndLayerTestImpl(INetworkPtr network,
const std::map<int, std::vector<TInput>>& inputTensorData,
const std::map<int, std::vector<TOutput>>& expectedOutputData,
std::vector<BackendId> backends,
float tolerance = 0.000001f)
{
// Create runtime in which test will run
IRuntime::CreationOptions options;
IRuntimePtr runtime(IRuntime::Create(options));
// optimize the network
IOptimizedNetworkPtr optNet = Optimize(*network, backends, runtime->GetDeviceSpec());
// Loads it into the runtime.
NetworkId netId;
runtime->LoadNetwork(netId, std::move(optNet));
InputTensors inputTensors;
inputTensors.reserve(inputTensorData.size());
for (auto&& it : inputTensorData)
{
inputTensors.push_back({it.first,
ConstTensor(runtime->GetInputTensorInfo(netId, it.first), it.second.data())});
}
OutputTensors outputTensors;
outputTensors.reserve(expectedOutputData.size());
std::map<int, std::vector<TOutput>> outputStorage;
for (auto&& it : expectedOutputData)
{
std::vector<TOutput> out(it.second.size());
outputStorage.emplace(it.first, out);
outputTensors.push_back({it.first,
Tensor(runtime->GetOutputTensorInfo(netId, it.first),
outputStorage.at(it.first).data())});
}
// Does the inference.
runtime->EnqueueWorkload(netId, inputTensors, outputTensors);
// Checks the results.
for (auto&& it : expectedOutputData)
{
std::vector<TOutput> out = outputStorage.at(it.first);
for (unsigned int i = 0; i < out.size(); ++i)
{
CHECK_MESSAGE(Compare<ArmnnOType>(it.second[i], out[i], tolerance) == true,
"Actual output: " << out[i] << ". Expected output:" << it.second[i]);
}
}
}
inline void ImportNonAlignedInputPointerTest(std::vector<BackendId> backends)
{
using namespace armnn;
// Create runtime in which test will run
IRuntime::CreationOptions options;
IRuntimePtr runtime(armnn::IRuntime::Create(options));
// build up the structure of the network
INetworkPtr net(INetwork::Create());
IConnectableLayer* input = net->AddInputLayer(0);
ActivationDescriptor descriptor;
descriptor.m_Function = ActivationFunction::Square;
IConnectableLayer* pooling = net->AddActivationLayer(descriptor);
IConnectableLayer* output = net->AddOutputLayer(0);
input->GetOutputSlot(0).Connect(pooling->GetInputSlot(0));
pooling->GetOutputSlot(0).Connect(output->GetInputSlot(0));
input->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32, 0.0f, 0, true));
pooling->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32));
// Optimize the network
OptimizerOptions optimizedOptions;
optimizedOptions.m_ImportEnabled = true;
IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec(), optimizedOptions);
CHECK(optNet);
// Loads it into the runtime.
NetworkId netId;
std::string ignoredErrorMessage;
// Enable Importing
INetworkProperties networkProperties(false, MemorySource::Malloc, MemorySource::Undefined);
runtime->LoadNetwork(netId, std::move(optNet), ignoredErrorMessage, networkProperties);
// Creates structures for input & output
std::vector<float> inputData
{
1.0f, 2.0f, 3.0f, 4.0f
};
// Misaligned input
float* misalignedInputData = reinterpret_cast<float*>(reinterpret_cast<char*>(inputData.data()) + 1);
std::vector<float> outputData(4);
// Aligned output
float* alignedOutputData = outputData.data();
InputTensors inputTensors
{
{0,armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 0), misalignedInputData)},
};
OutputTensors outputTensors
{
{0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), alignedOutputData)}
};
runtime->GetProfiler(netId)->EnableProfiling(true);
// Do the inference and expect it to fail with a ImportMemoryException
CHECK_THROWS_AS(runtime->EnqueueWorkload(netId, inputTensors, outputTensors), MemoryImportException);
}
inline void ExportNonAlignedOutputPointerTest(std::vector<BackendId> backends)
{
using namespace armnn;
// Create runtime in which test will run
IRuntime::CreationOptions options;
IRuntimePtr runtime(armnn::IRuntime::Create(options));
// build up the structure of the network
INetworkPtr net(INetwork::Create());
IConnectableLayer* input = net->AddInputLayer(0);
ActivationDescriptor descriptor;
descriptor.m_Function = ActivationFunction::Square;
IConnectableLayer* pooling = net->AddActivationLayer(descriptor);
IConnectableLayer* output = net->AddOutputLayer(0);
input->GetOutputSlot(0).Connect(pooling->GetInputSlot(0));
pooling->GetOutputSlot(0).Connect(output->GetInputSlot(0));
input->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32, 0.0f, 0, true));
pooling->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32));
// Optimize the network
OptimizerOptions optimizedOptions;
optimizedOptions.m_ImportEnabled = true;
optimizedOptions.m_ExportEnabled = true;
IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec(), optimizedOptions);
CHECK(optNet);
// Loads it into the runtime.
NetworkId netId;
std::string ignoredErrorMessage;
// Enable Importing and Exporting
INetworkProperties networkProperties(false, MemorySource::Malloc, MemorySource::Malloc);
runtime->LoadNetwork(netId, std::move(optNet), ignoredErrorMessage, networkProperties);
// Creates structures for input & output
std::vector<float> inputData
{
1.0f, 2.0f, 3.0f, 4.0f, 5.0f
};
// Aligned input
float* alignedInputData = inputData.data();
std::vector<float> outputData(5);
// Misaligned output
float* misalignedOutputData = reinterpret_cast<float*>(reinterpret_cast<char*>(outputData.data()) + 1);
InputTensors inputTensors
{
{0,armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 0), alignedInputData)},
};
OutputTensors outputTensors
{
{0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), misalignedOutputData)}
};
// Do the inference and expect it to fail with a ExportMemoryException
if (backends[0] == Compute::CpuAcc)
{
// For CpuAcc the NeonTensorHandle will throw its own exception on misaligned memory
CHECK_THROWS_AS(runtime->EnqueueWorkload(netId, inputTensors, outputTensors), MemoryImportException);
}
else
{
CHECK_THROWS_AS(runtime->EnqueueWorkload(netId, inputTensors, outputTensors), MemoryExportException);
}
}
inline void ImportAlignedPointerTest(std::vector<BackendId> backends)
{
using namespace armnn;
// Create runtime in which test will run
IRuntime::CreationOptions options;
IRuntimePtr runtime(armnn::IRuntime::Create(options));
// build up the structure of the network
INetworkPtr net(INetwork::Create());
IConnectableLayer* input = net->AddInputLayer(0);
ActivationDescriptor descriptor;
descriptor.m_Function = ActivationFunction::Square;
IConnectableLayer* pooling = net->AddActivationLayer(descriptor);
IConnectableLayer* output = net->AddOutputLayer(0);
input->GetOutputSlot(0).Connect(pooling->GetInputSlot(0));
pooling->GetOutputSlot(0).Connect(output->GetInputSlot(0));
input->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32, 0.0f, 0, true));
pooling->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32));
// Optimize the network
OptimizerOptions optimizedOptions;
optimizedOptions.m_ImportEnabled = true;
optimizedOptions.m_ExportEnabled = true;
IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec(), optimizedOptions);
CHECK(optNet);
// Loads it into the runtime.
NetworkId netId;
std::string ignoredErrorMessage;
// Enable Importing
INetworkProperties networkProperties(false, MemorySource::Malloc, MemorySource::Malloc);
runtime->LoadNetwork(netId, std::move(optNet), ignoredErrorMessage, networkProperties);
// Creates structures for input & output
std::vector<float> inputData
{
1.0f, 2.0f, 3.0f, 4.0f
};
std::vector<float> outputData(4);
std::vector<float> expectedOutput
{
1.0f, 4.0f, 9.0f, 16.0f
};
InputTensors inputTensors
{
{0,armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 0), inputData.data())},
};
OutputTensors outputTensors
{
{0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), outputData.data())}
};
runtime->GetProfiler(netId)->EnableProfiling(true);
// Do the inference
runtime->EnqueueWorkload(netId, inputTensors, outputTensors);
// Retrieve the Profiler.Print() output to get the workload execution
ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance();
std::stringstream ss;
profilerManager.GetProfiler()->Print(ss);
std::string dump = ss.str();
// Contains ActivationWorkload
std::size_t found = dump.find("ActivationWorkload");
CHECK(found != std::string::npos);
// Contains SyncMemGeneric
found = dump.find("SyncMemGeneric");
CHECK(found != std::string::npos);
// Does not contain CopyMemGeneric
found = dump.find("CopyMemGeneric");
CHECK(found == std::string::npos);
// Check output is as expected
CHECK(outputData == expectedOutput);
}
inline void ImportOnlyWorkload(std::vector<BackendId> backends)
{
using namespace armnn;
IRuntime::CreationOptions options;
IRuntimePtr runtime(IRuntime::Create(options));
// Builds up the structure of the network.
INetworkPtr net(INetwork::Create());
IConnectableLayer* input = net->AddInputLayer(0);
ActivationDescriptor descriptor;
descriptor.m_Function = ActivationFunction::Square;
IConnectableLayer* pooling = net->AddActivationLayer(descriptor);
IConnectableLayer* output = net->AddOutputLayer(0);
input->GetOutputSlot(0).Connect(pooling->GetInputSlot(0));
pooling->GetOutputSlot(0).Connect(output->GetInputSlot(0));
input->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32, 0.0f, 0, true));
pooling->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32));
// optimize the network
OptimizerOptions optimizedOptions;
optimizedOptions.m_ImportEnabled = true;
IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec(), optimizedOptions);
INFO("Load Network");
// Load it into the runtime. It should pass.
NetworkId netId;
std::string ignoredErrorMessage;
INetworkProperties networkProperties(false, MemorySource::Malloc, MemorySource::Undefined);
CHECK(runtime->LoadNetwork(netId, std::move(optNet),ignoredErrorMessage, networkProperties)
== Status::Success);
INFO("Generate Data");
// Creates structures for input & output
std::vector<float> inputData
{
1.0f, 2.0f, 3.0f, 4.0f
};
std::vector<float> outputData(4);
std::vector<float> expectedOutput
{
1.0f, 4.0f, 9.0f, 16.0f
};
INFO("Create Inference");
InputTensors inputTensors
{
{0,armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 0), inputData.data())},
};
OutputTensors outputTensors
{
{0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), outputData.data())}
};
INFO("Get Profiler");
runtime->GetProfiler(netId)->EnableProfiling(true);
INFO("Run Inference");
// Do the inference
runtime->EnqueueWorkload(netId, inputTensors, outputTensors);
INFO("Print Profiler");
// Retrieve the Profiler.Print() output to get the workload execution
ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance();
std::stringstream ss;
profilerManager.GetProfiler()->Print(ss);
std::string dump = ss.str();
// Check there are no SyncMemGeneric workloads as we didn't export
INFO("Find SyncMemGeneric");
int count = SubStringCounter(dump, "SyncMemGeneric");
CHECK(count == 0);
// Should only be 1 CopyMemGeneric for the output as we imported
INFO("Find CopyMemGeneric");
count = SubStringCounter(dump, "CopyMemGeneric");
CHECK(count == 1);
// Check the output is correct
CHECK(std::equal(outputData.begin(), outputData.end(), expectedOutput.begin(), expectedOutput.end()));
}
inline void ExportOnlyWorkload(std::vector<BackendId> backends)
{
using namespace armnn;
IRuntime::CreationOptions options;
IRuntimePtr runtime(IRuntime::Create(options));
// Builds up the structure of the network.
INetworkPtr net(INetwork::Create());
IConnectableLayer* input = net->AddInputLayer(0);
ActivationDescriptor descriptor;
descriptor.m_Function = ActivationFunction::Square;
IConnectableLayer* pooling = net->AddActivationLayer(descriptor);
IConnectableLayer* output = net->AddOutputLayer(0);
input->GetOutputSlot(0).Connect(pooling->GetInputSlot(0));
pooling->GetOutputSlot(0).Connect(output->GetInputSlot(0));
input->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32, 0.0f, 0, true));
pooling->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32));
// optimize the network
OptimizerOptions optimizedOptions;
optimizedOptions.m_ExportEnabled = true;
IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec(), optimizedOptions);
INFO("Load Network");
// Load it into the runtime. It should pass.
NetworkId netId;
std::string ignoredErrorMessage;
INetworkProperties networkProperties(false, MemorySource::Undefined, MemorySource::Malloc);
CHECK(runtime->LoadNetwork(netId, std::move(optNet),ignoredErrorMessage, networkProperties)
== Status::Success);
INFO("Generate Data");
// Creates structures for input & output
std::vector<float> inputData
{
1.0f, 2.0f, 3.0f, 4.0f
};
std::vector<float> outputData(4);
std::vector<float> expectedOutput
{
1.0f, 4.0f, 9.0f, 16.0f
};
INFO("Create Inference");
InputTensors inputTensors
{
{0,armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 0), inputData.data())},
};
OutputTensors outputTensors
{
{0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), outputData.data())}
};
INFO("Get Profiler");
runtime->GetProfiler(netId)->EnableProfiling(true);
INFO("Run Inference");
// Do the inference
runtime->EnqueueWorkload(netId, inputTensors, outputTensors);
INFO("Print Profiler");
// Retrieve the Profiler.Print() output to get the workload execution
ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance();
std::stringstream ss;
profilerManager.GetProfiler()->Print(ss);
std::string dump = ss.str();
// Check there is a SyncMemGeneric workload as we exported
INFO("Find SyncMemGeneric");
int count = SubStringCounter(dump, "SyncMemGeneric");
CHECK(count == 1);
// Should be 1 CopyMemGeneric for the output as we did not import
INFO("Find CopyMemGeneric");
count = SubStringCounter(dump, "CopyMemGeneric");
CHECK(count == 1);
// Check the output is correct
CHECK(std::equal(outputData.begin(), outputData.end(), expectedOutput.begin(), expectedOutput.end()));
}
inline void ImportAndExportWorkload(std::vector<BackendId> backends)
{
using namespace armnn;
IRuntime::CreationOptions options;
IRuntimePtr runtime(IRuntime::Create(options));
// Builds up the structure of the network.
INetworkPtr net(INetwork::Create());
IConnectableLayer* input = net->AddInputLayer(0);
ActivationDescriptor descriptor;
descriptor.m_Function = ActivationFunction::Square;
IConnectableLayer* pooling = net->AddActivationLayer(descriptor);
IConnectableLayer* output = net->AddOutputLayer(0);
input->GetOutputSlot(0).Connect(pooling->GetInputSlot(0));
pooling->GetOutputSlot(0).Connect(output->GetInputSlot(0));
input->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32, 0.0f, 0, true));
pooling->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32));
OptimizerOptions optimizedOptions;
optimizedOptions.m_ImportEnabled = true;
optimizedOptions.m_ExportEnabled = true;
IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec(), optimizedOptions);
INFO("Load Network");
// Load it into the runtime. It should pass.
NetworkId netId;
std::string ignoredErrorMessage;
INetworkProperties networkProperties(false, MemorySource::Malloc, MemorySource::Malloc);
CHECK(runtime->LoadNetwork(netId, std::move(optNet),ignoredErrorMessage, networkProperties)
== Status::Success);
INFO("Generate Data");
// Creates structures for input & output
std::vector<float> inputData
{
1.0f, 2.0f, 3.0f, 4.0f
};
std::vector<float> outputData(4);
std::vector<float> expectedOutput
{
1.0f, 4.0f, 9.0f, 16.0f
};
INFO("Create inference");
InputTensors inputTensors
{
{0,armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 0), inputData.data())},
};
OutputTensors outputTensors
{
{0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), outputData.data())}
};
INFO("Get Profiler");
runtime->GetProfiler(netId)->EnableProfiling(true);
INFO("Run Inference");
// Do the inference
runtime->EnqueueWorkload(netId, inputTensors, outputTensors);
INFO("Print Profiler");
// Retrieve the Profiler.Print() output to get the workload execution
ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance();
std::stringstream ss;
profilerManager.GetProfiler()->Print(ss);
std::string dump = ss.str();
// Check there is a SyncMemGeneric workload as we exported
INFO("Find SyncMemGeneric");
int count = SubStringCounter(dump, "SyncMemGeneric");
CHECK(count == 1);
// Shouldn't be any CopyMemGeneric workloads
INFO("Find CopyMemGeneric");
count = SubStringCounter(dump, "CopyMemGeneric");
CHECK(count == 0);
// Check the output is correct
CHECK(std::equal(outputData.begin(), outputData.end(), expectedOutput.begin(), expectedOutput.end()));
}
inline void ExportOutputWithSeveralOutputSlotConnectionsTest(std::vector<BackendId> backends)
{
using namespace armnn;
// Create runtime in which test will run
IRuntime::CreationOptions options;
IRuntimePtr runtime(armnn::IRuntime::Create(options));
// build up the structure of the network
INetworkPtr net(INetwork::Create());
IConnectableLayer* input = net->AddInputLayer(0);
ActivationDescriptor descriptor;
descriptor.m_Function = ActivationFunction::Square;
IConnectableLayer* activation = net->AddActivationLayer(descriptor);
IConnectableLayer* output0 = net->AddOutputLayer(0);
IConnectableLayer* output1 = net->AddOutputLayer(1);
input->GetOutputSlot(0).Connect(activation->GetInputSlot(0));
activation->GetOutputSlot(0).Connect(output0->GetInputSlot(0));
activation->GetOutputSlot(0).Connect(output1->GetInputSlot(0));
input->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 4, 1 }, DataType::Float32, 0.0f, 0, true));
activation->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 4, 1 }, DataType::Float32));
// Optimize the network
OptimizerOptions optimizedOptions;
optimizedOptions.m_ImportEnabled = true;
optimizedOptions.m_ExportEnabled = true;
IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec(), optimizedOptions);
// Loads it into the runtime.
NetworkId netId;
std::string ignoredErrorMessage;
// Enable Importing
INetworkProperties networkProperties(false, MemorySource::Malloc, MemorySource::Malloc);
runtime->LoadNetwork(netId, std::move(optNet), ignoredErrorMessage, networkProperties);
// Creates structures for input & output
std::vector<float> inputData
{
1.0f, 2.0f, 3.0f, 4.0f
};
std::vector<float> outputData0(4);
std::vector<float> outputData1(4);
std::vector<float> expectedOutput
{
1.0f, 4.0f, 9.0f, 16.0f
};
InputTensors inputTensors
{
{0,armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 0), inputData.data())},
};
OutputTensors outputTensors
{
{0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), outputData0.data())},
{1,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 1), outputData1.data())}
};
// The result of the inference is not important, just the fact that there
// should not be CopyMemGeneric workloads.
runtime->GetProfiler(netId)->EnableProfiling(true);
// Do the inference
runtime->EnqueueWorkload(netId, inputTensors, outputTensors);
// Retrieve the Profiler.Print() output to get the workload execution
ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance();
std::stringstream ss;
profilerManager.GetProfiler()->Print(ss);
std::string dump = ss.str();
std::size_t found = std::string::npos;
if (backends[0] == Compute::CpuRef)
{
found = dump.find("RefActivationWorkload");
}
else if (backends[0] == Compute::CpuAcc)
{
found = dump.find("NeonActivationWorkload");
}
else if (backends[0] == Compute::GpuAcc)
{
found = dump.find("ClActivationWorkload");
}
CHECK(found != std::string::npos);
// No contains SyncMemGeneric
found = dump.find("SyncMemGeneric");
CHECK(found == std::string::npos);
// Contains CopyMemGeneric
found = dump.find("CopyMemGeneric");
CHECK(found != std::string::npos);
// Check that the outputs are correct
CHECK(std::equal(outputData0.begin(), outputData0.end(),
expectedOutput.begin(), expectedOutput.end()));
CHECK(std::equal(outputData1.begin(), outputData1.end(),
expectedOutput.begin(), expectedOutput.end()));
}
inline void StridedSliceInvalidSliceEndToEndTest(std::vector<BackendId> backends)
{
using namespace armnn;
// Create runtime in which test will run
IRuntime::CreationOptions options;
IRuntimePtr runtime(armnn::IRuntime::Create(options));
// build up the structure of the network
INetworkPtr net(INetwork::Create());
IConnectableLayer* input = net->AddInputLayer(0);
// Configure a strided slice with a stride the same size as the input but with a ShrinkAxisMask on the first
// dim of the output to make it too small to hold the specified slice.
StridedSliceDescriptor descriptor;
descriptor.m_Begin = {0, 0};
descriptor.m_End = {2, 3};
descriptor.m_Stride = {1, 1};
descriptor.m_BeginMask = 0;
descriptor.m_EndMask = 0;
descriptor.m_ShrinkAxisMask = 1;
IConnectableLayer* stridedSlice = net->AddStridedSliceLayer(descriptor);
IConnectableLayer* output0 = net->AddOutputLayer(0);
input->GetOutputSlot(0).Connect(stridedSlice->GetInputSlot(0));
stridedSlice->GetOutputSlot(0).Connect(output0->GetInputSlot(0));
input->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 2, 3 }, DataType::Float32, 0.0f, 0, true));
stridedSlice->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 3 }, DataType::Float32));
// Attempt to optimize the network and check that the correct exception is thrown
CHECK_THROWS_AS(Optimize(*net, backends, runtime->GetDeviceSpec()), armnn::LayerValidationException);
}
inline void ForceImportWithAlignedBuffersEndToEndTest(std::vector<BackendId> backends)
{
/**
* This test is similar to the Import tests above, we create a network with a square function and pass in a vector
* with 4 floats, square them. and validate the output. We then check the profiling logs to see if input/output
* tensors are copied (CopyMemGeneric) or imported (SyncMemGeneric)
* In this case all inputs and outputs should be imported
*/
using namespace armnn;
IRuntime::CreationOptions options;
IRuntimePtr runtime(IRuntime::Create(options));
// Builds up the structure of the network.
INetworkPtr net(INetwork::Create());
IConnectableLayer* input = net->AddInputLayer(0);
ActivationDescriptor descriptor;
descriptor.m_Function = ActivationFunction::Square;
IConnectableLayer* activationLayer = net->AddActivationLayer(descriptor);
IConnectableLayer* output = net->AddOutputLayer(0);
input->GetOutputSlot(0).Connect(activationLayer->GetInputSlot(0));
activationLayer->GetOutputSlot(0).Connect(output->GetInputSlot(0));
input->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32, 0.0f, 0, true));
activationLayer->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32));
IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec());
INFO("Load Network");
// Load it into the runtime. It should pass.
NetworkId netId;
std::string ignoredErrorMessage;
INetworkProperties networkProperties(false, MemorySource::Undefined, MemorySource::Undefined);
CHECK(runtime->LoadNetwork(netId, std::move(optNet),ignoredErrorMessage, networkProperties)
== Status::Success);
INFO("Generate Data");
// Creates structures for input & output
std::vector<float> inputData
{
1.0f, 2.0f, 3.0f, 4.0f
};
std::vector<float> outputData(4);
std::vector<float> expectedOutput
{
1.0f, 4.0f, 9.0f, 16.0f
};
// Check our input and output pointers are actually aligned
uintptr_t alignment = GetDataTypeSize(DataType::Float32);
CHECK(!(reinterpret_cast<uintptr_t>(inputData.data()) % alignment));
CHECK(!(reinterpret_cast<uintptr_t>(outputData.data()) % alignment));
INFO("Create Inference");
InputTensors inputTensors
{
{0,armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 0), inputData.data())},
};
OutputTensors outputTensors
{
{0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), outputData.data())}
};
runtime->GetProfiler(netId)->EnableProfiling(true);
std::vector<ImportedInputId> importedInputIds =
runtime->ImportInputs(netId, inputTensors, MemorySource::Malloc);
std::vector<ImportedOutputId> importedOutputIds =
runtime->ImportOutputs(netId, outputTensors, MemorySource::Malloc);
// Do the inference and force the import as the memory is aligned.
runtime->EnqueueWorkload(netId, inputTensors, outputTensors, importedInputIds, importedOutputIds);
// Retrieve the Profiler.Print() output to get the workload execution
ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance();
std::stringstream ss;
profilerManager.GetProfiler()->Print(ss);
std::string dump = ss.str();
if (backends[0] == Compute::CpuAcc)
{
// Reconfigure has not been implemented for CpuAcc so it will always copy, this will break whenever
// reconfigure is implemented
int count = SubStringCounter(dump, "SyncMemGeneric");
CHECK(count == 0);
// Should be 2 CopyMemGeneric workloads
count = SubStringCounter(dump, "CopyMemGeneric");
CHECK(count == 2);
}
else
{
// Check there is a SyncMemGeneric workload as we exported
int count = SubStringCounter(dump, "SyncMemGeneric");
CHECK(count == 1);
// Shouldn't be any CopyMemGeneric workloads
count = SubStringCounter(dump, "CopyMemGeneric");
CHECK(count == 0);
}
// Check the output is correct
CHECK(std::equal(outputData.begin(), outputData.end(), expectedOutput.begin(), expectedOutput.end()));
}
inline void ForceImportWithMisalignedInputBuffersEndToEndTest(std::vector<BackendId> backends)
{
/**
* This test is similar to the Import tests above, we create a network with a square function and pass in a vector
* with 4 floats, square them. and validate the output. We then check the profiling logs to see if input/output
* tensors are copied (CopyMemGeneric) or imported (SyncMemGeneric)
* In this case all only the output should be imported
*/
using namespace armnn;
IRuntime::CreationOptions options;
IRuntimePtr runtime(IRuntime::Create(options));
// Builds up the structure of the network.
INetworkPtr net(INetwork::Create());
IConnectableLayer* input = net->AddInputLayer(0);
ActivationDescriptor descriptor;
descriptor.m_Function = ActivationFunction::Square;
IConnectableLayer* activationLayer = net->AddActivationLayer(descriptor);
IConnectableLayer* output = net->AddOutputLayer(0);
input->GetOutputSlot(0).Connect(activationLayer->GetInputSlot(0));
activationLayer->GetOutputSlot(0).Connect(output->GetInputSlot(0));
input->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32, 0.0f, 0, true));
activationLayer->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32));
IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec());
INFO("Load Network");
// Load it into the runtime. It should pass.
NetworkId netId;
std::string ignoredErrorMessage;
INetworkProperties networkProperties(false, MemorySource::Undefined, MemorySource::Undefined);
CHECK(runtime->LoadNetwork(netId, std::move(optNet),ignoredErrorMessage, networkProperties)
== Status::Success);
INFO("Generate Data");
// This code looks a little funky but the idea is to create a buffer of floats but offset by the size of a char
// this will guarantee that the resultant buffer is misaligned and thus should always be copied.
auto memPtr = std::malloc(4 * sizeof(float) + sizeof(char));
float* misalignedMemPtr = reinterpret_cast<float*>(reinterpret_cast<char*>(memPtr) + 1);
// Check if our pointer is truly misaligned
uintptr_t alignment = GetDataTypeSize(DataType::Float32);
CHECK (reinterpret_cast<uintptr_t>(misalignedMemPtr) % alignment);
std::vector<float> inputData
{
1.0f, 2.0f, 3.0f, 4.0f
};
std::memcpy(misalignedMemPtr, inputData.data(), 4*sizeof(float));
std::vector<float> outputData(4);
// Check our output buffer is aligned
CHECK(!(reinterpret_cast<uintptr_t>(outputData.data()) % alignment));
std::vector<float> expectedOutput
{
1.0f, 4.0f, 9.0f, 16.0f
};
INFO("Create Inference");
InputTensors inputTensors
{
{0,armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 0), misalignedMemPtr)},
};
OutputTensors outputTensors
{
{0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), outputData.data())}
};
runtime->GetProfiler(netId)->EnableProfiling(true);
std::vector<ImportedInputId> importedInputIds =
runtime->ImportInputs(netId, inputTensors, MemorySource::Malloc);
std::vector<ImportedOutputId> importedOutputIds =
runtime->ImportOutputs(netId, outputTensors, MemorySource::Malloc);
// Do the inference and force the import as the memory is misaligned.
runtime->EnqueueWorkload(netId, inputTensors, outputTensors, importedInputIds, importedOutputIds);
// Retrieve the Profiler.Print() output to get the workload execution
ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance();
std::stringstream ss;
profilerManager.GetProfiler()->Print(ss);
std::string dump = ss.str();
// GpuAcc is a different case to CpuRef and CpuAcc, it doesn't use the buffer directly but instead maps it to a
// new set of addresses within Gpu Memory. This will almost always be auto-aligned, so we don't need to check
// for imports/copies. Only that the output is correct.
if (backends[0] != Compute::GpuAcc)
{
if (backends[0] == Compute::CpuAcc)
{
// Reconfigure has not been implemented for CpuAcc so it will always copy, this will break whenever
// reconfigure is implemented
// We should get 0 SyncMemGeneric for the Output
int count = SubStringCounter(dump, "SyncMemGeneric");
CHECK(count == 0);
// Should be 2 CopyMemGeneric as we copied the input
count = SubStringCounter(dump, "CopyMemGeneric");
CHECK(count == 2);
}
else
{
// We should get 1 SyncMemGeneric for the Output
int count = SubStringCounter(dump, "SyncMemGeneric");
CHECK(count == 1);
// Should only be 1 CopyMemGeneric as we copied the input
count = SubStringCounter(dump, "CopyMemGeneric");
CHECK(count == 1);
}
}
// Check the output is correct
CHECK(std::equal(outputData.begin(), outputData.end(), expectedOutput.begin(), expectedOutput.end()));
std::free(memPtr);
}
inline void ForceImportWithMisalignedOutputBuffersEndToEndTest(std::vector<BackendId> backends)
{
/**
* This test is similar to the Import tests above, we create a network with a square function and pass in a vector
* with 4 floats, square them. and validate the output. We then check the profiling logs to see if input/output
* tensors are copied (CopyMemGeneric) or imported (SyncMemGeneric)
* In this case all only the input should be imported
*/
using namespace armnn;
IRuntime::CreationOptions options;
IRuntimePtr runtime(IRuntime::Create(options));
// Builds up the structure of the network.
INetworkPtr net(INetwork::Create());
IConnectableLayer* input = net->AddInputLayer(0);
ActivationDescriptor descriptor;
descriptor.m_Function = ActivationFunction::Square;
IConnectableLayer* activationLayer = net->AddActivationLayer(descriptor);
IConnectableLayer* output = net->AddOutputLayer(0);
input->GetOutputSlot(0).Connect(activationLayer->GetInputSlot(0));
activationLayer->GetOutputSlot(0).Connect(output->GetInputSlot(0));
input->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32, 0.0f, 0, true));
activationLayer->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32));
IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec());
INFO("Load Network");
// Load it into the runtime. It should pass.
NetworkId netId;
std::string ignoredErrorMessage;
INetworkProperties networkProperties(false, MemorySource::Undefined, MemorySource::Undefined);
CHECK(runtime->LoadNetwork(netId, std::move(optNet),ignoredErrorMessage, networkProperties)
== Status::Success);
INFO("Generate Data");
// This code looks a little funky but the idea is to create a buffer of floats but offset by the size of a char
// this will guarantee that the resultant buffer is misaligned and thus should always be copied.
auto memPtr = std::malloc(4 * sizeof(float) + sizeof(char));
float* misalignedMemPtr = reinterpret_cast<float*>(reinterpret_cast<char*>(memPtr) + 1);
// Check if our pointer is truly misaligned
uintptr_t alignment = GetDataTypeSize(DataType::Float32);
CHECK (reinterpret_cast<uintptr_t>(misalignedMemPtr) % alignment);
// Creates structures for input & output
std::vector<float> inputData
{
1.0f, 2.0f, 3.0f, 4.0f
};
// Check our input buffer is aligned
CHECK(!(reinterpret_cast<uintptr_t>(inputData.data()) % alignment));
std::vector<float> expectedOutput
{
1.0f, 4.0f, 9.0f, 16.0f
};
INFO("Create Inference");
InputTensors inputTensors
{
{0,armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 0), inputData.data())},
};
OutputTensors outputTensors
{
{0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), misalignedMemPtr)}
};
runtime->GetProfiler(netId)->EnableProfiling(true);
std::vector<ImportedInputId> importedInputIds =
runtime->ImportInputs(netId, inputTensors, MemorySource::Malloc);
std::vector<ImportedOutputId> importedOutputIds =
runtime->ImportOutputs(netId, outputTensors, MemorySource::Malloc);
// Do the inference and force the import as the memory is misaligned.
runtime->EnqueueWorkload(netId, inputTensors, outputTensors, importedInputIds, importedOutputIds);
// Retrieve the Profiler.Print() output to get the workload execution
ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance();
std::stringstream ss;
profilerManager.GetProfiler()->Print(ss);
std::string dump = ss.str();
// GpuAcc is a different case to CpuRef and CpuAcc, it doesn't use the buffer directly but instead maps it to a
// new set of addresses within Gpu Memory. This will almost always be auto-aligned, so we don't need to check
// for imports/copies. Only that the output is correct.
if (backends[0] != Compute::GpuAcc)
{
// Even though we Imported the Input we still shouldn't have a SyncMemGeneric
int count = SubStringCounter(dump, "SyncMemGeneric");
CHECK(count == 0);
// Should only be 1 CopyMemGeneric as we copied the input
count = SubStringCounter(dump, "CopyMemGeneric");
if (backends[0] == Compute::CpuAcc)
{
// Reconfigure has not been implemented for CpuAcc so it will always copy, this will break whenever
// reconfigure is implemented
CHECK(count == 2);
}
else
{
CHECK(count == 1);
}
// Check the output is correct
}
unsigned int index = 0;
std::vector<float> outputData(expectedOutput.size(), 0);
std::memcpy(outputData.data(), misalignedMemPtr, expectedOutput.size() * sizeof(float));
for (auto outputValue : expectedOutput)
{
CHECK(outputValue == outputData[index]);
++index;
}
std::free(memPtr);
}
inline void ForceImportWithMisalignedInputAndOutputBuffersEndToEndTest(std::vector<BackendId> backends)
{
/**
* This test is similar to the Import tests above, we create a network with a square function and pass in a vector
* with 4 floats, square them. and validate the output. We then check the profiling logs to see if input/output
* tensors are copied (CopyMemGeneric) or imported (SyncMemGeneric)
* In this case all inputs and outputs should be copied
*/
using namespace armnn;
IRuntime::CreationOptions options;
IRuntimePtr runtime(IRuntime::Create(options));
// Builds up the structure of the network.
INetworkPtr net(INetwork::Create());
IConnectableLayer* input = net->AddInputLayer(0);
ActivationDescriptor descriptor;
descriptor.m_Function = ActivationFunction::Square;
IConnectableLayer* activationLayer = net->AddActivationLayer(descriptor);
IConnectableLayer* output = net->AddOutputLayer(0);
input->GetOutputSlot(0).Connect(activationLayer->GetInputSlot(0));
activationLayer->GetOutputSlot(0).Connect(output->GetInputSlot(0));
input->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32, 0.0f, 0, true));
activationLayer->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32));
IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec());
INFO("Load Network");
// Load it into the runtime. It should pass.
NetworkId netId;
std::string ignoredErrorMessage;
INetworkProperties networkProperties(false, MemorySource::Undefined, MemorySource::Undefined);
CHECK(runtime->LoadNetwork(netId, std::move(optNet),ignoredErrorMessage, networkProperties)
== Status::Success);
INFO("Generate Data");
// This code looks a little funky but the idea is to create a buffer of floats but offset by the size of a char
// this will guarantee that the resultant buffer is misaligned and thus should always be copied.
auto inputMemPtr = std::malloc(4 * sizeof(float) + sizeof(char));
float* misalignedInputPtr = reinterpret_cast<float*>(reinterpret_cast<char*>(inputMemPtr) + 1);
// Check if our pointer is truly misaligned
uintptr_t alignment = GetDataTypeSize(DataType::Float32);
CHECK (reinterpret_cast<uintptr_t>(misalignedInputPtr) % alignment);
std::vector<float> inputData
{
1.0f, 2.0f, 3.0f, 4.0f
};
std::memcpy(misalignedInputPtr, inputData.data(), 4*sizeof(float));
auto outputMemPtr = std::malloc(4 * sizeof(float) + sizeof(char));
float* misalignedOutputPtr = reinterpret_cast<float*>(reinterpret_cast<char*>(outputMemPtr) + 1);
// Check if our pointer is truly misaligned
CHECK (reinterpret_cast<uintptr_t>(misalignedOutputPtr) % alignment);
std::vector<float> expectedOutput
{
1.0f, 4.0f, 9.0f, 16.0f
};
INFO("Create Inference");
InputTensors inputTensors
{
{0,armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 0), misalignedInputPtr)},
};
OutputTensors outputTensors
{
{0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), misalignedOutputPtr)}
};
runtime->GetProfiler(netId)->EnableProfiling(true);
std::vector<ImportedInputId> importedInputIds =
runtime->ImportInputs(netId, inputTensors, MemorySource::Malloc);
std::vector<ImportedOutputId> importedOutputIds =
runtime->ImportOutputs(netId, outputTensors, MemorySource::Malloc);
// Do the inference and force the import as the memory is misaligned.
runtime->EnqueueWorkload(netId, inputTensors, outputTensors, importedInputIds, importedOutputIds);
// Retrieve the Profiler.Print() output to get the workload execution
ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance();
std::stringstream ss;
profilerManager.GetProfiler()->Print(ss);
std::string dump = ss.str();
// GpuAcc is a different case to CpuRef and CpuAcc, it doesn't use the buffer directly but instead maps it to a
// new set of addresses within Gpu Memory. This will almost always be auto-aligned, so we don't need to check
// for imports/copies. Only that the output is correct.
if (backends[0] != Compute::GpuAcc)
{
// We can only copy so there should be no SyncMemGeneric
int count = SubStringCounter(dump, "SyncMemGeneric");
CHECK(count == 0);
// Should only be CopyMemGeneric workloads as we copied all buffers
count = SubStringCounter(dump, "CopyMemGeneric");
CHECK(count == 2);
}
// Check the output is correct
unsigned int index = 0;
std::vector<float> outputData(expectedOutput.size(), 0);
std::memcpy(outputData.data(), misalignedOutputPtr, expectedOutput.size() * sizeof(float));
for (auto expectedValue : expectedOutput)
{
CHECK(expectedValue == outputData[index]);
++index;
}
std::free(inputMemPtr);
std::free(outputMemPtr);
}
inline void ForceImportRepeatedInferencesEndToEndTest(std::vector<BackendId> backends)
{
/**
* This test is similar to the Import tests above, we create a network with a square function and pass in a vector
* with 4 floats, square them. and validate the output. We then check the profiling logs to see if input/output
* tensors are copied (CopyMemGeneric) or imported (SyncMemGeneric)
* In this we create some aligned buffers, import them into a network and validate the output and number of
* SynMemGeneric/CopyMemgeneric. Then we try the same network again with misaligned buffers to make sure it falls
* back to copying correctly.
*/
using namespace armnn;
IRuntime::CreationOptions options;
IRuntimePtr runtime(IRuntime::Create(options));
// Builds up the structure of the network.
INetworkPtr net(INetwork::Create());
IConnectableLayer* input = net->AddInputLayer(0);
ActivationDescriptor descriptor;
descriptor.m_Function = ActivationFunction::Square;
IConnectableLayer* activationLayer = net->AddActivationLayer(descriptor);
IConnectableLayer* output = net->AddOutputLayer(0);
input->GetOutputSlot(0).Connect(activationLayer->GetInputSlot(0));
activationLayer->GetOutputSlot(0).Connect(output->GetInputSlot(0));
input->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32, 0.0f, 0, true));
activationLayer->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32));
IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec());
INFO("Load Network");
// Load it into the runtime. It should pass.
NetworkId netId;
std::string ignoredErrorMessage;
INetworkProperties networkProperties(false, MemorySource::Undefined, MemorySource::Undefined);
CHECK(runtime->LoadNetwork(netId, std::move(optNet),ignoredErrorMessage, networkProperties)
== Status::Success);
INFO("Generate Data");
// Creates structures for input & output
std::vector<float> inputData
{
1.0f, 2.0f, 3.0f, 4.0f
};
std::vector<float> outputData(4);
std::vector<float> expectedOutput
{
1.0f, 4.0f, 9.0f, 16.0f
};
// Check our input and output pointers are actually aligned
uintptr_t alignment = GetDataTypeSize(DataType::Float32);
CHECK(!(reinterpret_cast<uintptr_t>(inputData.data()) % alignment));
CHECK(!(reinterpret_cast<uintptr_t>(outputData.data()) % alignment));
INFO("Create Inference");
InputTensors inputTensors
{
{0,armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 0), inputData.data())},
};
OutputTensors outputTensors
{
{0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), outputData.data())}
};
runtime->GetProfiler(netId)->EnableProfiling(true);
std::vector<ImportedInputId> importedInputIds =
runtime->ImportInputs(netId, inputTensors, MemorySource::Malloc);
std::vector<ImportedOutputId> importedOutputIds =
runtime->ImportOutputs(netId, outputTensors, MemorySource::Malloc);
// Do the inference and force the import as the memory is aligned.
runtime->EnqueueWorkload(netId, inputTensors, outputTensors, importedInputIds, importedOutputIds);
// Retrieve the Profiler.AnalyzeEventsAndWriteResults() output to get the workload execution
ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance();
std::stringstream ss;
profilerManager.GetProfiler()->AnalyzeEventsAndWriteResults(ss);
std::string dump = ss.str();
if (backends[0] == Compute::CpuAcc)
{
// Reconfigure has not been implemented for CpuAcc so it will always copy, this will break whenever
// reconfigure is implemented
int count = SubStringCounter(dump, "SyncMemGeneric");
CHECK(count == 0);
// Should be 2 CopyMemGeneric workloads
count = SubStringCounter(dump, "CopyMemGeneric");
CHECK(count >= 1);
}
else
{
// Check there is at least 1 SyncMemGeneric workload as we exported
int count = SubStringCounter(dump, "SyncMemGeneric");
CHECK(count >= 1);
// Shouldn't be any CopyMemGeneric workloads
count = SubStringCounter(dump, "CopyMemGeneric");
CHECK(count == 0);
}
// Check the output is correct
CHECK(std::equal(outputData.begin(), outputData.end(), expectedOutput.begin(), expectedOutput.end()));
// This code looks a little funky but the idea is to create a buffer of floats but offset by the size of a char
// this will guarantee that the resultant buffer is misaligned and thus should always be copied.
auto inputMemPtr = std::malloc(4 * sizeof(float) + sizeof(char));
float* misalignedInputPtr = reinterpret_cast<float*>(reinterpret_cast<char*>(inputMemPtr) + 1);
// Check if our pointer is truly misaligned
CHECK (reinterpret_cast<uintptr_t>(misalignedInputPtr) % alignment);
std::vector<float> inputValues
{
2.0f, 3.0f, 4.0f, 5.0f
};
std::memcpy(misalignedInputPtr, inputValues.data(), inputValues.size()*sizeof(float));
auto outputMemPtr = std::malloc(4 * sizeof(float) + sizeof(char));
float* misalignedOutputPtr = reinterpret_cast<float*>(reinterpret_cast<char*>(outputMemPtr) + 1);
// Check if our pointer is truly misaligned
CHECK (reinterpret_cast<uintptr_t>(misalignedOutputPtr) % alignment);
std::vector<float> expectedMisalignedOutput
{
4.0f, 9.0f, 16.0f, 25.0f
};
INFO("Create Second Inference");
InputTensors inputTensorsMisaligned
{
{0,armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 0), misalignedInputPtr)},
};
OutputTensors outputTensorsMisaligned
{
{0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), misalignedOutputPtr)}
};
importedInputIds = runtime->ImportInputs(netId, inputTensorsMisaligned, MemorySource::Malloc);
importedOutputIds = runtime->ImportOutputs(netId, outputTensorsMisaligned, MemorySource::Malloc);
// Do the inference and force the import as the memory is misaligned.
runtime->EnqueueWorkload(netId,
inputTensorsMisaligned,
outputTensorsMisaligned,
importedInputIds,
importedOutputIds);
// Retrieve the Profiler.AnalyzeEventsAndWriteResults() output to get the workload execution
// We need to use AnalyzeEventsAndWriteResults here to make sure the second inference has been profiled
profilerManager.GetProfiler()->AnalyzeEventsAndWriteResults(ss);
dump = ss.str();
// GpuAcc is a different case to CpuRef and CpuAcc, it doesn't use the buffer directly but instead maps it to a
// new set of addresses within Gpu Memory. This will almost always be auto-aligned, so we don't need to check
// for imports/copies. Only that the output is correct.
if (backends[0] != Compute::GpuAcc)
{
// The SyncMemGeneric will still be in the profiling log from the first inference
int count = SubStringCounter(dump, "SyncMemGeneric");
CHECK(count >= 1);
// We should now see CopyMemGeneric workloads as we copied all buffers
count = SubStringCounter(dump, "CopyMemGeneric");
CHECK(count >= 1);
}
// Check the output is correct
unsigned int index = 0;
std::vector<float> alignedOutputData(expectedMisalignedOutput.size(), 0);
std::memcpy(alignedOutputData.data(), misalignedOutputPtr, expectedMisalignedOutput.size() * sizeof(float));
for (auto outputValue : expectedMisalignedOutput)
{
CHECK(outputValue == alignedOutputData[index]);
++index;
}
// Clean up to avoid interfering with other tests
runtime->UnloadNetwork(netId);
std::free(inputMemPtr);
std::free(outputMemPtr);
}
inline void ForceImportRepeatedInferencesInvertedEndToEndTest(std::vector<BackendId> backends)
{
/**
* This test is similar to the Import tests above, we create a network with a square function and pass in a vector
* with 4 floats, square them. and validate the output. We then check the profiling logs to see if input/output
* tensors are copied (CopyMemGeneric) or imported (SyncMemGeneric)
* In this we create some misaligned buffers, copy them into a network and validate the output and number of
* SynMemGeneric/CopyMemgeneric. Then we try the same network again with aligned buffers to make sure it switches
* to importing correctly.
*/
using namespace armnn;
IRuntime::CreationOptions options;
IRuntimePtr runtime(IRuntime::Create(options));
// Builds up the structure of the network.
INetworkPtr net(INetwork::Create());
IConnectableLayer* input = net->AddInputLayer(0);
ActivationDescriptor descriptor;
descriptor.m_Function = ActivationFunction::Square;
IConnectableLayer* activationLayer = net->AddActivationLayer(descriptor);
IConnectableLayer* output = net->AddOutputLayer(0);
input->GetOutputSlot(0).Connect(activationLayer->GetInputSlot(0));
activationLayer->GetOutputSlot(0).Connect(output->GetInputSlot(0));
input->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32, 0.0f, 0, true));
activationLayer->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 1, 4 }, DataType::Float32));
IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec());
INFO("Load Network");
// Load it into the runtime. It should pass.
NetworkId netId;
std::string ignoredErrorMessage;
INetworkProperties networkProperties(false, MemorySource::Undefined, MemorySource::Undefined);
CHECK(runtime->LoadNetwork(netId, std::move(optNet),ignoredErrorMessage, networkProperties)
== Status::Success);
INFO("Generate Data");
// This code looks a little funky but the idea is to create a buffer of floats but offset by the size of a char
// this will guarantee that the resultant buffer is misaligned and thus should always be copied.
auto inputMemPtr = std::malloc(4 * sizeof(float) + sizeof(char));
float* misalignedInputPtr = reinterpret_cast<float*>(reinterpret_cast<char*>(inputMemPtr) + 1);
// Check if our pointer is truly misaligned
uintptr_t alignment = GetDataTypeSize(DataType::Float32);
CHECK (reinterpret_cast<uintptr_t>(misalignedInputPtr) % alignment);
std::vector<float> inputValues
{
2.0f, 3.0f, 4.0f, 5.0f
};
std::memcpy(misalignedInputPtr, inputValues.data(), inputValues.size() * sizeof(float));
auto outputMemPtr = std::malloc(4 * sizeof(float) + sizeof(char));
float* misalignedOutputPtr = reinterpret_cast<float*>(reinterpret_cast<char*>(outputMemPtr) + 1);
// Check if our pointer is truly misaligned
CHECK (reinterpret_cast<uintptr_t>(misalignedOutputPtr) % alignment);
std::vector<float> expectedMisalignedOutput
{
4.0f, 9.0f, 16.0f, 25.0f
};
INFO("Create Second Inference");
InputTensors inputTensorsMisaligned
{
{0,armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 0), misalignedInputPtr)},
};
OutputTensors outputTensorsMisaligned
{
{0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), misalignedOutputPtr)}
};
runtime->GetProfiler(netId)->EnableProfiling(true);
std::vector<ImportedInputId> importedInputIds =
runtime->ImportInputs(netId, inputTensorsMisaligned, MemorySource::Malloc);
std::vector<ImportedOutputId> importedOutputIds =
runtime->ImportOutputs(netId, outputTensorsMisaligned, MemorySource::Malloc);
// Do the inference and force the import as the memory is misaligned.
runtime->EnqueueWorkload(netId,
inputTensorsMisaligned,
outputTensorsMisaligned,
importedInputIds,
importedOutputIds);
// Retrieve the Profiler.AnalyzeEventsAndWriteResults() output to get the workload execution
ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance();
std::stringstream ss;
profilerManager.GetProfiler()->AnalyzeEventsAndWriteResults(ss);
std::string dump = ss.str();
// GpuAcc is a different case to CpuRef and CpuAcc, it doesn't use the buffer directly but instead maps it to a
// new set of addresses within Gpu Memory. This will almost always be auto-aligned, so we don't need to check
// for imports/copies. Only that the output is correct.
if (backends[0] != Compute::GpuAcc)
{
// We can only copy so there should be no SyncMemGeneric
int count = SubStringCounter(dump, "SyncMemGeneric");
CHECK(count == 0);
// Should only be CopyMemGeneric workloads as we copied all buffers
count = SubStringCounter(dump, "CopyMemGeneric");
CHECK(count >= 1);
}
// Check the output is correct
unsigned int index = 0;
std::vector<float> alignedOutput(expectedMisalignedOutput.size());
std::memcpy(alignedOutput.data(), misalignedOutputPtr, expectedMisalignedOutput.size()*sizeof(float));
for (auto outputValue : expectedMisalignedOutput)
{
CHECK(outputValue == alignedOutput[index]);
++index;
}
std::free(inputMemPtr);
std::free(outputMemPtr);
// Creates structures for input & output
std::vector<float> inputData
{
1.0f, 2.0f, 3.0f, 4.0f
};
std::vector<float> outputData(4);
std::vector<float> expectedOutput
{
1.0f, 4.0f, 9.0f, 16.0f
};
// Check our input and output pointers are actually aligned
CHECK(!(reinterpret_cast<uintptr_t>(inputData.data()) % alignment));
CHECK(!(reinterpret_cast<uintptr_t>(outputData.data()) % alignment));
INFO("Create Inference");
InputTensors inputTensors
{
{0,armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 0), inputData.data())},
};
OutputTensors outputTensors
{
{0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), outputData.data())}
};
importedInputIds = runtime->ImportInputs(netId, inputTensors, MemorySource::Malloc);
importedOutputIds = runtime->ImportOutputs(netId, outputTensors, MemorySource::Malloc);
// Do the inference and force the import as the memory is aligned.
runtime->EnqueueWorkload(netId, inputTensors, outputTensors, importedInputIds, importedOutputIds);
// Retrieve the Profiler.AnalyzeEventsAndWriteResults() output to get the workload execution
// We need to use AnalyzeEventsAndWriteResults here to make sure the second inference has been profiled
profilerManager.GetProfiler()->AnalyzeEventsAndWriteResults(ss);
dump = ss.str();
if (backends[0] == Compute::CpuAcc)
{
// Reconfigure has not been implemented for CpuAcc so it will always copy, this will break whenever
// reconfigure is implemented
int count = SubStringCounter(dump, "SyncMemGeneric");
CHECK(count == 0);
// Should be 2 CopyMemGeneric workloads
count = SubStringCounter(dump, "CopyMemGeneric");
CHECK(count >= 1);
}
else
{
// Repeated inferences make it difficult to check for an accurate count. So we just validate that we have a
// SyncMemGeneric Workload when we previously didn't
int count = SubStringCounter(dump, "SyncMemGeneric");
CHECK(count >= 1);
// Should still be some CopyMemGeneric Workloads from the last inference
count = SubStringCounter(dump, "CopyMemGeneric");
CHECK(count >= 1);
}
// Check the output is correct
CHECK(std::equal(outputData.begin(), outputData.end(), expectedOutput.begin(), expectedOutput.end()));
// Clean up to avoid interfering with other tests
runtime->UnloadNetwork(netId);
}
} // anonymous namespace