IVGCVSW-4770 Fix segmentation fault in FileOnlyProfilingDecoratorTests

Signed-off-by: Jan Eilers <jan.eilers@arm.com>
Change-Id: I5725aa418b50fc14ce4e6638fe29a74d762cc304
diff --git a/src/profiling/test/FileOnlyProfilingDecoratorTests.cpp b/src/profiling/test/FileOnlyProfilingDecoratorTests.cpp
index 79da17a..69ebe33 100644
--- a/src/profiling/test/FileOnlyProfilingDecoratorTests.cpp
+++ b/src/profiling/test/FileOnlyProfilingDecoratorTests.cpp
@@ -38,212 +38,206 @@
 
 BOOST_AUTO_TEST_CASE(TestFileOnlyProfiling)
 {
-    // This test requires at least one backend registry to be enabled
-    // which can execute a NormalizationLayer
-    if (!HasSuitableBackendRegistered())
+    // Get all registered backends
+    std::vector<BackendId> suitableBackends = GetSuitableBackendRegistered();
+
+    // Run test for each backend separately
+    for (auto const& backend : suitableBackends)
     {
-        return;
+        // Enable m_FileOnly but also provide ILocalPacketHandler which should consume the packets.
+        // This won't dump anything to file.
+        armnn::Runtime::CreationOptions creationOptions;
+        creationOptions.m_ProfilingOptions.m_EnableProfiling     = true;
+        creationOptions.m_ProfilingOptions.m_FileOnly            = true;
+        creationOptions.m_ProfilingOptions.m_CapturePeriod       = 100;
+        creationOptions.m_ProfilingOptions.m_TimelineEnabled     = true;
+        ILocalPacketHandlerSharedPtr localPacketHandlerPtr = std::make_shared<TestTimelinePacketHandler>();
+        creationOptions.m_ProfilingOptions.m_LocalPacketHandlers.push_back(localPacketHandlerPtr);
+
+        armnn::Runtime runtime(creationOptions);
+        // ensure the GUID generator is reset to zero
+        GetProfilingService(&runtime).ResetGuidGenerator();
+
+        // Load a simple network
+        // build up the structure of the network
+        INetworkPtr net(INetwork::Create());
+
+        IConnectableLayer* input = net->AddInputLayer(0, "input");
+
+        ElementwiseUnaryDescriptor descriptor(UnaryOperation::Rsqrt);
+        IConnectableLayer* Rsqrt = net->AddElementwiseUnaryLayer(descriptor, "Rsqrt");
+
+        IConnectableLayer* output = net->AddOutputLayer(0, "output");
+
+        input->GetOutputSlot(0).Connect(Rsqrt->GetInputSlot(0));
+        Rsqrt->GetOutputSlot(0).Connect(output->GetInputSlot(0));
+
+        input->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 4, 4 }, DataType::Float32));
+        Rsqrt->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 4, 4 }, DataType::Float32));
+
+        std::vector<armnn::BackendId> backendsVec {backend};
+        IOptimizedNetworkPtr optNet = Optimize(*net, backendsVec, runtime.GetDeviceSpec());
+
+        // Load it into the runtime. It should succeed.
+        armnn::NetworkId netId;
+        BOOST_TEST(runtime.LoadNetwork(netId, std::move(optNet)) == Status::Success);
+
+        // Creates structures for input & output.
+        std::vector<float> inputData(16);
+        std::vector<float> outputData(16);
+        for (unsigned int i = 0; i < 16; ++i) {
+            inputData[i] = 9.0;
+            outputData[i] = 3.0;
+        }
+
+        InputTensors inputTensors
+        {
+            {0, ConstTensor(runtime.GetInputTensorInfo(netId, 0), inputData.data())}
+        };
+        OutputTensors outputTensors
+        {
+            {0, Tensor(runtime.GetOutputTensorInfo(netId, 0), outputData.data())}
+        };
+
+        // Does the inference.
+        runtime.EnqueueWorkload(netId, inputTensors, outputTensors);
+
+        static_cast<TestTimelinePacketHandler *>(localPacketHandlerPtr.get())->WaitOnInferenceCompletion(3000);
+
+        const TimelineModel &model =
+                static_cast<TestTimelinePacketHandler *>(localPacketHandlerPtr.get())->GetTimelineModel();
+
+        for (auto &error : model.GetErrors()) {
+            std::cout << error.what() << std::endl;
+        }
+        BOOST_TEST(model.GetErrors().empty());
+        std::vector<std::string> desc = GetModelDescription(model);
+        std::vector<std::string> expectedOutput;
+        expectedOutput.push_back("Entity [0] name = input type = layer");
+        expectedOutput.push_back("   connection [14] from entity [0] to entity [1]");
+        expectedOutput.push_back("   child: Entity [23] backendId = " + backend.Get() + " type = workload");
+        expectedOutput.push_back("Entity [1] name = Rsqrt type = layer");
+        expectedOutput.push_back("   connection [22] from entity [1] to entity [2]");
+        expectedOutput.push_back("   child: Entity [15] backendId = " + backend.Get() + " type = workload");
+        expectedOutput.push_back("Entity [2] name = output type = layer");
+        expectedOutput.push_back("   child: Entity [27] backendId = " + backend.Get() + " type = workload");
+        expectedOutput.push_back("Entity [6] type = network");
+        expectedOutput.push_back("   child: Entity [0] name = input type = layer");
+        expectedOutput.push_back("   child: Entity [1] name = Rsqrt type = layer");
+        expectedOutput.push_back("   child: Entity [2] name = output type = layer");
+        expectedOutput.push_back("   execution: Entity [31] type = inference");
+        expectedOutput.push_back("Entity [15] backendId = " + backend.Get() + " type = workload");
+        expectedOutput.push_back("   execution: Entity [44] type = workload_execution");
+        expectedOutput.push_back("Entity [23] backendId = " + backend.Get() + " type = workload");
+        expectedOutput.push_back("   execution: Entity [36] type = workload_execution");
+        expectedOutput.push_back("Entity [27] backendId = " + backend.Get() + " type = workload");
+        expectedOutput.push_back("   execution: Entity [52] type = workload_execution");
+        expectedOutput.push_back("Entity [31] type = inference");
+        expectedOutput.push_back("   child: Entity [36] type = workload_execution");
+        expectedOutput.push_back("   child: Entity [44] type = workload_execution");
+        expectedOutput.push_back("   child: Entity [52] type = workload_execution");
+        expectedOutput.push_back("   event: [34] class [start_of_life]");
+        expectedOutput.push_back("   event: [60] class [end_of_life]");
+        expectedOutput.push_back("Entity [36] type = workload_execution");
+        expectedOutput.push_back("   event: [40] class [start_of_life]");
+        expectedOutput.push_back("   event: [42] class [end_of_life]");
+        expectedOutput.push_back("Entity [44] type = workload_execution");
+        expectedOutput.push_back("   event: [48] class [start_of_life]");
+        expectedOutput.push_back("   event: [50] class [end_of_life]");
+        expectedOutput.push_back("Entity [52] type = workload_execution");
+        expectedOutput.push_back("   event: [56] class [start_of_life]");
+        expectedOutput.push_back("   event: [58] class [end_of_life]");
+        BOOST_TEST(CompareOutput(desc, expectedOutput));
     }
-
-    // Enable m_FileOnly but also provide ILocalPacketHandler which should consume the packets.
-    // This won't dump anything to file.
-    armnn::Runtime::CreationOptions creationOptions;
-    creationOptions.m_ProfilingOptions.m_EnableProfiling     = true;
-    creationOptions.m_ProfilingOptions.m_FileOnly            = true;
-    creationOptions.m_ProfilingOptions.m_CapturePeriod       = 100;
-    creationOptions.m_ProfilingOptions.m_TimelineEnabled     = true;
-    ILocalPacketHandlerSharedPtr localPacketHandlerPtr = std::make_shared<TestTimelinePacketHandler>();
-    creationOptions.m_ProfilingOptions.m_LocalPacketHandlers.push_back(localPacketHandlerPtr);
-
-    armnn::Runtime runtime(creationOptions);
-    // ensure the GUID generator is reset to zero
-    GetProfilingService(&runtime).ResetGuidGenerator();
-
-    // Load a simple network
-    // build up the structure of the network
-    INetworkPtr net(INetwork::Create());
-
-    IConnectableLayer* input = net->AddInputLayer(0, "input");
-
-    ElementwiseUnaryDescriptor descriptor(UnaryOperation::Sqrt);
-    IConnectableLayer* normalize = net->AddElementwiseUnaryLayer(descriptor, "normalization");
-
-    IConnectableLayer* output = net->AddOutputLayer(0, "output");
-
-    input->GetOutputSlot(0).Connect(normalize->GetInputSlot(0));
-    normalize->GetOutputSlot(0).Connect(output->GetInputSlot(0));
-
-    input->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 4, 4 }, DataType::Float32));
-    normalize->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 4, 4 }, DataType::Float32));
-
-    // optimize the network
-    std::vector<armnn::BackendId> backends =
-            { armnn::Compute::CpuRef, armnn::Compute::CpuAcc, armnn::Compute::GpuAcc };
-    IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime.GetDeviceSpec());
-
-    // Load it into the runtime. It should succeed.
-    armnn::NetworkId netId;
-    BOOST_TEST(runtime.LoadNetwork(netId, std::move(optNet)) == Status::Success);
-
-    // Creates structures for input & output.
-    std::vector<float> inputData(16);
-    std::vector<float> outputData(16);
-    for (unsigned int i = 0; i < 16; ++i)
-    {
-        inputData[i] = 9.0;
-        outputData[i] = 3.0;
-    }
-
-    InputTensors  inputTensors
-    {
-        {0, ConstTensor(runtime.GetInputTensorInfo(netId, 0), inputData.data())}
-    };
-    OutputTensors outputTensors
-    {
-        {0, Tensor(runtime.GetOutputTensorInfo(netId, 0), outputData.data())}
-    };
-
-    // Does the inference.
-    runtime.EnqueueWorkload(netId, inputTensors, outputTensors);
-
-    static_cast<TestTimelinePacketHandler*>(localPacketHandlerPtr.get())->WaitOnInferenceCompletion(3000);
-
-    const TimelineModel& model =
-        static_cast<TestTimelinePacketHandler*>(localPacketHandlerPtr.get())->GetTimelineModel();
-
-    for (auto& error : model.GetErrors())
-    {
-        std::cout << error.what() << std::endl;
-    }
-    BOOST_TEST(model.GetErrors().empty());
-    std::vector<std::string> desc = GetModelDescription(model);
-    std::vector<std::string> expectedOutput;
-    expectedOutput.push_back("Entity [0] name = input type = layer");
-    expectedOutput.push_back("   connection [14] from entity [0] to entity [1]");
-    expectedOutput.push_back("   child: Entity [23] backendId = CpuRef type = workload");
-    expectedOutput.push_back("Entity [1] name = normalization type = layer");
-    expectedOutput.push_back("   connection [22] from entity [1] to entity [2]");
-    expectedOutput.push_back("   child: Entity [15] backendId = CpuRef type = workload");
-    expectedOutput.push_back("Entity [2] name = output type = layer");
-    expectedOutput.push_back("   child: Entity [27] backendId = CpuRef type = workload");
-    expectedOutput.push_back("Entity [6] type = network");
-    expectedOutput.push_back("   child: Entity [0] name = input type = layer");
-    expectedOutput.push_back("   child: Entity [1] name = normalization type = layer");
-    expectedOutput.push_back("   child: Entity [2] name = output type = layer");
-    expectedOutput.push_back("   execution: Entity [31] type = inference");
-    expectedOutput.push_back("Entity [15] backendId = CpuRef type = workload");
-    expectedOutput.push_back("   execution: Entity [44] type = workload_execution");
-    expectedOutput.push_back("Entity [23] backendId = CpuRef type = workload");
-    expectedOutput.push_back("   execution: Entity [36] type = workload_execution");
-    expectedOutput.push_back("Entity [27] backendId = CpuRef type = workload");
-    expectedOutput.push_back("   execution: Entity [52] type = workload_execution");
-    expectedOutput.push_back("Entity [31] type = inference");
-    expectedOutput.push_back("   child: Entity [36] type = workload_execution");
-    expectedOutput.push_back("   child: Entity [44] type = workload_execution");
-    expectedOutput.push_back("   child: Entity [52] type = workload_execution");
-    expectedOutput.push_back("   event: [34] class [start_of_life]");
-    expectedOutput.push_back("   event: [60] class [end_of_life]");
-    expectedOutput.push_back("Entity [36] type = workload_execution");
-    expectedOutput.push_back("   event: [40] class [start_of_life]");
-    expectedOutput.push_back("   event: [42] class [end_of_life]");
-    expectedOutput.push_back("Entity [44] type = workload_execution");
-    expectedOutput.push_back("   event: [48] class [start_of_life]");
-    expectedOutput.push_back("   event: [50] class [end_of_life]");
-    expectedOutput.push_back("Entity [52] type = workload_execution");
-    expectedOutput.push_back("   event: [56] class [start_of_life]");
-    expectedOutput.push_back("   event: [58] class [end_of_life]");
-    BOOST_TEST(CompareOutput(desc, expectedOutput));
 }
 
 BOOST_AUTO_TEST_CASE(DumpOutgoingValidFileEndToEnd)
 {
-    // This test requires at least one backend registry to be enabled
-    // which can execute a NormalizationLayer
-    if (!HasSuitableBackendRegistered())
+    // Get all registered backends
+    std::vector<BackendId> suitableBackends = GetSuitableBackendRegistered();
+
+    // Run test for each backend separately
+    for (auto const& backend : suitableBackends)
     {
-        return;
+        // Create a temporary file name.
+        fs::path tempPath = armnnUtils::Filesystem::NamedTempFile("DumpOutgoingValidFileEndToEnd_CaptureFile.txt");
+        // Make sure the file does not exist at this point
+        BOOST_CHECK(!fs::exists(tempPath));
+
+        armnn::Runtime::CreationOptions options;
+        options.m_ProfilingOptions.m_EnableProfiling     = true;
+        options.m_ProfilingOptions.m_FileOnly            = true;
+        options.m_ProfilingOptions.m_IncomingCaptureFile = "";
+        options.m_ProfilingOptions.m_OutgoingCaptureFile = tempPath.string();
+        options.m_ProfilingOptions.m_CapturePeriod       = 100;
+        options.m_ProfilingOptions.m_TimelineEnabled     = true;
+
+        ILocalPacketHandlerSharedPtr localPacketHandlerPtr = std::make_shared<TestTimelinePacketHandler>();
+        options.m_ProfilingOptions.m_LocalPacketHandlers.push_back(localPacketHandlerPtr);
+
+        armnn::Runtime runtime(options);
+        // ensure the GUID generator is reset to zero
+        GetProfilingService(&runtime).ResetGuidGenerator();
+
+        // Load a simple network
+        // build up the structure of the network
+        INetworkPtr net(INetwork::Create());
+
+        IConnectableLayer* input = net->AddInputLayer(0, "input");
+
+        ElementwiseUnaryDescriptor descriptor(UnaryOperation::Rsqrt);
+        IConnectableLayer* Rsqrt = net->AddElementwiseUnaryLayer(descriptor, "Rsqrt");
+
+        IConnectableLayer* output = net->AddOutputLayer(0, "output");
+
+        input->GetOutputSlot(0).Connect(Rsqrt->GetInputSlot(0));
+        Rsqrt->GetOutputSlot(0).Connect(output->GetInputSlot(0));
+
+        input->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 4, 4 }, DataType::Float32));
+        Rsqrt->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 4, 4 }, DataType::Float32));
+
+
+        std::vector<BackendId> backendsVec{backend};
+        IOptimizedNetworkPtr optNet = Optimize(*net, backendsVec, runtime.GetDeviceSpec());
+
+        // Load it into the runtime. It should succeed.
+        armnn::NetworkId netId;
+        BOOST_TEST(runtime.LoadNetwork(netId, std::move(optNet)) == Status::Success);
+
+        // Creates structures for input & output.
+        std::vector<float> inputData(16);
+        std::vector<float> outputData(16);
+        for (unsigned int i = 0; i < 16; ++i) {
+            inputData[i] = 9.0;
+            outputData[i] = 3.0;
+        }
+
+        InputTensors inputTensors
+        {
+            {0, ConstTensor(runtime.GetInputTensorInfo(netId, 0), inputData.data())}
+        };
+        OutputTensors outputTensors
+        {
+            {0, Tensor(runtime.GetOutputTensorInfo(netId, 0), outputData.data())}
+        };
+
+        // Does the inference.
+        runtime.EnqueueWorkload(netId, inputTensors, outputTensors);
+
+        static_cast<TestTimelinePacketHandler *>(localPacketHandlerPtr.get())->WaitOnInferenceCompletion(3000);
+
+        // In order to flush the files we need to gracefully close the profiling service.
+        options.m_ProfilingOptions.m_EnableProfiling = false;
+        GetProfilingService(&runtime).ResetExternalProfilingOptions(options.m_ProfilingOptions, true);
+
+        // The output file size should be greater than 0.
+        BOOST_CHECK(fs::file_size(tempPath) > 0);
+
+        // NOTE: would be an interesting exercise to take this file and decode it
+
+        // Delete the tmp file.
+        BOOST_CHECK(fs::remove(tempPath));
     }
-
-    // Create a temporary file name.
-    fs::path tempPath = armnnUtils::Filesystem::NamedTempFile("DumpOutgoingValidFileEndToEnd_CaptureFile.txt");
-    armnn::Runtime::CreationOptions options;
-    options.m_ProfilingOptions.m_EnableProfiling     = true;
-    options.m_ProfilingOptions.m_FileOnly            = true;
-    options.m_ProfilingOptions.m_IncomingCaptureFile = "";
-    options.m_ProfilingOptions.m_OutgoingCaptureFile = tempPath.string();
-    options.m_ProfilingOptions.m_CapturePeriod       = 100;
-    options.m_ProfilingOptions.m_TimelineEnabled     = true;
-
-    ILocalPacketHandlerSharedPtr localPacketHandlerPtr = std::make_shared<TestTimelinePacketHandler>();
-    options.m_ProfilingOptions.m_LocalPacketHandlers.push_back(localPacketHandlerPtr);
-
-    // Make sure the file does not exist at this point
-    BOOST_CHECK(!fs::exists(tempPath));
-
-    armnn::Runtime runtime(options);
-    // ensure the GUID generator is reset to zero
-    GetProfilingService(&runtime).ResetGuidGenerator();
-
-    // Load a simple network
-    // build up the structure of the network
-    INetworkPtr net(INetwork::Create());
-
-    IConnectableLayer* input = net->AddInputLayer(0, "input");
-
-    ElementwiseUnaryDescriptor descriptor(UnaryOperation::Sqrt);
-    IConnectableLayer* normalize = net->AddElementwiseUnaryLayer(descriptor, "normalization");
-
-    IConnectableLayer* output = net->AddOutputLayer(0, "output");
-
-    input->GetOutputSlot(0).Connect(normalize->GetInputSlot(0));
-    normalize->GetOutputSlot(0).Connect(output->GetInputSlot(0));
-
-    input->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 4, 4 }, DataType::Float32));
-    normalize->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 4, 4 }, DataType::Float32));
-
-    // optimize the network
-    std::vector<armnn::BackendId> backends =
-            { armnn::Compute::CpuRef, armnn::Compute::CpuAcc, armnn::Compute::GpuAcc };
-    IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime.GetDeviceSpec());
-
-    // Load it into the runtime. It should succeed.
-    armnn::NetworkId netId;
-    BOOST_TEST(runtime.LoadNetwork(netId, std::move(optNet)) == Status::Success);
-
-    // Creates structures for input & output.
-    std::vector<float> inputData(16);
-    std::vector<float> outputData(16);
-    for (unsigned int i = 0; i < 16; ++i)
-    {
-        inputData[i] = 9.0;
-        outputData[i] = 3.0;
-    }
-
-    InputTensors  inputTensors
-    {
-        {0, ConstTensor(runtime.GetInputTensorInfo(netId, 0), inputData.data())}
-    };
-    OutputTensors outputTensors
-    {
-        {0, Tensor(runtime.GetOutputTensorInfo(netId, 0), outputData.data())}
-    };
-
-    // Does the inference.
-    runtime.EnqueueWorkload(netId, inputTensors, outputTensors);
-
-    static_cast<TestTimelinePacketHandler*>(localPacketHandlerPtr.get())->WaitOnInferenceCompletion(3000);
-
-    // In order to flush the files we need to gracefully close the profiling service.
-    options.m_ProfilingOptions.m_EnableProfiling = false;
-    GetProfilingService(&runtime).ResetExternalProfilingOptions(options.m_ProfilingOptions, true);
-
-    // The output file size should be greater than 0.
-    BOOST_CHECK(fs::file_size(tempPath) > 0);
-
-    // NOTE: would be an interesting exercise to take this file and decode it
-
-    // Delete the tmp file.
-    BOOST_CHECK(fs::remove(tempPath));
 }
 
 BOOST_AUTO_TEST_SUITE_END()
diff --git a/src/profiling/test/ProfilingTestUtils.cpp b/src/profiling/test/ProfilingTestUtils.cpp
index 0f43728..9f6bc43 100644
--- a/src/profiling/test/ProfilingTestUtils.cpp
+++ b/src/profiling/test/ProfilingTestUtils.cpp
@@ -34,6 +34,24 @@
     return headerSize + bodySize + payloadSize;
 }
 
+std::vector<BackendId> GetSuitableBackendRegistered()
+{
+    std::vector<BackendId> suitableBackends;
+    if (BackendRegistryInstance().IsBackendRegistered(GetComputeDeviceAsCString(armnn::Compute::CpuRef)))
+    {
+        suitableBackends.push_back(armnn::Compute::CpuRef);
+    }
+    if (BackendRegistryInstance().IsBackendRegistered(GetComputeDeviceAsCString(armnn::Compute::CpuAcc)))
+    {
+        suitableBackends.push_back(armnn::Compute::CpuAcc);
+    }
+    if (BackendRegistryInstance().IsBackendRegistered(GetComputeDeviceAsCString(armnn::Compute::GpuAcc)))
+    {
+        suitableBackends.push_back(armnn::Compute::GpuAcc);
+    }
+    return suitableBackends;
+}
+
 inline unsigned int OffsetToNextWord(unsigned int numberOfBytes)
 {
     unsigned int uint32_t_size = sizeof(uint32_t);
@@ -1199,17 +1217,6 @@
     bufferManager.MarkRead(inferenceReadableBuffer);
 }
 
-bool HasSuitableBackendRegistered()
-{
-    // Only run the file only profiling unit tests on CpuRef until failure on build system can be debugged
-    if (BackendRegistryInstance().GetBackendIds().size() == 1 &&
-        BackendRegistryInstance().IsBackendRegistered(GetComputeDeviceAsCString(armnn::Compute::CpuRef)))
-    {
-        return true;
-    }
-    return false;
-}
-
 bool CompareOutput(std::vector<std::string> output, std::vector<std::string> expectedOutput)
 {
     if (output.size() != expectedOutput.size())
diff --git a/src/profiling/test/ProfilingTestUtils.hpp b/src/profiling/test/ProfilingTestUtils.hpp
index a9a6921..8f138bb 100644
--- a/src/profiling/test/ProfilingTestUtils.hpp
+++ b/src/profiling/test/ProfilingTestUtils.hpp
@@ -21,6 +21,9 @@
 
 uint32_t GetStreamMetaDataPacketSize();
 
+/// Returns a vector of CpuRef, CpuAcc or GpuAcc backends if they where registered
+std::vector<BackendId> GetSuitableBackendRegistered();
+
 inline unsigned int OffsetToNextWord(unsigned int numberOfBytes);
 
 void VerifyTimelineHeaderBinary(const unsigned char* readableData,
@@ -57,10 +60,6 @@
 
 void VerifyPostOptimisationStructureTestImpl(armnn::BackendId backendId);
 
-// returns true if a CpuRef, CpuAcc or GpuAcc
-// backend is registered
-bool HasSuitableBackendRegistered();
-
 bool CompareOutput(std::vector<std::string> output, std::vector<std::string> expectedOutput);
 
 namespace armnn