IVGCVSW-5012 Enable zero copy for Neon

 * Allow memory import if padding is not required in Neon
 * AddMockImportBackend for fallback tests
 * Refactor GraphUtils
 * Memory import unit tests
 * Fallback unit tests

Signed-off-by: Narumol Prangnawarat <narumol.prangnawarat@arm.com>
Change-Id: Ic2e141e12774bf6d915e77745b6f6d2d83d9b82d
diff --git a/src/backends/neon/test/CMakeLists.txt b/src/backends/neon/test/CMakeLists.txt
index 16c066b..dd13b63 100644
--- a/src/backends/neon/test/CMakeLists.txt
+++ b/src/backends/neon/test/CMakeLists.txt
@@ -6,6 +6,7 @@
 list(APPEND armnnNeonBackendUnitTests_sources
     NeonCreateWorkloadTests.cpp
     NeonEndToEndTests.cpp
+    NeonFallbackTests.cpp
     NeonJsonPrinterTests.cpp
     NeonLayerSupportTests.cpp
     NeonLayerTests.cpp
diff --git a/src/backends/neon/test/NeonEndToEndTests.cpp b/src/backends/neon/test/NeonEndToEndTests.cpp
index ffbae51..dc0a609 100644
--- a/src/backends/neon/test/NeonEndToEndTests.cpp
+++ b/src/backends/neon/test/NeonEndToEndTests.cpp
@@ -410,27 +410,27 @@
     ExportNonAlignedOutputPointerTest(defaultBackends);
 }
 
-BOOST_AUTO_TEST_CASE(NeonImportAlignedPointerTest, * boost::unit_test::disabled())
+BOOST_AUTO_TEST_CASE(NeonImportAlignedPointerTest)
 {
     ImportAlignedPointerTest(defaultBackends);
 }
 
-BOOST_AUTO_TEST_CASE(NeonImportOnlyWorkload, * boost::unit_test::disabled())
+BOOST_AUTO_TEST_CASE(NeonImportOnlyWorkload)
 {
     ImportOnlyWorkload(defaultBackends);
 }
 
-BOOST_AUTO_TEST_CASE(NeonExportOnlyWorkload, * boost::unit_test::disabled())
+BOOST_AUTO_TEST_CASE(NeonExportOnlyWorkload)
 {
     ExportOnlyWorkload(defaultBackends);
 }
 
-BOOST_AUTO_TEST_CASE(NeonImportAndExportWorkload, * boost::unit_test::disabled())
+BOOST_AUTO_TEST_CASE(NeonImportAndExportWorkload)
 {
     ImportAndExportWorkload(defaultBackends);
 }
 
-BOOST_AUTO_TEST_CASE(NeonExportOutputWithSeveralOutputSlotConnectionsTest, * boost::unit_test::disabled())
+BOOST_AUTO_TEST_CASE(NeonExportOutputWithSeveralOutputSlotConnectionsTest)
 {
     ExportOutputWithSeveralOutputSlotConnectionsTest(defaultBackends);
 }
diff --git a/src/backends/neon/test/NeonFallbackTests.cpp b/src/backends/neon/test/NeonFallbackTests.cpp
new file mode 100644
index 0000000..cf4d91b
--- /dev/null
+++ b/src/backends/neon/test/NeonFallbackTests.cpp
@@ -0,0 +1,547 @@
+//
+// Copyright © 2020 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include <backendsCommon/test/CommonTestUtils.hpp>
+#include <backendsCommon/test/mockBackend/MockImportBackend.hpp>
+
+#include <test/GraphUtils.hpp>
+
+#include <boost/test/unit_test.hpp>
+
+BOOST_AUTO_TEST_SUITE(NeonFallback)
+
+std::vector<armnn::BackendId> defaultBackends = { armnn::Compute::CpuAcc };
+
+BOOST_AUTO_TEST_CASE(FallbackImportToCpuAcc)
+{
+    using namespace armnn;
+
+    // Create a mock backend object
+    MockImportBackendInitialiser initialiser; // Register the Mock Backend
+    auto backendObjPtr = CreateBackendObject(MockImportBackendId());
+    BOOST_TEST((backendObjPtr != nullptr));
+
+    BackendIdSet backendIds = BackendRegistryInstance().GetBackendIds();
+    if (backendIds.find("MockRef") == backendIds.end())
+    {
+        std::string message = "Cannot load MockRef";
+        BOOST_FAIL(message);
+    }
+
+    // Create runtime in which test will run and allow fallback to CpuRef.
+    IRuntime::CreationOptions options;
+    IRuntimePtr runtime(IRuntime::Create(options));
+
+    // Builds up the structure of the network.
+    INetworkPtr net(INetwork::Create());
+
+    IConnectableLayer* input0 = net->AddInputLayer(0, "input0");
+    IConnectableLayer* input1 = net->AddInputLayer(1, "input1");
+    IConnectableLayer* input2 = net->AddInputLayer(2, "input2");
+    IConnectableLayer* add = net->AddAdditionLayer("add");
+    IConnectableLayer* sub = net->AddSubtractionLayer("sub");
+    IConnectableLayer* output = net->AddOutputLayer(0, "output");
+
+    input0->GetOutputSlot(0).Connect(add->GetInputSlot(0));
+    input1->GetOutputSlot(0).Connect(add->GetInputSlot(1));
+    input2->GetOutputSlot(0).Connect(sub->GetInputSlot(0));
+    add->GetOutputSlot(0).Connect(sub->GetInputSlot(1));
+    sub->GetOutputSlot(0).Connect(output->GetInputSlot(0));
+
+    TensorInfo info = TensorInfo({ 1, 2, 3, 2 }, DataType::Float32);
+
+    input0->GetOutputSlot(0).SetTensorInfo(info);
+    input1->GetOutputSlot(0).SetTensorInfo(info);
+    input2->GetOutputSlot(0).SetTensorInfo(info);
+    add->GetOutputSlot(0).SetTensorInfo(info);
+    sub->GetOutputSlot(0).SetTensorInfo(info);
+
+    // optimize the network
+    std::vector<BackendId> backends = { "MockRef", Compute::CpuAcc };
+    IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec());
+
+    OptimizedNetwork* optNetObjPtr = PolymorphicDowncast<OptimizedNetwork*>(optNet.get());
+    Graph& graph = optNetObjPtr->GetGraph();
+
+    armnn::Layer* const layer0 = GetFirstLayerWithName(graph, "input0");
+    armnn::Layer* const layer1 = GetFirstLayerWithName(graph, "input1");
+    armnn::Layer* const layer2 = GetFirstLayerWithName(graph, "input2");
+    armnn::Layer* const layer3 = GetFirstLayerWithName(graph, "add");
+    armnn::Layer* const layer4 = GetFirstLayerWithName(graph, "[ add (0) -> sub (1) ]");
+    armnn::Layer* const layer5 = GetFirstLayerWithName(graph, "sub");
+    armnn::Layer* const layer6 = GetFirstLayerWithName(graph, "output");
+
+    // Checks order is valid.
+    BOOST_TEST(CheckOrder(graph, layer0, layer1));
+    BOOST_TEST(CheckOrder(graph, layer1, layer2));
+    BOOST_TEST(CheckOrder(graph, layer2, layer3));
+    BOOST_TEST(CheckOrder(graph, layer3, layer4));
+    BOOST_TEST(CheckOrder(graph, layer4, layer5));
+    BOOST_TEST(CheckOrder(graph, layer5, layer6));
+
+    // Load it into the runtime. It should pass.
+    NetworkId netId;
+    std::string ignoredErrorMessage;
+    INetworkProperties networkProperties(true, true);
+
+    runtime->LoadNetwork(netId, std::move(optNet), ignoredErrorMessage, networkProperties);
+
+    // Creates structures for input & output
+    std::vector<float> inputData0
+    {
+        1.0f, 1.0f, 2.0f, 2.0f, 2.0f, 3.0f, 4.0f, 4.0f, 5.0f, 5.0f, 6.0f, 6.0f
+    };
+    std::vector<float> inputData1
+    {
+        0.0f, 1.0f, 1.0f, 2.0f, 3.0f, 3.0f, 3.0f, 4.0f, 4.0f, 5.0f, 5.0f, 6.0f
+    };
+    std::vector<float> inputData2
+    {
+        12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f
+    };
+
+    std::vector<float> outputData(12);
+
+    std::vector<float> expectedOutput
+    {
+        11.0f, 9.0f, 7.0f, 5.0f, 3.0f, 1.0f, -1.0f, -3.0f, -5.0f, -7.0f, -9.0f, -11.0f
+    };
+
+    InputTensors inputTensors
+    {
+        { 0, armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 0), inputData0.data()) },
+        { 1, armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 1), inputData1.data()) },
+        { 2, armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 2), inputData2.data()) }
+    };
+    OutputTensors outputTensors
+    {
+        { 0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), outputData.data()) }
+    };
+
+    runtime->GetProfiler(netId)->EnableProfiling(true);
+
+    // Do the inference
+    runtime->EnqueueWorkload(netId, inputTensors, outputTensors);
+
+    // Retrieve the Profiler.Print() output to get the workload execution
+    ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance();
+    std::stringstream ss;
+    profilerManager.GetProfiler()->Print(ss);;
+    std::string dump = ss.str();
+
+    // Contains ImportMemGeneric
+    std::size_t found = dump.find("ImportMemGeneric");
+    BOOST_TEST(found != std::string::npos);
+
+    // Contains SyncMemGeneric
+    found = dump.find("SyncMemGeneric");
+    BOOST_TEST(found != std::string::npos);
+
+    // Does not contain CopyMemGeneric
+    found = dump.find("CopyMemGeneric");
+    BOOST_TEST(found == std::string::npos);
+
+    // Use memory import between backends
+    BOOST_TEST((layer4->GetType() == LayerType::MemImport));
+
+    // Check output is as expected
+    BOOST_TEST(outputData == expectedOutput);
+}
+
+BOOST_AUTO_TEST_CASE(FallbackPaddingCopyToCpuAcc)
+{
+    using namespace armnn;
+
+    // Create a mock backend object
+    MockImportBackendInitialiser initialiser; // Register the Mock Backend
+    auto backendObjPtr = CreateBackendObject(MockImportBackendId());
+    BOOST_TEST((backendObjPtr != nullptr));
+
+    BackendIdSet backendIds = BackendRegistryInstance().GetBackendIds();
+    if (backendIds.find("MockRef") == backendIds.end())
+    {
+        std::string message = "Cannot load MockRef";
+        BOOST_FAIL(message);
+    }
+
+    // Create runtime in which test will run and allow fallback to CpuRef.
+    IRuntime::CreationOptions options;
+    IRuntimePtr runtime(IRuntime::Create(options));
+
+    // Builds up the structure of the network.
+    INetworkPtr net(INetwork::Create());
+
+    Pooling2dDescriptor desc;
+
+    IConnectableLayer* input0 = net->AddInputLayer(0, "input0");
+    IConnectableLayer* input1 = net->AddInputLayer(1, "input1");
+    IConnectableLayer* add = net->AddAdditionLayer("add");
+    IConnectableLayer* pooling = net->AddPooling2dLayer(desc, "pooling");
+    IConnectableLayer* output = net->AddOutputLayer(0, "output");
+
+    input0->GetOutputSlot(0).Connect(add->GetInputSlot(0));
+    input1->GetOutputSlot(0).Connect(add->GetInputSlot(1));
+    add->GetOutputSlot(0).Connect(pooling->GetInputSlot(0));
+    pooling->GetOutputSlot(0).Connect(output->GetInputSlot(0));
+
+    TensorInfo info = TensorInfo({ 1, 2, 3, 2 }, DataType::Float32);
+    TensorInfo poolingInfo = TensorInfo({ 1, 2, 1, 1 }, DataType::Float32);
+
+    input0->GetOutputSlot(0).SetTensorInfo(info);
+    input1->GetOutputSlot(0).SetTensorInfo(info);
+    add->GetOutputSlot(0).SetTensorInfo(info);
+    pooling->GetOutputSlot(0).SetTensorInfo(poolingInfo);
+
+    // optimize the network
+    std::vector<BackendId> backends = { "MockRef", Compute::CpuAcc };
+    IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec());
+
+    OptimizedNetwork* optNetObjPtr = PolymorphicDowncast<OptimizedNetwork*>(optNet.get());
+    Graph& graph = optNetObjPtr->GetGraph();
+
+    armnn::Layer* const layer0 = GetFirstLayerWithName(graph, "input0");
+    armnn::Layer* const layer1 = GetFirstLayerWithName(graph, "input1");
+    armnn::Layer* const layer2 = GetFirstLayerWithName(graph, "add");
+    armnn::Layer* const layer3 = GetFirstLayerWithName(graph, "[ add (0) -> pooling (0) ]");
+    armnn::Layer* const layer4 = GetFirstLayerWithName(graph, "pooling");
+    armnn::Layer* const layer5 = GetFirstLayerWithName(graph, "output");
+
+    // Checks order is valid.
+    BOOST_TEST(CheckOrder(graph, layer0, layer1));
+    BOOST_TEST(CheckOrder(graph, layer1, layer2));
+    BOOST_TEST(CheckOrder(graph, layer2, layer3));
+    BOOST_TEST(CheckOrder(graph, layer3, layer4));
+    BOOST_TEST(CheckOrder(graph, layer4, layer5));
+
+    // Load it into the runtime. It should pass.
+    NetworkId netId;
+    std::string ignoredErrorMessage;
+    INetworkProperties networkProperties(true, true);
+
+    runtime->LoadNetwork(netId, std::move(optNet), ignoredErrorMessage, networkProperties);
+
+    // Creates structures for input & output
+    std::vector<float> inputData0
+    {
+        1.0f, 1.0f, 2.0f, 2.0f, 2.0f, 3.0f, 4.0f, 4.0f, 5.0f, 5.0f, 6.0f, 6.0f
+    };
+    std::vector<float> inputData1
+    {
+        0.0f, 1.0f, 1.0f, 2.0f, 3.0f, 3.0f, 3.0f, 4.0f, 4.0f, 5.0f, 5.0f, 6.0f
+    };
+
+    std::vector<float> outputData(2);
+
+    std::vector<float> expectedOutput
+    {
+        6.0f, 12.0f
+    };
+
+    InputTensors inputTensors
+    {
+        { 0, armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 0), inputData0.data()) },
+        { 1, armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 1), inputData1.data()) }
+    };
+    OutputTensors outputTensors
+    {
+        { 0, armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), outputData.data()) }
+    };
+
+    runtime->GetProfiler(netId)->EnableProfiling(true);
+
+    // Do the inference
+    runtime->EnqueueWorkload(netId, inputTensors, outputTensors);
+
+    // Retrieve the Profiler.Print() output to get the workload execution
+    ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance();
+    std::stringstream ss;
+    profilerManager.GetProfiler()->Print(ss);;
+    std::string dump = ss.str();
+
+    // Contains CopyMemGeneric between the backends
+    std::size_t found = dump.find("CopyMemGeneric");
+    BOOST_TEST(found != std::string::npos);
+
+    // Contains SyncMemGeneric for the output
+    found = dump.find("SyncMemGeneric");
+    BOOST_TEST(found != std::string::npos);
+
+    // Does not contain ImportMemGeneric
+    found = dump.find("ImportMemGeneric");
+    BOOST_TEST(found == std::string::npos);
+
+    // Use memory import between backends
+    BOOST_TEST((layer3->GetType() == LayerType::MemCopy));
+
+    // Check output is as expected
+    BOOST_TEST(outputData == expectedOutput);
+}
+
+BOOST_AUTO_TEST_CASE(FallbackImportFromCpuAcc)
+{
+    using namespace armnn;
+
+    // Create a mock backend object
+    MockImportBackendInitialiser initialiser; // Register the Mock Backend
+    auto backendObjPtr = CreateBackendObject(MockImportBackendId());
+    BOOST_TEST((backendObjPtr != nullptr));
+
+    BackendIdSet backendIds = BackendRegistryInstance().GetBackendIds();
+    if (backendIds.find("MockRef") == backendIds.end())
+    {
+        std::string message = "Cannot load MockRef";
+        BOOST_FAIL(message);
+    }
+
+    // Create runtime in which test will run and allow fallback to CpuRef.
+    IRuntime::CreationOptions options;
+    IRuntimePtr runtime(IRuntime::Create(options));
+
+    // Builds up the structure of the network.
+    INetworkPtr net(INetwork::Create());
+
+    IConnectableLayer* input0 = net->AddInputLayer(0, "input0");
+    IConnectableLayer* input1 = net->AddInputLayer(1, "input1");
+    IConnectableLayer* input2 = net->AddInputLayer(2, "input2");
+    IConnectableLayer* sub = net->AddSubtractionLayer("sub");
+    IConnectableLayer* add = net->AddAdditionLayer("add");
+    IConnectableLayer* output = net->AddOutputLayer(0, "output");
+
+    input0->GetOutputSlot(0).Connect(sub->GetInputSlot(0));
+    input1->GetOutputSlot(0).Connect(sub->GetInputSlot(1));
+    input2->GetOutputSlot(0).Connect(add->GetInputSlot(0));
+    sub->GetOutputSlot(0).Connect(add->GetInputSlot(1));
+    add->GetOutputSlot(0).Connect(output->GetInputSlot(0));
+
+    TensorInfo info = TensorInfo({ 1, 2, 3, 2 }, DataType::Float32);
+
+    input0->GetOutputSlot(0).SetTensorInfo(info);
+    input1->GetOutputSlot(0).SetTensorInfo(info);
+    input2->GetOutputSlot(0).SetTensorInfo(info);
+    sub->GetOutputSlot(0).SetTensorInfo(info);
+    add->GetOutputSlot(0).SetTensorInfo(info);
+
+    // optimize the network
+    std::vector<BackendId> backends = { "MockRef", Compute::CpuAcc };
+    IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec());
+
+    OptimizedNetwork* optNetObjPtr = PolymorphicDowncast<OptimizedNetwork*>(optNet.get());
+    Graph& graph = optNetObjPtr->GetGraph();
+
+    armnn::Layer* const layer0 = GetFirstLayerWithName(graph, "input0");
+    armnn::Layer* const layer1 = GetFirstLayerWithName(graph, "input1");
+    armnn::Layer* const layer2 = GetFirstLayerWithName(graph, "input2");
+    armnn::Layer* const layer3 = GetFirstLayerWithName(graph, "sub");
+    armnn::Layer* const layer4 = GetFirstLayerWithName(graph, "[ sub (0) -> add (1) ]");
+    armnn::Layer* const layer5 = GetFirstLayerWithName(graph, "add");
+    armnn::Layer* const layer6 = GetFirstLayerWithName(graph, "output");
+
+    // Checks order is valid.
+    BOOST_TEST(CheckOrder(graph, layer0, layer1));
+    BOOST_TEST(CheckOrder(graph, layer1, layer2));
+    BOOST_TEST(CheckOrder(graph, layer2, layer3));
+    BOOST_TEST(CheckOrder(graph, layer3, layer4));
+    BOOST_TEST(CheckOrder(graph, layer4, layer5));
+    BOOST_TEST(CheckOrder(graph, layer5, layer6));
+
+    // Load it into the runtime. It should pass.
+    NetworkId netId;
+    std::string ignoredErrorMessage;
+    INetworkProperties networkProperties(true, true);
+
+    runtime->LoadNetwork(netId, std::move(optNet), ignoredErrorMessage, networkProperties);
+
+    // Creates structures for input & output
+    std::vector<float> inputData0
+    {
+        1.0f, 1.0f, 2.0f, 2.0f, 2.0f, 3.0f, 4.0f, 4.0f, 5.0f, 5.0f, 6.0f, 0.0f
+    };
+    std::vector<float> inputData1
+    {
+        0.0f, 1.0f, 1.0f, 2.0f, 3.0f, 3.0f, 3.0f, 4.0f, 4.0f, 5.0f, 5.0f, 6.0f
+    };
+    std::vector<float> inputData2
+    {
+        12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f
+    };
+
+    std::vector<float> outputData(12);
+
+    std::vector<float> expectedOutput
+    {
+        13.0f, 11.0f, 11.0f, 9.0f, 7.0f, 7.0f, 7.0f, 5.0f, 5.0f, 3.0f, 3.0f, -5.0f
+    };
+
+    InputTensors inputTensors
+    {
+        { 0, armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 0), inputData0.data()) },
+        { 1, armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 1), inputData1.data()) },
+        { 2, armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 2), inputData2.data()) }
+    };
+    OutputTensors outputTensors
+    {
+        { 0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), outputData.data()) }
+    };
+
+    runtime->GetProfiler(netId)->EnableProfiling(true);
+
+    // Do the inference
+    runtime->EnqueueWorkload(netId, inputTensors, outputTensors);
+
+    // Retrieve the Profiler.Print() output to get the workload execution
+    ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance();
+    std::stringstream ss;
+    profilerManager.GetProfiler()->Print(ss);;
+    std::string dump = ss.str();
+
+    // Contains ImportMemGeneric
+    std::size_t found = dump.find("ImportMemGeneric");
+    BOOST_TEST(found != std::string::npos);
+
+    // Contains SyncMemGeneric
+    found = dump.find("SyncMemGeneric");
+    BOOST_TEST(found != std::string::npos);
+
+    // Does not contain CopyMemGeneric
+    found = dump.find("CopyMemGeneric");
+    BOOST_TEST(found == std::string::npos);
+
+    // Use memory import between backends
+    BOOST_TEST((layer4->GetType() == LayerType::MemImport));
+
+    // Check output is as expected
+    BOOST_TEST(outputData == expectedOutput);
+}
+
+BOOST_AUTO_TEST_CASE(FallbackPaddingCopyFromCpuAcc)
+{
+    using namespace armnn;
+
+    // Create a mock backend object
+    MockImportBackendInitialiser initialiser; // Register the Mock Backend
+    auto backendObjPtr = CreateBackendObject(MockImportBackendId());
+    BOOST_TEST((backendObjPtr != nullptr));
+
+    BackendIdSet backendIds = BackendRegistryInstance().GetBackendIds();
+    if (backendIds.find("MockRef") == backendIds.end())
+    {
+        std::string message = "Cannot load MockRef";
+        BOOST_FAIL(message);
+    }
+
+    // Create runtime in which test will run and allow fallback to CpuRef.
+    IRuntime::CreationOptions options;
+    IRuntimePtr runtime(IRuntime::Create(options));
+
+    // Builds up the structure of the network.
+    INetworkPtr net(INetwork::Create());
+
+    Pooling2dDescriptor desc;
+
+    IConnectableLayer* input0 = net->AddInputLayer(0, "input0");
+    IConnectableLayer* input1 = net->AddInputLayer(1, "input1");
+    IConnectableLayer* pooling = net->AddPooling2dLayer(desc, "pooling");
+    IConnectableLayer* add = net->AddAdditionLayer("add");
+    IConnectableLayer* output = net->AddOutputLayer(0, "output");
+
+    input0->GetOutputSlot(0).Connect(pooling->GetInputSlot(0));
+    input1->GetOutputSlot(0).Connect(add->GetInputSlot(1));
+    pooling->GetOutputSlot(0).Connect(add->GetInputSlot(0));
+    add->GetOutputSlot(0).Connect(output->GetInputSlot(0));
+
+    TensorInfo inputInfo = TensorInfo({ 1, 2, 3, 2 }, DataType::Float32);
+    TensorInfo poolingInfo = TensorInfo({ 1, 2, 1, 1 }, DataType::Float32);
+
+    input0->GetOutputSlot(0).SetTensorInfo(inputInfo);
+    input1->GetOutputSlot(0).SetTensorInfo(poolingInfo);
+    pooling->GetOutputSlot(0).SetTensorInfo(poolingInfo);
+    add->GetOutputSlot(0).SetTensorInfo(poolingInfo);
+
+    // optimize the network
+    std::vector<BackendId> backends = { "MockRef", Compute::CpuAcc };
+    IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec());
+
+    OptimizedNetwork* optNetObjPtr = PolymorphicDowncast<OptimizedNetwork*>(optNet.get());
+    Graph& graph = optNetObjPtr->GetGraph();
+
+    armnn::Layer* const layer0 = GetFirstLayerWithName(graph, "input0");
+    armnn::Layer* const layer1 = GetFirstLayerWithName(graph, "input1");
+    armnn::Layer* const layer2 = GetFirstLayerWithName(graph, "pooling");
+    armnn::Layer* const layer3 = GetFirstLayerWithName(graph, "[ pooling (0) -> add (0) ]");
+    armnn::Layer* const layer4 = GetFirstLayerWithName(graph, "add");
+    armnn::Layer* const layer5 = GetFirstLayerWithName(graph, "output");
+
+    // Checks order is valid.
+    BOOST_TEST(CheckOrder(graph, layer0, layer1));
+    BOOST_TEST(CheckOrder(graph, layer1, layer2));
+    BOOST_TEST(CheckOrder(graph, layer2, layer3));
+    BOOST_TEST(CheckOrder(graph, layer3, layer4));
+    BOOST_TEST(CheckOrder(graph, layer4, layer5));
+
+    // Load it into the runtime. It should pass.
+    NetworkId netId;
+    std::string ignoredErrorMessage;
+    INetworkProperties networkProperties(true, true);
+
+    runtime->LoadNetwork(netId, std::move(optNet), ignoredErrorMessage, networkProperties);
+
+    // Creates structures for input & output
+    std::vector<float> inputData0
+    {
+        1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f
+    };
+    std::vector<float> inputData1
+    {
+        -1.0f, 3.0f
+    };
+
+    std::vector<float> outputData(2);
+
+    std::vector<float> expectedOutput
+    {
+        5.0f, 15.0f
+    };
+
+    InputTensors inputTensors
+    {
+        { 0, armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 0), inputData0.data()) },
+        { 1, armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 1), inputData1.data()) }
+    };
+    OutputTensors outputTensors
+    {
+        { 0, armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), outputData.data()) }
+    };
+
+    runtime->GetProfiler(netId)->EnableProfiling(true);
+
+    // Do the inference
+    runtime->EnqueueWorkload(netId, inputTensors, outputTensors);
+
+    // Retrieve the Profiler.Print() output to get the workload execution
+    ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance();
+    std::stringstream ss;
+    profilerManager.GetProfiler()->Print(ss);;
+    std::string dump = ss.str();
+
+    // Contains CopyMemGeneric between the backends
+    std::size_t found = dump.find("CopyMemGeneric");
+    BOOST_TEST(found != std::string::npos);
+
+    // Contains SyncMemGeneric for the output
+    found = dump.find("SyncMemGeneric");
+    BOOST_TEST(found != std::string::npos);
+
+    // Does not contain ImportMemGeneric
+    found = dump.find("ImportMemGeneric");
+    BOOST_TEST(found == std::string::npos);
+
+    // Use memory import between backends
+    BOOST_TEST((layer3->GetType() == LayerType::MemCopy));
+
+    // Check output is as expected
+    BOOST_TEST(outputData == expectedOutput);
+}
+
+BOOST_AUTO_TEST_SUITE_END()
diff --git a/src/backends/neon/test/NeonTensorHandleTests.cpp b/src/backends/neon/test/NeonTensorHandleTests.cpp
index 8b3e3fd..c6a562f 100644
--- a/src/backends/neon/test/NeonTensorHandleTests.cpp
+++ b/src/backends/neon/test/NeonTensorHandleTests.cpp
@@ -12,6 +12,7 @@
 #include <armnn/utility/PolymorphicDowncast.hpp>
 
 #include <test/GraphUtils.hpp>
+#include <arm_compute/runtime/Allocator.h>
 
 #include <boost/test/unit_test.hpp>
 
@@ -160,4 +161,77 @@
     }
 }
 
+BOOST_AUTO_TEST_CASE(NeonTensorHandleFactoryMemoryManaged)
+{
+    std::shared_ptr<NeonMemoryManager> memoryManager = std::make_shared<NeonMemoryManager>(
+        std::make_unique<arm_compute::Allocator>(),
+        BaseMemoryManager::MemoryAffinity::Offset);
+    NeonTensorHandleFactory handleFactory(memoryManager);
+    TensorInfo info({ 1, 1, 2, 1 }, DataType::Float32);
+
+    // create TensorHandle with memory managed
+    auto handle = handleFactory.CreateTensorHandle(info, true);
+    handle->Manage();
+    handle->Allocate();
+
+    memoryManager->Acquire();
+    {
+        float* buffer = reinterpret_cast<float*>(handle->Map());
+        BOOST_CHECK(buffer != nullptr); // Yields a valid pointer
+        buffer[0] = 1.5f;
+        buffer[1] = 2.5f;
+        BOOST_CHECK(buffer[0] == 1.5f); // Memory is writable and readable
+        BOOST_CHECK(buffer[1] == 2.5f); // Memory is writable and readable
+    }
+    memoryManager->Release();
+
+    memoryManager->Acquire();
+    {
+        float* buffer = reinterpret_cast<float*>(handle->Map());
+        BOOST_CHECK(buffer != nullptr); // Yields a valid pointer
+        buffer[0] = 3.5f;
+        buffer[1] = 4.5f;
+        BOOST_CHECK(buffer[0] == 3.5f); // Memory is writable and readable
+        BOOST_CHECK(buffer[1] == 4.5f); // Memory is writable and readable
+    }
+    memoryManager->Release();
+
+    float testPtr[2] = { 2.5f, 5.5f };
+    // Cannot import as import is disabled
+    BOOST_CHECK(!handle->Import(static_cast<void*>(testPtr), MemorySource::Malloc));
+}
+
+BOOST_AUTO_TEST_CASE(NeonTensorHandleFactoryImport)
+{
+    std::shared_ptr<NeonMemoryManager> memoryManager = std::make_shared<NeonMemoryManager>(
+        std::make_unique<arm_compute::Allocator>(),
+        BaseMemoryManager::MemoryAffinity::Offset);
+    NeonTensorHandleFactory handleFactory(memoryManager);
+    TensorInfo info({ 1, 1, 2, 1 }, DataType::Float32);
+
+    // create TensorHandle without memory managed
+    auto handle = handleFactory.CreateTensorHandle(info, false);
+    handle->Manage();
+    handle->Allocate();
+    memoryManager->Acquire();
+
+    // No buffer allocated when import is enabled
+    BOOST_CHECK((PolymorphicDowncast<NeonTensorHandle*>(handle.get()))->GetTensor().buffer() == nullptr);
+
+    float testPtr[2] = { 2.5f, 5.5f };
+    // Correctly import
+    BOOST_CHECK(handle->Import(static_cast<void*>(testPtr), MemorySource::Malloc));
+    float* buffer = reinterpret_cast<float*>(handle->Map());
+    BOOST_CHECK(buffer != nullptr); // Yields a valid pointer after import
+    BOOST_CHECK(buffer == testPtr); // buffer is pointing to testPtr
+    // Memory is writable and readable with correct value
+    BOOST_CHECK(buffer[0] == 2.5f);
+    BOOST_CHECK(buffer[1] == 5.5f);
+    buffer[0] = 3.5f;
+    buffer[1] = 10.0f;
+    BOOST_CHECK(buffer[0] == 3.5f);
+    BOOST_CHECK(buffer[1] == 10.0f);
+    memoryManager->Release();
+}
+
 BOOST_AUTO_TEST_SUITE_END()