IVGCVSW-5818 Enable import on GPU

Signed-off-by: Narumol Prangnawarat <narumol.prangnawarat@arm.com>
Change-Id: I4e4eb107aa2bfa09625840d738001f33152e6792
diff --git a/src/backends/cl/ClBackend.cpp b/src/backends/cl/ClBackend.cpp
index f97cb4b..35770d9 100644
--- a/src/backends/cl/ClBackend.cpp
+++ b/src/backends/cl/ClBackend.cpp
@@ -4,12 +4,13 @@
 //
 
 #include "ClBackend.hpp"
+#include "ClBackendContext.hpp"
 #include "ClBackendId.hpp"
 #include "ClBackendModelContext.hpp"
-#include "ClWorkloadFactory.hpp"
-#include "ClBackendContext.hpp"
+#include "ClImportTensorHandleFactory.hpp"
 #include "ClLayerSupport.hpp"
 #include "ClTensorHandleFactory.hpp"
+#include "ClWorkloadFactory.hpp"
 
 #include <armnn/BackendRegistry.hpp>
 #include <armnn/Descriptors.hpp>
@@ -71,6 +72,8 @@
 
     registry.RegisterMemoryManager(memoryManager);
     registry.RegisterFactory(std::make_unique<ClTensorHandleFactory>(memoryManager));
+    registry.RegisterFactory(std::make_unique<ClImportTensorHandleFactory>(
+        static_cast<MemorySourceFlags>(MemorySource::Malloc), static_cast<MemorySourceFlags>(MemorySource::Malloc)));
 
     return std::make_unique<ClWorkloadFactory>(
             PolymorphicPointerDowncast<ClMemoryManager>(memoryManager));
@@ -83,6 +86,24 @@
 
     registry.RegisterMemoryManager(memoryManager);
     registry.RegisterFactory(std::make_unique<ClTensorHandleFactory>(memoryManager));
+    registry.RegisterFactory(std::make_unique<ClImportTensorHandleFactory>(
+        static_cast<MemorySourceFlags>(MemorySource::Malloc), static_cast<MemorySourceFlags>(MemorySource::Malloc)));
+
+    return std::make_unique<ClWorkloadFactory>(
+        PolymorphicPointerDowncast<ClMemoryManager>(memoryManager), CreateBackendSpecificModelContext(modelOptions));
+}
+
+IBackendInternal::IWorkloadFactoryPtr ClBackend::CreateWorkloadFactory(
+    TensorHandleFactoryRegistry& registry,
+    const ModelOptions& modelOptions,
+    MemorySourceFlags inputFlags,
+    MemorySourceFlags outputFlags) const
+{
+    auto memoryManager = std::make_shared<ClMemoryManager>(std::make_unique<arm_compute::CLBufferAllocator>());
+
+    registry.RegisterMemoryManager(memoryManager);
+    registry.RegisterFactory(std::make_unique<ClTensorHandleFactory>(memoryManager));
+    registry.RegisterFactory(std::make_unique<ClImportTensorHandleFactory>(inputFlags, outputFlags));
 
     return std::make_unique<ClWorkloadFactory>(
         PolymorphicPointerDowncast<ClMemoryManager>(memoryManager), CreateBackendSpecificModelContext(modelOptions));
@@ -90,7 +111,8 @@
 
 std::vector<ITensorHandleFactory::FactoryId> ClBackend::GetHandleFactoryPreferences() const
 {
-    return std::vector<ITensorHandleFactory::FactoryId> {ClTensorHandleFactory::GetIdStatic()};
+    return std::vector<ITensorHandleFactory::FactoryId> {ClTensorHandleFactory::GetIdStatic(),
+                                                         ClImportTensorHandleFactory::GetIdStatic()};
 }
 
 void ClBackend::RegisterTensorHandleFactories(TensorHandleFactoryRegistry& registry)
@@ -99,6 +121,19 @@
 
     registry.RegisterMemoryManager(mgr);
     registry.RegisterFactory(std::make_unique<ClTensorHandleFactory>(mgr));
+    registry.RegisterFactory(std::make_unique<ClImportTensorHandleFactory>(
+        static_cast<MemorySourceFlags>(MemorySource::Malloc), static_cast<MemorySourceFlags>(MemorySource::Malloc)));
+}
+
+void ClBackend::RegisterTensorHandleFactories(TensorHandleFactoryRegistry& registry,
+                                              MemorySourceFlags inputFlags,
+                                              MemorySourceFlags outputFlags)
+{
+    auto mgr = std::make_shared<ClMemoryManager>(std::make_unique<arm_compute::CLBufferAllocator>());
+
+    registry.RegisterMemoryManager(mgr);
+    registry.RegisterFactory(std::make_unique<ClTensorHandleFactory>(mgr));
+    registry.RegisterFactory(std::make_unique<ClImportTensorHandleFactory>(inputFlags, outputFlags));
 }
 
 IBackendInternal::IBackendContextPtr ClBackend::CreateBackendContext(const IRuntime::CreationOptions& options) const
diff --git a/src/backends/cl/ClBackend.hpp b/src/backends/cl/ClBackend.hpp
index f9a5745..252d87e 100644
--- a/src/backends/cl/ClBackend.hpp
+++ b/src/backends/cl/ClBackend.hpp
@@ -30,16 +30,25 @@
     IBackendInternal::IWorkloadFactoryPtr CreateWorkloadFactory(
         TensorHandleFactoryRegistry& registry) const override;
 
-    IWorkloadFactoryPtr CreateWorkloadFactory( const IMemoryManagerSharedPtr& memoryManager,
-                                               const ModelOptions& modelOptions) const override;
+    IWorkloadFactoryPtr CreateWorkloadFactory(const IMemoryManagerSharedPtr& memoryManager,
+                                              const ModelOptions& modelOptions) const override;
 
     IWorkloadFactoryPtr CreateWorkloadFactory(class TensorHandleFactoryRegistry& tensorHandleFactoryRegistry,
                                               const ModelOptions& modelOptions) const override;
 
+    IWorkloadFactoryPtr CreateWorkloadFactory(class TensorHandleFactoryRegistry& tensorHandleFactoryRegistry,
+                                              const ModelOptions& modelOptions,
+                                              MemorySourceFlags inputFlags,
+                                              MemorySourceFlags outputFlags) const override;
+
     std::vector<ITensorHandleFactory::FactoryId> GetHandleFactoryPreferences() const override;
 
     void RegisterTensorHandleFactories(TensorHandleFactoryRegistry& registry) override;
 
+    void RegisterTensorHandleFactories(TensorHandleFactoryRegistry& registry,
+                                       MemorySourceFlags inputFlags,
+                                       MemorySourceFlags outputFlags) override;
+
     IBackendInternal::IBackendContextPtr CreateBackendContext(const IRuntime::CreationOptions&) const override;
     IBackendInternal::IBackendProfilingContextPtr CreateBackendProfilingContext(
         const IRuntime::CreationOptions&, IBackendProfilingPtr& backendProfiling) override;
diff --git a/src/backends/cl/ClImportTensorHandleFactory.cpp b/src/backends/cl/ClImportTensorHandleFactory.cpp
index 594e054..26d5f9c 100644
--- a/src/backends/cl/ClImportTensorHandleFactory.cpp
+++ b/src/backends/cl/ClImportTensorHandleFactory.cpp
@@ -106,6 +106,11 @@
     return true;
 }
 
+bool ClImportTensorHandleFactory::SupportsMapUnmap() const
+{
+    return false;
+}
+
 MemorySourceFlags ClImportTensorHandleFactory::GetExportFlags() const
 {
     return m_ExportFlags;
@@ -116,4 +121,19 @@
     return m_ImportFlags;
 }
 
+std::vector<Capability> ClImportTensorHandleFactory::GetCapabilities(const IConnectableLayer* layer,
+                                                                     const IConnectableLayer* connectedLayer,
+                                                                     CapabilityClass capabilityClass)
+{
+    IgnoreUnused(layer);
+    IgnoreUnused(connectedLayer);
+    std::vector<Capability> capabilities;
+    if (capabilityClass == CapabilityClass::FallbackImportDisabled)
+    {
+        Capability paddingCapability(CapabilityClass::FallbackImportDisabled, true);
+        capabilities.push_back(paddingCapability);
+    }
+    return capabilities;
+}
+
 }    // namespace armnn
\ No newline at end of file
diff --git a/src/backends/cl/ClImportTensorHandleFactory.hpp b/src/backends/cl/ClImportTensorHandleFactory.hpp
index ee2f84e..7e22949 100644
--- a/src/backends/cl/ClImportTensorHandleFactory.hpp
+++ b/src/backends/cl/ClImportTensorHandleFactory.hpp
@@ -58,10 +58,16 @@
 
     bool SupportsSubTensors() const override;
 
+    bool SupportsMapUnmap() const override;
+
     MemorySourceFlags GetExportFlags() const override;
 
     MemorySourceFlags GetImportFlags() const override;
 
+    std::vector<Capability> GetCapabilities(const IConnectableLayer* layer,
+                                            const IConnectableLayer* connectedLayer,
+                                            CapabilityClass capabilityClass) override;
+
 private:
     MemorySourceFlags m_ImportFlags;
     MemorySourceFlags m_ExportFlags;
diff --git a/src/backends/cl/backend.mk b/src/backends/cl/backend.mk
index 976f614..e6c289c 100644
--- a/src/backends/cl/backend.mk
+++ b/src/backends/cl/backend.mk
@@ -20,6 +20,7 @@
         ClContextControl.cpp \
         ClContextDeserializer.cpp \
         ClContextSerializer.cpp \
+        ClImportTensorHandleFactory.cpp \
         ClLayerSupport.cpp \
         ClRegistryInitializer.cpp \
         ClTensorHandleFactory.cpp \
diff --git a/src/backends/cl/test/ClFallbackTests.cpp b/src/backends/cl/test/ClFallbackTests.cpp
index eec3afe..183b8ca 100644
--- a/src/backends/cl/test/ClFallbackTests.cpp
+++ b/src/backends/cl/test/ClFallbackTests.cpp
@@ -11,7 +11,7 @@
 
 BOOST_AUTO_TEST_SUITE(ClFallback)
 
-BOOST_AUTO_TEST_CASE(ClImportEnabledFallbackToNeon, * boost::unit_test::disabled())
+BOOST_AUTO_TEST_CASE(ClImportEnabledFallbackToNeon)
 {
     using namespace armnn;
 
@@ -34,7 +34,7 @@
     add->GetOutputSlot(0).Connect(sub->GetInputSlot(1));
     sub->GetOutputSlot(0).Connect(output->GetInputSlot(0));
 
-    TensorInfo info = TensorInfo({ 1, 2, 3, 2 }, DataType::Float32);
+    TensorInfo info = TensorInfo({ 1, 2, 4, 2 }, DataType::Float32);
 
     input0->GetOutputSlot(0).SetTensorInfo(info);
     input1->GetOutputSlot(0).SetTensorInfo(info);
@@ -82,30 +82,49 @@
     runtime->LoadNetwork(netId, std::move(optNet), ignoredErrorMessage, networkProperties);
 
     // Creates structures for input & output
-    std::vector<float> inputData0
+    std::vector<float> inputValue0
     {
-        1.0f, 1.0f, 2.0f, 2.0f, 2.0f, 3.0f, 4.0f, 4.0f, 5.0f, 5.0f, 6.0f, 6.0f
+        1.0f, 1.0f, 2.0f, 2.0f, 2.0f, 3.0f, 4.0f, 4.0f, 5.0f, 5.0f, 6.0f, 6.0f, 1.0f, 1.0f, 2.0f, 2.0f
     };
-    std::vector<float> inputData1
+    std::vector<float> inputValue1
     {
-        0.0f, 1.0f, 1.0f, 2.0f, 3.0f, 3.0f, 3.0f, 4.0f, 4.0f, 5.0f, 5.0f, 6.0f
+        0.0f, 1.0f, 1.0f, 2.0f, 3.0f, 3.0f, 3.0f, 4.0f, 4.0f, 5.0f, 5.0f, 6.0f, 0.0f, 1.0f, 1.0f, 2.0f
     };
     std::vector<float> inputData2
     {
-        12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f
+        12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 12.0f, 11.0f, 10.0f, 9.0f
     };
 
-    std::vector<float> outputData(12);
+    std::vector<float> outputData(16);
 
     std::vector<float> expectedOutput
     {
-        11.0f, 9.0f, 7.0f, 5.0f, 3.0f, 1.0f, -1.0f, -3.0f, -5.0f, -7.0f, -9.0f, -11.0f
+        11.0f, 9.0f, 7.0f, 5.0f, 3.0f, 1.0f, -1.0f, -3.0f, -5.0f, -7.0f, -9.0f, -11.0f, 11.0f, 9.0f, 7.0f, 5.0f
     };
 
+    // Prepare aligned data
+    unsigned int numElements = info.GetNumElements();
+    size_t totalBytes = numElements * sizeof(float);
+    const size_t alignment = 64;
+    size_t space = totalBytes + alignment + alignment;
+    auto inputData0 = std::make_unique<uint8_t[]>(space);
+    void* alignedInputPtr0 = inputData0.get();
+    BOOST_CHECK(std::align(alignment, totalBytes, alignedInputPtr0, space));
+
+    auto* intputPtr0 = reinterpret_cast<float*>(alignedInputPtr0);
+    std::copy(inputValue0.begin(), inputValue0.end(), intputPtr0);
+
+    auto inputData1 = std::make_unique<uint8_t[]>(space);
+    void* alignedInputPtr1 = inputData1.get();
+    BOOST_CHECK(std::align(alignment, totalBytes, alignedInputPtr1, space));
+
+    auto* intputPtr1 = reinterpret_cast<float*>(alignedInputPtr1);
+    std::copy(inputValue1.begin(), inputValue1.end(), intputPtr1);
+
     InputTensors inputTensors
     {
-        { 0, armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 0), inputData0.data()) },
-        { 1, armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 1), inputData1.data()) },
+        { 0, armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 0), alignedInputPtr0) },
+        { 1, armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 1), alignedInputPtr1) },
         { 2, armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 2), inputData2.data()) }
     };
     OutputTensors outputTensors
@@ -134,6 +153,8 @@
 
     // Check output is as expected
     BOOST_TEST(outputData == expectedOutput);
+
+    runtime->UnloadNetwork(netId);
 }
 
 BOOST_AUTO_TEST_CASE(ClImportDisabledFallbackToNeon)
@@ -258,7 +279,7 @@
     BOOST_TEST(outputData == expectedOutput);
 }
 
-BOOST_AUTO_TEST_CASE(ClImportEnabledFallbackSubgraphToNeon, * boost::unit_test::disabled())
+BOOST_AUTO_TEST_CASE(ClImportEnabledFallbackSubgraphToNeon)
 {
     using namespace armnn;
 
@@ -269,6 +290,10 @@
     INetworkPtr net(INetwork::Create());
 
     Pooling2dDescriptor desc;
+    desc.m_PoolWidth = 2;
+    desc.m_PoolHeight = 2;
+    desc.m_StrideX = 2;
+    desc.m_StrideY = 2;
 
     IConnectableLayer* input0 = net->AddInputLayer(0, "input0");
     IConnectableLayer* input1 = net->AddInputLayer(1, "input1");
@@ -285,8 +310,8 @@
     sub->GetOutputSlot(0).Connect(pooling->GetInputSlot(0));
     pooling->GetOutputSlot(0).Connect(output->GetInputSlot(0));
 
-    TensorInfo info = TensorInfo({ 1, 2, 3, 2 }, DataType::Float32);
-    TensorInfo poolingInfo = TensorInfo({ 1, 2, 1, 1 }, DataType::Float32);
+    TensorInfo info = TensorInfo({ 1, 2, 4, 2 }, DataType::Float32);
+    TensorInfo poolingInfo = TensorInfo({ 1, 2, 2, 1 }, DataType::Float32);
 
     input0->GetOutputSlot(0).SetTensorInfo(info);
     input1->GetOutputSlot(0).SetTensorInfo(info);
@@ -340,27 +365,45 @@
     runtime->LoadNetwork(netId, std::move(optNet), ignoredErrorMessage, networkProperties);
 
     // Creates structures for input & output
-    std::vector<float> inputData0
+    std::vector<float> inputValue0
     {
-        1.0f, 1.0f, 2.0f, 2.0f, 2.0f, 3.0f, 4.0f, 4.0f, 5.0f, 5.0f, 6.0f, 6.0f
+        1.0f, 1.0f, 2.0f, 2.0f, 2.0f, 3.0f, 4.0f, 4.0f, 5.0f, 5.0f, 6.0f, 6.0f, 1.0f, 1.0f, 2.0f, 2.0f
     };
-    std::vector<float> inputData1
+    std::vector<float> inputValue1
     {
-        0.0f, 1.0f, 1.0f, 2.0f, 3.0f, 3.0f, 3.0f, 4.0f, 4.0f, 5.0f, 5.0f, 6.0f
+        0.0f, 1.0f, 1.0f, 2.0f, 3.0f, 3.0f, 3.0f, 4.0f, 4.0f, 5.0f, 5.0f, 6.0f, 0.0f, 1.0f, 1.0f, 2.0f
     };
     std::vector<float> inputData2
     {
-        12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f
+        12.0f, 11.0f, 10.0f, 9.0f, 8.0f, 7.0f, 6.0f, 5.0f, 4.0f, 3.0f, 2.0f, 1.0f, 12.0f, 11.0f, 10.0f, 9.0f
     };
 
-    std::vector<float> outputData(2);
+    std::vector<float> outputData(4);
 
-    std::vector<float> expectedOutput{ 11.0f, -1.0f };
+    std::vector<float> expectedOutput{ 11.0f, 3.0f, -5.0f, 11.0f };
+
+    unsigned int numElements = info.GetNumElements();
+    size_t totalBytes = numElements * sizeof(float);
+    const size_t alignment = 64;
+    size_t space = totalBytes + alignment + alignment;
+    auto inputData0 = std::make_unique<uint8_t[]>(space);
+    void* alignedInputPtr0 = inputData0.get();
+    BOOST_CHECK(std::align(alignment, totalBytes, alignedInputPtr0, space));
+
+    auto* intputPtr0 = reinterpret_cast<float*>(alignedInputPtr0);
+    std::copy(inputValue0.begin(), inputValue0.end(), intputPtr0);
+
+    auto inputData1 = std::make_unique<uint8_t[]>(space);
+    void* alignedInputPtr1 = inputData1.get();
+    BOOST_CHECK(std::align(alignment, totalBytes, alignedInputPtr1, space));
+
+    auto* intputPtr1 = reinterpret_cast<float*>(alignedInputPtr1);
+    std::copy(inputValue1.begin(), inputValue1.end(), intputPtr1);
 
     InputTensors inputTensors
     {
-        { 0, armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 0), inputData0.data()) },
-        { 1, armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 1), inputData1.data()) },
+        { 0, armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 0), alignedInputPtr0) },
+        { 1, armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 1), alignedInputPtr1) },
         { 2, armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 2), inputData2.data()) }
     };
     OutputTensors outputTensors
@@ -393,6 +436,8 @@
 
     // Check output is as expected
     BOOST_TEST(outputData == expectedOutput);
+
+    runtime->UnloadNetwork(netId);
 }
 
 BOOST_AUTO_TEST_CASE(ClImportDisableFallbackSubgraphToNeon)
diff --git a/src/backends/cl/test/ClImportTensorHandleTests.cpp b/src/backends/cl/test/ClImportTensorHandleTests.cpp
index bfb74af..85ff35f 100644
--- a/src/backends/cl/test/ClImportTensorHandleTests.cpp
+++ b/src/backends/cl/test/ClImportTensorHandleTests.cpp
@@ -11,6 +11,9 @@
 
 #include <boost/test/unit_test.hpp>
 
+#include <armnn/IRuntime.hpp>
+#include <armnn/INetwork.hpp>
+
 using namespace armnn;
 
 BOOST_AUTO_TEST_SUITE(ClImportTensorHandleTests)
@@ -38,7 +41,7 @@
     const size_t totalBytes = tensor.info()->total_size();
     const size_t alignment =
         arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>();
-    size_t space = totalBytes + alignment;
+    size_t space = totalBytes + alignment + alignment;
     auto testData = std::make_unique<uint8_t[]>(space);
     void* alignedPtr = testData.get();
     BOOST_CHECK(std::align(alignment, totalBytes, alignedPtr, space));
@@ -57,7 +60,7 @@
     // Validate result by checking that the output has no negative values
     for(unsigned int i = 0; i < numElements; ++i)
     {
-        BOOST_ASSERT(typedPtr[i] >= 0);
+        BOOST_TEST(typedPtr[i] >= 0);
     }
 }
 
@@ -78,7 +81,7 @@
     const size_t totalBytes = tensor.info()->total_size();
     const size_t alignment =
         arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>();
-    size_t space = totalBytes + alignment;
+    size_t space = totalBytes + alignment + alignment;
     auto testData = std::make_unique<uint8_t[]>(space);
     void* alignedPtr = testData.get();
     BOOST_CHECK(std::align(alignment, totalBytes, alignedPtr, space));
@@ -108,4 +111,105 @@
     BOOST_CHECK_THROW(handle->Import(inputData.data(), invalidMemSource), MemoryImportException);
 }
 
-BOOST_AUTO_TEST_SUITE_END()
\ No newline at end of file
+BOOST_FIXTURE_TEST_CASE(ClImportEndToEnd, ClContextControlFixture)
+{
+    // Create runtime in which test will run
+    IRuntime::CreationOptions options;
+    IRuntimePtr runtime(armnn::IRuntime::Create(options));
+
+    // build up the structure of the network
+    INetworkPtr net(INetwork::Create());
+
+    IConnectableLayer* input = net->AddInputLayer(0, "Input");
+
+    ActivationDescriptor descriptor;
+    descriptor.m_Function = ActivationFunction::ReLu;
+    IConnectableLayer* activation = net->AddActivationLayer(descriptor, "Activation");
+
+    IConnectableLayer* output = net->AddOutputLayer(0, "Output");
+
+    input->GetOutputSlot(0).Connect(activation->GetInputSlot(0));
+    activation->GetOutputSlot(0).Connect(output->GetInputSlot(0));
+
+    TensorInfo tensorInfo = TensorInfo({ 1, 24, 16, 3 }, DataType::Float32);
+    unsigned int numElements = tensorInfo.GetNumElements();
+    size_t totalBytes = numElements * sizeof(float);
+
+    input->GetOutputSlot(0).SetTensorInfo(tensorInfo);
+    activation->GetOutputSlot(0).SetTensorInfo(tensorInfo);
+
+    // Optimize the network
+    OptimizerOptions optOptions;
+    optOptions.m_ImportEnabled = true;
+    std::vector<armnn::BackendId> backends = {armnn::Compute::GpuAcc};
+    IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec(), optOptions);
+    BOOST_CHECK(optNet);
+
+    // Loads it into the runtime.
+    NetworkId netId;
+    std::string ignoredErrorMessage;
+    // Enable Importing
+    INetworkProperties networkProperties(false, MemorySource::Malloc, MemorySource::Malloc);
+    runtime->LoadNetwork(netId, std::move(optNet), ignoredErrorMessage, networkProperties);
+
+    // Creates structures for input & output
+    const size_t alignment =
+        arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>();
+    size_t space = totalBytes + alignment + alignment;
+    auto inputData = std::make_unique<uint8_t[]>(space);
+    void* alignedInputPtr = inputData.get();
+    BOOST_CHECK(std::align(alignment, totalBytes, alignedInputPtr, space));
+
+    // Input with negative values
+    auto* intputPtr = reinterpret_cast<float*>(alignedInputPtr);
+    std::fill_n(intputPtr, numElements, -5.0f);
+
+    auto outputData = std::make_unique<uint8_t[]>(space);
+    void* alignedOutputPtr = outputData.get();
+    BOOST_CHECK(std::align(alignment, totalBytes, alignedOutputPtr, space));
+
+    InputTensors inputTensors
+    {
+        {0,armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 0), alignedInputPtr)},
+    };
+    OutputTensors outputTensors
+    {
+        {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), alignedOutputPtr)}
+    };
+
+    runtime->GetProfiler(netId)->EnableProfiling(true);
+
+    // Do the inference
+    runtime->EnqueueWorkload(netId, inputTensors, outputTensors);
+
+    // Retrieve the Profiler.Print() output to get the workload execution
+    ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance();
+    std::stringstream ss;
+    profilerManager.GetProfiler()->Print(ss);;
+    std::string dump = ss.str();
+
+    // Contains ActivationWorkload
+    std::size_t found = dump.find("ActivationWorkload");
+    BOOST_TEST(found != std::string::npos);
+
+    // Contains SyncMemGeneric
+    found = dump.find("SyncMemGeneric");
+    BOOST_TEST(found != std::string::npos);
+
+    // Does not contain CopyMemGeneric
+    found = dump.find("CopyMemGeneric");
+    BOOST_TEST(found == std::string::npos);
+
+    // Check output is as expected
+    // Validate result by checking that the output has no negative values
+    auto* outputResult = reinterpret_cast<float*>(alignedOutputPtr);
+    BOOST_TEST(outputResult);
+    for(unsigned int i = 0; i < numElements; ++i)
+    {
+        BOOST_TEST(outputResult[i] >= 0);
+    }
+
+    runtime->UnloadNetwork(netId);
+}
+
+BOOST_AUTO_TEST_SUITE_END()