IVGCVSW-5790 Merge async prototype

 * Added thread safe execution mechanism for armnn
 * Removed duplicate function bool Compare(T a, T b, float tolerance)
 * Added StridedSliceAsyncEndToEndTest
 * Fixed memory leak

Signed-off-by: Mike Kelly <mike.kelly@arm.com>
Change-Id: I2d367fc77ee7c01b8953138543e76af5e691211f
diff --git a/src/backends/backendsCommon/MemCopyWorkload.cpp b/src/backends/backendsCommon/MemCopyWorkload.cpp
index 7bdc05e..813adef 100644
--- a/src/backends/backendsCommon/MemCopyWorkload.cpp
+++ b/src/backends/backendsCommon/MemCopyWorkload.cpp
@@ -1,5 +1,5 @@
 //
-// Copyright © 2017 Arm Ltd. All rights reserved.
+// Copyright © 2017 Arm Ltd and Contributors. All rights reserved.
 // SPDX-License-Identifier: MIT
 //
 
@@ -40,7 +40,7 @@
 
 
 CopyMemGenericWorkload::CopyMemGenericWorkload(const MemCopyQueueDescriptor& descriptor,
-                                                         const WorkloadInfo& info)
+                                               const WorkloadInfo& info)
     : BaseWorkload<MemCopyQueueDescriptor>(descriptor, info)
 {
     GatherTensorHandlePairs(descriptor, m_TensorHandlePairs);
@@ -61,4 +61,21 @@
     }
 }
 
+void CopyMemGenericWorkload::ExecuteAsync(WorkingMemDescriptor& descriptor)
+{
+    ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "CopyMemGeneric_Execute_WorkingMemDescriptor");
+    std::vector<TensorHandlePair> tensorHandlePairs;
+    GatherTensorHandlePairs(descriptor, tensorHandlePairs);
+
+    auto copyFunc = [](void* dst, const void* src, size_t size)
+    {
+        memcpy(dst, src, size);
+    };
+
+    for (const auto& pair : tensorHandlePairs)
+    {
+        CopyTensorContentsGeneric(pair.first, pair.second, copyFunc);
+    }
+}
+
 } //namespace armnn
diff --git a/src/backends/backendsCommon/MemCopyWorkload.hpp b/src/backends/backendsCommon/MemCopyWorkload.hpp
index 6529286..12664fd 100644
--- a/src/backends/backendsCommon/MemCopyWorkload.hpp
+++ b/src/backends/backendsCommon/MemCopyWorkload.hpp
@@ -1,5 +1,5 @@
 //
-// Copyright © 2017 Arm Ltd. All rights reserved.
+// Copyright © 2017 Arm Ltd and Contributors. All rights reserved.
 // SPDX-License-Identifier: MIT
 //
 #pragma once
@@ -19,6 +19,7 @@
 public:
     CopyMemGenericWorkload(const MemCopyQueueDescriptor& descriptor, const WorkloadInfo& info);
     void Execute() const override;
+    void ExecuteAsync(WorkingMemDescriptor& descriptor) override;
 
 private:
     using TensorHandlePair = std::pair<const ITensorHandle*, ITensorHandle*>;
diff --git a/src/backends/backendsCommon/MemSyncWorkload.cpp b/src/backends/backendsCommon/MemSyncWorkload.cpp
index b29c46e..fe04a30 100644
--- a/src/backends/backendsCommon/MemSyncWorkload.cpp
+++ b/src/backends/backendsCommon/MemSyncWorkload.cpp
@@ -1,5 +1,5 @@
 //
-// Copyright © 2017 Arm Ltd. All rights reserved.
+// Copyright © 2017 Arm Ltd and Contributors. All rights reserved.
 // SPDX-License-Identifier: MIT
 //
 
@@ -14,7 +14,7 @@
 {
 
 SyncMemGenericWorkload::SyncMemGenericWorkload(const MemSyncQueueDescriptor& descriptor,
-                                                         const WorkloadInfo& info)
+                                               const WorkloadInfo& info)
     : BaseWorkload<MemSyncQueueDescriptor>(descriptor, info)
 {
     m_TensorHandle = descriptor.m_Inputs[0];
@@ -27,4 +27,11 @@
     m_TensorHandle->Unmap();
 }
 
+void SyncMemGenericWorkload::ExecuteAsync(WorkingMemDescriptor& descriptor)
+{
+    ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "SyncMemGeneric_Execute_WorkingMemDescriptor");
+    descriptor.m_Inputs[0]->Map(true);
+    descriptor.m_Inputs[0]->Unmap();
+}
+
 } //namespace armnn
diff --git a/src/backends/backendsCommon/MemSyncWorkload.hpp b/src/backends/backendsCommon/MemSyncWorkload.hpp
index 0d44788..8142f18 100644
--- a/src/backends/backendsCommon/MemSyncWorkload.hpp
+++ b/src/backends/backendsCommon/MemSyncWorkload.hpp
@@ -1,5 +1,5 @@
 //
-// Copyright © 2017 Arm Ltd. All rights reserved.
+// Copyright © 2017 Arm Ltd and Contributors. All rights reserved.
 // SPDX-License-Identifier: MIT
 //
 #pragma once
@@ -19,6 +19,7 @@
 public:
     SyncMemGenericWorkload(const MemSyncQueueDescriptor& descriptor, const WorkloadInfo& info);
     void Execute() const override;
+    void ExecuteAsync(WorkingMemDescriptor& descriptor) override;
 
 private:
     ITensorHandle* m_TensorHandle;
diff --git a/src/backends/backendsCommon/Workload.hpp b/src/backends/backendsCommon/Workload.hpp
index 482f9bd..940b878 100644
--- a/src/backends/backendsCommon/Workload.hpp
+++ b/src/backends/backendsCommon/Workload.hpp
@@ -1,11 +1,12 @@
 //
-// Copyright © 2017 Arm Ltd. All rights reserved.
+// Copyright © 2017 Arm Ltd and Contributors. All rights reserved.
 // SPDX-License-Identifier: MIT
 //
 #pragma once
 
 #include "WorkloadData.hpp"
 #include "WorkloadInfo.hpp"
+#include "WorkingMemDescriptor.hpp"
 
 #include <armnn/backends/IWorkload.hpp>
 #include <Profiling.hpp>
@@ -36,6 +37,8 @@
         m_Data.Validate(info);
     }
 
+    void ExecuteAsync(WorkingMemDescriptor&) override {};
+
     void PostAllocationConfigure() override {}
 
     const QueueDescriptor& GetData() const { return m_Data; }
diff --git a/src/backends/backendsCommon/test/CMakeLists.txt b/src/backends/backendsCommon/test/CMakeLists.txt
index d3857b8..9d36f52 100644
--- a/src/backends/backendsCommon/test/CMakeLists.txt
+++ b/src/backends/backendsCommon/test/CMakeLists.txt
@@ -51,6 +51,7 @@
     SpaceToDepthEndToEndTestImpl.cpp
     SpaceToDepthEndToEndTestImpl.hpp
     SplitterEndToEndTestImpl.hpp
+    StridedSliceAsyncEndToEndTest.hpp
     TensorCopyUtils.cpp
     TensorCopyUtils.hpp
     WorkloadFactoryHelper.hpp
diff --git a/src/backends/backendsCommon/test/EndToEndTestImpl.hpp b/src/backends/backendsCommon/test/EndToEndTestImpl.hpp
index 9ce4201..3a757d0 100644
--- a/src/backends/backendsCommon/test/EndToEndTestImpl.hpp
+++ b/src/backends/backendsCommon/test/EndToEndTestImpl.hpp
@@ -4,6 +4,8 @@
 //
 #pragma once
 
+#include "CommonTestUtils.hpp"
+
 #include <armnn/Descriptors.hpp>
 #include <armnn/INetwork.hpp>
 #include <armnn/IRuntime.hpp>
@@ -105,23 +107,6 @@
     );
 }
 
-// Utility template for comparing tensor elements
-template<DataType ArmnnType, typename T = ResolveType<ArmnnType>>
-bool Compare(T a, T b, float tolerance = 0.000001f)
-{
-    if (ArmnnType == DataType::Boolean)
-    {
-        // NOTE: Boolean is represented as uint8_t (with zero equals
-        // false and everything else equals true), therefore values
-        // need to be casted to bool before comparing them
-        return static_cast<bool>(a) == static_cast<bool>(b);
-    }
-
-    // NOTE: All other types can be cast to float and compared with
-    // a certain level of tolerance
-    return std::fabs(static_cast<float>(a) - static_cast<float>(b)) <= tolerance;
-}
-
 // Utility function to find the number of instances of a substring within a string.
 int SubStringCounter(std::string& string, std::string&& substring)
 {
diff --git a/src/backends/backendsCommon/test/StridedSliceAsyncEndToEndTest.hpp b/src/backends/backendsCommon/test/StridedSliceAsyncEndToEndTest.hpp
new file mode 100644
index 0000000..2ccd2b1
--- /dev/null
+++ b/src/backends/backendsCommon/test/StridedSliceAsyncEndToEndTest.hpp
@@ -0,0 +1,178 @@
+//
+// Copyright © 2021 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include <ResolveType.hpp>
+
+#include <armnn/IWorkingMemHandle.hpp>
+#include <armnn/INetwork.hpp>
+
+#include <backendsCommon/test/CommonTestUtils.hpp>
+
+#include <boost/test/unit_test.hpp>
+
+#include <vector>
+
+namespace armnn
+{
+
+namespace experimental
+{
+
+template<DataType ArmnnIType, DataType ArmnnOType,
+        typename TInput = ResolveType <ArmnnIType>, typename TOutput = ResolveType <ArmnnOType>>
+void AsyncEndToEndTestImpl(INetworkPtr network,
+                           const std::map<int, std::vector<TInput>>& inputTensorData,
+                           const std::map<int, std::vector<TOutput>>& expectedOutputData,
+                           std::vector<BackendId> backends,
+                           float tolerance = 0.000001f)
+{
+    // Create Runtime in which test will run
+    IRuntime::CreationOptions options;
+    IRuntimePtr runtime(IRuntime::Create(options));
+
+    // Optimize the Network
+    IOptimizedNetworkPtr optNet = Optimize(*network, backends, runtime->GetDeviceSpec());
+
+    // Creates AsyncNetwork
+    NetworkId networkId = 0;
+    std::string errorMessage;
+    const INetworkProperties networkProperties;
+    auto asyncNetwork = runtime->CreateAsyncNetwork(networkId, std::move(optNet), errorMessage, networkProperties);
+
+    InputTensors inputTensors;
+    inputTensors.reserve(inputTensorData.size());
+    for (auto&& it : inputTensorData)
+    {
+        inputTensors.push_back({it.first,
+                                ConstTensor(asyncNetwork->GetInputTensorInfo(it.first), it.second.data())});
+    }
+
+    OutputTensors outputTensors;
+    outputTensors.reserve(expectedOutputData.size());
+    std::map<int, std::vector<TOutput>> outputStorage;
+    for (auto&& it : expectedOutputData)
+    {
+        std::vector<TOutput> out(it.second.size());
+        outputStorage.emplace(it.first, out);
+        outputTensors.push_back({it.first,
+                                 Tensor(asyncNetwork->GetOutputTensorInfo(it.first),
+                                        outputStorage.at(it.first).data())});
+    }
+
+    // Create WorkingMemHandle for this async network
+    std::unique_ptr<IWorkingMemHandle> workingMemHandle = asyncNetwork->CreateWorkingMemHandle();
+    IWorkingMemHandle& workingMemHandleRef = *workingMemHandle.get();
+
+    // Run the async network
+    asyncNetwork->Execute(inputTensors, outputTensors, workingMemHandleRef);
+
+    // Checks the results.
+    for (auto&& it : expectedOutputData)
+    {
+        std::vector<TOutput> out = outputStorage.at(it.first);
+        for (unsigned int i = 0; i < out.size(); ++i)
+        {
+            BOOST_CHECK(Compare<ArmnnOType>(it.second[i], out[i], tolerance) == true);
+        }
+    }
+}
+
+template<typename armnn::DataType DataType>
+INetworkPtr CreateStridedSliceNetwork(const TensorShape& inputShape,
+                                      const TensorShape& outputShape,
+                                      const std::vector<int>& beginData,
+                                      const std::vector<int>& endData,
+                                      const std::vector<int>& stridesData,
+                                      int beginMask = 0,
+                                      int endMask = 0,
+                                      int shrinkAxisMask = 0,
+                                      int ellipsisMask = 0,
+                                      int newAxisMask = 0,
+                                      const float qScale = 1.0f,
+                                      const int32_t qOffset = 0)
+{
+    using namespace armnn;
+    // Builds up the structure of the network.
+    INetworkPtr net(INetwork::Create());
+
+    TensorInfo inputTensorInfo(inputShape, DataType, qScale, qOffset);
+    TensorInfo outputTensorInfo(outputShape, DataType, qScale, qOffset);
+
+    armnn::StridedSliceDescriptor stridedSliceDescriptor;
+    stridedSliceDescriptor.m_Begin = beginData;
+    stridedSliceDescriptor.m_End = endData;
+    stridedSliceDescriptor.m_Stride = stridesData;
+    stridedSliceDescriptor.m_BeginMask = beginMask;
+    stridedSliceDescriptor.m_EndMask = endMask;
+    stridedSliceDescriptor.m_ShrinkAxisMask = shrinkAxisMask;
+    stridedSliceDescriptor.m_EllipsisMask = ellipsisMask;
+    stridedSliceDescriptor.m_NewAxisMask = newAxisMask;
+
+    IConnectableLayer* input = net->AddInputLayer(0, "Input_Layer");
+    IConnectableLayer* stridedSlice = net->AddStridedSliceLayer(stridedSliceDescriptor, "splitter");
+    IConnectableLayer* output = net->AddOutputLayer(0);
+
+    Connect(input, stridedSlice, inputTensorInfo, 0, 0);
+    Connect(stridedSlice, output, outputTensorInfo, 0, 0);
+
+    return net;
+}
+
+template<armnn::DataType ArmnnType>
+void StridedSlicedEndToEndTest(const std::vector<BackendId>& backends)
+{
+    using namespace armnn;
+    using T = ResolveType<ArmnnType>;
+
+    const TensorShape& inputShape = {3, 2, 3, 1};
+    const TensorShape& outputShape = {1, 2, 3, 1};
+    const std::vector<int>& beginData = {1, 0, 0, 0};
+    const std::vector<int>& endData = {2, 2, 3, 1};
+    const std::vector<int>& stridesData = {1, 1, 1, 1};
+    int beginMask = 0;
+    int endMask = 0;
+    int shrinkAxisMask = 0;
+    int ellipsisMask = 0;
+    int newAxisMask = 0;
+
+    // Builds up the structure of the network
+    INetworkPtr net = CreateStridedSliceNetwork<ArmnnType>(inputShape,
+                                                           outputShape,
+                                                           beginData,
+                                                           endData,
+                                                           stridesData,
+                                                           beginMask,
+                                                           endMask,
+                                                           shrinkAxisMask,
+                                                           ellipsisMask,
+                                                           newAxisMask);
+
+    BOOST_TEST_CHECKPOINT("create a network");
+
+    // Creates structures for input & output.
+    std::vector<T> inputData{
+            1.0f, 1.0f, 1.0f, 2.0f, 2.0f, 2.0f,
+
+            3.0f, 3.0f, 3.0f, 4.0f, 4.0f, 4.0f,
+
+            5.0f, 5.0f, 5.0f, 6.0f, 6.0f, 6.0f
+    };
+
+    std::vector<T> outputExpected{
+            3.0f, 3.0f, 3.0f, 4.0f, 4.0f, 4.0f
+    };
+
+    std::map<int, std::vector<T>> inputTensorData = {{0, inputData}};
+    std::map<int, std::vector<T>> expectedOutputData = {{0, outputExpected}};
+
+    AsyncEndToEndTestImpl<ArmnnType, ArmnnType>(move(net), inputTensorData, expectedOutputData, backends);
+}
+
+} // experimental namespace
+
+} // armnn namespace
+