IVGCVSW-7830 Add backend optimizations to remove Reshapes where possible

 * Added optimization to remove reshapes for Neon and Ref Backends
   by using overridden TensorInfos
 * Added ability to delete Subgraphs during Optimization
 * Fixed naming error in NeonEndToEndTests and CLEndToEndTests
 * Added LayerNameAndTypeCheck for testing.
 * Fixed error where layers were not marked as altered when removed in
   CLBackend

Signed-off-by: Mike Kelly <mike.kelly@arm.com>
Change-Id: I1ac25cd4ec9821470d961831ae2c8d24882276cc
diff --git a/src/backends/reference/RefBackend.cpp b/src/backends/reference/RefBackend.cpp
index 8c8879c..02749af 100644
--- a/src/backends/reference/RefBackend.cpp
+++ b/src/backends/reference/RefBackend.cpp
@@ -1,5 +1,5 @@
 //
-// Copyright © 2022 Arm Ltd and Contributors. All rights reserved.
+// Copyright © 2022-2023 Arm Ltd and Contributors. All rights reserved.
 // SPDX-License-Identifier: MIT
 //
 
@@ -16,8 +16,6 @@
 #include <backendsCommon/DefaultAllocator.hpp>
 #include <backendsCommon/SubgraphUtils.hpp>
 
-#include <Optimizer.hpp>
-
 namespace armnn
 {
 
@@ -116,9 +114,16 @@
                 }
             }
         }
+
+        // Remove Reshape where possible
+        if (base.GetType() == LayerType::Reshape)
+        {
+            ReshapeLayer* baseLayer = PolymorphicDowncast<ReshapeLayer*>(&base);
+            RemoveReshapeLayer(baseLayer, untouched, optimizationViews);
+        }
     }
 
-    if (optimizationViews.GetSubstitutions().empty())
+    if (optimizationViews.GetSubstitutions().empty() && optimizationViews.GetDeletedSubgraphs().empty())
     {
         optimizationViews.AddUntouchedSubgraph(SubgraphView(subgraph));
     }
diff --git a/src/backends/reference/RefTensorHandle.cpp b/src/backends/reference/RefTensorHandle.cpp
index dbfa374..cce992c 100644
--- a/src/backends/reference/RefTensorHandle.cpp
+++ b/src/backends/reference/RefTensorHandle.cpp
@@ -1,29 +1,40 @@
 //
-// Copyright © 2017 Arm Ltd. All rights reserved.
+// Copyright © 2019-2023 Arm Ltd. All rights reserved.
 // SPDX-License-Identifier: MIT
 //
+
 #include "RefTensorHandle.hpp"
 
 namespace armnn
 {
 
-RefTensorHandle::RefTensorHandle(const TensorInfo &tensorInfo, std::shared_ptr<RefMemoryManager> &memoryManager):
+RefTensorHandle::RefTensorHandle(const TensorInfo& tensorInfo, std::shared_ptr<RefMemoryManager>& memoryManager):
     m_TensorInfo(tensorInfo),
     m_MemoryManager(memoryManager),
     m_Pool(nullptr),
     m_UnmanagedMemory(nullptr),
-    m_ImportedMemory(nullptr)
+    m_ImportedMemory(nullptr),
+    m_Decorated()
 {
-
 }
 
 RefTensorHandle::RefTensorHandle(const TensorInfo& tensorInfo)
                                  : m_TensorInfo(tensorInfo),
                                    m_Pool(nullptr),
                                    m_UnmanagedMemory(nullptr),
-                                   m_ImportedMemory(nullptr)
+                                   m_ImportedMemory(nullptr),
+                                   m_Decorated()
 {
+}
 
+RefTensorHandle::RefTensorHandle(const TensorInfo& tensorInfo, const RefTensorHandle& parent)
+        : m_TensorInfo(tensorInfo),
+          m_MemoryManager(parent.m_MemoryManager),
+          m_Pool(parent.m_Pool),
+          m_UnmanagedMemory(parent.m_UnmanagedMemory),
+          m_ImportedMemory(parent.m_ImportedMemory),
+          m_Decorated()
+{
 }
 
 RefTensorHandle::~RefTensorHandle()
@@ -139,4 +150,52 @@
     return false;
 }
 
+std::shared_ptr<ITensorHandle> RefTensorHandle::DecorateTensorHandle(const TensorInfo& tensorInfo)
+{
+    auto decorated = std::make_shared<RefTensorHandleDecorator>(tensorInfo, *this);
+    m_Decorated.emplace_back(decorated);
+    return decorated;
+}
+
+RefTensorHandleDecorator::RefTensorHandleDecorator(const TensorInfo& tensorInfo, const RefTensorHandle& parent)
+: RefTensorHandle(tensorInfo)
+, m_TensorInfo(tensorInfo)
+, m_Parent(parent)
+{
+}
+
+void RefTensorHandleDecorator::Manage()
+{
+}
+
+void RefTensorHandleDecorator::Allocate()
+{
+}
+
+const void* RefTensorHandleDecorator::Map(bool unused) const
+{
+    return m_Parent.Map(unused);
+}
+
+MemorySourceFlags RefTensorHandleDecorator::GetImportFlags() const
+{
+    return static_cast<MemorySourceFlags>(MemorySource::Malloc);
+}
+
+bool RefTensorHandleDecorator::Import(void*, MemorySource )
+{
+    return false;
+}
+
+bool RefTensorHandleDecorator::CanBeImported(void* , MemorySource)
+{
+    return false;
+}
+
+std::shared_ptr<ITensorHandle> RefTensorHandleDecorator::DecorateTensorHandle(const TensorInfo&)
+{
+    return nullptr;
+}
+
+
 }
diff --git a/src/backends/reference/RefTensorHandle.hpp b/src/backends/reference/RefTensorHandle.hpp
index b4dedd5..128f623 100644
--- a/src/backends/reference/RefTensorHandle.hpp
+++ b/src/backends/reference/RefTensorHandle.hpp
@@ -1,7 +1,8 @@
 //
-// Copyright © 2017 Arm Ltd. All rights reserved.
+// Copyright © 2019-2023 Arm Ltd. All rights reserved.
 // SPDX-License-Identifier: MIT
 //
+
 #pragma once
 
 #include <armnn/backends/TensorHandle.hpp>
@@ -11,14 +12,17 @@
 namespace armnn
 {
 
+class RefTensorHandleDecorator;
 // An implementation of ITensorHandle with simple "bump the pointer" memory-management behaviour
 class RefTensorHandle : public ITensorHandle
 {
 public:
-    RefTensorHandle(const TensorInfo& tensorInfo, std::shared_ptr<RefMemoryManager> &memoryManager);
+    RefTensorHandle(const TensorInfo& tensorInfo, std::shared_ptr<RefMemoryManager>& memoryManager);
 
     RefTensorHandle(const TensorInfo& tensorInfo);
 
+    RefTensorHandle(const TensorInfo& tensorInfo, const RefTensorHandle& parent);
+
     ~RefTensorHandle();
 
     virtual void Manage() override;
@@ -56,6 +60,8 @@
     virtual bool Import(void* memory, MemorySource source) override;
     virtual bool CanBeImported(void* memory, MemorySource source) override;
 
+    virtual std::shared_ptr<ITensorHandle> DecorateTensorHandle(const TensorInfo& tensorInfo) override;
+
 private:
     // Only used for testing
     void CopyOutTo(void*) const override;
@@ -68,10 +74,86 @@
 
     TensorInfo m_TensorInfo;
 
-    std::shared_ptr<RefMemoryManager> m_MemoryManager;
+    mutable std::shared_ptr<RefMemoryManager> m_MemoryManager;
     RefMemoryManager::Pool* m_Pool;
     mutable void* m_UnmanagedMemory;
     void* m_ImportedMemory;
+    std::vector<std::shared_ptr<RefTensorHandleDecorator>> m_Decorated;
+};
+
+class RefTensorHandleDecorator : public RefTensorHandle
+{
+public:
+    RefTensorHandleDecorator(const TensorInfo& tensorInfo, const RefTensorHandle& parent);
+
+    ~RefTensorHandleDecorator() = default;
+
+    virtual void Manage() override;
+
+    virtual void Allocate() override;
+
+    virtual ITensorHandle* GetParent() const override
+    {
+        return nullptr;
+    }
+
+    virtual const void* Map(bool /* blocking = true */) const override;
+    using ITensorHandle::Map;
+
+    virtual void Unmap() const override
+    {}
+
+    TensorShape GetStrides() const override
+    {
+        return GetUnpaddedTensorStrides(m_TensorInfo);
+    }
+
+    TensorShape GetShape() const override
+    {
+        return m_TensorInfo.GetShape();
+    }
+
+    const TensorInfo& GetTensorInfo() const
+    {
+        return m_TensorInfo;
+    }
+
+    virtual MemorySourceFlags GetImportFlags() const override;
+
+    virtual bool Import(void* memory, MemorySource source) override;
+    virtual bool CanBeImported(void* memory, MemorySource source) override;
+
+    virtual std::shared_ptr<ITensorHandle> DecorateTensorHandle(const TensorInfo& tensorInfo) override;
+
+    /// Map the tensor data for access. Must be paired with call to Unmap().
+    /// \param blocking hint to block the calling thread until all other accesses are complete. (backend dependent)
+    /// \return pointer to the first element of the mapped data.
+    void* Map(bool blocking=true)
+    {
+        return const_cast<void*>(static_cast<const ITensorHandle*>(this)->Map(blocking));
+    }
+
+    /// Unmap the tensor data that was previously mapped with call to Map().
+    void Unmap()
+    {
+        return static_cast<const ITensorHandle*>(this)->Unmap();
+    }
+
+    /// Testing support to be able to verify and set tensor data content
+    void CopyOutTo(void* /* memory */) const override
+    {};
+
+    void CopyInFrom(const void* /* memory */) override
+    {};
+
+    /// Unimport externally allocated memory
+    void Unimport() override
+    {};
+
+private:
+    TensorInfo m_TensorInfo;
+    const RefTensorHandle& m_Parent;
 };
 
 }
+
diff --git a/src/backends/reference/test/RefCreateWorkloadTests.cpp b/src/backends/reference/test/RefCreateWorkloadTests.cpp
index 894dd75..13ac7fc 100644
--- a/src/backends/reference/test/RefCreateWorkloadTests.cpp
+++ b/src/backends/reference/test/RefCreateWorkloadTests.cpp
@@ -1314,4 +1314,52 @@
     RefCreateActivationWorkloadReplaceFunctionsTest<armnn::DataType::QAsymmU8>();
 }
 
+bool TestRefTensorHandleInfo(armnn::RefTensorHandle* handle, const armnn::TensorInfo& expectedInfo)
+{
+    const TensorInfo handleInfo = handle->GetTensorInfo();
+    const TensorInfo expectedAclInfo = expectedInfo;
+
+    if (handleInfo.GetDataType() != expectedAclInfo.GetDataType())
+    {
+        return false;
+    }
+
+    if (handleInfo.GetNumDimensions() != expectedAclInfo.GetNumDimensions())
+    {
+        return false;
+    }
+
+    for (unsigned int d = 0; d < expectedAclInfo.GetNumDimensions(); ++d)
+    {
+        if (handleInfo.GetShape()[d] != expectedAclInfo.GetShape()[d])
+        {
+            return false;
+        }
+    }
+
+    return true;
+}
+
+TEST_CASE("RefCreateSplitterWorkload")
+{
+    Graph graph;
+    RefWorkloadFactory factory = GetFactory();
+
+    auto workload = CreateSplitterWorkloadTest<RefSplitterWorkload, DataType::Float32>(factory, graph);
+
+    // Checks that outputs are as we expect them (see definition of CreateSplitterWorkloadTest).
+    SplitterQueueDescriptor queueDescriptor = workload->GetData();
+    auto inputHandle = PolymorphicDowncast<RefTensorHandle*>(queueDescriptor.m_Inputs[0]);
+    CHECK(TestRefTensorHandleInfo(inputHandle, TensorInfo({5, 7, 7}, DataType::Float32)));
+
+    auto outputHandle0 = PolymorphicDowncast<RefTensorHandle*>(queueDescriptor.m_Outputs[0]);
+    CHECK(TestRefTensorHandleInfo(outputHandle0, TensorInfo({1, 7, 7}, DataType::Float32)));
+
+    auto outputHandle1 = PolymorphicDowncast<RefTensorHandle*>(queueDescriptor.m_Outputs[1]);
+    CHECK(TestRefTensorHandleInfo(outputHandle1, TensorInfo({2, 7, 7}, DataType::Float32)));
+
+    auto outputHandle2 = PolymorphicDowncast<RefTensorHandle*>(queueDescriptor.m_Outputs[2]);
+    CHECK(TestRefTensorHandleInfo(outputHandle2, TensorInfo({2, 7, 7}, DataType::Float32)));
+}
+
 }
diff --git a/src/backends/reference/test/RefEndToEndTests.cpp b/src/backends/reference/test/RefEndToEndTests.cpp
index 4bb3f29..eb2aabc 100644
--- a/src/backends/reference/test/RefEndToEndTests.cpp
+++ b/src/backends/reference/test/RefEndToEndTests.cpp
@@ -35,6 +35,7 @@
 #include <backendsCommon/test/SpaceToDepthEndToEndTestImpl.hpp>
 #include <backendsCommon/test/SplitterEndToEndTestImpl.hpp>
 #include <backendsCommon/test/StridedSliceAsyncEndToEndTest.hpp>
+#include <backendsCommon/test/SubgraphUtilsTest.hpp>
 #include <backendsCommon/test/TransposeConvolution2dEndToEndTestImpl.hpp>
 #include <backendsCommon/test/TransposeEndToEndTestImpl.hpp>
 
@@ -1618,6 +1619,22 @@
 {
     ElementwiseBinarySimpleEndToEnd<armnn::DataType::QAsymmU8>(defaultBackends, BinaryOperation::SqDiff);
 }
+
 #endif
 
+// Backend Optimization Tests
+TEST_CASE("RefReshapeRemovalSimpleCaseEndToEnd")
+{
+    ReshapeRemovalEndToEnd<armnn::DataType::Float32>(defaultBackends);
+}
+
+TEST_CASE("RefReshapeRemovalNCHWFirstEndToEnd")
+{
+    ReshapeRemovalNCHWEndToEnd<armnn::DataType::Float32>(defaultBackends, true, true);
+}
+
+TEST_CASE("RefReshapeRemovalNCHWSecondEndToEnd")
+{
+    ReshapeRemovalNCHWEndToEnd<armnn::DataType::Float32>(defaultBackends, true, false);
+}
 }