IVGCVSW-4375 Add support for Transpose to optimizations

 * Changed some existing Permutation specific optimizations to also support Transpose
 * Added MoveTransposeUp optimization
 * Added TransposeAsReshape optimization
 * Added tests for Transpose optimizations
 * Added missing layer tests for Transpose

Signed-off-by: Mike Kelly <mike.kelly@arm.com>
Change-Id: I20d099b284861402ae94aaa5dbf34907327a485f
diff --git a/Android.mk b/Android.mk
index e423f25..e29f0f9 100644
--- a/Android.mk
+++ b/Android.mk
@@ -95,7 +95,6 @@
         src/armnn/NetworkUtils.cpp \
         src/armnn/Observable.cpp \
         src/armnn/Optimizer.cpp \
-        src/armnn/optimizations/PermuteAndBatchToSpaceAsDepthToSpace.cpp \
         src/armnn/OutputHandler.cpp \
         src/armnn/ProfilingEvent.cpp \
         src/armnn/Profiling.cpp \
@@ -352,6 +351,7 @@
         src/armnn/test/optimizations/PermuteAndBatchToSpaceAsDepthToSpaceTests.cpp \
         src/armnn/test/optimizations/PermuteAsReshapeTests.cpp \
         src/armnn/test/optimizations/SquashEqualSiblingsTests.cpp \
+        src/armnn/test/optimizations/TransposeAsReshapeTests.cpp \
         src/armnn/test/OptimizerTests.cpp \
         src/armnn/test/OptionalTest.cpp \
         src/armnn/test/ProfilerTests.cpp \
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9396316..f55f391 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -441,12 +441,12 @@
     src/armnn/optimizations/ConvertFp32NetworkToFp16.hpp
     src/armnn/optimizations/FoldPadIntoConvolution2d.hpp
     src/armnn/optimizations/MovePermuteUp.hpp
+    src/armnn/optimizations/MoveTransposeUp.hpp
     src/armnn/optimizations/Optimization.hpp
     src/armnn/optimizations/OptimizeConsecutiveReshapes.hpp
     src/armnn/optimizations/OptimizeInverseConversions.hpp
     src/armnn/optimizations/OptimizeInversePermutes.hpp
     src/armnn/optimizations/PermuteAndBatchToSpaceAsDepthToSpace.hpp
-    src/armnn/optimizations/PermuteAndBatchToSpaceAsDepthToSpace.cpp
     src/armnn/optimizations/PermuteAsReshape.hpp
     src/armnn/optimizations/SquashEqualSiblings.hpp
     src/profiling/BufferManager.cpp
@@ -619,12 +619,14 @@
         src/armnn/test/optimizations/Fp32NetworkToFp16ConverterTests.cpp
         src/armnn/test/optimizations/InsertDebugLayerTests.cpp
         src/armnn/test/optimizations/MovePermuteUpTests.cpp
+        src/armnn/test/optimizations/MoveTransposeUpTests.cpp
         src/armnn/test/optimizations/OptimizeConsecutiveReshapesTests.cpp
         src/armnn/test/optimizations/OptimizeInverseConversionsTests.cpp
         src/armnn/test/optimizations/OptimizeInversePermutesTests.cpp
         src/armnn/test/optimizations/PermuteAndBatchToSpaceAsDepthToSpaceTests.cpp
         src/armnn/test/optimizations/PermuteAsReshapeTests.cpp
         src/armnn/test/optimizations/SquashEqualSiblingsTests.cpp
+        src/armnn/test/optimizations/TransposeAsReshapeTests.cpp
         src/armnn/test/OptionalTest.cpp
         src/armnn/test/ProfilerTests.cpp
         src/armnn/test/ProfilingEventTest.cpp
diff --git a/src/armnn/Network.cpp b/src/armnn/Network.cpp
index 55bf51a..50a7df6 100644
--- a/src/armnn/Network.cpp
+++ b/src/armnn/Network.cpp
@@ -910,13 +910,18 @@
     // Perform optimisation passes
     using namespace optimizations;
     Optimizer::Pass(optGraph, MakeOptimizations(SquashEqualPermuteSiblings(),
+                                                SquashEqualTransposeSiblings(),
                                                 SquashEqualReshapeSiblings(),
                                                 OptimizeInversePermutes(),
+                                                OptimizeInverseTransposes(),
                                                 MovePermuteUp(),
+                                                MoveTransposeUp(),
                                                 PermuteAsReshape(),
+                                                TransposeAsReshape(),
                                                 OptimizeConsecutiveReshapes(),
                                                 FoldPadIntoConvolution2d(),
-                                                PermuteAndBatchToSpaceAsDepthToSpace()));
+                                                PermuteAndBatchToSpaceAsDepthToSpace(),
+                                                TransposeAndBatchToSpaceAsDepthToSpace()));
 
     // Infer the tensor infos for all output slots. Throws an exception on failure
     optGraph.InferTensorInfos();
diff --git a/src/armnn/optimizations/All.hpp b/src/armnn/optimizations/All.hpp
index 4ea3f7f..273c337 100644
--- a/src/armnn/optimizations/All.hpp
+++ b/src/armnn/optimizations/All.hpp
@@ -4,14 +4,16 @@
 //
 #pragma once
 
+#include "AddDebug.hpp"
 #include "ConvertConstants.hpp"
+#include "ConvertFp32NetworkToFp16.hpp"
+#include "FoldPadIntoConvolution2d.hpp"
+#include "MovePermuteUp.hpp"
+#include "MoveTransposeUp.hpp"
+#include "OptimizeConsecutiveReshapes.hpp"
+#include "OptimizeInverseConversions.hpp"
 #include "OptimizeInversePermutes.hpp"
 #include "PermuteAsReshape.hpp"
-#include "OptimizeConsecutiveReshapes.hpp"
+#include "PermuteAndBatchToSpaceAsDepthToSpace.hpp"
 #include "SquashEqualSiblings.hpp"
-#include "MovePermuteUp.hpp"
-#include "OptimizeInverseConversions.hpp"
-#include "ConvertFp32NetworkToFp16.hpp"
-#include "AddDebug.hpp"
-#include "FoldPadIntoConvolution2d.hpp"
-#include "PermuteAndBatchToSpaceAsDepthToSpace.hpp"
\ No newline at end of file
+#include "TransposeAsReshape.hpp"
\ No newline at end of file
diff --git a/src/armnn/optimizations/MoveTransposeUp.hpp b/src/armnn/optimizations/MoveTransposeUp.hpp
new file mode 100644
index 0000000..6654306
--- /dev/null
+++ b/src/armnn/optimizations/MoveTransposeUp.hpp
@@ -0,0 +1,83 @@
+//
+// Copyright © 2020 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+#pragma once
+
+#include "Optimization.hpp"
+
+#include <armnnUtils/Transpose.hpp>
+
+namespace armnn
+{
+namespace optimizations
+{
+class MoveTransposeUpImpl
+{
+public:
+    /// Run for every connection between a base Layer (any) and a child TransposeLayer. If the type
+    /// of the base layer allows it, it moves the permutation to the inputs of the base layer.
+    /// I.e., adds equivalent permutations before the inputs of the base layer and moves the
+    /// connections in the output of the child transpose layer to the output of the base layer.
+    void Run(Graph& graph, InputSlot& connection) const
+    {
+        OutputSlot& baseOutput = *connection.GetConnectedOutputSlot();
+
+        if (baseOutput.GetNumConnections() == 1U)
+        {
+            Layer& base = baseOutput.GetOwningLayer();
+
+            if (CanMoveTransposeToInputs(base))
+            {
+                auto transpose = boost::polymorphic_downcast<TransposeLayer*>(&connection.GetOwningLayer());
+                const PermutationVector& perm = transpose->GetPermutation();
+
+                // Inserts an equivalent transpose before every input of the base layer.
+                for (auto baseInput = base.BeginInputSlots(); baseInput != base.EndInputSlots(); ++baseInput)
+                {
+                    // Inserts a new transpose layer.
+                    const std::string name = std::string("moved_up-") + transpose->GetName();
+                    TransposeLayer& permLayer = *graph.InsertNewLayer<TransposeLayer>(*baseInput, perm, name.c_str());
+
+                    // Sets output tensor info for the new layer.
+                    OutputSlot& parentOutput = *permLayer.GetInputSlot(0).GetConnectedOutputSlot();
+                    const TensorInfo permOutInfo = armnnUtils::TransposeTensorShape(parentOutput.GetTensorInfo(), perm);
+                    permLayer.GetOutputHandler().SetTensorInfo(permOutInfo);
+                }
+
+                // Sets transposed output tensor info
+                const TensorInfo& childOutInfo = transpose->GetOutputHandler().GetTensorInfo();
+                base.GetOutputHandler().SetTensorInfo(childOutInfo);
+
+                // Bypasses transpose. It will be removed as it's left unconnected.
+                transpose->GetOutputSlot().MoveAllConnections(base.GetOutputSlot());
+            }
+        }
+    }
+
+protected:
+    MoveTransposeUpImpl() = default;
+    ~MoveTransposeUpImpl() = default;
+
+private:
+    static bool CanMoveTransposeToInputs(const Layer& base)
+    {
+        switch (base.GetType())
+        {
+            case LayerType::Activation:
+            case LayerType::Addition:
+            case LayerType::FakeQuantization:
+            case LayerType::Floor:
+            case LayerType::MemCopy:
+            case LayerType::Multiplication:
+                return true;
+            default:
+                return false;
+        }
+    }
+};
+
+using MoveTransposeUp = OptimizeForConnection<Layer, TransposeLayer, MoveTransposeUpImpl>;
+
+} // namespace optimizations
+} // namespace armnn
diff --git a/src/armnn/optimizations/OptimizeInversePermutes.hpp b/src/armnn/optimizations/OptimizeInversePermutes.hpp
index 48bfa35..77d62a5 100644
--- a/src/armnn/optimizations/OptimizeInversePermutes.hpp
+++ b/src/armnn/optimizations/OptimizeInversePermutes.hpp
@@ -13,6 +13,7 @@
 namespace optimizations
 {
 
+template <typename PermuteType>
 class OptimizeInversePermutesImpl
 {
 public:
@@ -22,9 +23,9 @@
     {
         boost::ignore_unused(graph);
         Layer& base = connection.GetConnectedOutputSlot()->GetOwningLayer();
-        auto child = boost::polymorphic_downcast<PermuteLayer*>(&connection.GetOwningLayer());
+        auto child = boost::polymorphic_downcast<PermuteType*>(&connection.GetOwningLayer());
 
-        if (child->IsInverse(*boost::polymorphic_downcast<PermuteLayer*>(&base)))
+        if (child->IsInverse(*boost::polymorphic_downcast<PermuteType*>(&base)))
         {
             // Bypass both layers. Child will be removed as it's left unconnected.
             // Base layer will be removed if left unconnected.
@@ -37,7 +38,10 @@
     ~OptimizeInversePermutesImpl() = default;
 };
 
-using OptimizeInversePermutes = OptimizeForConnection<PermuteLayer, PermuteLayer, OptimizeInversePermutesImpl>;
+using OptimizeInversePermutes = OptimizeForConnection<PermuteLayer, PermuteLayer,
+    OptimizeInversePermutesImpl<PermuteLayer>>;
+using OptimizeInverseTransposes = OptimizeForConnection<TransposeLayer, TransposeLayer,
+    OptimizeInversePermutesImpl<TransposeLayer>>;
 
 } // namespace optimizations
 } // namespace armnn
diff --git a/src/armnn/optimizations/PermuteAndBatchToSpaceAsDepthToSpace.cpp b/src/armnn/optimizations/PermuteAndBatchToSpaceAsDepthToSpace.cpp
deleted file mode 100644
index c42162b..0000000
--- a/src/armnn/optimizations/PermuteAndBatchToSpaceAsDepthToSpace.cpp
+++ /dev/null
@@ -1,87 +0,0 @@
-//
-// Copyright © 2019 Arm Ltd. All rights reserved.
-// SPDX-License-Identifier: MIT
-//
-
-#include "PermuteAndBatchToSpaceAsDepthToSpace.hpp"
-
-using namespace armnn;
-using namespace armnn::optimizations;
-
-void PermuteAndBatchToSpaceAsDepthToSpaceImpl::Run(Graph& graph, InputSlot& connection) const
-{
-    // Validate base layer (the Permute) is compatible
-    Layer& base = connection.GetConnectedOutputSlot()->GetOwningLayer();
-    BOOST_ASSERT(base.GetType() == LayerType::Permute);
-    const TensorInfo& inputInfo        = base.GetInputSlot(0).GetConnection()->GetTensorInfo();
-    const TensorInfo& intermediateInfo = base.GetOutputSlot(0).GetTensorInfo();
-    if (intermediateInfo.GetNumDimensions() != 4)
-    {
-        // Must be 4D, otherwise the below checks do not make sense
-        return;
-    }
-    if (!static_cast<PermuteLayer&>(base).GetParameters().m_DimMappings.IsEqual(PermutationVector{ 3, 1, 2, 0 }))
-    {
-        // Must swap batch and channels dimensions, otherwise it is not the (original) channels dimension
-        // that is being decomposed.
-        return;
-    }
-
-    // Validate child layer (the BatchToSpace) is compatible
-    Layer& child = connection.GetOwningLayer();
-    BOOST_ASSERT(child.GetType() == LayerType::BatchToSpaceNd);
-    const TensorInfo& outputInfo                     = child.GetOutputSlot(0).GetTensorInfo();
-    const BatchToSpaceNdDescriptor& batchToSpaceDesc = static_cast<BatchToSpaceNdLayer&>(child).GetParameters();
-    if (batchToSpaceDesc.m_DataLayout != DataLayout::NHWC)
-    {
-        // The rest of this function assumes NHWC, although in future this restriction could be lifted.
-        return;
-    }
-    if (batchToSpaceDesc.m_Crops != std::vector<std::pair<unsigned int, unsigned int>>{ { 0, 0 }, { 0, 0 } })
-    {
-        // Cropping is not supported in DepthToSpace
-        return;
-    }
-    if (batchToSpaceDesc.m_BlockShape.size() != 2 ||
-        batchToSpaceDesc.m_BlockShape[0] != batchToSpaceDesc.m_BlockShape[1])
-    {
-        // Asymmetric or non-2D block sizes are not supported by DepthToSpace
-        return;
-    }
-    uint32_t blockSize = batchToSpaceDesc.m_BlockShape[0];
-    if (outputInfo.GetShape()[0] != 1 || outputInfo.GetShape()[3] != 1)
-    {
-        // The final output must have 1 batch and 1 channel because these dimensions will be swapped around
-        // once we make the substitution, and it needs to be equivalent.
-        return;
-    }
-
-    // Validate the intermediate tensor quantization params.
-    // These must be identical to either the input or output quantization params, otherwise the intermediate tensor
-    // may not have sufficient range/precision to preserve the values.
-    // This would mean that once we perform the substitution this loss of precision will no longer occur,
-    // so we would have changed the meaning of the network.
-    bool isIntermediateQuantParamsSameAsInput =
-        intermediateInfo.GetQuantizationScale() == inputInfo.GetQuantizationScale() &&
-        intermediateInfo.GetQuantizationOffset() == inputInfo.GetQuantizationOffset();
-    bool isIntermediateQuantParamsSameAsOutput =
-        intermediateInfo.GetQuantizationScale() == outputInfo.GetQuantizationScale() &&
-        intermediateInfo.GetQuantizationOffset() == outputInfo.GetQuantizationOffset();
-    if (!isIntermediateQuantParamsSameAsInput && !isIntermediateQuantParamsSameAsOutput)
-    {
-        return;
-    }
-
-    // Insert equivalent DepthToSpace layer
-    const std::string name = std::string("merged-") + base.GetName() + std::string("-with-") + child.GetName();
-
-    // Inserts equivalent reshape before base layer.
-    const DepthToSpaceDescriptor depthToSpaceDesc(blockSize, DataLayout::NHWC);
-    auto& depthToSpace = *graph.InsertNewLayer<DepthToSpaceLayer>(base.GetInputSlot(0), depthToSpaceDesc, name.c_str());
-    depthToSpace.GetOutputHandler().SetTensorInfo(outputInfo);
-
-    // Moves connections from child output to new layer.
-    // Child layer will be removed as it's left unconnected.
-    // Base layer will be removed if left unconnected.
-    child.GetOutputSlot().MoveAllConnections(depthToSpace.GetOutputSlot());
-}
diff --git a/src/armnn/optimizations/PermuteAndBatchToSpaceAsDepthToSpace.hpp b/src/armnn/optimizations/PermuteAndBatchToSpaceAsDepthToSpace.hpp
index 4a73efc..21aed86 100644
--- a/src/armnn/optimizations/PermuteAndBatchToSpaceAsDepthToSpace.hpp
+++ b/src/armnn/optimizations/PermuteAndBatchToSpaceAsDepthToSpace.hpp
@@ -14,14 +14,94 @@
 /// Replaces Permute leading into BatchToSpace with a DepthToSpace
 /// in the case where the Permute swaps the batch and channels dimensions
 /// such that the replacement is valid.
+template <typename PermuteType>
 class PermuteAndBatchToSpaceAsDepthToSpaceImpl
 {
 public:
-    void Run(Graph& graph, InputSlot& connection) const;
+    void Run(Graph& graph, InputSlot& connection) const
+    {
+        // Validate base layer (the Permute) is compatible
+        Layer& base = connection.GetConnectedOutputSlot()->GetOwningLayer();
+        BOOST_ASSERT(base.GetType() == LayerType::Permute || base.GetType() == LayerType::Transpose);
+        const TensorInfo& inputInfo = base.GetInputSlot(0).GetConnection()->GetTensorInfo();
+        const TensorInfo& intermediateInfo = base.GetOutputSlot(0).GetTensorInfo();
+        if (intermediateInfo.GetNumDimensions() != 4)
+        {
+            // Must be 4D, otherwise the below checks do not make sense
+            return;
+        }
+        if (!static_cast<PermuteType&>(base).GetParameters().m_DimMappings.IsEqual(PermutationVector{ 3, 1, 2, 0 }))
+        {
+            // Must swap batch and channels dimensions, otherwise it is not the (original) channels dimension
+            // that is being decomposed.
+            return;
+        }
+
+        // Validate child layer (the BatchToSpace) is compatible
+        Layer& child = connection.GetOwningLayer();
+        BOOST_ASSERT(child.GetType() == LayerType::BatchToSpaceNd);
+        const TensorInfo& outputInfo = child.GetOutputSlot(0).GetTensorInfo();
+        const BatchToSpaceNdDescriptor& batchToSpaceDesc = static_cast<BatchToSpaceNdLayer&>(child).GetParameters();
+        if (batchToSpaceDesc.m_DataLayout != DataLayout::NHWC)
+        {
+            // The rest of this function assumes NHWC, although in future this restriction could be lifted.
+            return;
+        }
+        if (batchToSpaceDesc.m_Crops != std::vector<std::pair<unsigned int, unsigned int>>{ { 0, 0 }, { 0, 0 } })
+        {
+            // Cropping is not supported in DepthToSpace
+            return;
+        }
+        if (batchToSpaceDesc.m_BlockShape.size() != 2 ||
+        batchToSpaceDesc.m_BlockShape[0] != batchToSpaceDesc.m_BlockShape[1])
+        {
+            // Asymmetric or non-2D block sizes are not supported by DepthToSpace
+            return;
+        }
+        uint32_t blockSize = batchToSpaceDesc.m_BlockShape[0];
+        if (outputInfo.GetShape()[0] != 1 || outputInfo.GetShape()[3] != 1)
+        {
+            // The final output must have 1 batch and 1 channel because these dimensions will be swapped around
+            // once we make the substitution, and it needs to be equivalent.
+            return;
+        }
+
+        // Validate the intermediate tensor quantization params.
+        // These must be identical to either the input or output quantization params, otherwise the intermediate tensor
+        // may not have sufficient range/precision to preserve the values.
+        // This would mean that once we perform the substitution this loss of precision will no longer occur,
+        // so we would have changed the meaning of the network.
+        bool isIntermediateQuantParamsSameAsInput =
+                intermediateInfo.GetQuantizationScale() == inputInfo.GetQuantizationScale() &&
+                intermediateInfo.GetQuantizationOffset() == inputInfo.GetQuantizationOffset();
+        bool isIntermediateQuantParamsSameAsOutput =
+                intermediateInfo.GetQuantizationScale() == outputInfo.GetQuantizationScale() &&
+                intermediateInfo.GetQuantizationOffset() == outputInfo.GetQuantizationOffset();
+        if (!isIntermediateQuantParamsSameAsInput && !isIntermediateQuantParamsSameAsOutput)
+        {
+            return;
+        }
+
+        // Insert equivalent DepthToSpace layer
+        const std::string name = std::string("merged-") + base.GetName() + std::string("-with-") + child.GetName();
+
+        // Inserts equivalent reshape before base layer.
+        const DepthToSpaceDescriptor depthToSpaceDesc(blockSize, DataLayout::NHWC);
+        auto& depthToSpace = *graph.InsertNewLayer<DepthToSpaceLayer>(base.GetInputSlot(0),
+                                                                      depthToSpaceDesc,
+                                                                      name.c_str());
+        depthToSpace.GetOutputHandler().SetTensorInfo(outputInfo);
+
+        // Moves connections from child output to new layer.
+        // Child layer will be removed as it's left unconnected.
+        // Base layer will be removed if left unconnected.
+        child.GetOutputSlot().MoveAllConnections(depthToSpace.GetOutputSlot());
+    }
 };
 
-using PermuteAndBatchToSpaceAsDepthToSpace =
-    OptimizeForConnection<PermuteLayer, BatchToSpaceNdLayer, PermuteAndBatchToSpaceAsDepthToSpaceImpl>;
-
+using PermuteAndBatchToSpaceAsDepthToSpace = OptimizeForConnection<PermuteLayer, BatchToSpaceNdLayer,
+    PermuteAndBatchToSpaceAsDepthToSpaceImpl<PermuteLayer>>;
+using TransposeAndBatchToSpaceAsDepthToSpace = OptimizeForConnection<TransposeLayer, BatchToSpaceNdLayer,
+    PermuteAndBatchToSpaceAsDepthToSpaceImpl<TransposeLayer>>;
 }    // namespace optimizations
 }    // namespace armnn
diff --git a/src/armnn/optimizations/SquashEqualSiblings.hpp b/src/armnn/optimizations/SquashEqualSiblings.hpp
index 12637ba..d5a8a5d 100644
--- a/src/armnn/optimizations/SquashEqualSiblings.hpp
+++ b/src/armnn/optimizations/SquashEqualSiblings.hpp
@@ -64,6 +64,8 @@
 };
 
 using SquashEqualPermuteSiblings = OptimizeForConnection<Layer, PermuteLayer, SquashEqualSiblingsImpl<PermuteLayer>>;
+using SquashEqualTransposeSiblings = OptimizeForConnection<Layer, TransposeLayer,
+    SquashEqualSiblingsImpl<TransposeLayer>>;
 using SquashEqualReshapeSiblings = OptimizeForConnection<Layer, ReshapeLayer, SquashEqualSiblingsImpl<ReshapeLayer>>;
 
 } // namespace optimizations
diff --git a/src/armnn/optimizations/TransposeAsReshape.hpp b/src/armnn/optimizations/TransposeAsReshape.hpp
new file mode 100644
index 0000000..4bb2f19
--- /dev/null
+++ b/src/armnn/optimizations/TransposeAsReshape.hpp
@@ -0,0 +1,81 @@
+//
+// Copyright © 2020 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+#pragma once
+
+#include "Optimization.hpp"
+
+namespace armnn
+{
+namespace optimizations
+{
+
+class TransposeAsReshapeImpl
+{
+public:
+    /// Run for every TransposeLayer. Replaces it with a ReshapeLayer if they are equivalent.
+    void Run(Graph& graph, TransposeLayer& transpose) const
+    {
+        if (IsReshape(transpose))
+        {
+            const TensorInfo& outInfo = transpose.GetOutputHandler().GetTensorInfo();
+
+            const std::string name = std::string("as_reshape-") + transpose.GetName();
+            const ReshapeDescriptor descriptor{outInfo.GetShape()};
+            // Inserts NewLayer so layers don't need to be re-sorted.
+            auto reshape = graph.InsertNewLayer<ReshapeLayer>(transpose.GetInputSlot(0), descriptor, name.c_str());
+            reshape->GetOutputHandler().SetTensorInfo(outInfo);
+
+            // Bypass transpose. It will be deleted since it's left unconnected.
+            transpose.GetOutputSlot().MoveAllConnections(reshape->GetOutputSlot());
+        }
+    }
+
+protected:
+    TransposeAsReshapeImpl() = default;
+    ~TransposeAsReshapeImpl() = default;
+
+private:
+    static bool IsReshape(const TransposeLayer& layer)
+    {
+        const TensorShape& outShape = layer.GetOutputHandler().GetTensorInfo().GetShape();
+        const PermutationVector& permutation = layer.GetPermutation();
+
+        const unsigned int numDimensions = permutation.GetSize();
+        std::map<unsigned int, unsigned int> permuteMappings;
+        for (unsigned int i = 0; i < permutation.GetSize(); ++i)
+        {
+            permuteMappings[permutation[i]] = i;
+        }
+
+        std::vector<unsigned int> permuteVector;
+        for (unsigned int i = 0; i < permutation.GetSize(); ++i)
+        {
+            permuteVector.push_back(permuteMappings.at(i));
+        }
+
+        unsigned int lastGtOne = 0;
+        while ((lastGtOne < numDimensions) && (outShape[(permuteVector[lastGtOne])] == 1U))
+        {
+            ++lastGtOne;
+        }
+
+        bool isReshape = true;
+        for (unsigned int i = lastGtOne + 1U; isReshape && (i < numDimensions); ++i)
+        {
+            if (outShape[permuteVector[i]] > 1U)
+            {
+                isReshape = permuteVector[lastGtOne] < permuteVector[i];
+                lastGtOne = i;
+            }
+        }
+
+        return isReshape;
+    }
+};
+
+using TransposeAsReshape = OptimizeForType<TransposeLayer, TransposeAsReshapeImpl>;
+
+} // namespace optimizations
+} // namespace armnn
diff --git a/src/armnn/test/TestNameAndDescriptorLayerVisitor.cpp b/src/armnn/test/TestNameAndDescriptorLayerVisitor.cpp
index efe50a5..431db2a 100644
--- a/src/armnn/test/TestNameAndDescriptorLayerVisitor.cpp
+++ b/src/armnn/test/TestNameAndDescriptorLayerVisitor.cpp
@@ -247,6 +247,12 @@
     return descriptor;
 }
 
+template<>
+armnn::TransposeDescriptor GetDescriptor<armnn::TransposeDescriptor>()
+{
+    return armnn::TransposeDescriptor({ 0, 1, 2, 3 });
+}
+
 } // anonymous namespace
 
 BOOST_AUTO_TEST_SUITE(TestNameAndDescriptorLayerVisitor)
@@ -275,5 +281,6 @@
 TEST_SUITE_NAME_AND_DESCRIPTOR_LAYER_VISITOR(Splitter)
 TEST_SUITE_NAME_AND_DESCRIPTOR_LAYER_VISITOR(Stack)
 TEST_SUITE_NAME_AND_DESCRIPTOR_LAYER_VISITOR(StridedSlice)
+TEST_SUITE_NAME_AND_DESCRIPTOR_LAYER_VISITOR(Transpose)
 
 BOOST_AUTO_TEST_SUITE_END()
diff --git a/src/armnn/test/TestNameAndDescriptorLayerVisitor.hpp b/src/armnn/test/TestNameAndDescriptorLayerVisitor.hpp
index f792bc3..b9877a8 100644
--- a/src/armnn/test/TestNameAndDescriptorLayerVisitor.hpp
+++ b/src/armnn/test/TestNameAndDescriptorLayerVisitor.hpp
@@ -67,3 +67,4 @@
 DECLARE_TEST_NAME_AND_DESCRIPTOR_LAYER_VISITOR_CLASS(Stack)
 DECLARE_TEST_NAME_AND_DESCRIPTOR_LAYER_VISITOR_CLASS(StandIn)
 DECLARE_TEST_NAME_AND_DESCRIPTOR_LAYER_VISITOR_CLASS(StridedSlice)
+DECLARE_TEST_NAME_AND_DESCRIPTOR_LAYER_VISITOR_CLASS(Transpose)
diff --git a/src/armnn/test/optimizations/MoveTransposeUpTests.cpp b/src/armnn/test/optimizations/MoveTransposeUpTests.cpp
new file mode 100644
index 0000000..e2fb3ab
--- /dev/null
+++ b/src/armnn/test/optimizations/MoveTransposeUpTests.cpp
@@ -0,0 +1,93 @@
+//
+// Copyright © 2020 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "../TestUtils.hpp"
+
+#include <Optimizer.hpp>
+
+#include <boost/test/unit_test.hpp>
+
+BOOST_AUTO_TEST_SUITE(Optimizer)
+using namespace armnn::optimizations;
+
+BOOST_AUTO_TEST_CASE(MoveTransposeUpTest)
+{
+    const armnn::TensorInfo info({ 1, 5, 2, 3 }, armnn::DataType::Float32);
+    const armnn::TensorInfo transposed({ 1, 3, 5, 2 }, armnn::DataType::Float32);
+
+    armnn::Graph graph;
+
+    armnn::LayerBindingId inputId = 0;
+
+    armnn::Layer* head = graph.AddLayer<armnn::OutputLayer>(0, "output");
+
+    std::string transposeLayerName = "original_transpose";
+
+    // Insert transpose
+    head = graph.InsertNewLayer<armnn::TransposeLayer>(head->GetInputSlot(0),
+                                                       armnn::TransposeDescriptor({ 0, 3, 1, 2 }),
+                                                       transposeLayerName.c_str());
+
+    head->GetOutputHandler().SetTensorInfo(transposed);
+
+    // Inserts layers that don't care about data format.
+    head = graph.InsertNewLayer<armnn::ActivationLayer>(head->GetInputSlot(0), armnn::ActivationDescriptor{}, "");
+    head->GetOutputHandler().SetTensorInfo(info);
+
+    head = graph.InsertNewLayer<armnn::AdditionLayer>(head->GetInputSlot(0), "");
+    head->GetOutputHandler().SetTensorInfo(info);
+
+    // Inserts input for 2nd input of Addition.
+    graph.InsertNewLayer<armnn::InputLayer>(head->GetInputSlot(1), inputId++, "")
+        ->GetOutputHandler()
+        .SetTensorInfo(info);
+
+    head = graph.InsertNewLayer<armnn::FakeQuantizationLayer>(head->GetInputSlot(0),
+                                                              armnn::FakeQuantizationDescriptor{}, "");
+    head->GetOutputHandler().SetTensorInfo(info);
+
+    head = graph.InsertNewLayer<armnn::FloorLayer>(head->GetInputSlot(0), "");
+    head->GetOutputHandler().SetTensorInfo(info);
+
+    head = graph.InsertNewLayer<armnn::MemCopyLayer>(head->GetInputSlot(0), "");
+    head->GetOutputHandler().SetTensorInfo(info);
+
+    head = graph.InsertNewLayer<armnn::MultiplicationLayer>(head->GetInputSlot(0), "");
+    head->GetOutputHandler().SetTensorInfo(info);
+
+    // Inserts input for 2nd input of Multiplication.
+    graph.InsertNewLayer<armnn::InputLayer>(head->GetInputSlot(1), inputId++, "")
+        ->GetOutputHandler()
+        .SetTensorInfo(info);
+
+    // Inserts input.
+    graph.InsertNewLayer<armnn::InputLayer>(head->GetInputSlot(0), inputId++, "")
+        ->GetOutputHandler()
+        .SetTensorInfo(info);
+
+    BOOST_TEST(CheckSequence(graph.cbegin(), graph.cend(), &IsLayerOfType<armnn::InputLayer>,
+                             &IsLayerOfType<armnn::InputLayer>, &IsLayerOfType<armnn::InputLayer>,
+                             &IsLayerOfType<armnn::MultiplicationLayer>, &IsLayerOfType<armnn::MemCopyLayer>,
+                             &IsLayerOfType<armnn::FloorLayer>, &IsLayerOfType<armnn::FakeQuantizationLayer>,
+                             &IsLayerOfType<armnn::AdditionLayer>, &IsLayerOfType<armnn::ActivationLayer>,
+                             &IsLayerOfType<armnn::TransposeLayer>, &IsLayerOfType<armnn::OutputLayer>));
+
+    armnn::Optimizer::Pass(graph, armnn::MakeOptimizations(MoveTransposeUp()));
+
+    // The transpose is moved to the top. New transposes for layers with multiple inputs.
+    BOOST_TEST(CheckSequence(graph.cbegin(), graph.cend(), &IsLayerOfType<armnn::InputLayer>,
+                             &IsLayerOfType<armnn::InputLayer>, &IsLayerOfType<armnn::InputLayer>,
+                             &IsLayerOfType<armnn::TransposeLayer>, &IsLayerOfType<armnn::TransposeLayer>,
+                             &IsLayerOfType<armnn::TransposeLayer>, &IsLayerOfType<armnn::MultiplicationLayer>,
+                             &IsLayerOfType<armnn::MemCopyLayer>, &IsLayerOfType<armnn::FloorLayer>,
+                             &IsLayerOfType<armnn::FakeQuantizationLayer>, &IsLayerOfType<armnn::AdditionLayer>,
+                             &IsLayerOfType<armnn::ActivationLayer>, &IsLayerOfType<armnn::OutputLayer>));
+
+    std::list<std::string> testRelatedLayers = { transposeLayerName };
+
+    BOOST_TEST(CheckRelatedLayers<armnn::TransposeLayer>(graph, testRelatedLayers));
+}
+
+BOOST_AUTO_TEST_SUITE_END()
\ No newline at end of file
diff --git a/src/armnn/test/optimizations/OptimizeInversePermutesTests.cpp b/src/armnn/test/optimizations/OptimizeInversePermutesTests.cpp
index dcf9559..21f791c 100644
--- a/src/armnn/test/optimizations/OptimizeInversePermutesTests.cpp
+++ b/src/armnn/test/optimizations/OptimizeInversePermutesTests.cpp
@@ -39,4 +39,31 @@
                              &IsLayerOfType<armnn::OutputLayer>));
 }
 
+BOOST_AUTO_TEST_CASE(OptimizeInverseTransposesTest)
+{
+    armnn::Graph graph;
+
+    auto output = graph.AddLayer<armnn::OutputLayer>(0, "output");
+
+    graph.InsertNewLayer<armnn::InputLayer>(output->GetInputSlot(0), 0, "input");
+
+    // Inserts two permutes, one the inverse of the other.
+    graph.InsertNewLayer<armnn::TransposeLayer>(output->GetInputSlot(0),
+                                                armnn::TransposeDescriptor({ 0, 3, 1, 2 }),
+                                                "transpose0312");
+    graph.InsertNewLayer<armnn::TransposeLayer>(output->GetInputSlot(0),
+                                                armnn::TransposeDescriptor({ 0, 2, 3, 1 }),
+                                                "transpose0231");
+
+    BOOST_TEST(CheckSequence(graph.cbegin(), graph.cend(), &IsLayerOfType<armnn::InputLayer>,
+                             &IsLayerOfType<armnn::TransposeLayer>, &IsLayerOfType<armnn::TransposeLayer>,
+                             &IsLayerOfType<armnn::OutputLayer>));
+
+    armnn::Optimizer::Pass(graph, armnn::MakeOptimizations(OptimizeInverseTransposes()));
+
+    // The permutes are removed.
+    BOOST_TEST(CheckSequence(graph.cbegin(), graph.cend(), &IsLayerOfType<armnn::InputLayer>,
+                             &IsLayerOfType<armnn::OutputLayer>));
+}
+
 BOOST_AUTO_TEST_SUITE_END()
\ No newline at end of file
diff --git a/src/armnn/test/optimizations/PermuteAndBatchToSpaceAsDepthToSpaceTests.cpp b/src/armnn/test/optimizations/PermuteAndBatchToSpaceAsDepthToSpaceTests.cpp
index 74ee18b..c2180a6 100644
--- a/src/armnn/test/optimizations/PermuteAndBatchToSpaceAsDepthToSpaceTests.cpp
+++ b/src/armnn/test/optimizations/PermuteAndBatchToSpaceAsDepthToSpaceTests.cpp
@@ -49,6 +49,37 @@
     return network;
 }
 
+/// Shared function for the below tests, so that we test the same network in both cases.
+INetworkPtr CreateTransposeTestNetwork()
+{
+    // Create a network
+    INetworkPtr network = INetwork::Create();
+
+    auto input = network->AddInputLayer(0, "input");
+    const TensorInfo inputInfo({ 1, 2, 3, 4 }, DataType::Float32);
+    input->GetOutputSlot(0).SetTensorInfo(inputInfo);
+
+    // Insert Permute which swaps batches and channels dimensions
+    auto permute = network->AddTransposeLayer(TransposeDescriptor(PermutationVector{ 3, 1, 2, 0 }), "permute");
+    const TensorInfo permuteInfo({ 4, 2, 3, 1 }, DataType::Float32);
+    permute->GetOutputSlot(0).SetTensorInfo(permuteInfo);
+    input->GetOutputSlot(0).Connect(permute->GetInputSlot(0));
+
+    // Insert BatchToSpace
+    BatchToSpaceNdDescriptor batchToSpaceDesc;
+    batchToSpaceDesc.m_BlockShape = { 2, 2 };
+    batchToSpaceDesc.m_DataLayout = DataLayout::NHWC;
+    auto batchToSpace             = network->AddBatchToSpaceNdLayer(batchToSpaceDesc, "batchToSpace");
+    const TensorInfo batchToSpaceInfo({ 1, 4, 6, 1 }, DataType::Float32);
+    batchToSpace->GetOutputSlot(0).SetTensorInfo(batchToSpaceInfo);
+    permute->GetOutputSlot(0).Connect(batchToSpace->GetInputSlot(0));
+
+    auto output = network->AddOutputLayer(0, "output");
+    batchToSpace->GetOutputSlot(0).Connect(output->GetInputSlot(0));
+
+    return network;
+}
+
 }    // namespace
 
 /// Tests that the optimization performed by PermuteAndBatchToSpaceAsDepthToSpace is as expected.
@@ -81,6 +112,36 @@
     BOOST_TEST(CheckRelatedLayers<DepthToSpaceLayer>(graph, testRelatedLayers));
 }
 
+/// Tests that the optimization performed by PermuteAndBatchToSpaceAsDepthToSpace is as expected.
+/// Note this does not ensure the correctness of the optimization - that is done in the below test.
+BOOST_AUTO_TEST_CASE(TransposeAndBatchToSpaceAsDepthToSpaceOptimizerTest)
+{
+    INetworkPtr network = CreateTransposeTestNetwork();
+    Graph graph         = static_cast<Network*>(network.get())->GetGraph();
+
+    // Confirm initial graph is as we expect
+    BOOST_TEST(CheckSequence(graph.cbegin(), graph.cend(), &IsLayerOfType<InputLayer>, &IsLayerOfType<TransposeLayer>,
+                             &IsLayerOfType<BatchToSpaceNdLayer>, &IsLayerOfType<OutputLayer>));
+
+    // Perform the optimization which should merge the two layers into a DepthToSpace
+    armnn::Optimizer::Pass(graph, MakeOptimizations(TransposeAndBatchToSpaceAsDepthToSpace()));
+
+    // Check that the replacement has been made as expected
+    auto checkDepthToSpace = [](const Layer* const layer) -> bool {
+        return IsLayerOfType<DepthToSpaceLayer>(layer) &&
+               static_cast<const DepthToSpaceLayer*>(layer)->GetParameters().m_BlockSize == 2 &&
+               static_cast<const DepthToSpaceLayer*>(layer)->GetParameters().m_DataLayout == DataLayout::NHWC &&
+               layer->GetOutputHandler().GetTensorInfo() == TensorInfo({ 1, 4, 6, 1 }, DataType::Float32);
+    };
+
+    BOOST_TEST(CheckSequence(graph.cbegin(), graph.cend(), &IsLayerOfType<InputLayer>, checkDepthToSpace,
+                             &IsLayerOfType<OutputLayer>));
+
+    // Check the new layer has the two merged layers listed as related layers
+    std::list<std::string> testRelatedLayers = { "batchToSpace", "permute" };
+    BOOST_TEST(CheckRelatedLayers<DepthToSpaceLayer>(graph, testRelatedLayers));
+}
+
 // This unit test needs the reference backend, it's not available if the reference backend is not built
 #if defined(ARMNNREF_ENABLED)
 
@@ -130,6 +191,53 @@
     };
     BOOST_TEST(outputData == expectedOutput);
 }
+
+/// Tests that a optimization performed by PermuteAndBatchToSpaceAsDepthToSpace does not change the behaviour
+/// of the network (i.e. it still produces the correct output).
+BOOST_AUTO_TEST_CASE(TransposeAndBatchToSpaceAsDepthToSpaceCorrectnessTest)
+{
+    INetworkPtr network = CreateTransposeTestNetwork();
+
+    IRuntimePtr runtime = IRuntime::Create(IRuntime::CreationOptions());
+    IOptimizedNetworkPtr optimizedNetwork = Optimize(*network, { Compute::CpuRef }, runtime->GetDeviceSpec());
+
+    // Confirm that the optimization has actually taken place
+    const Graph& optGraph = static_cast<OptimizedNetwork*>(optimizedNetwork.get())->GetGraph();
+    BOOST_TEST(CheckSequence(optGraph.cbegin(), optGraph.cend(), &IsLayerOfType<InputLayer>,
+                             &IsLayerOfType<DepthToSpaceLayer>, &IsLayerOfType<OutputLayer>));
+
+    // Load the graph into a runtime so we can check it produces the correct output
+    NetworkId netId;
+    runtime->LoadNetwork(netId, std::move(optimizedNetwork));
+
+    std::vector<float> inputData{
+            // Each row here is a row of pixels where each pixel has 4 channels
+            // clang-format off
+            1.0f,  2.0f,  3.0f,  4.0f,      10.0f,  20.0f,  30.0f,  40.0f,      100.0f,  200.0f,  300.0f,  400.0f,
+            -1.0f, -2.0f, -3.0f, -4.0f,    -10.0f, -20.0f, -30.0f, -40.0f,     -100.0f, -200.0f, -300.0f, -400.0f,
+            // clang-format on
+    };
+    ConstTensor input(TensorInfo({ 1, 2, 3, 4 }, DataType::Float32), inputData);
+    InputTensors inputs = { { 0, input } };
+    std::vector<float> outputData(4 * 6);
+    Tensor output(TensorInfo({ 1, 4, 6, 1 }, DataType::Float32), outputData.data());
+    OutputTensors outputs = { { 0, output } };
+    runtime->EnqueueWorkload(netId, inputs, outputs);
+
+    // Check the output is as expected.
+    // Note this output has been generated by running the network *without* the optimization.
+    std::vector<float> expectedOutput = {
+            // Rows and columns here match exactly with the tensor, as there is only 1 channel.
+            // clang-format off
+            1.0f,  2.0f,     10.0f,  20.0f,     100.0f,  200.0f,
+            3.0f,  4.0f,     30.0f,  40.0f,     300.0f,  400.0f,
+
+            -1.0f, -2.0f,   -10.0f, -20.0f,    -100.0f, -200.0f,
+            -3.0f, -4.0f,   -30.0f, -40.0f,    -300.0f, -400.0f,
+            // clang-format on
+    };
+    BOOST_TEST(outputData == expectedOutput);
+}
 #endif
 
 BOOST_AUTO_TEST_SUITE_END()
\ No newline at end of file
diff --git a/src/armnn/test/optimizations/TransposeAsReshapeTests.cpp b/src/armnn/test/optimizations/TransposeAsReshapeTests.cpp
new file mode 100644
index 0000000..3c6ed6e
--- /dev/null
+++ b/src/armnn/test/optimizations/TransposeAsReshapeTests.cpp
@@ -0,0 +1,60 @@
+//
+// Copyright © 2020 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "../TestUtils.hpp"
+
+#include <Optimizer.hpp>
+
+#include <boost/test/unit_test.hpp>
+
+using namespace armnn;
+
+BOOST_AUTO_TEST_SUITE(Optimizer)
+using namespace armnn::optimizations;
+
+BOOST_AUTO_TEST_CASE(TransposeAsReshapeTest)
+{
+    armnn::Graph graph;
+
+    std::string transposeLayerName = "transpose";
+
+    const armnn::TensorInfo infoIn({ 1, 2, 3, 1 }, armnn::DataType::Float32);
+    const armnn::TensorInfo infoOut({ 1, 1, 2, 3 }, armnn::DataType::Float32);
+
+    auto output = graph.AddLayer<armnn::OutputLayer>(0, "output");
+
+    graph.InsertNewLayer<armnn::InputLayer>(output->GetInputSlot(0), 0, "input")
+            ->GetOutputHandler()
+            .SetTensorInfo(infoIn);
+
+    // Inserts transpose.
+    graph
+            .InsertNewLayer<armnn::TransposeLayer>(output->GetInputSlot(0), armnn::TransposeDescriptor({ 0, 3, 1, 2 }),
+                                                   transposeLayerName.c_str())
+            ->GetOutputHandler()
+            .SetTensorInfo(infoOut);
+
+    BOOST_TEST(CheckSequence(graph.cbegin(), graph.cend(), &IsLayerOfType<armnn::InputLayer>,
+                             &IsLayerOfType<armnn::TransposeLayer>, &IsLayerOfType<armnn::OutputLayer>));
+
+    armnn::Optimizer::Pass(graph, armnn::MakeOptimizations(TransposeAsReshape()));
+
+    // The transpose is replaced by an equivalent reshape.
+
+    auto checkReshape = [&infoOut](const armnn::Layer* const layer) -> bool {
+        const auto reshapeLayer = static_cast<const armnn::ReshapeLayer*>(layer);
+        return IsLayerOfType<armnn::ReshapeLayer>(layer) &&
+               (reshapeLayer->GetParameters().m_TargetShape == infoOut.GetShape()) &&
+               (reshapeLayer->GetOutputHandler().GetTensorInfo().GetShape() == infoOut.GetShape());
+    };
+
+    BOOST_TEST(CheckSequence(graph.cbegin(), graph.cend(), &IsLayerOfType<armnn::InputLayer>, checkReshape,
+                             &IsLayerOfType<armnn::OutputLayer>));
+
+    std::list<std::string> testRelatedLayers = { transposeLayerName };
+    BOOST_TEST(CheckRelatedLayers<armnn::ReshapeLayer>(graph, testRelatedLayers));
+}
+
+BOOST_AUTO_TEST_SUITE_END()
\ No newline at end of file