IVGCVSW-4375 Add support for Transpose

 * Added TransposeLayer
 * Added CL, Neon and Ref Workloads
 * Added Transpose utilities
 * Added Serializer and Deserializer support
 * Added Quantizer support

Signed-off-by: Mike Kelly <mike.kelly@arm.com>
Change-Id: I04c755ba7cb5b1edf72b3c9f3c0314878032e3c7
diff --git a/src/backends/aclCommon/ArmComputeTensorUtils.cpp b/src/backends/aclCommon/ArmComputeTensorUtils.cpp
index 49fef5b..84091e8 100644
--- a/src/backends/aclCommon/ArmComputeTensorUtils.cpp
+++ b/src/backends/aclCommon/ArmComputeTensorUtils.cpp
@@ -214,7 +214,34 @@
     {
         aclPerm.set(i - start, perm[i] - start);
     }
+    return aclPerm;
+}
 
+arm_compute::PermutationVector BuildArmComputeTransposeVector(const armnn::PermutationVector& perm)
+{
+    arm_compute::PermutationVector aclPerm;
+    std::map<unsigned int, unsigned int> permuteMappings;
+    for (unsigned int i = 0; i < perm.GetSize(); ++i)
+    {
+        permuteMappings[perm[i]] = i;
+    }
+
+    std::vector<unsigned int> permuteVector;
+    for (unsigned int i = 0; i < perm.GetSize(); ++i)
+    {
+        permuteVector.push_back(permuteMappings.at(i));
+    }
+
+    unsigned int start = 0;
+    while ((start < perm.GetSize()) && (start == permuteVector[start]))
+    {
+        ++start;
+    }
+
+    for (unsigned int i = start; i < perm.GetSize(); ++i)
+    {
+        aclPerm.set(i - start, permuteVector[i] - start);
+    }
     return aclPerm;
 }
 
diff --git a/src/backends/aclCommon/ArmComputeTensorUtils.hpp b/src/backends/aclCommon/ArmComputeTensorUtils.hpp
index b4ff0f7..9b236e1 100644
--- a/src/backends/aclCommon/ArmComputeTensorUtils.hpp
+++ b/src/backends/aclCommon/ArmComputeTensorUtils.hpp
@@ -60,6 +60,9 @@
 /// Utility function used to setup an arm_compute::PermutationVector object from an armnn::PermutationVector.
 arm_compute::PermutationVector BuildArmComputePermutationVector(const armnn::PermutationVector& vector);
 
+/// Utility function used to setup an arm_compute::PermutationVector object from an armnn::PermutationVector.
+arm_compute::PermutationVector BuildArmComputeTransposeVector(const armnn::PermutationVector& vector);
+
 /// Utility function used to setup an arm_compute::Size2D object from width and height values.
 arm_compute::Size2D BuildArmComputeSize2D(const unsigned int width, const unsigned int height);
 
diff --git a/src/backends/backendsCommon/LayerSupportBase.cpp b/src/backends/backendsCommon/LayerSupportBase.cpp
index 449b809..1279134 100644
--- a/src/backends/backendsCommon/LayerSupportBase.cpp
+++ b/src/backends/backendsCommon/LayerSupportBase.cpp
@@ -579,4 +579,12 @@
     return DefaultLayerSupport(__func__, __FILE__, __LINE__, reasonIfUnsupported);
 }
 
+bool LayerSupportBase::IsTransposeSupported(const TensorInfo& /*input*/,
+                                            const TensorInfo& /*output*/,
+                                            const TransposeDescriptor& /*descriptor*/,
+                                            Optional<std::string&> reasonIfUnsupported) const
+{
+    return DefaultLayerSupport(__func__, __FILE__, __LINE__, reasonIfUnsupported);
+}
+
 } // namespace armnn
diff --git a/src/backends/backendsCommon/LayerSupportBase.hpp b/src/backends/backendsCommon/LayerSupportBase.hpp
index 459ac03..888bef5 100644
--- a/src/backends/backendsCommon/LayerSupportBase.hpp
+++ b/src/backends/backendsCommon/LayerSupportBase.hpp
@@ -353,6 +353,12 @@
         const TensorInfo& weights,
         const Optional<TensorInfo>& biases,
         Optional<std::string&> reasonIfUnsupported = EmptyOptional()) const override;
+
+    bool IsTransposeSupported(const TensorInfo& input,
+                              const TensorInfo& output,
+                              const TransposeDescriptor& descriptor,
+                              Optional<std::string&> reasonIfUnsupported = EmptyOptional()) const override;
+
 };
 
 } // namespace armnn
diff --git a/src/backends/backendsCommon/WorkloadData.cpp b/src/backends/backendsCommon/WorkloadData.cpp
index 410469e..9b7a242 100644
--- a/src/backends/backendsCommon/WorkloadData.cpp
+++ b/src/backends/backendsCommon/WorkloadData.cpp
@@ -2680,6 +2680,35 @@
     ValidateTensorDataTypesMatch(inputTensorInfo, outputTensorInfo, descriptorName, "input", "output");
 }
 
+void TransposeQueueDescriptor::Validate(const WorkloadInfo& workloadInfo) const
+{
+    const std::string descriptorName{"TransposeQueueDescriptor"};
+
+    ValidateNumInputs(workloadInfo,  descriptorName, 1);
+    ValidateNumOutputs(workloadInfo, descriptorName, 1);
+
+    const PermutationVector& mapping = m_Parameters.m_DimMappings;
+
+    const TensorInfo& inputTensorInfo  = workloadInfo.m_InputTensorInfos[0];
+    const TensorInfo& outputTensorInfo = workloadInfo.m_OutputTensorInfos[0];
+
+    ValidateTensorNumDimensions(inputTensorInfo,  descriptorName, mapping.GetSize(), "input");
+    ValidateTensorNumDimensions(outputTensorInfo, descriptorName, mapping.GetSize(), "output");
+
+    for (unsigned int i = 0u; i < mapping.GetSize(); ++i)
+    {
+        if (inputTensorInfo.GetShape()[mapping[i]] != outputTensorInfo.GetShape()[i])
+        {
+            throw InvalidArgumentException(descriptorName + ": src dimension " + to_string(mapping[i]) +
+                                           " (=" + to_string(inputTensorInfo.GetShape()[mapping[i]]) + ") " +
+                                           "must match dst dimension " + to_string(i) +
+                                           " (=" + to_string(outputTensorInfo.GetShape()[i]) + ")");
+        }
+    }
+
+    ValidateTensorDataTypesMatch(inputTensorInfo, outputTensorInfo, descriptorName, "input", "output");
+}
+
 void QuantizedLstmQueueDescriptor::Validate(const WorkloadInfo& workloadInfo) const
 {
     const std::string descriptorName{"QuantizedLstmQueueDescriptor"};
diff --git a/src/backends/backendsCommon/WorkloadData.hpp b/src/backends/backendsCommon/WorkloadData.hpp
index 46681e9..06289fa 100644
--- a/src/backends/backendsCommon/WorkloadData.hpp
+++ b/src/backends/backendsCommon/WorkloadData.hpp
@@ -504,6 +504,11 @@
     void Validate(const WorkloadInfo& workloadInfo) const;
 };
 
+struct TransposeQueueDescriptor : QueueDescriptorWithParameters<TransposeDescriptor>
+{
+    void Validate(const WorkloadInfo& workloadInfo) const;
+};
+
 struct QuantizedLstmQueueDescriptor : QueueDescriptor
 {
     QuantizedLstmQueueDescriptor()
diff --git a/src/backends/backendsCommon/WorkloadFactory.cpp b/src/backends/backendsCommon/WorkloadFactory.cpp
index 23ff70a..6ac76ec 100644
--- a/src/backends/backendsCommon/WorkloadFactory.cpp
+++ b/src/backends/backendsCommon/WorkloadFactory.cpp
@@ -1023,6 +1023,17 @@
                                                           reason);
             break;
         }
+        case LayerType::Transpose:
+        {
+            auto cLayer = boost::polymorphic_downcast<const TransposeLayer*>(&layer);
+            const TensorInfo& input = layer.GetInputSlot(0).GetConnection()->GetTensorInfo();
+            const TensorInfo& output = layer.GetOutputSlot(0).GetTensorInfo();
+            result = layerSupportObject->IsTransposeSupported(OverrideDataType(input, dataType),
+                                                              OverrideDataType(output, dataType),
+                                                              cLayer->GetParameters(),
+                                                              reason);
+            break;
+        }
         case LayerType::TransposeConvolution2d:
         {
             auto cLayer = boost::polymorphic_downcast<const TransposeConvolution2dLayer*>(&layer);
@@ -1315,7 +1326,7 @@
 }
 
 std::unique_ptr<IWorkload> IWorkloadFactory::CreatePermute(const PermuteQueueDescriptor& /*descriptor*/,
-                                                           const WorkloadInfo&/**/ /*info*/) const
+                                                           const WorkloadInfo& /*info*/) const
 {
     return std::unique_ptr<IWorkload>();
 }
@@ -1379,7 +1390,7 @@
 {
     return std::unique_ptr<IWorkload>();
 }
-/**/
+
 std::unique_ptr<IWorkload> IWorkloadFactory::CreateSoftmax(const SoftmaxQueueDescriptor& /*descriptor*/,
                                                            const WorkloadInfo& /*info*/) const
 {
@@ -1428,6 +1439,12 @@
     return std::unique_ptr<IWorkload>();
 }
 
+std::unique_ptr<IWorkload> IWorkloadFactory::CreateTranspose(const TransposeQueueDescriptor& /*descriptor*/,
+                                                             const WorkloadInfo& /*info*/) const
+{
+    return std::unique_ptr<IWorkload>();
+}
+
 std::unique_ptr<IWorkload> IWorkloadFactory::CreateTransposeConvolution2d(
     const TransposeConvolution2dQueueDescriptor& /*descriptor*/,
     const WorkloadInfo& /*info*/) const
diff --git a/src/backends/backendsCommon/WorkloadFactory.hpp b/src/backends/backendsCommon/WorkloadFactory.hpp
index e1cdff6..dae58b6 100644
--- a/src/backends/backendsCommon/WorkloadFactory.hpp
+++ b/src/backends/backendsCommon/WorkloadFactory.hpp
@@ -235,6 +235,9 @@
     virtual std::unique_ptr<IWorkload> CreateSwitch(const SwitchQueueDescriptor& descriptor,
                                                     const WorkloadInfo& Info) const;
 
+    virtual std::unique_ptr<IWorkload> CreateTranspose(const TransposeQueueDescriptor& descriptor,
+                                                       const WorkloadInfo& info) const;
+
     virtual std::unique_ptr<IWorkload> CreateTransposeConvolution2d(
         const TransposeConvolution2dQueueDescriptor& descriptor,
         const WorkloadInfo& info) const;
diff --git a/src/backends/backendsCommon/WorkloadFactoryBase.hpp b/src/backends/backendsCommon/WorkloadFactoryBase.hpp
index 9602cc3..960dbd3 100644
--- a/src/backends/backendsCommon/WorkloadFactoryBase.hpp
+++ b/src/backends/backendsCommon/WorkloadFactoryBase.hpp
@@ -266,6 +266,10 @@
                                             const WorkloadInfo& /*info*/) const override
     { return nullptr; }
 
+    std::unique_ptr<IWorkload> CreateTranspose(const TransposeQueueDescriptor& /*descriptor*/,
+                                               const WorkloadInfo& /*info*/) const override
+    { return nullptr; }
+
     std::unique_ptr<IWorkload> CreateTransposeConvolution2d(const TransposeConvolution2dQueueDescriptor& /*descriptor*/,
                                                             const WorkloadInfo& /*info*/) const override
     { return nullptr; }
diff --git a/src/backends/backendsCommon/test/IsLayerSupportedTestImpl.hpp b/src/backends/backendsCommon/test/IsLayerSupportedTestImpl.hpp
index 1a899aa..395a63d 100644
--- a/src/backends/backendsCommon/test/IsLayerSupportedTestImpl.hpp
+++ b/src/backends/backendsCommon/test/IsLayerSupportedTestImpl.hpp
@@ -537,6 +537,8 @@
 
 DECLARE_LAYER_POLICY_1_PARAM(Switch)
 
+DECLARE_LAYER_POLICY_2_PARAM(Transpose)
+
 DECLARE_LAYER_POLICY_2_PARAM(TransposeConvolution2d)
 
 
diff --git a/src/backends/backendsCommon/test/LayerTests.hpp b/src/backends/backendsCommon/test/LayerTests.hpp
index eba7944..62a66df 100644
--- a/src/backends/backendsCommon/test/LayerTests.hpp
+++ b/src/backends/backendsCommon/test/LayerTests.hpp
@@ -53,3 +53,4 @@
 #include <backendsCommon/test/layerTests/StridedSliceTestImpl.hpp>
 #include <backendsCommon/test/layerTests/SubtractionTestImpl.hpp>
 #include <backendsCommon/test/layerTests/TransposeConvolution2dTestImpl.hpp>
+#include <backendsCommon/test/layerTests/TransposeTestImpl.hpp>
diff --git a/src/backends/backendsCommon/test/layerTests/TransposeTestImpl.hpp b/src/backends/backendsCommon/test/layerTests/TransposeTestImpl.hpp
new file mode 100644
index 0000000..3949dcc
--- /dev/null
+++ b/src/backends/backendsCommon/test/layerTests/TransposeTestImpl.hpp
@@ -0,0 +1,240 @@
+//
+// Copyright © 2020 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include <ResolveType.hpp>
+
+
+#include <armnn/backends/IBackendInternal.hpp>
+#include <backendsCommon/WorkloadFactory.hpp>
+
+#include <backendsCommon/test/WorkloadTestUtils.hpp>
+
+#include <test/TensorHelpers.hpp>
+
+template<typename T>
+LayerTestResult<T, 4> SimpleTransposeTestImpl(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        armnn::TransposeDescriptor descriptor,
+        armnn::TensorInfo inputTensorInfo,
+        armnn::TensorInfo outputTensorInfo,
+        const std::vector<T>& inputData,
+        const std::vector<T>& outputExpectedData)
+{
+    boost::ignore_unused(memoryManager);
+    auto input = MakeTensor<T, 4>(inputTensorInfo, inputData);
+
+    LayerTestResult<T, 4> ret(outputTensorInfo);
+    ret.outputExpected = MakeTensor<T, 4>(outputTensorInfo, outputExpectedData);
+
+    std::unique_ptr<armnn::ITensorHandle> inputHandle = workloadFactory.CreateTensorHandle(inputTensorInfo);
+    std::unique_ptr<armnn::ITensorHandle> outputHandle = workloadFactory.CreateTensorHandle(outputTensorInfo);
+
+    armnn::TransposeQueueDescriptor data;
+    data.m_Parameters = descriptor;
+    armnn::WorkloadInfo info;
+    AddInputToWorkload(data, info, inputTensorInfo, inputHandle.get());
+    AddOutputToWorkload(data, info, outputTensorInfo, outputHandle.get());
+
+    std::unique_ptr<armnn::IWorkload> workload = workloadFactory.CreateTranspose(data, info);
+
+    inputHandle->Allocate();
+    outputHandle->Allocate();
+
+    CopyDataToITensorHandle(inputHandle.get(), &input[0][0][0][0]);
+
+    workload->Execute();
+
+    CopyDataFromITensorHandle(&ret.output[0][0][0][0], outputHandle.get());
+
+    return ret;
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> SimpleTransposeTest(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    armnn::TensorInfo inputTensorInfo;
+    armnn::TensorInfo outputTensorInfo;
+
+    unsigned int inputShape[] = { 1, 2, 2, 2 };
+    unsigned int outputShape[] = { 1, 2, 2, 2 };
+
+    armnn::TransposeDescriptor descriptor;
+    descriptor.m_DimMappings = {0U, 2U, 3U, 1U};
+
+    inputTensorInfo = armnn::TensorInfo(4, inputShape, ArmnnType);
+    outputTensorInfo = armnn::TensorInfo(4, outputShape, ArmnnType);
+
+    // Set quantization parameters if the requested type is a quantized type.
+    if(armnn::IsQuantizedType<T>())
+    {
+        inputTensorInfo.SetQuantizationScale(0.5f);
+        inputTensorInfo.SetQuantizationOffset(5);
+        outputTensorInfo.SetQuantizationScale(0.5f);
+        outputTensorInfo.SetQuantizationOffset(5);
+    }
+
+    std::vector<T> input = std::vector<T>(
+    {
+        1, 2,
+        3, 4,
+        5, 6,
+        7, 8
+    });
+
+    std::vector<T> outputExpected = std::vector<T>(
+    {
+        1, 5, 2, 6,
+        3, 7, 4, 8
+    });
+
+    return SimpleTransposeTestImpl<T>(workloadFactory, memoryManager,
+                                    descriptor, inputTensorInfo,
+                                    outputTensorInfo, input, outputExpected);
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> TransposeValueSet1Test(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    armnn::TensorInfo inputTensorInfo;
+    armnn::TensorInfo outputTensorInfo;
+
+    unsigned int inputShape[]  = { 1, 2, 2, 3 };
+    unsigned int outputShape[] = { 1, 3, 2, 2 };
+
+    armnn::TransposeDescriptor descriptor;
+    descriptor.m_DimMappings = {0U, 3U, 1U, 2U};
+
+    inputTensorInfo = armnn::TensorInfo(4, inputShape, ArmnnType);
+    outputTensorInfo = armnn::TensorInfo(4, outputShape, ArmnnType);
+
+    // Set quantization parameters if the requested type is a quantized type.
+    if(armnn::IsQuantizedType<T>())
+    {
+        inputTensorInfo.SetQuantizationScale(0.5f);
+        inputTensorInfo.SetQuantizationOffset(5);
+        outputTensorInfo.SetQuantizationScale(0.5f);
+        outputTensorInfo.SetQuantizationOffset(5);
+    }
+
+    std::vector<T> input = std::vector<T>(
+    {
+         1,  2,  3,
+        11, 12, 13,
+        21, 22, 23,
+        31, 32, 33
+    });
+
+    std::vector<T> outputExpected = std::vector<T>(
+    {
+        1, 11, 21, 31,
+        2, 12, 22, 32,
+        3, 13, 23, 33
+    });
+
+    return SimpleTransposeTestImpl<T>(workloadFactory, memoryManager,
+                                    descriptor, inputTensorInfo,
+                                    outputTensorInfo, input, outputExpected);
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> TransposeValueSet2Test(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    armnn::TensorInfo inputTensorInfo;
+    armnn::TensorInfo outputTensorInfo;
+
+    unsigned int inputShape[]  = { 1, 3, 2, 2 };
+    unsigned int outputShape[] = { 1, 2, 2, 3 };
+
+    armnn::TransposeDescriptor descriptor;
+    descriptor.m_DimMappings = {0U, 2U, 3U, 1U};
+
+    inputTensorInfo = armnn::TensorInfo(4, inputShape, ArmnnType);
+    outputTensorInfo = armnn::TensorInfo(4, outputShape, ArmnnType);
+
+    // Set quantization parameters if the requested type is a quantized type.
+    if(armnn::IsQuantizedType<T>())
+    {
+        inputTensorInfo.SetQuantizationScale(0.5f);
+        inputTensorInfo.SetQuantizationOffset(5);
+        outputTensorInfo.SetQuantizationScale(0.5f);
+        outputTensorInfo.SetQuantizationOffset(5);
+    }
+
+    std::vector<T> input = std::vector<T>(
+    {
+        1, 11, 21, 31,
+        2, 12, 22, 32,
+        3, 13, 23, 33
+    });
+
+    std::vector<T> outputExpected = std::vector<T>(
+    {
+         1,  2,  3,
+        11, 12, 13,
+        21, 22, 23,
+        31, 32, 33,
+    });
+
+    return SimpleTransposeTestImpl<T>(workloadFactory, memoryManager,
+                                    descriptor, inputTensorInfo,
+                                    outputTensorInfo, input, outputExpected);
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> TransposeValueSet3Test(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    armnn::TensorInfo inputTensorInfo;
+    armnn::TensorInfo outputTensorInfo;
+
+    unsigned int inputShape[]  = { 1, 2, 3, 3 };
+    unsigned int outputShape[] = { 1, 3, 2, 3 };
+
+    armnn::TransposeDescriptor descriptor;
+    descriptor.m_DimMappings = {0U, 3U, 1U, 2U};
+
+    inputTensorInfo = armnn::TensorInfo(4, inputShape, ArmnnType);
+    outputTensorInfo = armnn::TensorInfo(4, outputShape, ArmnnType);
+
+    // Set quantization parameters if the requested type is a quantized type.
+    if(armnn::IsQuantizedType<T>())
+    {
+        inputTensorInfo.SetQuantizationScale(0.5f);
+        inputTensorInfo.SetQuantizationOffset(5);
+        outputTensorInfo.SetQuantizationScale(0.5f);
+        outputTensorInfo.SetQuantizationOffset(5);
+    }
+
+    std::vector<T> input = std::vector<T>(
+    {
+         1,  2,  3,
+        11, 12, 13,
+        21, 22, 23,
+        31, 32, 33,
+        41, 42, 43,
+        51, 52, 53
+    });
+
+    std::vector<T> outputExpected = std::vector<T>(
+    {
+        1, 11, 21, 31, 41, 51,
+        2, 12, 22, 32, 42, 52,
+        3, 13, 23, 33, 43, 53
+    });
+
+    return SimpleTransposeTestImpl<T>(workloadFactory, memoryManager,
+                                      descriptor, inputTensorInfo,
+                                      outputTensorInfo, input, outputExpected);
+}
diff --git a/src/backends/cl/ClLayerSupport.cpp b/src/backends/cl/ClLayerSupport.cpp
index e8548e4..d3ac986 100644
--- a/src/backends/cl/ClLayerSupport.cpp
+++ b/src/backends/cl/ClLayerSupport.cpp
@@ -60,6 +60,7 @@
 #include "workloads/ClStridedSliceWorkload.hpp"
 #include "workloads/ClSubtractionWorkload.hpp"
 #include "workloads/ClTransposeConvolution2dWorkload.hpp"
+#include "workloads/ClTransposeWorkload.hpp"
 #endif
 
 using namespace boost;
@@ -819,4 +820,12 @@
                                    biases);
 }
 
+bool ClLayerSupport::IsTransposeSupported(const TensorInfo& input,
+                                          const TensorInfo& output,
+                                          const TransposeDescriptor& descriptor,
+                                          Optional<std::string&> reasonIfUnsupported) const
+{
+    FORWARD_WORKLOAD_VALIDATE_FUNC(ClTransposeWorkloadValidate, reasonIfUnsupported, input, output, descriptor);
+}
+
 } // namespace armnn
diff --git a/src/backends/cl/ClLayerSupport.hpp b/src/backends/cl/ClLayerSupport.hpp
index 819d086..60899d0 100644
--- a/src/backends/cl/ClLayerSupport.hpp
+++ b/src/backends/cl/ClLayerSupport.hpp
@@ -286,6 +286,12 @@
                                            const TensorInfo& weights,
                                            const Optional<TensorInfo>& biases,
                                            Optional<std::string&> reasonIfUnsupported = EmptyOptional()) const override;
+
+    bool IsTransposeSupported(const TensorInfo& input,
+                              const TensorInfo& output,
+                              const TransposeDescriptor& descriptor,
+                              Optional<std::string&> reasonIfUnsupported = EmptyOptional()) const override;
+
 };
 
 } // namespace armnn
diff --git a/src/backends/cl/ClWorkloadFactory.cpp b/src/backends/cl/ClWorkloadFactory.cpp
index 4bb2e2a..21c2629 100644
--- a/src/backends/cl/ClWorkloadFactory.cpp
+++ b/src/backends/cl/ClWorkloadFactory.cpp
@@ -534,6 +534,12 @@
     return MakeWorkload<ClSubtractionWorkload>(descriptor, info);
 }
 
+std::unique_ptr<IWorkload> ClWorkloadFactory::CreateTranspose(const TransposeQueueDescriptor& descriptor,
+                                                              const WorkloadInfo& info) const
+{
+    return MakeWorkload<ClTransposeWorkload>(descriptor, info);
+}
+
 std::unique_ptr<IWorkload> ClWorkloadFactory::CreateTransposeConvolution2d(
     const TransposeConvolution2dQueueDescriptor& descriptor,
     const WorkloadInfo& info) const
diff --git a/src/backends/cl/ClWorkloadFactory.hpp b/src/backends/cl/ClWorkloadFactory.hpp
index 980be91..a716801 100644
--- a/src/backends/cl/ClWorkloadFactory.hpp
+++ b/src/backends/cl/ClWorkloadFactory.hpp
@@ -210,6 +210,9 @@
     std::unique_ptr<IWorkload> CreateSubtraction(const SubtractionQueueDescriptor& descriptor,
                                                  const WorkloadInfo& info) const override;
 
+    std::unique_ptr<IWorkload> CreateTranspose(const TransposeQueueDescriptor& descriptor,
+                                               const WorkloadInfo& info) const override;
+
     std::unique_ptr<IWorkload> CreateTransposeConvolution2d(const TransposeConvolution2dQueueDescriptor& descriptor,
                                                             const WorkloadInfo& info) const override;
 
diff --git a/src/backends/cl/backend.mk b/src/backends/cl/backend.mk
index 4182b94..e326add 100644
--- a/src/backends/cl/backend.mk
+++ b/src/backends/cl/backend.mk
@@ -67,7 +67,8 @@
         workloads/ClStackWorkload.cpp \
         workloads/ClStridedSliceWorkload.cpp \
         workloads/ClSubtractionWorkload.cpp \
-        workloads/ClTransposeConvolution2dWorkload.cpp
+        workloads/ClTransposeConvolution2dWorkload.cpp \
+        workloads/ClTransposeWorkload.cpp
 else
 
 # ARMNN_COMPUTE_CL_ENABLED == 0
diff --git a/src/backends/cl/test/ClLayerTests.cpp b/src/backends/cl/test/ClLayerTests.cpp
index cfec81a..d8b0fd1 100644
--- a/src/backends/cl/test/ClLayerTests.cpp
+++ b/src/backends/cl/test/ClLayerTests.cpp
@@ -770,6 +770,20 @@
 ARMNN_AUTO_TEST_CASE(DequantizeSimpleUint8ToFp16, DequantizeSimpleUint8ToFp16Test)
 ARMNN_AUTO_TEST_CASE(DequantizeSimpleInt16ToFp16, DequantizeSimpleInt16ToFp16Test)
 
+// Transpose
+ARMNN_AUTO_TEST_CASE(SimpleTransposeFloat32, SimpleTransposeTest<DataType::Float32>)
+ARMNN_AUTO_TEST_CASE(TransposeFloat32ValueSet1Test, TransposeValueSet1Test<DataType::Float32>)
+ARMNN_AUTO_TEST_CASE(TransposeFloat32ValueSet2Test, TransposeValueSet2Test<DataType::Float32>)
+ARMNN_AUTO_TEST_CASE(TransposeFloat32ValueSet3Test, TransposeValueSet3Test<DataType::Float32>)
+ARMNN_AUTO_TEST_CASE(SimpleTransposeQASymm8, SimpleTransposeTest<DataType::QAsymmU8>)
+ARMNN_AUTO_TEST_CASE(TransposeQASymm8ValueSet1Test, TransposeValueSet1Test<DataType::QAsymmU8>)
+ARMNN_AUTO_TEST_CASE(TransposeQASymm8ValueSet2Test, TransposeValueSet2Test<DataType::QAsymmU8>)
+ARMNN_AUTO_TEST_CASE(TransposeQASymm8ValueSet3Test, TransposeValueSet3Test<DataType::QAsymmU8>)
+ARMNN_AUTO_TEST_CASE(SimpleTransposeQSymm16, SimpleTransposeTest<DataType::QSymmS16>)
+ARMNN_AUTO_TEST_CASE(TransposeQSymm16ValueSet1Test, TransposeValueSet1Test<DataType::QSymmS16>)
+ARMNN_AUTO_TEST_CASE(TransposeQSymm16ValueSet2Test, TransposeValueSet2Test<DataType::QSymmS16>)
+ARMNN_AUTO_TEST_CASE(TransposeQSymm16ValueSet3Test, TransposeValueSet3Test<DataType::QSymmS16>)
+
 // TransposeConvolution2d
 ARMNN_AUTO_TEST_CASE(SimpleTransposeConvolution2dFloatNchw,
                      SimpleTransposeConvolution2dTest<DataType::Float32, DataType::Float32>,
diff --git a/src/backends/cl/workloads/CMakeLists.txt b/src/backends/cl/workloads/CMakeLists.txt
index de62ca9..17d69b1 100644
--- a/src/backends/cl/workloads/CMakeLists.txt
+++ b/src/backends/cl/workloads/CMakeLists.txt
@@ -96,6 +96,8 @@
     ClSubtractionWorkload.hpp
     ClTransposeConvolution2dWorkload.cpp
     ClTransposeConvolution2dWorkload.hpp
+    ClTransposeWorkload.cpp
+    ClTransposeWorkload.hpp
     ClWorkloads.hpp
     ClWorkloadUtils.hpp
 )
diff --git a/src/backends/cl/workloads/ClTransposeWorkload.cpp b/src/backends/cl/workloads/ClTransposeWorkload.cpp
new file mode 100644
index 0000000..b276b22
--- /dev/null
+++ b/src/backends/cl/workloads/ClTransposeWorkload.cpp
@@ -0,0 +1,49 @@
+//
+// Copyright © 2020 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "ClTransposeWorkload.hpp"
+#include <cl/ClTensorHandle.hpp>
+#include <aclCommon/ArmComputeTensorUtils.hpp>
+
+#include <arm_compute/core/Error.h>
+
+#include "ClWorkloadUtils.hpp"
+
+namespace armnn
+{
+
+arm_compute::Status ClTransposeWorkloadValidate(const TensorInfo& input,
+                                                const TensorInfo& output,
+                                                const TransposeDescriptor& descriptor)
+{
+    const arm_compute::TensorInfo aclInputInfo = armcomputetensorutils::BuildArmComputeTensorInfo(input);
+    const arm_compute::TensorInfo aclOutputInfo = armcomputetensorutils::BuildArmComputeTensorInfo(output);
+    const armnn::PermutationVector& mappings = descriptor.m_DimMappings;
+
+    return arm_compute::CLPermute::validate(&aclInputInfo, &aclOutputInfo,
+                                            armcomputetensorutils::BuildArmComputeTransposeVector(mappings));
+}
+
+ClTransposeWorkload::ClTransposeWorkload(const TransposeQueueDescriptor& descriptor,
+                                         const WorkloadInfo& info)
+    : BaseWorkload<TransposeQueueDescriptor>(descriptor, info)
+{
+    m_Data.ValidateInputsOutputs(GetName(), 1, 1);
+
+    const arm_compute::ICLTensor& input = static_cast<IClTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
+    arm_compute::ICLTensor& output = static_cast<IClTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
+    const armnn::PermutationVector& mappings = m_Data.m_Parameters.m_DimMappings;
+    // Run the layer.
+    m_PermuteFunction.configure(&input, &output,
+                                armcomputetensorutils::BuildArmComputeTransposeVector(mappings));
+}
+
+void ClTransposeWorkload::Execute() const
+{
+    ARMNN_SCOPED_PROFILING_EVENT_CL(GetName() + "_Execute");
+    RunClFunction(m_PermuteFunction, CHECK_LOCATION());
+}
+
+} // namespace armnn
diff --git a/src/backends/cl/workloads/ClTransposeWorkload.hpp b/src/backends/cl/workloads/ClTransposeWorkload.hpp
new file mode 100644
index 0000000..c1bed93
--- /dev/null
+++ b/src/backends/cl/workloads/ClTransposeWorkload.hpp
@@ -0,0 +1,40 @@
+//
+// Copyright © 2020 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include <backendsCommon/Workload.hpp>
+#include <backendsCommon/WorkloadData.hpp>
+
+#include <armnn/TypesUtils.hpp>
+#include <arm_compute/runtime/CL/functions/CLPermute.h>
+
+#include <string>
+
+namespace armnn
+{
+
+arm_compute::Status ClTransposeWorkloadValidate(const TensorInfo& input,
+                                                const TensorInfo& output,
+                                                const TransposeDescriptor& descriptor);
+
+class ClTransposeWorkload : public BaseWorkload<TransposeQueueDescriptor>
+{
+public:
+    static const std::string& GetName()
+    {
+        static const std::string name = std::string("ClTransposeWorkload");
+        return name;
+    }
+
+    ClTransposeWorkload(const TransposeQueueDescriptor& descriptor, const WorkloadInfo& info);
+    void Execute() const override;
+
+private:
+    using BaseWorkload<TransposeQueueDescriptor>::m_Data;
+    mutable arm_compute::CLPermute m_PermuteFunction;
+};
+
+} // namespace armnn
diff --git a/src/backends/cl/workloads/ClWorkloads.hpp b/src/backends/cl/workloads/ClWorkloads.hpp
index 014dc3f..ec193d5 100644
--- a/src/backends/cl/workloads/ClWorkloads.hpp
+++ b/src/backends/cl/workloads/ClWorkloads.hpp
@@ -49,3 +49,4 @@
 #include "ClConvertFp16ToFp32Workload.hpp"
 #include "ClConvertFp32ToFp16Workload.hpp"
 #include "ClTransposeConvolution2dWorkload.hpp"
+#include "ClTransposeWorkload.hpp"
diff --git a/src/backends/neon/NeonLayerSupport.cpp b/src/backends/neon/NeonLayerSupport.cpp
index 3c161d5..7e58dab 100644
--- a/src/backends/neon/NeonLayerSupport.cpp
+++ b/src/backends/neon/NeonLayerSupport.cpp
@@ -59,6 +59,7 @@
 #include "workloads/NeonStridedSliceWorkload.hpp"
 #include "workloads/NeonSubtractionWorkload.hpp"
 #include "workloads/NeonTransposeConvolution2dWorkload.hpp"
+#include "workloads/NeonTransposeWorkload.hpp"
 #endif
 
 using namespace boost;
@@ -803,4 +804,12 @@
                                    biases);
 }
 
+bool NeonLayerSupport::IsTransposeSupported(const TensorInfo& input,
+                                            const TensorInfo& output,
+                                            const TransposeDescriptor& descriptor,
+                                            Optional<std::string&> reasonIfUnsupported) const
+{
+    FORWARD_WORKLOAD_VALIDATE_FUNC(NeonTransposeWorkloadValidate, reasonIfUnsupported, input, output, descriptor);
+}
+
 } // namespace armnn
diff --git a/src/backends/neon/NeonLayerSupport.hpp b/src/backends/neon/NeonLayerSupport.hpp
index 9cb64ea..f45db35 100644
--- a/src/backends/neon/NeonLayerSupport.hpp
+++ b/src/backends/neon/NeonLayerSupport.hpp
@@ -288,6 +288,11 @@
                                            const Optional<TensorInfo>& biases,
                                            Optional<std::string&> reasonIfUnsupported = EmptyOptional()) const override;
 
+    bool IsTransposeSupported(const TensorInfo& input,
+                              const TensorInfo& output,
+                              const TransposeDescriptor& descriptor,
+                              Optional<std::string&> reasonIfUnsupported = EmptyOptional()) const override;
+
 }; // class NeonLayerSupport
 
 } // namespace armnn
diff --git a/src/backends/neon/NeonWorkloadFactory.cpp b/src/backends/neon/NeonWorkloadFactory.cpp
index c3e0dc8..dc3ee84 100644
--- a/src/backends/neon/NeonWorkloadFactory.cpp
+++ b/src/backends/neon/NeonWorkloadFactory.cpp
@@ -503,6 +503,12 @@
     return std::make_unique<NeonSubtractionWorkload>(descriptor, info);
 }
 
+std::unique_ptr<armnn::IWorkload> NeonWorkloadFactory::CreateTranspose(const TransposeQueueDescriptor& descriptor,
+                                                                       const WorkloadInfo& info) const
+{
+    return std::make_unique<NeonTransposeWorkload>(descriptor, info);
+}
+
 std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateTransposeConvolution2d(
     const TransposeConvolution2dQueueDescriptor &descriptor,
     const WorkloadInfo &info) const
diff --git a/src/backends/neon/NeonWorkloadFactory.hpp b/src/backends/neon/NeonWorkloadFactory.hpp
index 95271e2..bc4107d 100644
--- a/src/backends/neon/NeonWorkloadFactory.hpp
+++ b/src/backends/neon/NeonWorkloadFactory.hpp
@@ -213,6 +213,9 @@
     std::unique_ptr<IWorkload> CreateSubtraction(const SubtractionQueueDescriptor& descriptor,
                                                  const WorkloadInfo& info) const override;
 
+    std::unique_ptr<IWorkload> CreateTranspose(const TransposeQueueDescriptor& descriptor,
+                                               const WorkloadInfo& info) const override;
+
     std::unique_ptr<IWorkload> CreateTransposeConvolution2d(const TransposeConvolution2dQueueDescriptor& descriptor,
                                                             const WorkloadInfo& info) const override;
 
diff --git a/src/backends/neon/backend.mk b/src/backends/neon/backend.mk
index 1c572e6..d9a5405 100644
--- a/src/backends/neon/backend.mk
+++ b/src/backends/neon/backend.mk
@@ -67,7 +67,8 @@
         workloads/NeonStackWorkload.cpp \
         workloads/NeonStridedSliceWorkload.cpp \
         workloads/NeonSubtractionWorkload.cpp \
-        workloads/NeonTransposeConvolution2dWorkload.cpp
+        workloads/NeonTransposeConvolution2dWorkload.cpp \
+        workloads/NeonTransposeWorkload.cpp
 
 else
 
diff --git a/src/backends/neon/test/NeonLayerTests.cpp b/src/backends/neon/test/NeonLayerTests.cpp
index 18658a3..482bc25 100644
--- a/src/backends/neon/test/NeonLayerTests.cpp
+++ b/src/backends/neon/test/NeonLayerTests.cpp
@@ -891,6 +891,20 @@
 ARMNN_AUTO_TEST_CASE(StackOutput3DInputs3, StackOutput3DInputs3Float32Test)
 ARMNN_AUTO_TEST_CASE(StackOutput5D,        StackOutput5DFloat32Test)
 
+// Transpose
+ARMNN_AUTO_TEST_CASE(SimpleTransposeFloat32, SimpleTransposeTest<DataType::Float32>)
+ARMNN_AUTO_TEST_CASE(TransposeFloat32ValueSet1Test, TransposeValueSet1Test<DataType::Float32>)
+ARMNN_AUTO_TEST_CASE(TransposeFloat32ValueSet2Test, TransposeValueSet2Test<DataType::Float32>)
+ARMNN_AUTO_TEST_CASE(TransposeFloat32ValueSet3Test, TransposeValueSet3Test<DataType::Float32>)
+ARMNN_AUTO_TEST_CASE(SimpleTransposeQASymm8, SimpleTransposeTest<DataType::QAsymmU8>)
+ARMNN_AUTO_TEST_CASE(TransposeQASymm8ValueSet1Test, TransposeValueSet1Test<DataType::QAsymmU8>)
+ARMNN_AUTO_TEST_CASE(TransposeQASymm8ValueSet2Test, TransposeValueSet2Test<DataType::QAsymmU8>)
+ARMNN_AUTO_TEST_CASE(TransposeQASymm8ValueSet3Test, TransposeValueSet3Test<DataType::QAsymmU8>)
+ARMNN_AUTO_TEST_CASE(SimpleTransposeQSymm16, SimpleTransposeTest<DataType::QSymmS16>)
+ARMNN_AUTO_TEST_CASE(TransposeQSymm16ValueSet1Test, TransposeValueSet1Test<DataType::QSymmS16>)
+ARMNN_AUTO_TEST_CASE(TransposeQSymm16ValueSet2Test, TransposeValueSet2Test<DataType::QSymmS16>)
+ARMNN_AUTO_TEST_CASE(TransposeQSymm16ValueSet3Test, TransposeValueSet3Test<DataType::QSymmS16>)
+
 // TransposeConvolution2d
 ARMNN_AUTO_TEST_CASE(SimpleTransposeConvolution2dFloatNchw,
                      SimpleTransposeConvolution2dTest<DataType::Float32, DataType::Float32>,
diff --git a/src/backends/neon/workloads/CMakeLists.txt b/src/backends/neon/workloads/CMakeLists.txt
index 02ffedc..a932f8b 100644
--- a/src/backends/neon/workloads/CMakeLists.txt
+++ b/src/backends/neon/workloads/CMakeLists.txt
@@ -98,6 +98,8 @@
     NeonSubtractionWorkload.hpp
     NeonTransposeConvolution2dWorkload.cpp
     NeonTransposeConvolution2dWorkload.hpp
+    NeonTransposeWorkload.cpp
+    NeonTransposeWorkload.hpp
     NeonWorkloads.hpp
     NeonWorkloadUtils.hpp
 )
diff --git a/src/backends/neon/workloads/NeonTransposeWorkload.cpp b/src/backends/neon/workloads/NeonTransposeWorkload.cpp
new file mode 100644
index 0000000..c11f2df
--- /dev/null
+++ b/src/backends/neon/workloads/NeonTransposeWorkload.cpp
@@ -0,0 +1,48 @@
+//
+// Copyright © 2020 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "NeonTransposeWorkload.hpp"
+#include <neon/NeonTensorHandle.hpp>
+#include <aclCommon/ArmComputeTensorUtils.hpp>
+
+#include <arm_compute/core/Error.h>
+
+namespace armnn
+{
+
+arm_compute::Status NeonTransposeWorkloadValidate(const TensorInfo& input,
+                                                  const TensorInfo& output,
+                                                  const TransposeDescriptor& descriptor)
+{
+    const arm_compute::TensorInfo aclInputInfo = armcomputetensorutils::BuildArmComputeTensorInfo(input);
+    const arm_compute::TensorInfo aclOutputInfo = armcomputetensorutils::BuildArmComputeTensorInfo(output);
+    const armnn::PermutationVector& mappings = descriptor.m_DimMappings;
+
+    return arm_compute::NEPermute::validate(&aclInputInfo, &aclOutputInfo,
+                                            armcomputetensorutils::BuildArmComputeTransposeVector(mappings));
+}
+
+NeonTransposeWorkload::NeonTransposeWorkload(const TransposeQueueDescriptor& descriptor,
+                                             const WorkloadInfo& info)
+        : BaseWorkload<TransposeQueueDescriptor>(descriptor, info)
+{
+    m_Data.ValidateInputsOutputs(GetName(), 1, 1);
+
+    const arm_compute::ITensor& input = static_cast<IAclTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
+    arm_compute::ITensor& output = static_cast<IAclTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
+    const armnn::PermutationVector& mappings = m_Data.m_Parameters.m_DimMappings;
+
+    // Run the layer.
+    m_PermuteFunction.configure(&input, &output,
+                                armcomputetensorutils::BuildArmComputeTransposeVector(mappings));
+}
+
+void NeonTransposeWorkload::Execute() const
+{
+    ARMNN_SCOPED_PROFILING_EVENT_NEON(GetName() + "_Execute");
+    m_PermuteFunction.run();
+}
+
+} // namespace armnn
diff --git a/src/backends/neon/workloads/NeonTransposeWorkload.hpp b/src/backends/neon/workloads/NeonTransposeWorkload.hpp
new file mode 100644
index 0000000..aab7b70
--- /dev/null
+++ b/src/backends/neon/workloads/NeonTransposeWorkload.hpp
@@ -0,0 +1,39 @@
+//
+// Copyright © 2020 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include <backendsCommon/Workload.hpp>
+#include <backendsCommon/WorkloadData.hpp>
+#include <neon/workloads/NeonWorkloadUtils.hpp>
+
+#include <armnn/TypesUtils.hpp>
+#include <arm_compute/runtime/NEON/functions/NEPermute.h>
+
+#include <string>
+
+namespace armnn
+{
+arm_compute::Status NeonTransposeWorkloadValidate(const TensorInfo& input, const TensorInfo& output,
+                                                  const TransposeDescriptor& descriptor);
+
+class NeonTransposeWorkload : public BaseWorkload<TransposeQueueDescriptor>
+{
+public:
+    static const std::string& GetName()
+    {
+        static const std::string name = std::string("NeonTransposeWorkload");
+        return name;
+    }
+
+    NeonTransposeWorkload(const TransposeQueueDescriptor& descriptor, const WorkloadInfo& info);
+    void Execute() const override;
+
+private:
+    using BaseWorkload<TransposeQueueDescriptor>::m_Data;
+    mutable arm_compute::NEPermute m_PermuteFunction;
+};
+
+} // namespace armnn
diff --git a/src/backends/neon/workloads/NeonWorkloads.hpp b/src/backends/neon/workloads/NeonWorkloads.hpp
index b08483c..52cd76f 100644
--- a/src/backends/neon/workloads/NeonWorkloads.hpp
+++ b/src/backends/neon/workloads/NeonWorkloads.hpp
@@ -50,3 +50,4 @@
 #include "NeonStridedSliceWorkload.hpp"
 #include "NeonSubtractionWorkload.hpp"
 #include "NeonTransposeConvolution2dWorkload.hpp"
+#include "NeonTransposeWorkload.hpp"
diff --git a/src/backends/reference/RefLayerSupport.cpp b/src/backends/reference/RefLayerSupport.cpp
index 8f1f170..25334c3 100644
--- a/src/backends/reference/RefLayerSupport.cpp
+++ b/src/backends/reference/RefLayerSupport.cpp
@@ -1388,9 +1388,10 @@
     bool supported = true;
 
     // Define supported output and inputs types.
-    std::array<DataType,3> supportedTypes =
+    std::array<DataType, 4> supportedTypes =
     {
         DataType::Float32,
+        DataType::Float16,
         DataType::QAsymmU8,
         DataType::QSymmS16
     };
@@ -1912,4 +1913,33 @@
     return supported;
 }
 
+bool RefLayerSupport::IsTransposeSupported(const TensorInfo& input,
+                                           const TensorInfo& output,
+                                           const TransposeDescriptor& descriptor,
+                                           Optional<std::string&> reasonIfUnsupported) const
+{
+    ignore_unused(descriptor);
+    bool supported = true;
+
+    // Define supported output and inputs types.
+    std::array<DataType, 4> supportedTypes =
+    {
+        DataType::Float32,
+        DataType::Float16,
+        DataType::QAsymmU8,
+        DataType::QSymmS16
+    };
+
+    supported &= CheckSupportRule(TypeAnyOf(input, supportedTypes), reasonIfUnsupported,
+                                  "Reference transpose: input is not a supported type.");
+
+    supported &= CheckSupportRule(TypeAnyOf(output, supportedTypes), reasonIfUnsupported,
+                                  "Reference transpose: output is not a supported type.");
+
+    supported &= CheckSupportRule(TypesAreEqual(input, output), reasonIfUnsupported,
+                                  "Reference transpose: input and output types are mismatched.");
+
+    return supported;
+}
+
 } // namespace armnn
diff --git a/src/backends/reference/RefLayerSupport.hpp b/src/backends/reference/RefLayerSupport.hpp
index 1551a55..27f3f81 100644
--- a/src/backends/reference/RefLayerSupport.hpp
+++ b/src/backends/reference/RefLayerSupport.hpp
@@ -318,6 +318,12 @@
         const TensorInfo& weights,
         const Optional<TensorInfo>& biases,
         Optional<std::string&> reasonIfUnsupported = EmptyOptional()) const override;
+
+    bool IsTransposeSupported(const TensorInfo& input,
+                              const TensorInfo& output,
+                              const TransposeDescriptor& descriptor,
+                              Optional<std::string&> reasonIfUnsupported = EmptyOptional()) const override;
+
 };
 
 } // namespace armnn
diff --git a/src/backends/reference/RefWorkloadFactory.cpp b/src/backends/reference/RefWorkloadFactory.cpp
index 02dbbab..2a415bf 100644
--- a/src/backends/reference/RefWorkloadFactory.cpp
+++ b/src/backends/reference/RefWorkloadFactory.cpp
@@ -561,6 +561,17 @@
     return std::make_unique<RefSubtractionWorkload>(descriptor, info);
 }
 
+std::unique_ptr<IWorkload> RefWorkloadFactory::CreateTranspose(const TransposeQueueDescriptor& descriptor,
+                                                               const WorkloadInfo& info) const
+{
+    if (IsQSymmS16(info))
+    {
+        return std::make_unique<RefTransposeQSymm16Workload>(descriptor, info);
+    }
+    return MakeWorkloadHelper<RefTransposeFloat16Workload, RefTransposeFloat32Workload, RefTransposeQAsymm8Workload,
+            NullWorkload, NullWorkload, NullWorkload>(descriptor, info);
+}
+
 std::unique_ptr<IWorkload> RefWorkloadFactory::CreateTransposeConvolution2d(
     const TransposeConvolution2dQueueDescriptor& descriptor,
     const WorkloadInfo& info) const
diff --git a/src/backends/reference/RefWorkloadFactory.hpp b/src/backends/reference/RefWorkloadFactory.hpp
index b5b9b0f..030ce6f 100644
--- a/src/backends/reference/RefWorkloadFactory.hpp
+++ b/src/backends/reference/RefWorkloadFactory.hpp
@@ -236,6 +236,9 @@
     std::unique_ptr<IWorkload> CreateSubtraction(const SubtractionQueueDescriptor& descriptor,
                                                  const WorkloadInfo& info) const override;
 
+    std::unique_ptr<IWorkload> CreateTranspose(const TransposeQueueDescriptor& descriptor,
+                                               const WorkloadInfo& info) const override;
+
     std::unique_ptr<IWorkload> CreateTransposeConvolution2d(const TransposeConvolution2dQueueDescriptor& descriptor,
                                                             const WorkloadInfo& info) const override;
 
diff --git a/src/backends/reference/backend.mk b/src/backends/reference/backend.mk
index 1987bd5..010d548 100644
--- a/src/backends/reference/backend.mk
+++ b/src/backends/reference/backend.mk
@@ -85,6 +85,7 @@
         workloads/RefStridedSliceWorkload.cpp \
         workloads/RefSplitterWorkload.cpp \
         workloads/RefTransposeConvolution2dWorkload.cpp \
+        workloads/RefTransposeWorkload.cpp \
         workloads/Resize.cpp \
         workloads/Slice.cpp \
         workloads/SpaceToBatchNd.cpp \
diff --git a/src/backends/reference/test/RefLayerTests.cpp b/src/backends/reference/test/RefLayerTests.cpp
index d5c67ef..ed2b995 100644
--- a/src/backends/reference/test/RefLayerTests.cpp
+++ b/src/backends/reference/test/RefLayerTests.cpp
@@ -1460,6 +1460,20 @@
 ARMNN_AUTO_TEST_CASE(Slice2dInt16, Slice2dInt16Test)
 ARMNN_AUTO_TEST_CASE(Slice1dInt16, Slice1dInt16Test)
 
+// Transpose
+ARMNN_AUTO_TEST_CASE(SimpleTransposeFloat32, SimpleTransposeTest<DataType::Float32>)
+ARMNN_AUTO_TEST_CASE(TransposeFloat32ValueSet1Test, TransposeValueSet1Test<DataType::Float32>)
+ARMNN_AUTO_TEST_CASE(TransposeFloat32ValueSet2Test, TransposeValueSet2Test<DataType::Float32>)
+ARMNN_AUTO_TEST_CASE(TransposeFloat32ValueSet3Test, TransposeValueSet3Test<DataType::Float32>)
+ARMNN_AUTO_TEST_CASE(SimpleTransposeQASymm8, SimpleTransposeTest<DataType::QAsymmU8>)
+ARMNN_AUTO_TEST_CASE(TransposeQASymm8ValueSet1Test, TransposeValueSet1Test<DataType::QAsymmU8>)
+ARMNN_AUTO_TEST_CASE(TransposeQASymm8ValueSet2Test, TransposeValueSet2Test<DataType::QAsymmU8>)
+ARMNN_AUTO_TEST_CASE(TransposeQASymm8ValueSet3Test, TransposeValueSet3Test<DataType::QAsymmU8>)
+ARMNN_AUTO_TEST_CASE(SimpleTransposeQSymm16, SimpleTransposeTest<DataType::QSymmS16>)
+ARMNN_AUTO_TEST_CASE(TransposeQSymm16ValueSet1Test, TransposeValueSet1Test<DataType::QSymmS16>)
+ARMNN_AUTO_TEST_CASE(TransposeQSymm16ValueSet2Test, TransposeValueSet2Test<DataType::QSymmS16>)
+ARMNN_AUTO_TEST_CASE(TransposeQSymm16ValueSet3Test, TransposeValueSet3Test<DataType::QSymmS16>)
+
 // TransposeConvolution2d
 ARMNN_AUTO_TEST_CASE(SimpleTransposeConvolution2dFloatNchw,
                      SimpleTransposeConvolution2dTest<DataType::Float32, DataType::Float32>,
diff --git a/src/backends/reference/workloads/CMakeLists.txt b/src/backends/reference/workloads/CMakeLists.txt
index 6795204..b2d8938 100644
--- a/src/backends/reference/workloads/CMakeLists.txt
+++ b/src/backends/reference/workloads/CMakeLists.txt
@@ -141,6 +141,8 @@
     RefStridedSliceWorkload.hpp
     RefTransposeConvolution2dWorkload.cpp
     RefTransposeConvolution2dWorkload.hpp
+    RefTransposeWorkload.cpp
+    RefTransposeWorkload.hpp
     RefWorkloads.hpp
     RefWorkloadUtils.hpp
     Resize.cpp
diff --git a/src/backends/reference/workloads/RefTransposeWorkload.cpp b/src/backends/reference/workloads/RefTransposeWorkload.cpp
new file mode 100644
index 0000000..6bdfb21
--- /dev/null
+++ b/src/backends/reference/workloads/RefTransposeWorkload.cpp
@@ -0,0 +1,35 @@
+//
+// Copyright © 2020 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "RefTransposeWorkload.hpp"
+#include "RefWorkloadUtils.hpp"
+
+#include <armnnUtils/Transpose.hpp>
+
+#include <ResolveType.hpp>
+
+namespace armnn
+{
+
+template <armnn::DataType DataType>
+void RefTransposeWorkload<DataType>::Execute() const
+{
+    using T = ResolveType<DataType>;
+
+    ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuRef, GetName() + "_Execute");
+
+    const ITensorHandle*     src      = m_Data.m_Inputs[0];
+    ITensorHandle*           dst      = m_Data.m_Outputs[0];
+    const PermutationVector& mappings = m_Data.m_Parameters.m_DimMappings;
+
+    armnnUtils::Transpose(GetTensorInfo(src).GetShape(), mappings, src->Map(), dst->Map(), sizeof(T));
+}
+
+template class RefTransposeWorkload<DataType::Float16>;
+template class RefTransposeWorkload<DataType::Float32>;
+template class RefTransposeWorkload<DataType::QAsymmU8>;
+template class RefTransposeWorkload<DataType::QSymmS16>;
+
+} //namespace armnn
diff --git a/src/backends/reference/workloads/RefTransposeWorkload.hpp b/src/backends/reference/workloads/RefTransposeWorkload.hpp
new file mode 100644
index 0000000..4b1c3d3
--- /dev/null
+++ b/src/backends/reference/workloads/RefTransposeWorkload.hpp
@@ -0,0 +1,35 @@
+//
+// Copyright © 2020 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include <backendsCommon/Workload.hpp>
+
+#include <armnn/TypesUtils.hpp>
+
+namespace armnn
+{
+
+template <armnn::DataType DataType>
+class RefTransposeWorkload : public TypedWorkload<TransposeQueueDescriptor, DataType>
+{
+public:
+    static const std::string& GetName()
+    {
+        static const std::string name = std::string("RefTranspose") + GetDataTypeName(DataType) + "Workload";
+        return name;
+    }
+
+    using TypedWorkload<TransposeQueueDescriptor, DataType>::m_Data;
+    using TypedWorkload<TransposeQueueDescriptor, DataType>::TypedWorkload;
+    void Execute() const override;
+};
+
+using RefTransposeFloat16Workload = RefTransposeWorkload<DataType::Float16>;
+using RefTransposeFloat32Workload = RefTransposeWorkload<DataType::Float32>;
+using RefTransposeQAsymm8Workload = RefTransposeWorkload<DataType::QAsymmU8>;
+using RefTransposeQSymm16Workload = RefTransposeWorkload<DataType::QSymmS16>;
+
+} //namespace armnn
\ No newline at end of file
diff --git a/src/backends/reference/workloads/RefWorkloads.hpp b/src/backends/reference/workloads/RefWorkloads.hpp
index 7034b67..a0558ff 100644
--- a/src/backends/reference/workloads/RefWorkloads.hpp
+++ b/src/backends/reference/workloads/RefWorkloads.hpp
@@ -58,6 +58,7 @@
 #include "RefStridedSliceWorkload.hpp"
 #include "RefSpaceToDepthWorkload.hpp"
 #include "RefTransposeConvolution2dWorkload.hpp"
+#include "RefTransposeWorkload.hpp"
 #include "RefWorkloadUtils.hpp"
 #include "Resize.hpp"
 #include "Softmax.hpp"