IVGCVSW-3419 Add reference workload support for the new Stack layer

 * Added reference workload for the Stack layer
 * Added factory methods
 * Added validation support
 * Added unit tests

Signed-off-by: Matthew Jackson <matthew.jackson@arm.com>
Change-Id: Ib14b72c15f53a2a2ca152afc357ce2aa405ccc88
diff --git a/src/backends/backendsCommon/LayerSupportBase.cpp b/src/backends/backendsCommon/LayerSupportBase.cpp
index 26b98a2..e843423 100644
--- a/src/backends/backendsCommon/LayerSupportBase.cpp
+++ b/src/backends/backendsCommon/LayerSupportBase.cpp
@@ -415,7 +415,7 @@
     return DefaultLayerSupport(__func__, __FILE__, __LINE__, reasonIfUnsupported);
 }
 
-bool LayerSupportBase::IsStackSupported(const std::vector<const TensorInfo*> inputs,
+bool LayerSupportBase::IsStackSupported(const std::vector<const TensorInfo*>& inputs,
                                         const TensorInfo& output,
                                         const StackDescriptor& descriptor,
                                         Optional<std::string&> reasonIfUnsupported) const
diff --git a/src/backends/backendsCommon/LayerSupportBase.hpp b/src/backends/backendsCommon/LayerSupportBase.hpp
index dad0798..d49fc3e 100644
--- a/src/backends/backendsCommon/LayerSupportBase.hpp
+++ b/src/backends/backendsCommon/LayerSupportBase.hpp
@@ -257,7 +257,7 @@
                              const ViewsDescriptor& descriptor,
                              Optional<std::string&> reasonIfUnsupported = EmptyOptional()) const override;
 
-    bool IsStackSupported(const std::vector<const TensorInfo*> inputs,
+    bool IsStackSupported(const std::vector<const TensorInfo*>& inputs,
                           const TensorInfo& output,
                           const StackDescriptor& descriptor,
                           Optional<std::string&> reasonIfUnsupported = EmptyOptional()) const override;
diff --git a/src/backends/backendsCommon/test/LayerTests.hpp b/src/backends/backendsCommon/test/LayerTests.hpp
index d6747f5..3db826f 100644
--- a/src/backends/backendsCommon/test/LayerTests.hpp
+++ b/src/backends/backendsCommon/test/LayerTests.hpp
@@ -4483,3 +4483,389 @@
 LayerTestResult<int16_t, 4> UnbiasedStridedTransposeConvolution2dInt16NhwcTest(
     armnn::IWorkloadFactory& workloadFactory,
     const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+template<armnn::DataType ArmnnType, typename T, std::size_t outputDimLength>
+LayerTestResult<T, outputDimLength> StackTestHelper(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        const armnn::TensorInfo& inputTensorInfo,
+        const armnn::TensorInfo& outputTensorInfo,
+        unsigned int axis,
+        const std::vector<std::vector<T>>& inputData,
+        const std::vector<T>& outputExpectedData)
+{
+    unsigned int numInputs = static_cast<unsigned int>(inputData.size());
+    std::vector<boost::multi_array<T, outputDimLength-1>> inputs;
+    for (unsigned int i = 0; i < numInputs; ++i)
+    {
+        inputs.push_back(MakeTensor<T, outputDimLength-1>(inputTensorInfo, inputData[i]));
+    }
+
+    LayerTestResult<T, outputDimLength> result(outputTensorInfo);
+    result.outputExpected = MakeTensor<T, outputDimLength>(outputTensorInfo, outputExpectedData);
+
+    std::vector<std::unique_ptr<armnn::ITensorHandle>> inputHandles;
+    for (unsigned int i = 0; i < numInputs; ++i)
+    {
+        inputHandles.push_back(workloadFactory.CreateTensorHandle(inputTensorInfo));
+    }
+    std::unique_ptr<armnn::ITensorHandle> outputHandle = workloadFactory.CreateTensorHandle(outputTensorInfo);
+
+    armnn::StackQueueDescriptor descriptor;
+    descriptor.m_Parameters.m_Axis = axis;
+    descriptor.m_Parameters.m_InputShape = inputTensorInfo.GetShape();
+    descriptor.m_Parameters.m_NumInputs = numInputs;
+
+    armnn::WorkloadInfo info;
+    for (unsigned int i = 0; i < numInputs; ++i)
+    {
+        std::unique_ptr<armnn::ITensorHandle>& inputHandle = inputHandles[i];
+        AddInputToWorkload(descriptor, info, inputTensorInfo, inputHandle.get());
+        inputHandle->Allocate();
+        CopyDataToITensorHandle(inputHandle.get(), inputs[i].origin());
+    }
+
+    AddOutputToWorkload(descriptor, info, outputTensorInfo, outputHandle.get());
+    outputHandle->Allocate();
+
+    std::unique_ptr<armnn::IWorkload> workload = workloadFactory.CreateStack(descriptor, info);
+
+    workload->Execute();
+
+    CopyDataFromITensorHandle(result.output.origin(), outputHandle.get());
+
+    return result;
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> Stack0AxisTest(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    armnn::TensorInfo inputTensorInfo ({ 3, 2, 3 }, ArmnnType);
+    armnn::TensorInfo outputTensorInfo({ 2, 3, 2, 3 }, ArmnnType);
+
+    std::vector<std::vector<T>> inputData;
+
+    inputData.push_back(
+    {
+        1, 2, 3,
+        4, 5, 6,
+
+        7, 8, 9,
+        10, 11, 12,
+
+        13, 14, 15,
+        16, 17, 18
+    });
+
+    inputData.push_back(
+    {
+        19, 20, 21,
+        22, 23, 24,
+
+        25, 26, 27,
+        28, 29, 30,
+
+        31, 32, 33,
+        34, 35, 36
+    });
+
+    std::vector<T> outputExpectedData =
+    {
+        1, 2, 3,
+        4, 5, 6,
+
+        7, 8, 9,
+        10, 11, 12,
+
+        13, 14, 15,
+        16, 17, 18,
+
+
+        19, 20, 21,
+        22, 23, 24,
+
+        25, 26, 27,
+        28, 29, 30,
+
+        31, 32, 33,
+        34, 35, 36
+    };
+
+    return StackTestHelper<ArmnnType, T, 4>(
+        workloadFactory,
+        memoryManager,
+        inputTensorInfo,
+        outputTensorInfo,
+        0U,
+        inputData,
+        outputExpectedData
+    );
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> Stack4dOutput1AxisTest(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    armnn::TensorInfo inputTensorInfo ({ 3, 2, 3 }, ArmnnType);
+    armnn::TensorInfo outputTensorInfo({ 3, 2, 2, 3 }, ArmnnType);
+
+    std::vector<std::vector<T>> inputData;
+
+    inputData.push_back(
+    {
+        1, 2, 3,
+        4, 5, 6,
+
+        7, 8, 9,
+        10, 11, 12,
+
+        13, 14, 15,
+        16, 17, 18
+    });
+
+    inputData.push_back(
+    {
+        19, 20, 21,
+        22, 23, 24,
+
+        25, 26, 27,
+        28, 29, 30,
+
+        31, 32, 33,
+        34, 35, 36
+    });
+
+    std::vector<T> outputExpectedData =
+    {
+        1, 2, 3,
+        4, 5, 6,
+
+        19, 20, 21,
+        22, 23, 24,
+
+
+        7, 8, 9,
+        10, 11, 12,
+
+        25, 26, 27,
+        28, 29, 30,
+
+
+        13, 14, 15,
+        16, 17, 18,
+
+        31, 32, 33,
+        34, 35, 36
+    };
+
+    return StackTestHelper<ArmnnType, T, 4>(
+        workloadFactory,
+        memoryManager,
+        inputTensorInfo,
+        outputTensorInfo,
+        1U,
+        inputData,
+        outputExpectedData
+    );
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> Stack4dOutput2AxisTest(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    armnn::TensorInfo inputTensorInfo ({ 3, 2, 3 }, ArmnnType);
+    armnn::TensorInfo outputTensorInfo({ 3, 2, 2, 3 }, ArmnnType);
+
+    std::vector<std::vector<T>> inputData;
+
+    inputData.push_back(
+    {
+        1, 2, 3,
+        4, 5, 6,
+
+        7, 8, 9,
+        10, 11, 12,
+
+        13, 14, 15,
+        16, 17, 18
+    });
+
+    inputData.push_back(
+    {
+        19, 20, 21,
+        22, 23, 24,
+
+        25, 26, 27,
+        28, 29, 30,
+
+        31, 32, 33,
+        34, 35, 36
+    });
+
+    std::vector<T> outputExpectedData =
+    {
+        1, 2, 3,
+        19, 20, 21,
+
+        4, 5, 6,
+        22, 23, 24,
+
+
+        7, 8, 9,
+        25, 26, 27,
+
+        10, 11, 12,
+        28, 29, 30,
+
+        13, 14, 15,
+        31, 32, 33,
+
+        16, 17, 18,
+        34, 35, 36
+    };
+
+    return StackTestHelper<ArmnnType, T, 4>(
+        workloadFactory,
+        memoryManager,
+        inputTensorInfo,
+        outputTensorInfo,
+        2U,
+        inputData,
+        outputExpectedData
+    );
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> Stack4dOutput3AxisTest(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    armnn::TensorInfo inputTensorInfo ({ 3, 2, 3 }, ArmnnType);
+    armnn::TensorInfo outputTensorInfo({ 3, 2, 3, 2 }, ArmnnType);
+
+    std::vector<std::vector<T>> inputData;
+
+    inputData.push_back(
+    {
+        1, 2, 3,
+        4, 5, 6,
+
+        7, 8, 9,
+        10, 11, 12,
+
+        13, 14, 15,
+        16, 17, 18
+    });
+
+    inputData.push_back(
+    {
+        19, 20, 21,
+        22, 23, 24,
+
+        25, 26, 27,
+        28, 29, 30,
+
+        31, 32, 33,
+        34, 35, 36
+    });
+
+    std::vector<T> outputExpectedData =
+    {
+        1, 19,
+        2, 20,
+        3, 21,
+
+        4, 22,
+        5, 23,
+        6, 24,
+
+
+        7, 25,
+        8, 26,
+        9, 27,
+
+        10, 28,
+        11, 29,
+        12, 30,
+
+
+        13, 31,
+        14, 32,
+        15, 33,
+
+        16, 34,
+        17, 35,
+        18, 36
+    };
+
+    return StackTestHelper<ArmnnType, T, 4>(
+        workloadFactory,
+        memoryManager,
+        inputTensorInfo,
+        outputTensorInfo,
+        3U,
+        inputData,
+        outputExpectedData
+    );
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 3> Stack3dOutput1Axis3InputTest(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    armnn::TensorInfo inputTensorInfo ({ 3, 3 }, ArmnnType);
+    armnn::TensorInfo outputTensorInfo({ 3, 3, 3 }, ArmnnType);
+
+    std::vector<std::vector<T>> inputData;
+
+    inputData.push_back(
+    {
+        1, 2, 3,
+        4, 5, 6,
+        7, 8, 9
+    });
+
+    inputData.push_back(
+    {
+        10, 11, 12,
+        13, 14, 15,
+        16, 17, 18
+    });
+
+    inputData.push_back(
+    {
+        19, 20, 21,
+        22, 23, 24,
+        25, 26, 27
+    });
+
+    std::vector<T> outputExpectedData =
+    {
+        1, 2, 3,
+        10, 11, 12,
+        19, 20, 21,
+
+        4, 5, 6,
+        13, 14, 15,
+        22, 23, 24,
+
+        7, 8, 9,
+        16, 17, 18,
+        25, 26, 27
+    };
+
+    return StackTestHelper<ArmnnType, T, 3>(
+        workloadFactory,
+        memoryManager,
+        inputTensorInfo,
+        outputTensorInfo,
+        1U,
+        inputData,
+        outputExpectedData
+    );
+}
diff --git a/src/backends/reference/RefLayerSupport.cpp b/src/backends/reference/RefLayerSupport.cpp
index 59c14c4..b9aa126 100644
--- a/src/backends/reference/RefLayerSupport.cpp
+++ b/src/backends/reference/RefLayerSupport.cpp
@@ -419,6 +419,7 @@
                                   "Reference concatenation: output type not supported");
     for (const TensorInfo* input : inputs)
     {
+        BOOST_ASSERT(input != nullptr);
         supported &= CheckSupportRule(TypeAnyOf(*input, supportedTypes), reasonIfUnsupported,
             "Reference concatenation: input type not supported");
 
@@ -1592,6 +1593,36 @@
     return supported;
 }
 
+bool RefLayerSupport::IsStackSupported(const std::vector<const TensorInfo*>& inputs,
+                                       const TensorInfo& output,
+                                       const StackDescriptor& descriptor,
+                                       Optional<std::string&> reasonIfUnsupported) const
+{
+    ignore_unused(descriptor);
+
+    bool supported = true;
+    std::array<DataType,3> supportedTypes =
+    {
+        DataType::Float32,
+        DataType::QuantisedAsymm8,
+        DataType::QuantisedSymm16
+    };
+
+    supported &= CheckSupportRule(TypeAnyOf(output, supportedTypes), reasonIfUnsupported,
+                                  "Reference stack: output type not supported");
+    for (const TensorInfo* input : inputs)
+    {
+        BOOST_ASSERT(input != nullptr);
+        supported &= CheckSupportRule(TypeAnyOf(*input, supportedTypes), reasonIfUnsupported,
+            "Reference stack: input type not supported");
+
+        supported &= CheckSupportRule(TypesAreEqual(*input, output), reasonIfUnsupported,
+            "Reference stack: input and output types mismatched.");
+    }
+
+    return supported;
+}
+
 bool RefLayerSupport::IsStridedSliceSupported(const TensorInfo& input,
                                               const TensorInfo& output,
                                               const StridedSliceDescriptor& descriptor,
diff --git a/src/backends/reference/RefLayerSupport.hpp b/src/backends/reference/RefLayerSupport.hpp
index c0bf188..f8bbeb7 100644
--- a/src/backends/reference/RefLayerSupport.hpp
+++ b/src/backends/reference/RefLayerSupport.hpp
@@ -241,6 +241,11 @@
                              const ViewsDescriptor& descriptor,
                              Optional<std::string&> reasonIfUnsupported = EmptyOptional()) const override;
 
+    bool IsStackSupported(const std::vector<const TensorInfo*>& inputs,
+                          const TensorInfo& output,
+                          const StackDescriptor& descriptor,
+                          Optional<std::string&> reasonIfUnsupported = EmptyOptional()) const override;
+
     bool IsStridedSliceSupported(const TensorInfo& input,
                                  const TensorInfo& output,
                                  const StridedSliceDescriptor& descriptor,
diff --git a/src/backends/reference/RefWorkloadFactory.cpp b/src/backends/reference/RefWorkloadFactory.cpp
index 183103c..925eb6a 100644
--- a/src/backends/reference/RefWorkloadFactory.cpp
+++ b/src/backends/reference/RefWorkloadFactory.cpp
@@ -506,4 +506,14 @@
     return std::make_unique<RefTransposeConvolution2dWorkload>(descriptor, info);
 }
 
+std::unique_ptr<IWorkload> RefWorkloadFactory::CreateStack(const StackQueueDescriptor& descriptor,
+                                                           const WorkloadInfo& info) const
+{
+    if (IsFloat16(info))
+    {
+        return MakeWorkload<NullWorkload, NullWorkload>(descriptor, info);
+    }
+    return std::make_unique<RefStackWorkload>(descriptor, info);
+}
+
 } // namespace armnn
diff --git a/src/backends/reference/RefWorkloadFactory.hpp b/src/backends/reference/RefWorkloadFactory.hpp
index 9ef1522..b012fbc 100644
--- a/src/backends/reference/RefWorkloadFactory.hpp
+++ b/src/backends/reference/RefWorkloadFactory.hpp
@@ -203,6 +203,9 @@
     std::unique_ptr<IWorkload> CreateTransposeConvolution2d(const TransposeConvolution2dQueueDescriptor& descriptor,
                                                             const WorkloadInfo& info) const override;
 
+    std::unique_ptr<IWorkload> CreateStack(const StackQueueDescriptor& descriptor,
+                                           const WorkloadInfo& info) const override;
+
 private:
 
     template <typename F32Workload, typename U8Workload, typename QueueDescriptorType>
diff --git a/src/backends/reference/backend.mk b/src/backends/reference/backend.mk
index 411ab7e..6e1360a 100644
--- a/src/backends/reference/backend.mk
+++ b/src/backends/reference/backend.mk
@@ -62,6 +62,7 @@
         workloads/RefSoftmaxWorkload.cpp \
         workloads/RefSpaceToBatchNdWorkload.cpp \
         workloads/RefSpaceToDepthWorkload.cpp \
+        workloads/RefStackWorkload.cpp \
         workloads/RefStridedSliceWorkload.cpp \
         workloads/RefSplitterWorkload.cpp \
         workloads/RefTransposeConvolution2dWorkload.cpp \
@@ -69,6 +70,7 @@
         workloads/Rsqrt.cpp \
         workloads/SpaceToBatchNd.cpp \
         workloads/SpaceToDepth.cpp \
+        workloads/Stack.cpp \
         workloads/StridedSlice.cpp \
         workloads/StringMapping.cpp \
         workloads/Softmax.cpp \
diff --git a/src/backends/reference/test/RefCreateWorkloadTests.cpp b/src/backends/reference/test/RefCreateWorkloadTests.cpp
index 2fa6cbf..f7999d0 100644
--- a/src/backends/reference/test/RefCreateWorkloadTests.cpp
+++ b/src/backends/reference/test/RefCreateWorkloadTests.cpp
@@ -990,4 +990,41 @@
     RefCreateSpaceToDepthWorkloadTest<RefSpaceToDepthWorkload, armnn::DataType::QuantisedSymm16>();
 }
 
+static void RefCreateStackWorkloadTest(const armnn::TensorShape& inputShape,
+                                       const armnn::TensorShape& outputShape,
+                                       unsigned int axis,
+                                       unsigned int numInputs,
+                                       armnn::DataType dataType)
+{
+    armnn::Graph graph;
+    RefWorkloadFactory factory;
+    auto workload = CreateStackWorkloadTest<RefStackWorkload>(factory,
+                                                              graph,
+                                                              inputShape,
+                                                              outputShape,
+                                                              axis,
+                                                              numInputs,
+                                                              dataType);
+
+    // Check output is as expected
+    auto queueDescriptor = workload->GetData();
+    auto outputHandle = boost::polymorphic_downcast<RefTensorHandle*>(queueDescriptor.m_Outputs[0]);
+    BOOST_TEST((outputHandle->GetTensorInfo() == TensorInfo(outputShape, dataType)));
+}
+
+BOOST_AUTO_TEST_CASE(CreateStackFloat32Workload)
+{
+    RefCreateStackWorkloadTest({ 3, 4, 5 }, { 3, 4, 2, 5 }, 2, 2, armnn::DataType::Float32);
+}
+
+BOOST_AUTO_TEST_CASE(CreateStackUint8Workload)
+{
+    RefCreateStackWorkloadTest({ 3, 4, 5 }, { 3, 4, 2, 5 }, 2, 2, armnn::DataType::QuantisedAsymm8);
+}
+
+BOOST_AUTO_TEST_CASE(CreateStackUint16Workload)
+{
+    RefCreateStackWorkloadTest({ 3, 4, 5 }, { 3, 4, 2, 5 }, 2, 2, armnn::DataType::QuantisedSymm16);
+}
+
 BOOST_AUTO_TEST_SUITE_END()
diff --git a/src/backends/reference/test/RefLayerTests.cpp b/src/backends/reference/test/RefLayerTests.cpp
index 509dbf7..4f46d27 100644
--- a/src/backends/reference/test/RefLayerTests.cpp
+++ b/src/backends/reference/test/RefLayerTests.cpp
@@ -1175,4 +1175,11 @@
 ARMNN_AUTO_TEST_CASE(UnbiasedStridedTransposeConvolution2dInt16Nhwc, UnbiasedStridedTransposeConvolution2dInt16NhwcTest)
 ARMNN_AUTO_TEST_CASE(UnbiasedStridedTransposeConvolution2dInt16Nchw, UnbiasedStridedTransposeConvolution2dInt16NchwTest)
 
+// Stack
+ARMNN_AUTO_TEST_CASE(Stack0Axis,               Stack0AxisTest<armnn::DataType::Float32>)
+ARMNN_AUTO_TEST_CASE(Stack4dOutput1Axis,       Stack4dOutput1AxisTest<armnn::DataType::Float32>)
+ARMNN_AUTO_TEST_CASE(Stack4dOutput2Axis,       Stack4dOutput2AxisTest<armnn::DataType::Float32>)
+ARMNN_AUTO_TEST_CASE(Stack4dOutput3Axis,       Stack4dOutput3AxisTest<armnn::DataType::Float32>)
+ARMNN_AUTO_TEST_CASE(Stack3dOutput1Axis3Input, Stack3dOutput1Axis3InputTest<armnn::DataType::Float32>)
+
 BOOST_AUTO_TEST_SUITE_END()
diff --git a/src/backends/reference/workloads/CMakeLists.txt b/src/backends/reference/workloads/CMakeLists.txt
index 696605d..c9db057 100644
--- a/src/backends/reference/workloads/CMakeLists.txt
+++ b/src/backends/reference/workloads/CMakeLists.txt
@@ -109,6 +109,8 @@
     RefSpaceToDepthWorkload.hpp
     RefSplitterWorkload.cpp
     RefSplitterWorkload.hpp
+    RefStackWorkload.cpp
+    RefStackWorkload.hpp
     RefStridedSliceWorkload.cpp
     RefStridedSliceWorkload.hpp
     RefTransposeConvolution2dWorkload.cpp
@@ -127,6 +129,8 @@
     SpaceToDepth.cpp
     Splitter.hpp
     Splitter.cpp
+    Stack.cpp
+    Stack.hpp
     StridedSlice.hpp
     StridedSlice.cpp
     StringMapping.cpp
diff --git a/src/backends/reference/workloads/RefStackWorkload.cpp b/src/backends/reference/workloads/RefStackWorkload.cpp
new file mode 100644
index 0000000..be36f40
--- /dev/null
+++ b/src/backends/reference/workloads/RefStackWorkload.cpp
@@ -0,0 +1,57 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "RefStackWorkload.hpp"
+
+#include "RefWorkloadUtils.hpp"
+#include "Stack.hpp"
+
+#include <Profiling.hpp>
+
+namespace armnn
+{
+
+RefStackWorkload::RefStackWorkload(const StackQueueDescriptor& descriptor,
+                                   const WorkloadInfo& info)
+    : BaseWorkload(descriptor, info)
+{}
+
+void RefStackWorkload::Execute() const
+{
+    ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuRef, "RefStackWorkload_Execute");
+
+    // Can perform a simple concatenation when axis == 0
+    if (!m_Data.m_Parameters.m_Axis)
+    {
+        float* output = GetOutputTensorData<float>(0, m_Data);
+        BOOST_ASSERT(output != nullptr);
+
+        unsigned int numInputs = m_Data.m_Parameters.m_NumInputs;
+        unsigned int inputLength = GetTensorInfo(m_Data.m_Inputs[0]).GetNumElements();
+
+        for (unsigned int inputIdx=0; inputIdx<numInputs; ++inputIdx)
+        {
+            const float* input = GetInputTensorData<float>(inputIdx, m_Data);
+            for (unsigned int elmt=0; elmt<inputLength; ++elmt)
+            {
+                output[(inputIdx * inputLength) + elmt] = input[elmt];
+            }
+        }
+        return;
+    }
+
+    std::vector<std::unique_ptr<Decoder<float>>> inputDecoders;
+    for (unsigned int i=0; i<m_Data.m_Inputs.size(); ++i)
+    {
+        inputDecoders.push_back(MakeDecoder<float>(GetTensorInfo(m_Data.m_Inputs[i]),
+                                                   m_Data.m_Inputs[i]->Map()));
+    }
+    std::unique_ptr<Encoder<float>> outputEncoder = MakeEncoder<float>(GetTensorInfo(m_Data.m_Outputs[0]),
+                                                                       m_Data.m_Outputs[0]->Map());
+
+    Stack(m_Data, inputDecoders, *outputEncoder);
+}
+
+} // namespace armnn
diff --git a/src/backends/reference/workloads/RefStackWorkload.hpp b/src/backends/reference/workloads/RefStackWorkload.hpp
new file mode 100644
index 0000000..ceb27d9
--- /dev/null
+++ b/src/backends/reference/workloads/RefStackWorkload.hpp
@@ -0,0 +1,22 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include <backendsCommon/Workload.hpp>
+#include <backendsCommon/WorkloadData.hpp>
+
+namespace armnn
+{
+
+class RefStackWorkload : public BaseWorkload<StackQueueDescriptor>
+{
+public:
+    explicit RefStackWorkload(const StackQueueDescriptor& descriptor,
+                              const WorkloadInfo& info);
+    virtual void Execute() const override;
+};
+
+} // namespace armnn
diff --git a/src/backends/reference/workloads/RefWorkloads.hpp b/src/backends/reference/workloads/RefWorkloads.hpp
index 4bdf05d..e86dccd 100644
--- a/src/backends/reference/workloads/RefWorkloads.hpp
+++ b/src/backends/reference/workloads/RefWorkloads.hpp
@@ -46,6 +46,7 @@
 #include "RefSplitterWorkload.hpp"
 #include "RefSoftmaxWorkload.hpp"
 #include "RefSpaceToBatchNdWorkload.hpp"
+#include "RefStackWorkload.hpp"
 #include "RefStridedSliceWorkload.hpp"
 #include "RefSpaceToDepthWorkload.hpp"
 #include "RefTransposeConvolution2dWorkload.hpp"
diff --git a/src/backends/reference/workloads/Stack.cpp b/src/backends/reference/workloads/Stack.cpp
new file mode 100644
index 0000000..386c899
--- /dev/null
+++ b/src/backends/reference/workloads/Stack.cpp
@@ -0,0 +1,115 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "Stack.hpp"
+#include "RefWorkloadUtils.hpp"
+
+namespace armnn
+{
+
+void Stack(const StackQueueDescriptor& data,
+           std::vector<std::unique_ptr<Decoder<float>>>& inputs,
+           Encoder<float>& output)
+{
+    const TensorInfo& outputInfo = GetTensorInfo(data.m_Outputs[0]);
+    const TensorInfo& inputInfo = GetTensorInfo(data.m_Inputs[0]);
+
+    unsigned int outputNumDims = outputInfo.GetNumDimensions();
+    unsigned int inputNumDims = inputInfo.GetNumDimensions();
+
+    const armnn::TensorShape& outputDims = outputInfo.GetShape();
+    const armnn::TensorShape& inputDims = inputInfo.GetShape();
+
+    unsigned int axis = data.m_Parameters.m_Axis;
+
+    // Initialise output data
+    unsigned int numOutputElements = 1;
+    for (unsigned int i=0; i<outputNumDims; ++i)
+    {
+        numOutputElements *= outputDims[i];
+    }
+
+    const unsigned int iNumTensors = static_cast<unsigned int>(data.m_Inputs.size());
+    const unsigned int iBatchSize  = inputDims[0];
+    const unsigned int iChannels   = (inputNumDims > 1) ? inputDims[1] : 1;
+    const unsigned int iHeight     = (inputNumDims > 2) ? inputDims[2] : 1;
+    const unsigned int iWidth      = (inputNumDims > 3) ? inputDims[3] : 1;
+
+    const unsigned int oBatchSize  = outputDims[1];
+    const unsigned int oChannels   = (outputNumDims > 2) ? outputDims[2] : 1;
+    const unsigned int oHeight     = (outputNumDims > 3) ? outputDims[3] : 1;
+    const unsigned int oWidth      = (outputNumDims > 4) ? outputDims[4] : 1;
+
+    // Array to store the input coordinates
+    // iCoordinates[0] = i, iCoordinates[1] = bi, iCoordinates[2] = ci
+    // iCoordinates[3] = hi, iCoordinates[4] = wi, iCoordinates[5] = 0
+    // iCoordinates[5] will be always zero and used for not incrementing
+    // the output when the input has less than 4 dimensions
+    std::array<unsigned int, 6> iCoordinates{ 0 };
+
+    // Array of pointers used to map the output coordinates to the input ones, in accordance with the axis
+    // This array is initialized with &iCoordinates[5] since this will be always zero
+    std::array<unsigned int *, 5> oCoordinates = { &iCoordinates[5],
+                                                   &iCoordinates[5],
+                                                   &iCoordinates[5],
+                                                   &iCoordinates[5],
+                                                   &iCoordinates[5] };
+
+    // Set the axis coordinate
+    oCoordinates[axis] = &iCoordinates[0];
+
+    // Map the output coordinates, accounting for the axis
+    unsigned int dim_shift = 0;
+    for(unsigned int dim = 0; dim < inputNumDims; ++dim)
+    {
+        if(dim == axis)
+        {
+            dim_shift++;
+        }
+        oCoordinates[dim + dim_shift] = &iCoordinates[dim + 1];
+    }
+
+    // Alias for the input coordinates
+    unsigned int &i  = iCoordinates[0];
+    unsigned int &bi = iCoordinates[1];
+    unsigned int &ci = iCoordinates[2];
+    unsigned int &hi = iCoordinates[3];
+    unsigned int &wi = iCoordinates[4];
+
+    // Alias for the output coordinates
+    unsigned int &o  = *(oCoordinates[0]);
+    unsigned int &bo = *(oCoordinates[1]);
+    unsigned int &co = *(oCoordinates[2]);
+    unsigned int &ho = *(oCoordinates[3]);
+    unsigned int &wo = *(oCoordinates[4]);
+
+    // Stack tensors
+    for(; i < iNumTensors; ++(i))
+    {
+        for(bi = 0; bi < iBatchSize; ++(bi))
+        {
+            for(ci = 0; ci < iChannels; ++(ci))
+            {
+                for(hi = 0; hi < iHeight; ++(hi))
+                {
+                    for(wi = 0; wi < iWidth; ++(wi))
+                    {
+                        output[o  * oWidth * oHeight * oChannels * oBatchSize +
+                               bo * oWidth * oHeight * oChannels +
+                               co * oWidth * oHeight +
+                               ho * oWidth +
+                               wo];
+
+                        output.Set(inputs[i]->Get());
+
+                        ++(*(inputs[i]));
+                    }
+                }
+            }
+        }
+    }
+}
+
+} // namespace armnn
diff --git a/src/backends/reference/workloads/Stack.hpp b/src/backends/reference/workloads/Stack.hpp
new file mode 100644
index 0000000..cd86d41
--- /dev/null
+++ b/src/backends/reference/workloads/Stack.hpp
@@ -0,0 +1,20 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include "Encoders.hpp"
+#include "Decoders.hpp"
+
+#include <backendsCommon/WorkloadData.hpp>
+
+namespace armnn
+{
+
+void Stack (const StackQueueDescriptor&                   data,
+            std::vector<std::unique_ptr<Decoder<float>>>& inputs,
+            Encoder<float>&                               output);
+
+} // namespace armnn