IVGCVSW-2105 - Unit tests for merger
 * Add LayerTests
 * Add WorkloadTests

!android-nn-driver:166

Change-Id: I903461002879f60fc9f8ae929f18784e2d9b1fc1
diff --git a/src/backends/backendsCommon/test/LayerTests.cpp b/src/backends/backendsCommon/test/LayerTests.cpp
index f5689e7..caa4f40 100755
--- a/src/backends/backendsCommon/test/LayerTests.cpp
+++ b/src/backends/backendsCommon/test/LayerTests.cpp
@@ -1994,9 +1994,9 @@
 }
 
 //
-// Concatenation is only supported for N and C dimensions for NCHW. In case of
-// <4 dimensions we need to make sure that the concat dimensions are at least
-// the 3rd slowest iterating one.
+// Concatenation is only supported for N and C dimensions for NCHW and the inner most dimension
+// In case of <4 dimensions we need to make sure that the concat dimensions are at least
+// the 3rd slowest iterating one or the inner most dimension.
 //
 
 bool NeedPermuteForConcat(
@@ -2022,7 +2022,7 @@
         }
     }
 
-    return (nDimensions-concatDim) < 3;
+    return (nDimensions < 3 || (nDimensions == 3 && (nDimensions-concatDim) < 3 && (nDimensions-concatDim) != 1));
 }
 
 armnn::TensorShape ExpandTensorShapeTo3dForPermute(const armnn::TensorShape & inputShape)
@@ -2050,7 +2050,6 @@
 {
     BOOST_ASSERT_MSG(numDimensions <= 3,
        "Only dimensions 1,2 and 3 are supported by this helper");
-
     unsigned int expandedBy = 3 - numDimensions;
     unsigned int expandedConcatAxis = concatDim + expandedBy;
 
@@ -2110,6 +2109,7 @@
         {
             numDims = tensorInfo.GetShape().GetNumDimensions();
             Generate3dPermuteVectorForConcat(numDims, concatDim, permutations);
+
             // Store the reverese permutation.
             permuteVector = permutations.second;
             BOOST_ASSERT_MSG(!permuteVector.IsEqual(identity),
@@ -2191,7 +2191,8 @@
     std::initializer_list<T *> inputsOrig,
     const armnn::TensorInfo& outputTensorInfoOrig,
     T * output,
-    unsigned int concatDim)
+    unsigned int concatDim,
+    bool useSubtensor)
 {
     BOOST_ASSERT_MSG(output != nullptr, "output must not be null");
     if (output == nullptr)
@@ -2202,8 +2203,6 @@
         return;
     }
 
-    armnn::MergerQueueDescriptor queueDescriptor;
-
     // Saves a copy of the parameters which we might need to change.
     std::vector<armnn::TensorInfo> inputTensorInfos(inputTensorInfosOrig.begin(), inputTensorInfosOrig.end());
     std::vector<T *> inputs            = inputsOrig;
@@ -2234,35 +2233,51 @@
                                   outputTensorInfo);
     }
 
-    armnn::OriginsDescriptor viewsDescriptor = CreateMergerDescriptorForConcatenation(inputTensorInfos, concatDim);
-    queueDescriptor.m_Parameters = viewsDescriptor;
-
-    queueDescriptor.m_ViewOrigins.reserve(viewsDescriptor.GetNumViews());
-    for (unsigned int i = 0; i < viewsDescriptor.GetNumViews(); ++i)
-    {
-        queueDescriptor.m_ViewOrigins.emplace_back(std::vector<unsigned int>(viewsDescriptor.GetViewOrigin(i),
-            viewsDescriptor.GetViewOrigin(i) + viewsDescriptor.GetNumDimensions()));
-    }
-
-    std::unique_ptr<armnn::ITensorHandle> outputHandle = workloadFactory.CreateTensorHandle(outputTensorInfo);
+    armnn::WorkloadInfo workloadInfo;
 
     std::vector<std::unique_ptr<armnn::ITensorHandle>> inputHandles;
     inputHandles.reserve(inputCount);
 
-    const bool subTensorsSupported = workloadFactory.SupportsSubTensors();
-    for (unsigned int i = 0; i < inputCount; ++i)
+    std::unique_ptr<armnn::ITensorHandle> outputHandle = workloadFactory.CreateTensorHandle(outputTensorInfo);
+
+    armnn::MergerQueueDescriptor queueDescriptor;
+    armnn::OriginsDescriptor viewsDescriptor = CreateMergerDescriptorForConcatenation(inputTensorInfos, concatDim);
+    queueDescriptor.m_Parameters = viewsDescriptor;
+
+    if (useSubtensor)
     {
-        const armnn::TensorInfo& inputTensorInfo = inputTensorInfos[i];
+        queueDescriptor.m_ViewOrigins.reserve(viewsDescriptor.GetNumViews());
+        for (unsigned int i = 0; i < viewsDescriptor.GetNumViews(); ++i)
+        {
+            queueDescriptor.m_ViewOrigins.emplace_back(std::vector<unsigned int>(viewsDescriptor.GetViewOrigin(i),
+                viewsDescriptor.GetViewOrigin(i) + viewsDescriptor.GetNumDimensions()));
+        }
 
-        std::unique_ptr<armnn::ITensorHandle> inputHandle = subTensorsSupported ?
-            workloadFactory.CreateSubTensorHandle(*outputHandle, inputTensorInfo.GetShape(),
-                queueDescriptor.m_ViewOrigins[i].m_Origin.data())
-            : workloadFactory.CreateTensorHandle(inputTensorInfo);
+        outputHandle = workloadFactory.CreateTensorHandle(outputTensorInfo);
 
-        inputHandles.emplace_back(std::move(inputHandle));
+        const bool subTensorsSupported = workloadFactory.SupportsSubTensors();
+        for (unsigned int i = 0; i < inputCount; ++i)
+        {
+            const armnn::TensorInfo& inputTensorInfo = inputTensorInfos[i];
+            std::unique_ptr<armnn::ITensorHandle> inputHandle =
+                subTensorsSupported ?
+                    workloadFactory.CreateSubTensorHandle(*outputHandle,
+                                                          inputTensorInfo.GetShape(),
+                                                          queueDescriptor.m_ViewOrigins[i].m_Origin.data()) :
+                    workloadFactory.CreateTensorHandle(inputTensorInfo);
+
+            inputHandles.emplace_back(std::move(inputHandle));
+        }
+
     }
-
-    armnn::WorkloadInfo workloadInfo;
+    else
+    {
+        for (unsigned int i = 0; i < inputCount; ++i)
+        {
+            std::unique_ptr<armnn::ITensorHandle> inputHandle = workloadFactory.CreateTensorHandle(inputTensorInfos[i]);
+            inputHandles.emplace_back(std::move(inputHandle));
+        }
+    }
 
     for (unsigned int i = 0; i < inputCount; ++i)
     {
@@ -2324,11 +2339,12 @@
     std::vector<T> output;
     output.resize(outputTensorInfo.GetNumElements());
     Concatenate<T>(workloadFactory, memoryManager,
-        { inputTensorInfo, inputTensorInfo, inputTensorInfo },
-        { input0.data(), input1.data(), input2.data() },
-        outputTensorInfo,
-        output.data(),
-        0);
+                   { inputTensorInfo, inputTensorInfo, inputTensorInfo },
+                   { input0.data(), input1.data(), input2.data() },
+                   outputTensorInfo,
+                   output.data(),
+                   0,
+                   true);
 
     result.output = MakeTensor<T, 1>(outputTensorInfo, output);
     result.outputExpected = MakeTensor<T, 1>(outputTensorInfo, QuantizedVector<T>(qScale, qOffset, {
@@ -2385,11 +2401,12 @@
     std::vector<T> output;
     output.resize(outputTensorInfo.GetNumElements());
     Concatenate<T>(workloadFactory, memoryManager,
-        { inputTensorInfo, inputTensorInfo, inputTensorInfo },
-        { input0.data(), input1.data(), input2.data() },
-        outputTensorInfo,
-        output.data(),
-        dimension);
+                   { inputTensorInfo, inputTensorInfo, inputTensorInfo },
+                   { input0.data(), input1.data(), input2.data() },
+                   outputTensorInfo,
+                   output.data(),
+                   dimension,
+                   true);
 
     result.output = MakeTensor<T, 2>(outputTensorInfo, output);
     return result;
@@ -2505,11 +2522,12 @@
     std::vector<T> output;
     output.resize(outputTensorInfo.GetNumElements());
     Concatenate<T>(workloadFactory, memoryManager,
-        { input0TensorInfo, input1TensorInfo, input2TensorInfo },
-        { input0.data(), input1.data(), input2.data() },
-        outputTensorInfo,
-        output.data(),
-        0);
+                   { input0TensorInfo, input1TensorInfo, input2TensorInfo },
+                   { input0.data(), input1.data(), input2.data() },
+                   outputTensorInfo,
+                   output.data(),
+                   0,
+                   true);
 
     result.output = MakeTensor<T, 2>(outputTensorInfo, output);
     result.outputExpected = MakeTensor<T, 2>(outputTensorInfo, QuantizedVector<T>(qScale, qOffset, {
@@ -2582,11 +2600,12 @@
     std::vector<T> output;
     output.resize(outputTensorInfo.GetNumElements());
     Concatenate<T>(workloadFactory, memoryManager,
-        { input0TensorInfo, input1TensorInfo, input2TensorInfo },
-        { input0.data(), input1.data(), input2.data() },
-        outputTensorInfo,
-        output.data(),
-        1);
+                   { input0TensorInfo, input1TensorInfo, input2TensorInfo },
+                   { input0.data(), input1.data(), input2.data() },
+                   outputTensorInfo,
+                   output.data(),
+                   1,
+                   true);
 
     result.output = MakeTensor<T, 2>(outputTensorInfo, output);
     result.outputExpected = MakeTensor<T, 2>(outputTensorInfo, QuantizedVector<T>(qScale, qOffset, {
@@ -2613,6 +2632,7 @@
     const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
     const armnn::TensorInfo& outputTensorInfo,
     unsigned int dimension,
+    bool useSubtensor,
     float qScale,
     int32_t qOffset)
 {
@@ -2683,11 +2703,12 @@
     std::vector<T> output;
     output.resize(outputTensorInfo.GetNumElements());
     Concatenate<T>(workloadFactory, memoryManager,
-        { inputTensorInfo, inputTensorInfo, inputTensorInfo },
-        { input0.data(), input1.data(), input2.data() },
-        outputTensorInfo,
-        output.data(),
-        dimension);
+                   { inputTensorInfo, inputTensorInfo, inputTensorInfo },
+                   { input0.data(), input1.data(), input2.data() },
+                   outputTensorInfo,
+                   output.data(),
+                   dimension,
+                   useSubtensor);
 
     result.output = MakeTensor<T, 3>(outputTensorInfo, output);
     return result;
@@ -2703,7 +2724,7 @@
     armnn::TensorInfo outputTensorInfo({ 6, 3, 2 }, armnn::GetDataType<T>());
 
     LayerTestResult<T, 3> result =
-        Concatenation3dTestImpl<T>(workloadFactory, memoryManager, outputTensorInfo, 0, qScale, qOffset);
+        Concatenation3dTestImpl<T>(workloadFactory, memoryManager, outputTensorInfo, 0, true, qScale, qOffset);
     result.outputExpected = MakeTensor<T, 3>(outputTensorInfo, QuantizedVector<T>(qScale, qOffset, {
         // Batch 0, Channel 0
         1.0f, 2.0f,
@@ -2759,6 +2780,7 @@
         // Batch 5, Channel 2
         35.0f, 36.0f
     }));
+
     return result;
 }
 
@@ -2779,7 +2801,8 @@
     armnn::TensorInfo outputTensorInfo({ 2, 9, 2 }, armnn::GetDataType<T>());
 
     LayerTestResult<T, 3> result =
-        Concatenation3dTestImpl<T>(workloadFactory, memoryManager, outputTensorInfo, 1, qScale, qOffset);
+        Concatenation3dTestImpl<T>(workloadFactory, memoryManager, outputTensorInfo, 1, true, qScale, qOffset);
+
     result.outputExpected = MakeTensor<T, 3>(outputTensorInfo, QuantizedVector<T>(qScale, qOffset, {
         // Batch 0, Channel 0
         1.0f, 2.0f,
@@ -2850,13 +2873,15 @@
 LayerTestResult<T, 3> Concatenation3dDim2TestImpl(
     armnn::IWorkloadFactory& workloadFactory,
     const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    bool useSubtensor,
     float qScale,
     int32_t qOffset)
 {
     armnn::TensorInfo outputTensorInfo({ 2, 3, 6 }, armnn::GetDataType<T>());
 
     LayerTestResult<T, 3> result =
-        Concatenation3dTestImpl<T>(workloadFactory, memoryManager, outputTensorInfo, 2, qScale, qOffset);
+        Concatenation3dTestImpl<T>(workloadFactory, memoryManager, outputTensorInfo, 2, useSubtensor, qScale, qOffset);
+
     result.outputExpected = MakeTensor<T, 3>(outputTensorInfo, QuantizedVector<T>(qScale, qOffset, {
         // Batch 0, Channel 0
         1.0f, 2.0f, 7.0f, 8.0f, 13.0f, 14.0f,
@@ -2882,9 +2907,10 @@
 
 LayerTestResult<float, 3> Concatenation3dDim2Test(
     armnn::IWorkloadFactory& workloadFactory,
-    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    bool useSubtensor)
 {
-    return Concatenation3dDim2TestImpl<float>(workloadFactory, memoryManager, 0.0f, 0);
+    return Concatenation3dDim2TestImpl<float>(workloadFactory, memoryManager, useSubtensor, 0.0f, 0);
 }
 
 template <typename T>
@@ -2963,11 +2989,12 @@
     std::vector<T> output;
     output.resize(outputTensorInfo.GetNumElements());
     Concatenate<T>(workloadFactory, memoryManager,
-        { input0TensorInfo, input1TensorInfo, input2TensorInfo },
-        { input0.data(), input1.data(), input2.data() },
-        outputTensorInfo,
-        output.data(),
-        0);
+                   { input0TensorInfo, input1TensorInfo, input2TensorInfo },
+                   { input0.data(), input1.data(), input2.data() },
+                   outputTensorInfo,
+                   output.data(),
+                   0,
+                   true);
 
     result.output = MakeTensor<T, 3>(outputTensorInfo, output);
     result.outputExpected = MakeTensor<T, 3>(outputTensorInfo, QuantizedVector<T>(qScale, qOffset, {
@@ -3106,11 +3133,12 @@
     std::vector<T> output;
     output.resize(outputTensorInfo.GetNumElements());
     Concatenate<T>(workloadFactory, memoryManager,
-        { input0TensorInfo, input1TensorInfo, input2TensorInfo },
-        { input0.data(), input1.data(), input2.data() },
-        outputTensorInfo,
-        output.data(),
-        1);
+                   { input0TensorInfo, input1TensorInfo, input2TensorInfo },
+                   { input0.data(), input1.data(), input2.data() },
+                   outputTensorInfo,
+                   output.data(),
+                   1,
+                   true);
 
     result.output = MakeTensor<T, 3>(outputTensorInfo, output);
     result.outputExpected = MakeTensor<T, 3>(outputTensorInfo, QuantizedVector<T>(qScale, qOffset, {
@@ -3177,6 +3205,7 @@
 LayerTestResult<T, 3> Concatenation3dDim2DiffInputDimsTestImpl(
     armnn::IWorkloadFactory& workloadFactory,
     const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    bool useSubtensor,
     float qScale,
     int32_t qOffset)
 {
@@ -3249,11 +3278,12 @@
     std::vector<T> output;
     output.resize(outputTensorInfo.GetNumElements());
     Concatenate<T>(workloadFactory, memoryManager,
-        { input0TensorInfo, input1TensorInfo, input2TensorInfo },
-        { input0.data(), input1.data(), input2.data() },
-        outputTensorInfo,
-        output.data(),
-        2);
+                   { input0TensorInfo, input1TensorInfo, input2TensorInfo },
+                   { input0.data(), input1.data(), input2.data() },
+                   outputTensorInfo,
+                   output.data(),
+                   2,
+                   useSubtensor);
 
     result.output = MakeTensor<T, 3>(outputTensorInfo, output);
     result.outputExpected = MakeTensor<T, 3>(outputTensorInfo, QuantizedVector<T>(qScale, qOffset, {
@@ -3281,9 +3311,547 @@
 
 LayerTestResult<float, 3> Concatenation3dDim2DiffInputDimsTest(
     armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    bool useSubtensor)
+{
+    return Concatenation3dDim2DiffInputDimsTestImpl<float>(workloadFactory, memoryManager, useSubtensor, 0.0f, 0);
+}
+
+template <typename T>
+LayerTestResult<T, 4> Concatenation4dTestImpl(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::TensorInfo& outputTensorInfo,
+    unsigned int dimension,
+    bool useSubtensor,
+    float qScale,
+    int32_t qOffset)
+{
+    armnn::TensorInfo inputTensorInfo({ 1, 3, 2, 2 }, armnn::GetDataType<T>());
+
+    auto input0 = MakeTensor<T, 4>(inputTensorInfo, QuantizedVector<T>(qScale, qOffset, {
+        1.0f, 2.0f,
+        3.0f, 4.0f,
+        5.0f, 6.0f,
+        7.0f, 8.0f,
+        9.0f, 10.0f,
+        11.0f, 12.0f
+    }));
+
+    auto input1 = MakeTensor<T, 4>(inputTensorInfo, QuantizedVector<T>(qScale, qOffset, {
+        11.0f, 12.0f,
+        13.0f, 14.0f,
+        15.0f, 16.0f,
+        17.0f, 18.0f,
+        19.0f, 20.0f,
+        21.0f, 22.0f
+    }));
+
+    auto input2 = MakeTensor<T, 4>(inputTensorInfo, QuantizedVector<T>(qScale, qOffset, {
+        21.0f, 22.0f,
+        23.0f, 24.0f,
+        25.0f, 26.0f,
+        27.0f, 28.0f,
+        29.0f, 30.0f,
+        31.0f, 32.0f
+    }));
+
+    LayerTestResult<T, 4> result(outputTensorInfo);
+
+    std::vector<T> output;
+    output.resize(outputTensorInfo.GetNumElements());
+
+    Concatenate<T>(workloadFactory,
+                   memoryManager,
+                   {inputTensorInfo, inputTensorInfo, inputTensorInfo},
+                   {input0.data(), input1.data(), input2.data()},
+                   outputTensorInfo,
+                   output.data(),
+                   dimension,
+                   useSubtensor);
+
+    result.output = MakeTensor<T, 4>(outputTensorInfo, output);
+    return result;
+}
+
+template <typename T>
+LayerTestResult<T, 4> Concatenation4dDim0TestImpl(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    float qScale,
+    int32_t qOffset)
+{
+    armnn::TensorInfo outputTensorInfo({ 3, 3, 2, 2 }, armnn::GetDataType<T>());
+
+    LayerTestResult<T, 4> result = Concatenation4dTestImpl<T>(workloadFactory, memoryManager, outputTensorInfo, 0,
+                                                              true, qScale, qOffset);
+    result.outputExpected = MakeTensor<T, 4>(outputTensorInfo, QuantizedVector<T>(qScale, qOffset, {
+        1.0f, 2.0f,
+        3.0f, 4.0f,
+        5.0f, 6.0f,
+        7.0f, 8.0f,
+        9.0f, 10.0f,
+        11.0f, 12.0f,
+
+        11.0f, 12.0f,
+        13.0f, 14.0f,
+        15.0f, 16.0f,
+        17.0f, 18.0f,
+        19.0f, 20.0f,
+        21.0f, 22.0f,
+
+        21.0f, 22.0f,
+        23.0f, 24.0f,
+        25.0f, 26.0f,
+        27.0f, 28.0f,
+        29.0f, 30.0f,
+        31.0f, 32.0f
+    }));
+    return result;
+}
+
+LayerTestResult<float, 4> Concatenation4dDim0Test(
+    armnn::IWorkloadFactory& workloadFactory,
     const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
 {
-    return Concatenation3dDim2DiffInputDimsTestImpl<float>(workloadFactory, memoryManager, 0.0f, 0);
+    return Concatenation4dDim0TestImpl<float>(workloadFactory, memoryManager, 0.0f, 0);
+}
+
+template <typename T>
+LayerTestResult<T, 4> Concatenation4dDim1TestImpl(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    float qScale,
+    int32_t qOffset)
+{
+    armnn::TensorInfo outputTensorInfo({ 1, 9, 2, 2 }, armnn::GetDataType<T>());
+
+    LayerTestResult<T, 4> result = Concatenation4dTestImpl<T>(workloadFactory, memoryManager, outputTensorInfo, 1,
+                                                              true, qScale, qOffset);
+    result.outputExpected = MakeTensor<T, 4>(outputTensorInfo, QuantizedVector<T>(qScale, qOffset, {
+        1.0f, 2.0f,
+        3.0f, 4.0f,
+        5.0f, 6.0f,
+        7.0f, 8.0f,
+        9.0f, 10.0f,
+        11.0f, 12.0f,
+
+        11.0f, 12.0f,
+        13.0f, 14.0f,
+        15.0f, 16.0f,
+        17.0f, 18.0f,
+        19.0f, 20.0f,
+        21.0f, 22.0f,
+
+        21.0f, 22.0f,
+        23.0f, 24.0f,
+        25.0f, 26.0f,
+        27.0f, 28.0f,
+        29.0f, 30.0f,
+        31.0f, 32.0f
+    }));
+
+    return result;
+}
+
+LayerTestResult<float, 4> Concatenation4dDim1Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return Concatenation4dDim1TestImpl<float>(workloadFactory, memoryManager, 0.0f, 0);
+}
+
+template <typename T>
+LayerTestResult<T, 4> Concatenation4dDim2TestImpl(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    float qScale,
+    int32_t qOffset)
+{
+    armnn::TensorInfo outputTensorInfo({ 1, 3, 6, 2 }, armnn::GetDataType<T>());
+
+    LayerTestResult<T, 4> result = Concatenation4dTestImpl<T>(workloadFactory, memoryManager, outputTensorInfo, 2,
+                                                              true, qScale, qOffset);
+    result.outputExpected = MakeTensor<T, 4>(outputTensorInfo, QuantizedVector<T>(qScale, qOffset, {
+        1.0f, 2.0f,
+        3.0f, 4.0f,
+        11.0f, 12.0f,
+        13.0f, 14.0f,
+        21.0f, 22.0f,
+        23.0f, 24.0f,
+
+        5.0f, 6.0f,
+        7.0f, 8.0f,
+        15.0f, 16.0f,
+        17.0f, 18.0f,
+        25.0f, 26.0f,
+        27.0f, 28.0f,
+
+        9.0f, 10.0f,
+        11.0f, 12.0f,
+        19.0f, 20.0f,
+        21.0f, 22.0f,
+        29.0f, 30.0f,
+        31.0f, 32.0f
+    }));
+
+    return result;
+}
+
+LayerTestResult<float, 4> Concatenation4dDim2Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return Concatenation4dDim2TestImpl<float>(workloadFactory, memoryManager, 0.0f, 0);
+}
+
+template <typename T>
+LayerTestResult<T, 4> Concatenation4dDim3TestImpl(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    float qScale,
+    int32_t qOffset,
+    bool useSubtensor)
+{
+    armnn::TensorInfo outputTensorInfo({ 1, 3, 2, 6 }, armnn::GetDataType<T>());
+
+    LayerTestResult<T, 4> result = Concatenation4dTestImpl<T>(workloadFactory, memoryManager, outputTensorInfo, 3,
+                                                              useSubtensor, qScale, qOffset);
+    result.outputExpected = MakeTensor<T, 4>(outputTensorInfo, QuantizedVector<T>(qScale, qOffset, {
+        1.0f, 2.0f,
+        11.0f, 12.0f,
+        21.0f, 22.0f,
+        3.0f, 4.0f,
+        13.0f, 14.0f,
+        23.0f, 24.0f,
+
+        5.0f, 6.0f,
+        15.0f, 16.0f,
+        25.0f, 26.0f,
+        7.0f, 8.0f,
+        17.0f, 18.0f,
+        27.0f, 28.0f,
+
+        9.0f, 10.0f,
+        19.0f, 20.0f,
+        29.0f, 30.0f,
+        11.0f, 12.0f,
+        21.0f, 22.0f,
+        31.0f, 32.0f
+    }));
+
+    return result;
+}
+
+LayerTestResult<float, 4> Concatenation4dDim3Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    bool useSubtensor)
+{
+    return Concatenation4dDim3TestImpl<float>(workloadFactory, memoryManager, 0.0f, 0, useSubtensor);
+}
+
+template <typename T>
+LayerTestResult<T, 4> Concatenation4dDiffShapeDim0TestImpl(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    float qScale,
+    int32_t qOffset)
+{
+    unsigned int dimension = 0;
+    armnn::TensorInfo inputTensorInfo0({ 1, 3, 2, 2 }, armnn::GetDataType<T>());
+
+    auto input0 = MakeTensor<T, 4>(inputTensorInfo0, QuantizedVector<T>(qScale, qOffset, {
+        1.0f, 2.0f,
+        3.0f, 4.0f,
+        5.0f, 6.0f,
+        7.0f, 8.0f,
+        9.0f, 10.0f,
+        11.0f, 12.0f
+    }));
+
+    armnn::TensorInfo inputTensorInfo1({ 2, 3, 2, 2 }, armnn::GetDataType<T>());
+
+    auto input1 = MakeTensor<T, 4>(inputTensorInfo1, QuantizedVector<T>(qScale, qOffset, {
+        11.0f, 12.0f,
+        13.0f, 14.0f,
+        15.0f, 16.0f,
+        17.0f, 18.0f,
+        19.0f, 20.0f,
+        21.0f, 22.0f,
+
+        21.0f, 22.0f,
+        23.0f, 24.0f,
+        25.0f, 26.0f,
+        27.0f, 28.0f,
+        29.0f, 30.0f,
+        31.0f, 32.0f
+
+    }));
+
+    armnn::TensorInfo outputTensorInfo({ 3, 3, 2, 2 }, armnn::GetDataType<T>());
+
+    LayerTestResult<T, 4> result(outputTensorInfo);
+
+    std::vector<T> output;
+    output.resize(outputTensorInfo.GetNumElements());
+    Concatenate<T>(workloadFactory,
+                   memoryManager,
+                   {inputTensorInfo0, inputTensorInfo1},
+                   {input0.data(), input1.data()},
+                   outputTensorInfo,
+                   output.data(),
+                   dimension,
+                   true);
+
+    result.output = MakeTensor<T, 4>(outputTensorInfo, output);
+    result.outputExpected = MakeTensor<T, 4>(outputTensorInfo, QuantizedVector<T>(qScale, qOffset, {
+        1.0f, 2.0f,
+        3.0f, 4.0f,
+        5.0f, 6.0f,
+        7.0f, 8.0f,
+        9.0f, 10.0f,
+        11.0f, 12.0f,
+
+        11.0f, 12.0f,
+        13.0f, 14.0f,
+        15.0f, 16.0f,
+        17.0f, 18.0f,
+        19.0f, 20.0f,
+        21.0f, 22.0f,
+
+        21.0f, 22.0f,
+        23.0f, 24.0f,
+        25.0f, 26.0f,
+        27.0f, 28.0f,
+        29.0f, 30.0f,
+        31.0f, 32.0f
+    }));
+
+    return result;
+}
+
+LayerTestResult<float, 4> Concatenation4dDiffShapeDim0Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return Concatenation4dDiffShapeDim0TestImpl<float>(workloadFactory, memoryManager, 0.0f, 0);
+}
+
+template <typename T>
+LayerTestResult<T, 4> Concatenation4dDiffShapeDim1TestImpl(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    float qScale,
+    int32_t qOffset)
+{
+    unsigned int dimension = 1;
+    armnn::TensorInfo inputTensorInfo0({ 1, 3, 2, 2 }, armnn::GetDataType<T>());
+
+    auto input0 = MakeTensor<T, 4>(inputTensorInfo0, QuantizedVector<T>(qScale, qOffset, {
+        1.0f, 2.0f,
+        3.0f, 4.0f,
+        5.0f, 6.0f,
+        7.0f, 8.0f,
+        9.0f, 10.0f,
+        11.0f, 12.0f
+    }));
+
+    armnn::TensorInfo inputTensorInfo1({ 1, 2, 2, 2 }, armnn::GetDataType<T>());
+
+    auto input1 = MakeTensor<T, 4>(inputTensorInfo1, QuantizedVector<T>(qScale, qOffset, {
+        11.0f, 12.0f,
+        13.0f, 14.0f,
+        15.0f, 16.0f,
+        17.0f, 18.0f,
+
+    }));
+
+    armnn::TensorInfo outputTensorInfo({ 1, 5, 2, 2 }, armnn::GetDataType<T>());
+
+    LayerTestResult<T, 4> result(outputTensorInfo);
+
+    std::vector<T> output;
+    output.resize(outputTensorInfo.GetNumElements());
+    Concatenate<T>(workloadFactory,
+                   memoryManager,
+                   {inputTensorInfo0, inputTensorInfo1},
+                   {input0.data(), input1.data()},
+                   outputTensorInfo,
+                   output.data(),
+                   dimension,
+                   true);
+
+    result.output = MakeTensor<T, 4>(outputTensorInfo, output);
+    result.outputExpected = MakeTensor<T, 4>(outputTensorInfo, QuantizedVector<T>(qScale, qOffset, {
+        1.0f, 2.0f,
+        3.0f, 4.0f,
+        5.0f, 6.0f,
+        7.0f, 8.0f,
+        9.0f, 10.0f,
+        11.0f, 12.0f,
+        11.0f, 12.0f,
+        13.0f, 14.0f,
+        15.0f, 16.0f,
+        17.0f, 18.0f
+    }));
+
+    return result;
+}
+
+LayerTestResult<float, 4> Concatenation4dDiffShapeDim1Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return Concatenation4dDiffShapeDim1TestImpl<float>(workloadFactory, memoryManager, 0.0f, 0);
+}
+
+template <typename T>
+LayerTestResult<T, 4> Concatenation4dDiffShapeDim2TestImpl(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    float qScale,
+    int32_t qOffset)
+{
+    unsigned int dimension = 2;
+    armnn::TensorInfo inputTensorInfo0({ 1, 3, 2, 2 }, armnn::GetDataType<T>());
+
+    auto input0 = MakeTensor<T, 4>(inputTensorInfo0, QuantizedVector<T>(qScale, qOffset, {
+        1.0f, 2.0f,
+        3.0f, 4.0f,
+        5.0f, 6.0f,
+        7.0f, 8.0f,
+        9.0f, 10.0f,
+        11.0f, 12.0f
+    }));
+
+    armnn::TensorInfo inputTensorInfo1({ 1, 3, 3, 2 }, armnn::GetDataType<T>());
+
+    auto input1 = MakeTensor<T, 4>(inputTensorInfo1, QuantizedVector<T>(qScale, qOffset, {
+        11.0f, 12.0f,
+        13.0f, 14.0f,
+        15.0f, 16.0f,
+        17.0f, 18.0f,
+        19.0f, 20.0f,
+        21.0f, 22.0f,
+        23.0f, 24.0f,
+        25.0f, 26.0f,
+        27.0f, 28.0f
+    }));
+
+    armnn::TensorInfo outputTensorInfo({ 1, 3, 5, 2 }, armnn::GetDataType<T>());
+
+    LayerTestResult<T, 4> result(outputTensorInfo);
+
+    std::vector<T> output;
+    output.resize(outputTensorInfo.GetNumElements());
+    Concatenate<T>(workloadFactory,
+                   memoryManager,
+                   {inputTensorInfo0, inputTensorInfo1},
+                   {input0.data(), input1.data()},
+                   outputTensorInfo,
+                   output.data(),
+                   dimension,
+                   true);
+
+    result.output = MakeTensor<T, 4>(outputTensorInfo, output);
+    result.outputExpected = MakeTensor<T, 4>(outputTensorInfo, QuantizedVector<T>(qScale, qOffset, {
+        1.0f, 2.0f,
+        3.0f, 4.0f,
+        11.0f, 12.0f,
+        13.0f, 14.0f,
+        15.0f, 16.0f,
+
+        5.0f, 6.0f,
+        7.0f, 8.0f,
+        17.0f, 18.0f,
+        19.0f, 20.0f,
+        21.0f, 22.0f,
+
+        9.0f, 10.0f,
+        11.0f, 12.0f,
+        23.0f, 24.0f,
+        25.0f, 26.0f,
+        27.0f, 28.0f
+    }));
+
+    return result;
+}
+
+LayerTestResult<float, 4> Concatenation4dDiffShapeDim2Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return Concatenation4dDiffShapeDim2TestImpl<float>(workloadFactory, memoryManager, 0.0f, 0);
+}
+
+template <typename T>
+LayerTestResult<T, 4> Concatenation4dDiffShapeDim3TestImpl(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    float qScale,
+    int32_t qOffset,
+    bool useSubtensor)
+{
+    unsigned int dimension = 3;
+    armnn::TensorInfo inputTensorInfo0({ 1, 3, 2, 2 }, armnn::GetDataType<T>());
+
+    auto input0 = MakeTensor<T, 4>(inputTensorInfo0, QuantizedVector<T>(qScale, qOffset, {
+        1.0f, 2.0f,
+        3.0f, 4.0f,
+        5.0f, 6.0f,
+        7.0f, 8.0f,
+        9.0f, 10.0f,
+        11.0f, 12.0f
+    }));
+
+    armnn::TensorInfo inputTensorInfo1({ 1, 3, 2, 3 }, armnn::GetDataType<T>());
+
+    auto input1 = MakeTensor<T, 4>(inputTensorInfo1, QuantizedVector<T>(qScale, qOffset, {
+        11.0f, 12.0f, 13.0f,
+        14.0f, 15.0f, 16.0f,
+
+        17.0f, 18.0f, 19.0f,
+        20.0f, 21.0f, 22.0f,
+
+        23.0f, 24.0f, 25.0f,
+        26.0f, 27.0f, 28.0f
+    }));
+
+    armnn::TensorInfo outputTensorInfo({ 1, 3, 2, 5 }, armnn::GetDataType<T>());
+
+    LayerTestResult<T, 4> result(outputTensorInfo);
+
+    std::vector<T> output;
+    output.resize(outputTensorInfo.GetNumElements());
+    Concatenate<T>(workloadFactory,
+                   memoryManager,
+                   {inputTensorInfo0, inputTensorInfo1},
+                   {input0.data(), input1.data()},
+                   outputTensorInfo,
+                   output.data(),
+                   dimension,
+                   useSubtensor);
+
+    result.output = MakeTensor<T, 4>(outputTensorInfo, output);
+    result.outputExpected = MakeTensor<T, 4>(outputTensorInfo, QuantizedVector<T>(qScale, qOffset, {
+        1.0f, 2.0f, 11.0f, 12.0f, 13.0f,
+        3.0f, 4.0f, 14.0f, 15.0f, 16.0f,
+        5.0f, 6.0f, 17.0f, 18.0f, 19.0f,
+        7.0f, 8.0f, 20.0f, 21.0f, 22.0f,
+        9.0f, 10.0f, 23.0f, 24.0f, 25.0f,
+        11.0f, 12.0f, 26.0f, 27.0f, 28.0f
+    }));
+
+    return result;
+}
+
+LayerTestResult<float, 4> Concatenation4dDiffShapeDim3Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    bool useSubtensor)
+{
+    return Concatenation4dDiffShapeDim3TestImpl<float>(workloadFactory, memoryManager, 0.0f, 0, useSubtensor);
 }
 
 LayerTestResult<float, 4> ResizeBilinearNopTest(
@@ -5667,9 +6235,10 @@
 
 LayerTestResult<uint8_t, 3> Concatenation3dDim2Uint8Test(
     armnn::IWorkloadFactory& workloadFactory,
-    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    bool useSubtensor)
 {
-    return Concatenation3dDim2TestImpl<uint8_t>(workloadFactory, memoryManager, 0.5f, -1);
+    return Concatenation3dDim2TestImpl<uint8_t>(workloadFactory, memoryManager, useSubtensor, 0.5f, -1);
 }
 
 LayerTestResult<uint8_t, 3> Concatenation3dDim0DiffInputDimsUint8Test(
@@ -5688,9 +6257,67 @@
 
 LayerTestResult<uint8_t, 3> Concatenation3dDim2DiffInputDimsUint8Test(
     armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    bool useSubtensor)
+{
+    return Concatenation3dDim2DiffInputDimsTestImpl<uint8_t>(workloadFactory, memoryManager, useSubtensor, 0.5f, -1);
+}
+
+LayerTestResult<uint8_t, 4> Concatenation4dDim0Uint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
     const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
 {
-    return Concatenation3dDim2DiffInputDimsTestImpl<uint8_t>(workloadFactory, memoryManager, 0.5f, -1);
+    return Concatenation4dDim0TestImpl<uint8_t>(workloadFactory, memoryManager, 0.5f, -1);
+}
+
+LayerTestResult<uint8_t, 4> Concatenation4dDim1Uint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return Concatenation4dDim1TestImpl<uint8_t>(workloadFactory, memoryManager, 0.5f, -1);
+}
+
+LayerTestResult<uint8_t, 4> Concatenation4dDim2Uint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return Concatenation4dDim2TestImpl<uint8_t>(workloadFactory, memoryManager, 0.5f, -1);
+}
+
+LayerTestResult<uint8_t, 4> Concatenation4dDim3Uint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager, bool useSubtensor)
+{
+    return Concatenation4dDim3TestImpl<uint8_t>(workloadFactory, memoryManager, 0.5f, -1, useSubtensor);
+}
+
+LayerTestResult<uint8_t, 4> Concatenation4dDiffShapeDim0Uint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return Concatenation4dDiffShapeDim0TestImpl<uint8_t>(workloadFactory, memoryManager, 0.5f, -1);
+}
+
+LayerTestResult<uint8_t, 4> Concatenation4dDiffShapeDim1Uint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return Concatenation4dDiffShapeDim1TestImpl<uint8_t>(workloadFactory, memoryManager, 0.5f, -1);
+}
+
+LayerTestResult<uint8_t, 4> Concatenation4dDiffShapeDim2Uint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return Concatenation4dDiffShapeDim2TestImpl<uint8_t>(workloadFactory, memoryManager, 0.5f, -1);
+}
+
+LayerTestResult<uint8_t, 4> Concatenation4dDiffShapeDim3Uint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    bool useSubtensor)
+{
+    return Concatenation4dDiffShapeDim3TestImpl<uint8_t>(workloadFactory, memoryManager, 0.5f, -1, useSubtensor);
 }
 
 LayerTestResult<float, 4> SimpleMaxPooling2dSize2x2Stride2x2Test(
diff --git a/src/backends/backendsCommon/test/LayerTests.hpp b/src/backends/backendsCommon/test/LayerTests.hpp
index 7734e5e..15d0853 100644
--- a/src/backends/backendsCommon/test/LayerTests.hpp
+++ b/src/backends/backendsCommon/test/LayerTests.hpp
@@ -366,7 +366,8 @@
 
 LayerTestResult<float, 3> Concatenation3dDim2Test(
     armnn::IWorkloadFactory& workloadFactory,
-    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    bool useSubtensor);
 
 LayerTestResult<float, 3> Concatenation3dDim0DiffInputDimsTest(
     armnn::IWorkloadFactory& workloadFactory,
@@ -378,8 +379,77 @@
 
 LayerTestResult<float, 3> Concatenation3dDim2DiffInputDimsTest(
     armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    bool useSubtensor);
+
+LayerTestResult<float, 4> Concatenation4dDim0Test(
+    armnn::IWorkloadFactory& workloadFactory,
     const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
 
+LayerTestResult<float, 4> Concatenation4dDim1Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<float, 4> Concatenation4dDim2Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<float, 4> Concatenation4dDim3Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    bool useSubtensor);
+
+LayerTestResult<float, 4> Concatenation4dDiffShapeDim0Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<float, 4> Concatenation4dDiffShapeDim1Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<float, 4> Concatenation4dDiffShapeDim2Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<float, 4> Concatenation4dDiffShapeDim3Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    bool useSubtensor);
+
+LayerTestResult<uint8_t, 4> Concatenation4dDim0Uint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<uint8_t, 4> Concatenation4dDim1Uint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<uint8_t, 4> Concatenation4dDim2Uint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<uint8_t, 4> Concatenation4dDim3Uint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    bool useSubtensor);
+
+LayerTestResult<uint8_t, 4> Concatenation4dDiffShapeDim0Uint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<uint8_t, 4> Concatenation4dDiffShapeDim1Uint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<uint8_t, 4> Concatenation4dDiffShapeDim2Uint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<uint8_t, 4> Concatenation4dDiffShapeDim3Uint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    bool useSubtensor);
+
 LayerTestResult<uint8_t, 4> SimpleSigmoidUint8Test(
     armnn::IWorkloadFactory& workloadFactory,
     const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
@@ -781,7 +851,8 @@
 
 LayerTestResult<uint8_t, 3> Concatenation3dDim2Uint8Test(
     armnn::IWorkloadFactory& workloadFactory,
-    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    bool useSubtensor);
 
 LayerTestResult<uint8_t, 3> Concatenation3dDim0DiffInputDimsUint8Test(
     armnn::IWorkloadFactory& workloadFactory,
@@ -793,7 +864,8 @@
 
 LayerTestResult<uint8_t, 3> Concatenation3dDim2DiffInputDimsUint8Test(
     armnn::IWorkloadFactory& workloadFactory,
-    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    bool useSubtensor);
 
 LayerTestResult<float, 2> FullyConnectedLargeTest(
     armnn::IWorkloadFactory& workloadFactory,
diff --git a/src/backends/cl/test/ClCreateWorkloadTests.cpp b/src/backends/cl/test/ClCreateWorkloadTests.cpp
index b243ca8..a7fa9dc 100644
--- a/src/backends/cl/test/ClCreateWorkloadTests.cpp
+++ b/src/backends/cl/test/ClCreateWorkloadTests.cpp
@@ -803,4 +803,39 @@
     ClMeanWorkloadTest<ClMeanWorkload, armnn::DataType::QuantisedAsymm8>();
 }
 
+template <typename MergerWorkloadType, armnn::DataType DataType>
+static void ClCreateMergerWorkloadTest(std::initializer_list<unsigned int> outputShape,
+                                       unsigned int concatAxis)
+{
+    Graph graph;
+    ClWorkloadFactory factory =
+        ClWorkloadFactoryHelper::GetFactory(ClWorkloadFactoryHelper::GetMemoryManager());
+
+    auto workload = CreateMergerWorkloadTest<MergerWorkloadType, DataType>(factory, graph, outputShape, concatAxis);
+
+    MergerQueueDescriptor queueDescriptor = workload->GetData();
+    auto inputHandle0  = boost::polymorphic_downcast<IClTensorHandle*>(queueDescriptor.m_Inputs[0]);
+    auto inputHandle1  = boost::polymorphic_downcast<IClTensorHandle*>(queueDescriptor.m_Inputs[1]);
+    auto outputHandle = boost::polymorphic_downcast<IClTensorHandle*>(queueDescriptor.m_Outputs[0]);
+
+    BOOST_TEST(CompareIClTensorHandleShape(inputHandle0, { 2, 3, 2, 5 }));
+    BOOST_TEST(CompareIClTensorHandleShape(inputHandle1, { 2, 3, 2, 5 }));
+    BOOST_TEST(CompareIClTensorHandleShape(outputHandle, outputShape));
+}
+
+BOOST_AUTO_TEST_CASE(CreateMergerDim0Float32Workload)
+{
+    ClCreateMergerWorkloadTest<ClMergerWorkload, armnn::DataType::Float32>({ 4, 3, 2, 5 }, 0);
+}
+
+BOOST_AUTO_TEST_CASE(CreateMergerDim1Float32Workload)
+{
+    ClCreateMergerWorkloadTest<ClMergerWorkload, armnn::DataType::Float32>({ 2, 6, 2, 5 }, 1);
+}
+
+BOOST_AUTO_TEST_CASE(CreateMergerDim3Float32Workload)
+{
+    ClCreateMergerWorkloadTest<ClMergerWorkload, armnn::DataType::Float32>({ 2, 3, 2, 10 }, 3);
+}
+
 BOOST_AUTO_TEST_SUITE_END()
diff --git a/src/backends/cl/test/ClLayerTests.cpp b/src/backends/cl/test/ClLayerTests.cpp
index 2559fb4..06359b7 100755
--- a/src/backends/cl/test/ClLayerTests.cpp
+++ b/src/backends/cl/test/ClLayerTests.cpp
@@ -254,31 +254,25 @@
 
 // Concat
 ARMNN_AUTO_TEST_CASE(Concatenation1d, Concatenation1dTest)
-ARMNN_AUTO_TEST_CASE(Concatenation1dUint8, Concatenation1dUint8Test)
 
 ARMNN_AUTO_TEST_CASE(Concatenation2dDim0, Concatenation2dDim0Test)
-ARMNN_AUTO_TEST_CASE(Concatenation2dDim0Uint8, Concatenation2dDim0Uint8Test)
 ARMNN_AUTO_TEST_CASE(Concatenation2dDim1, Concatenation2dDim1Test)
-ARMNN_AUTO_TEST_CASE(Concatenation2dDim1Uint8, Concatenation2dDim1Uint8Test)
-
 ARMNN_AUTO_TEST_CASE(Concatenation2dDim0DiffInputDims, Concatenation2dDim0DiffInputDimsTest)
-ARMNN_AUTO_TEST_CASE(Concatenation2dDim0DiffInputDimsUint8, Concatenation2dDim0DiffInputDimsUint8Test)
 ARMNN_AUTO_TEST_CASE(Concatenation2dDim1DiffInputDims, Concatenation2dDim1DiffInputDimsTest)
-ARMNN_AUTO_TEST_CASE(Concatenation2dDim1DiffInputDimsUint8, Concatenation2dDim1DiffInputDimsUint8Test)
 
 ARMNN_AUTO_TEST_CASE(Concatenation3dDim0, Concatenation3dDim0Test)
-ARMNN_AUTO_TEST_CASE(Concatenation3dDim0Uint8, Concatenation3dDim0Uint8Test)
 ARMNN_AUTO_TEST_CASE(Concatenation3dDim1, Concatenation3dDim1Test)
-ARMNN_AUTO_TEST_CASE(Concatenation3dDim1Uint8, Concatenation3dDim1Uint8Test)
-ARMNN_AUTO_TEST_CASE(Concatenation3dDim2, Concatenation3dDim2Test)
-ARMNN_AUTO_TEST_CASE(Concatenation3dDim2Uint8, Concatenation3dDim2Uint8Test)
-
+ARMNN_AUTO_TEST_CASE(Concatenation3dDim2, Concatenation3dDim2Test, false)
 ARMNN_AUTO_TEST_CASE(Concatenation3dDim0DiffInputDims, Concatenation3dDim0DiffInputDimsTest)
-ARMNN_AUTO_TEST_CASE(Concatenation3dDim0DiffInputDimsUint8, Concatenation3dDim0DiffInputDimsUint8Test)
 ARMNN_AUTO_TEST_CASE(Concatenation3dDim1DiffInputDims, Concatenation3dDim1DiffInputDimsTest)
-ARMNN_AUTO_TEST_CASE(Concatenation3dDim1DiffInputDimsUint8, Concatenation3dDim1DiffInputDimsUint8Test)
-ARMNN_AUTO_TEST_CASE(Concatenation3dDim2DiffInputDims, Concatenation3dDim2DiffInputDimsTest)
-ARMNN_AUTO_TEST_CASE(Concatenation3dDim2DiffInputDimsUint8, Concatenation3dDim2DiffInputDimsUint8Test)
+ARMNN_AUTO_TEST_CASE(Concatenation3dDim2DiffInputDims, Concatenation3dDim2DiffInputDimsTest, false)
+
+ARMNN_AUTO_TEST_CASE(Concatenation4dDim0, Concatenation4dDim0Test)
+ARMNN_AUTO_TEST_CASE(Concatenation4dDim1, Concatenation4dDim1Test)
+ARMNN_AUTO_TEST_CASE(Concatenation4dDim3, Concatenation4dDim3Test, false)
+ARMNN_AUTO_TEST_CASE(Concatenation4dDiffShapeDim0, Concatenation4dDiffShapeDim0Test)
+ARMNN_AUTO_TEST_CASE(Concatenation4dDiffShapeDim1, Concatenation4dDiffShapeDim1Test)
+ARMNN_AUTO_TEST_CASE(Concatenation4dDiffShapeDim3, Concatenation4dDiffShapeDim3Test, false)
 
 // Floor
 ARMNN_AUTO_TEST_CASE(SimpleFloor, SimpleFloorTest)
diff --git a/src/backends/neon/test/NeonCreateWorkloadTests.cpp b/src/backends/neon/test/NeonCreateWorkloadTests.cpp
index 61160e2..1fd56ca 100644
--- a/src/backends/neon/test/NeonCreateWorkloadTests.cpp
+++ b/src/backends/neon/test/NeonCreateWorkloadTests.cpp
@@ -616,4 +616,39 @@
     NeonCreateL2NormalizationWorkloadTest<NeonL2NormalizationFloatWorkload, DataType::Float32>(DataLayout::NHWC);
 }
 
+template <typename MergerWorkloadType, armnn::DataType DataType>
+static void NeonCreateMergerWorkloadTest(std::initializer_list<unsigned int> outputShape,
+                                         unsigned int concatAxis)
+{
+    Graph graph;
+    NeonWorkloadFactory factory =
+        NeonWorkloadFactoryHelper::GetFactory(NeonWorkloadFactoryHelper::GetMemoryManager());
+
+    auto workload = CreateMergerWorkloadTest<MergerWorkloadType, DataType>(factory, graph, outputShape, concatAxis);
+
+    MergerQueueDescriptor queueDescriptor = workload->GetData();
+    auto inputHandle0 = boost::polymorphic_downcast<INeonTensorHandle*>(queueDescriptor.m_Inputs[0]);
+    auto inputHandle1 = boost::polymorphic_downcast<INeonTensorHandle*>(queueDescriptor.m_Inputs[1]);
+    auto outputHandle = boost::polymorphic_downcast<INeonTensorHandle*>(queueDescriptor.m_Outputs[0]);
+
+    BOOST_TEST(TestNeonTensorHandleInfo(inputHandle0, TensorInfo({ 2, 3, 2, 5 }, DataType)));
+    BOOST_TEST(TestNeonTensorHandleInfo(inputHandle1, TensorInfo({ 2, 3, 2, 5 }, DataType)));
+    BOOST_TEST(TestNeonTensorHandleInfo(outputHandle, TensorInfo(outputShape, DataType)));
+}
+
+BOOST_AUTO_TEST_CASE(CreateMergerDim0Float32Workload)
+{
+    NeonCreateMergerWorkloadTest<NeonMergerWorkload, armnn::DataType::Float32>({ 4, 3, 2, 5 }, 0);
+}
+
+BOOST_AUTO_TEST_CASE(CreateMergerDim1Float32Workload)
+{
+    NeonCreateMergerWorkloadTest<NeonMergerWorkload, armnn::DataType::Float32>({ 2, 6, 2, 5 }, 1);
+}
+
+BOOST_AUTO_TEST_CASE(CreateMergerDim3Float32Workload)
+{
+    NeonCreateMergerWorkloadTest<NeonMergerWorkload, armnn::DataType::Float32>({ 2, 3, 2, 10 }, 3);
+}
+
 BOOST_AUTO_TEST_SUITE_END()
diff --git a/src/backends/neon/test/NeonLayerTests.cpp b/src/backends/neon/test/NeonLayerTests.cpp
index f0410f2..9a6c71b 100644
--- a/src/backends/neon/test/NeonLayerTests.cpp
+++ b/src/backends/neon/test/NeonLayerTests.cpp
@@ -356,16 +356,29 @@
 ARMNN_AUTO_TEST_CASE(Concatenation3dDim0Uint8, Concatenation3dDim0Uint8Test)
 ARMNN_AUTO_TEST_CASE(Concatenation3dDim1, Concatenation3dDim1Test)
 ARMNN_AUTO_TEST_CASE(Concatenation3dDim1Uint8, Concatenation3dDim1Uint8Test)
-ARMNN_AUTO_TEST_CASE(Concatenation3dDim2, Concatenation3dDim2Test)
-ARMNN_AUTO_TEST_CASE(Concatenation3dDim2Uint8, Concatenation3dDim2Uint8Test)
+ARMNN_AUTO_TEST_CASE(Concatenation3dDim2, Concatenation3dDim2Test, false)
+ARMNN_AUTO_TEST_CASE(Concatenation3dDim2Uint8, Concatenation3dDim2Uint8Test, false)
 
 ARMNN_AUTO_TEST_CASE(Concatenation3dDim0DiffInputDims, Concatenation3dDim0DiffInputDimsTest)
 ARMNN_AUTO_TEST_CASE(Concatenation3dDim0DiffInputDimsUint8, Concatenation3dDim0DiffInputDimsUint8Test)
 ARMNN_AUTO_TEST_CASE(Concatenation3dDim1DiffInputDims, Concatenation3dDim1DiffInputDimsTest)
 ARMNN_AUTO_TEST_CASE(Concatenation3dDim1DiffInputDimsUint8, Concatenation3dDim1DiffInputDimsUint8Test)
-ARMNN_AUTO_TEST_CASE(Concatenation3dDim2DiffInputDims, Concatenation3dDim2DiffInputDimsTest)
-ARMNN_AUTO_TEST_CASE(Concatenation3dDim2DiffInputDimsUint8, Concatenation3dDim2DiffInputDimsUint8Test)
+ARMNN_AUTO_TEST_CASE(Concatenation3dDim2DiffInputDims, Concatenation3dDim2DiffInputDimsTest, false)
+ARMNN_AUTO_TEST_CASE(Concatenation3dDim2DiffInputDimsUint8, Concatenation3dDim2DiffInputDimsUint8Test, false)
 
+ARMNN_AUTO_TEST_CASE(Concatenation4dDim0, Concatenation4dDim0Test)
+ARMNN_AUTO_TEST_CASE(Concatenation4dDim1, Concatenation4dDim1Test)
+ARMNN_AUTO_TEST_CASE(Concatenation4dDim3, Concatenation4dDim3Test, false)
+ARMNN_AUTO_TEST_CASE(Concatenation4dDim0Uint8, Concatenation4dDim0Uint8Test)
+ARMNN_AUTO_TEST_CASE(Concatenation4dDim1Uint8, Concatenation4dDim1Uint8Test)
+ARMNN_AUTO_TEST_CASE(Concatenation4dDim3Uint8, Concatenation4dDim3Uint8Test, false)
+
+ARMNN_AUTO_TEST_CASE(Concatenation4dDiffShapeDim0, Concatenation4dDiffShapeDim0Test)
+ARMNN_AUTO_TEST_CASE(Concatenation4dDiffShapeDim1, Concatenation4dDiffShapeDim1Test)
+ARMNN_AUTO_TEST_CASE(Concatenation4dDiffShapeDim3, Concatenation4dDiffShapeDim3Test, false)
+ARMNN_AUTO_TEST_CASE(Concatenation4dDiffShapeDim0Uint8, Concatenation4dDiffShapeDim0Uint8Test)
+ARMNN_AUTO_TEST_CASE(Concatenation4dDiffShapeDim1Uint8, Concatenation4dDiffShapeDim1Uint8Test)
+ARMNN_AUTO_TEST_CASE(Concatenation4dDiffShapeDim3Uint8, Concatenation4dDiffShapeDim3Uint8Test, false)
 // L2 Normalization
 ARMNN_AUTO_TEST_CASE(L2Normalization1d, L2Normalization1dTest, armnn::DataLayout::NCHW)
 ARMNN_AUTO_TEST_CASE(L2Normalization2d, L2Normalization2dTest, armnn::DataLayout::NCHW)
diff --git a/src/backends/reference/test/RefCreateWorkloadTests.cpp b/src/backends/reference/test/RefCreateWorkloadTests.cpp
index d03fe5c..47f9d0e 100644
--- a/src/backends/reference/test/RefCreateWorkloadTests.cpp
+++ b/src/backends/reference/test/RefCreateWorkloadTests.cpp
@@ -609,4 +609,58 @@
     RefCreateReshapeWorkloadTest<RefReshapeUint8Workload, armnn::DataType::QuantisedAsymm8>();
 }
 
+template <typename MergerWorkloadType, armnn::DataType DataType>
+static void RefCreateMergerWorkloadTest(const armnn::TensorShape& outputShape,
+                                        unsigned int concatAxis)
+{
+    Graph graph;
+    RefWorkloadFactory factory;
+    auto workload = CreateMergerWorkloadTest<MergerWorkloadType, DataType>(factory, graph, outputShape, concatAxis);
+
+    CheckInputsOutput(std::move(workload),
+                      TensorInfo({ 2, 3, 2, 5 }, DataType),
+                      TensorInfo({ 2, 3, 2, 5 }, DataType),
+                      TensorInfo(outputShape, DataType));
+}
+
+BOOST_AUTO_TEST_CASE(CreateMergerDim0Float32Workload)
+{
+    RefCreateMergerWorkloadTest<RefMergerFloat32Workload, armnn::DataType::Float32>({ 4, 3, 2, 5 }, 0);
+}
+
+BOOST_AUTO_TEST_CASE(CreateMergerDim0Uint8Workload)
+{
+    RefCreateMergerWorkloadTest<RefMergerUint8Workload, armnn::DataType::QuantisedAsymm8>({ 4, 3, 2, 5 }, 0);
+}
+
+BOOST_AUTO_TEST_CASE(CreateMergerDim1Float32Workload)
+{
+    RefCreateMergerWorkloadTest<RefMergerFloat32Workload, armnn::DataType::Float32>({ 2, 6, 2, 5 }, 1);
+}
+
+BOOST_AUTO_TEST_CASE(CreateMergerDim1Uint8Workload)
+{
+    RefCreateMergerWorkloadTest<RefMergerUint8Workload, armnn::DataType::QuantisedAsymm8>({ 2, 6, 2, 5 }, 1);
+}
+
+BOOST_AUTO_TEST_CASE(CreateMergerDim2Float32Workload)
+{
+    RefCreateMergerWorkloadTest<RefMergerFloat32Workload, armnn::DataType::Float32>({ 2, 3, 4, 5 }, 2);
+}
+
+BOOST_AUTO_TEST_CASE(CreateMergerDim2Uint8Workload)
+{
+    RefCreateMergerWorkloadTest<RefMergerUint8Workload, armnn::DataType::QuantisedAsymm8>({ 2, 3, 4, 5 }, 2);
+}
+
+BOOST_AUTO_TEST_CASE(CreateMergerDim3Float32Workload)
+{
+    RefCreateMergerWorkloadTest<RefMergerFloat32Workload, armnn::DataType::Float32>({ 2, 3, 2, 10 }, 3);
+}
+
+BOOST_AUTO_TEST_CASE(CreateMergerDim3Uint8Workload)
+{
+    RefCreateMergerWorkloadTest<RefMergerUint8Workload, armnn::DataType::QuantisedAsymm8>({ 2, 3, 2, 10 }, 3);
+}
+
 BOOST_AUTO_TEST_SUITE_END()
diff --git a/src/backends/reference/test/RefLayerTests.cpp b/src/backends/reference/test/RefLayerTests.cpp
index 35981ea..aba9f3e 100644
--- a/src/backends/reference/test/RefLayerTests.cpp
+++ b/src/backends/reference/test/RefLayerTests.cpp
@@ -289,7 +289,6 @@
 ARMNN_AUTO_TEST_CASE(PadUint83d, PadUint83dTest)
 ARMNN_AUTO_TEST_CASE(PadUint84d, PadUint84dTest)
 
-
 // Constant
 ARMNN_AUTO_TEST_CASE(Constant, ConstantTest)
 ARMNN_AUTO_TEST_CASE(ConstantUint8, ConstantUint8Test)
@@ -312,15 +311,33 @@
 ARMNN_AUTO_TEST_CASE(Concatenation3dDim0Uint8, Concatenation3dDim0Uint8Test)
 ARMNN_AUTO_TEST_CASE(Concatenation3dDim1, Concatenation3dDim1Test)
 ARMNN_AUTO_TEST_CASE(Concatenation3dDim1Uint8, Concatenation3dDim1Uint8Test)
-ARMNN_AUTO_TEST_CASE(Concatenation3dDim2, Concatenation3dDim2Test)
-ARMNN_AUTO_TEST_CASE(Concatenation3dDim2Uint8, Concatenation3dDim2Uint8Test)
+ARMNN_AUTO_TEST_CASE(Concatenation3dDim2, Concatenation3dDim2Test, true)
+ARMNN_AUTO_TEST_CASE(Concatenation3dDim2Uint8, Concatenation3dDim2Uint8Test, true)
 
 ARMNN_AUTO_TEST_CASE(Concatenation3dDim0DiffInputDims, Concatenation3dDim0DiffInputDimsTest)
 ARMNN_AUTO_TEST_CASE(Concatenation3dDim0DiffInputDimsUint8, Concatenation3dDim0DiffInputDimsUint8Test)
 ARMNN_AUTO_TEST_CASE(Concatenation3dDim1DiffInputDims, Concatenation3dDim1DiffInputDimsTest)
 ARMNN_AUTO_TEST_CASE(Concatenation3dDim1DiffInputDimsUint8, Concatenation3dDim1DiffInputDimsUint8Test)
-ARMNN_AUTO_TEST_CASE(Concatenation3dDim2DiffInputDims, Concatenation3dDim2DiffInputDimsTest)
-ARMNN_AUTO_TEST_CASE(Concatenation3dDim2DiffInputDimsUint8, Concatenation3dDim2DiffInputDimsUint8Test)
+ARMNN_AUTO_TEST_CASE(Concatenation3dDim2DiffInputDims, Concatenation3dDim2DiffInputDimsTest, true)
+ARMNN_AUTO_TEST_CASE(Concatenation3dDim2DiffInputDimsUint8, Concatenation3dDim2DiffInputDimsUint8Test, true)
+
+ARMNN_AUTO_TEST_CASE(Concatenation4dDim0, Concatenation4dDim0Test)
+ARMNN_AUTO_TEST_CASE(Concatenation4dDim1, Concatenation4dDim1Test)
+ARMNN_AUTO_TEST_CASE(Concatenation4dDim2, Concatenation4dDim2Test)
+ARMNN_AUTO_TEST_CASE(Concatenation4dDim3, Concatenation4dDim3Test, true)
+ARMNN_AUTO_TEST_CASE(Concatenation4dDim0Uint8, Concatenation4dDim0Uint8Test)
+ARMNN_AUTO_TEST_CASE(Concatenation4dDim1Uint8, Concatenation4dDim1Uint8Test)
+ARMNN_AUTO_TEST_CASE(Concatenation4dDim2Uint8, Concatenation4dDim2Uint8Test)
+ARMNN_AUTO_TEST_CASE(Concatenation4dDim3Uint8, Concatenation4dDim3Uint8Test, true)
+
+ARMNN_AUTO_TEST_CASE(Concatenation4dDiffShapeDim0, Concatenation4dDiffShapeDim0Test)
+ARMNN_AUTO_TEST_CASE(Concatenation4dDiffShapeDim1, Concatenation4dDiffShapeDim1Test)
+ARMNN_AUTO_TEST_CASE(Concatenation4dDiffShapeDim2, Concatenation4dDiffShapeDim2Test)
+ARMNN_AUTO_TEST_CASE(Concatenation4dDiffShapeDim3, Concatenation4dDiffShapeDim3Test, true)
+ARMNN_AUTO_TEST_CASE(Concatenation4dDiffShapeDim0Uint8, Concatenation4dDiffShapeDim0Uint8Test)
+ARMNN_AUTO_TEST_CASE(Concatenation4dDiffShapeDim1Uint8, Concatenation4dDiffShapeDim1Uint8Test)
+ARMNN_AUTO_TEST_CASE(Concatenation4dDiffShapeDim2Uint8, Concatenation4dDiffShapeDim2Uint8Test)
+ARMNN_AUTO_TEST_CASE(Concatenation4dDiffShapeDim3Uint8, Concatenation4dDiffShapeDim3Uint8Test, true)
 
 // Floor
 ARMNN_AUTO_TEST_CASE(SimpleFloor, SimpleFloorTest)