IVGCVSW-3381 Break up LayerTests.hpp into more manageable files

Signed-off-by: Aron Virginas-Tar <Aron.Virginas-Tar@arm.com>
Change-Id: Icf39434f09fd340ad664cb3b97b8bee6d9da4838
diff --git a/src/backends/backendsCommon/test/layerTests/ActivationTestImpl.cpp b/src/backends/backendsCommon/test/layerTests/ActivationTestImpl.cpp
new file mode 100644
index 0000000..c05dfd6
--- /dev/null
+++ b/src/backends/backendsCommon/test/layerTests/ActivationTestImpl.cpp
@@ -0,0 +1,1084 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "ActivationTestImpl.hpp"
+
+#include <ResolveType.hpp>
+
+#include <armnn/ArmNN.hpp>
+
+#include <backendsCommon/test/ActivationFixture.hpp>
+#include <backendsCommon/test/QuantizeHelper.hpp>
+#include <backendsCommon/test/TensorCopyUtils.hpp>
+#include <backendsCommon/test/WorkloadTestUtils.hpp>
+
+#include <test/TensorHelpers.hpp>
+
+#include <boost/multi_array.hpp>
+
+#include <algorithm>
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> BoundedReLuTestCommon(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    float upperBound,
+    float lowerBound,
+    float inputScale,
+    int32_t inputOffset,
+    float outputScale,
+    int32_t outputOffset,
+    const std::vector<T>& inputData,
+    const std::vector<T>& outputExpectedData,
+    unsigned int inputWidth,
+    unsigned int inputHeight,
+    unsigned int inputChannels,
+    unsigned int inputBatchSize)
+{
+    unsigned int outputWidth = inputWidth;
+    unsigned int outputHeight = inputHeight;
+    unsigned int outputChannels = inputChannels;
+    unsigned int outputBatchSize = inputBatchSize;
+
+    armnn::TensorInfo inputTensorInfo({ inputBatchSize, inputChannels, inputHeight, inputWidth }, ArmnnType);
+
+    armnn::TensorInfo outputTensorInfo({ outputBatchSize, outputChannels, outputHeight, outputWidth }, ArmnnType);
+
+    if(armnn::IsQuantizedType<T>())
+    {
+        inputTensorInfo.SetQuantizationScale(inputScale);
+        inputTensorInfo.SetQuantizationOffset(inputOffset);
+
+        outputTensorInfo.SetQuantizationScale(outputScale);
+        outputTensorInfo.SetQuantizationOffset(outputOffset);
+    }
+
+    LayerTestResult<T, 4> result(inputTensorInfo);
+
+    auto input = MakeTensor<T, 4>(inputTensorInfo, inputData);
+
+    std::unique_ptr<armnn::ITensorHandle> inputHandle = workloadFactory.CreateTensorHandle(inputTensorInfo);
+    std::unique_ptr<armnn::ITensorHandle> outputHandle = workloadFactory.CreateTensorHandle(outputTensorInfo);
+
+    // Setup bounded ReLu.
+    armnn::ActivationQueueDescriptor descriptor;
+    armnn::WorkloadInfo workloadInfo;
+    AddInputToWorkload(descriptor, workloadInfo, inputTensorInfo, inputHandle.get());
+    AddOutputToWorkload(descriptor, workloadInfo, outputTensorInfo, outputHandle.get());
+
+    descriptor.m_Parameters.m_Function = armnn::ActivationFunction::BoundedReLu;
+    descriptor.m_Parameters.m_A = upperBound;
+    descriptor.m_Parameters.m_B = lowerBound;
+
+    std::unique_ptr<armnn::IWorkload> workload = workloadFactory.CreateActivation(descriptor, workloadInfo);
+
+    inputHandle->Allocate();
+    outputHandle->Allocate();
+
+    CopyDataToITensorHandle(inputHandle.get(), &input[0][0][0][0]);
+
+    workload->Execute();
+
+    CopyDataFromITensorHandle(&result.output[0][0][0][0], outputHandle.get());
+
+    result.outputExpected = MakeTensor<T, 4>(outputTensorInfo, outputExpectedData);
+
+    return result;
+}
+
+LayerTestResult<float, 4> BoundedReLuUpperAndLowerBoundTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    unsigned int inputWidth = 4u;
+    unsigned int inputHeight = 5u;
+    unsigned int inputChannels = 1u;
+    unsigned int inputBatchSize = 1;
+
+    std::vector<float> input = std::vector<float>{
+      -2.0f,       0.1f,     0.5f,     1.25f,
+     0.786f,    0.9875f,    -1.5f,    0.384f,
+    1.0001f,       3.5f,     7.5f,    0.896f,
+     2.126f,       2.0f,     0.3f,     0.15f,
+     0.999f,       1.2f,    0.89f,      6.1f,
+    };
+
+    // Calculated manually.
+    std::vector<float> output = std::vector<float>{
+      -1.0f,       0.1f,     0.5f,      1.0f,
+     0.786f,    0.9875f,    -1.0f,    0.384f,
+       1.0f,       1.0f,     1.0f,    0.896f,
+       1.0f,       1.0f,     0.3f,     0.15f,
+     0.999f,       1.0f,    0.89f,      1.0f,
+    };
+
+    return BoundedReLuTestCommon<armnn::DataType::Float32>(
+        workloadFactory, memoryManager, 1.0f, -1.0f, 1.0f, 0, 1.0f, 0, input, output,
+        inputWidth, inputHeight, inputChannels, inputBatchSize);
+}
+
+LayerTestResult<float, 4> BoundedReLuUpperBoundOnlyTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    unsigned int inputWidth = 4u;
+    unsigned int inputHeight = 5u;
+    unsigned int inputChannels = 1u;
+    unsigned int inputBatchSize = 1;
+
+    std::vector<float> input = std::vector<float>{
+      -1.0f,       0.1f,     0.5f,      6.25f,
+     0.786f,    5.9875f,    -0.5f,     0.384f,
+    6.0001f,       3.5f,     7.5f,     0.896f,
+     2.126f,      12.0f,     0.3f,      0.15f,
+     0.999f,       1.2f,    0.89f,       6.1f,
+    };
+
+    // Calculated manually.
+    std::vector<float> output = std::vector<float>{
+       0.0f,       0.1f,     0.5f,       6.0f,
+     0.786f,    5.9875f,     0.0f,     0.384f,
+       6.0f,       3.5f,     6.0f,     0.896f,
+     2.126f,       6.0f,     0.3f,      0.15f,
+     0.999f,       1.2f,    0.89f,       6.0f,
+    };
+
+    return BoundedReLuTestCommon<armnn::DataType::Float32>(
+        workloadFactory, memoryManager, 6.0f, 0.0f, 1.0f, 0, 1.0f, 0, input, output,
+        inputWidth, inputHeight, inputChannels, inputBatchSize);
+}
+
+LayerTestResult<uint8_t, 4> BoundedReLuUint8UpperBoundOnlyTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    unsigned int inputWidth     = 3u;
+    unsigned int inputHeight    = 2u;
+    unsigned int inputChannels  = 1u;
+    unsigned int inputBatchSize = 1;
+
+    std::vector<uint8_t> input = std::vector<uint8_t>{
+         51, 124, 28,
+        251,   8, 92
+    };
+
+    // Calculated manually.
+    std::vector<uint8_t> output = std::vector<uint8_t>{
+          0, 122,  0,
+        255,   0, 58
+    };
+
+    float inputScale     = 12.0f / 255.0f;
+    int32_t inputOffset  = 63;
+    float outputScale    = 6.0f / 255.0f;
+    int32_t outputOffset = 0;
+
+    return BoundedReLuTestCommon<armnn::DataType::QuantisedAsymm8>(
+        workloadFactory, memoryManager, 6.0f, 0.0f,
+        inputScale, inputOffset, outputScale, outputOffset,
+        input, output, inputWidth, inputHeight, inputChannels, inputBatchSize);
+}
+
+LayerTestResult<uint8_t, 4> BoundedReLuUint8UpperAndLowerBoundTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    unsigned int inputWidth     = 3u;
+    unsigned int inputHeight    = 2u;
+    unsigned int inputChannels  = 1u;
+    unsigned int inputBatchSize = 1;
+
+    std::vector<uint8_t> input = std::vector<uint8_t>{
+         51, 230, 28,
+        251,   8, 92
+    };
+
+    // Calculated manually.
+    std::vector<uint8_t> output = std::vector<uint8_t>{
+         51, 192, 32,
+        192,  32, 92
+    };
+
+    int32_t inputOffset = 112;
+    float inputScale    = 0.0125f;
+
+    return BoundedReLuTestCommon<armnn::DataType::QuantisedAsymm8>(
+        workloadFactory, memoryManager, 1.0f, -1.0f,
+        inputScale, inputOffset, inputScale, inputOffset, // Input/output scale & offset same.
+        input, output, inputWidth, inputHeight, inputChannels, inputBatchSize);
+}
+
+namespace
+{
+
+struct BoundedReLuRandomInputTestTraits
+{
+    constexpr static unsigned int inputHeight = 31u;
+    constexpr static unsigned int inputWidth = 19u;
+    constexpr static unsigned int inputChannels = 4u;
+    constexpr static unsigned int inputBatchSize = 2;
+
+    constexpr static unsigned int outputHeight = inputHeight;
+    constexpr static unsigned int outputWidth = inputWidth;
+    constexpr static unsigned int outputChannels = inputChannels;
+    constexpr static unsigned int outputBatchSize = inputBatchSize;
+
+    static armnn::TensorInfo GetInputTensorInfo()
+    {
+        return armnn::TensorInfo({ inputBatchSize, inputChannels, inputHeight, inputWidth },
+            armnn::DataType::Float32);
+    }
+
+    static armnn::TensorInfo GetOutputTensorInfo()
+    {
+        return armnn::TensorInfo({ outputBatchSize, outputChannels, outputHeight, outputWidth },
+            armnn::DataType::Float32);
+    }
+};
+
+boost::multi_array<float, 4> BoundedReLuRandomInputTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    float lowerBound,
+    float upperBound,
+    const armnn::ActivationDescriptor& activationDescriptor)
+{
+    const armnn::TensorInfo inputTensorInfo = BoundedReLuRandomInputTestTraits::GetInputTensorInfo();
+    const armnn::TensorInfo outputTensorInfo = BoundedReLuRandomInputTestTraits::GetOutputTensorInfo();
+
+    boost::multi_array<float, 4> output(GetTensorShapeAsArray<4>(outputTensorInfo));
+
+    // Min/max random values passed to MakeRandomTensor are purposely outside of the ReLu
+    // range [lowerBound, upperBound].
+    auto input = MakeRandomTensor<float, 4>(inputTensorInfo, 4605828, lowerBound - 5.0f, upperBound * 2.0f);
+
+    std::unique_ptr<armnn::ITensorHandle> inputHandle = workloadFactory.CreateTensorHandle(inputTensorInfo);
+    std::unique_ptr<armnn::ITensorHandle> outputHandle = workloadFactory.CreateTensorHandle(outputTensorInfo);
+
+    // Set up bounded ReLu.
+    armnn::ActivationQueueDescriptor descriptor;
+    armnn::WorkloadInfo workloadInfo;
+    AddInputToWorkload(descriptor, workloadInfo, inputTensorInfo, inputHandle.get());
+    AddOutputToWorkload(descriptor, workloadInfo, outputTensorInfo, outputHandle.get());
+    descriptor.m_Parameters = activationDescriptor;
+
+    std::unique_ptr<armnn::IWorkload> workload = workloadFactory.CreateActivation(descriptor, workloadInfo);
+
+    inputHandle->Allocate();
+    outputHandle->Allocate();
+
+    CopyDataToITensorHandle(inputHandle.get(), &input[0][0][0][0]);
+
+    workload->Execute();
+
+    CopyDataFromITensorHandle(&output[0][0][0][0], outputHandle.get());
+
+    return output;
+}
+
+} // namespace
+
+LayerTestResult<float, 4> CompareBoundedReLuTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    armnn::IWorkloadFactory& refWorkloadFactory,
+    float upperBound,
+    float lowerBound)
+{
+    LayerTestResult<float, 4> result(BoundedReLuRandomInputTestTraits::GetOutputTensorInfo());
+
+    armnn::ActivationDescriptor activationDescriptor;
+    activationDescriptor.m_Function = armnn::ActivationFunction::BoundedReLu;
+    activationDescriptor.m_A = upperBound;
+    activationDescriptor.m_B = lowerBound;
+
+    result.output = BoundedReLuRandomInputTest(
+        workloadFactory, memoryManager, 0.0f, upperBound, activationDescriptor);
+    result.outputExpected = BoundedReLuRandomInputTest(
+        refWorkloadFactory, nullptr, 0.0f, upperBound, activationDescriptor);
+
+    return result;
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T,4> ConstantLinearActivationTestCommon(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    float qScale = 0.0f,
+    int32_t qOffset = 0)
+{
+    unsigned int inputHeight    = 20;
+    unsigned int inputWidth     = 17;
+    unsigned int inputChannels  = 3;
+    unsigned int batchSize      = 5;
+
+    armnn::TensorInfo inputTensorInfo;
+    armnn::TensorInfo outputTensorInfo;
+
+    unsigned int shape[]  = {batchSize, inputChannels, inputHeight, inputWidth};
+
+    inputTensorInfo = armnn::TensorInfo(4, shape, ArmnnType);
+    outputTensorInfo = armnn::TensorInfo(4, shape, ArmnnType);
+
+    // Set quantization parameters if the requested type is a quantized type.
+    if(armnn::IsQuantizedType<T>())
+    {
+        inputTensorInfo.SetQuantizationScale(qScale);
+        inputTensorInfo.SetQuantizationOffset(qOffset);
+        outputTensorInfo.SetQuantizationScale(qScale);
+        outputTensorInfo.SetQuantizationOffset(qOffset);
+    }
+
+    LayerTestResult<T, 4> ret(outputTensorInfo);
+
+    std::unique_ptr<armnn::ITensorHandle> inputHandle = workloadFactory.CreateTensorHandle(inputTensorInfo);
+    std::unique_ptr<armnn::ITensorHandle> outputHandle = workloadFactory.CreateTensorHandle(outputTensorInfo);
+
+    // Do linear activation that should leave the tensor unchanged.
+    armnn::ActivationQueueDescriptor data;
+    armnn::WorkloadInfo info;
+    AddInputToWorkload(data, info, inputTensorInfo, inputHandle.get());
+    AddOutputToWorkload(data, info, outputTensorInfo, outputHandle.get());
+    data.m_Parameters.m_A = 1.0f;
+    data.m_Parameters.m_B = 0.0f;
+    data.m_Parameters.m_Function = armnn::ActivationFunction::Linear;
+
+    std::unique_ptr<armnn::IWorkload> workload = workloadFactory.CreateActivation(data, info);
+
+    inputHandle->Allocate();
+    outputHandle->Allocate();
+
+    boost::multi_array<T, 4> input = MakeRandomTensor<T, 4>(inputTensorInfo, 7123561);
+    CopyDataToITensorHandle(inputHandle.get(), &input[0][0][0][0]);
+
+    workload->Execute();
+
+    CopyDataFromITensorHandle(&ret.output[0][0][0][0], outputHandle.get());
+
+    // Ensure output equals input.
+    ret.outputExpected = input;
+
+    return ret;
+}
+
+LayerTestResult<float, 4> ConstantLinearActivationTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return ConstantLinearActivationTestCommon<armnn::DataType::Float32>(workloadFactory, memoryManager);
+}
+
+LayerTestResult<uint8_t, 4> ConstantLinearActivationUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return ConstantLinearActivationTestCommon<armnn::DataType::QuantisedAsymm8>(
+        workloadFactory, memoryManager, 4.0f, 3);
+}
+
+LayerTestResult<int16_t, 4> ConstantLinearActivationInt16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return ConstantLinearActivationTestCommon<armnn::DataType::QuantisedSymm16>(
+            workloadFactory, memoryManager, 0.1f, 0);
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> SimpleActivationTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    armnn::ActivationFunction activationFunction,
+    float activationParameterA,
+    float activationParameterB,
+    float scale,
+    int32_t offset,
+    const std::vector<float>& inputData,
+    float outScale,
+    int32_t outOffset,
+    const std::vector<float>& outputExpectedData)
+{
+    constexpr static unsigned int inputWidth = 16u;
+    constexpr static unsigned int inputHeight = 1u;
+    constexpr static unsigned int inputChannels = 1u;
+    constexpr static unsigned int inputBatchSize = 1u;
+
+    constexpr static unsigned int outputWidth = inputWidth;
+    constexpr static unsigned int outputHeight = inputHeight;
+    constexpr static unsigned int outputChannels = inputChannels;
+    constexpr static unsigned int outputBatchSize = inputBatchSize;
+
+    armnn::TensorInfo inputTensorInfo({ inputBatchSize, inputChannels, inputHeight, inputWidth }, ArmnnType);
+    armnn::TensorInfo outputTensorInfo({ outputBatchSize, outputChannels, outputHeight, outputWidth }, ArmnnType);
+
+    // Set quantization parameters if the requested type is a quantized type.
+    if(armnn::IsQuantizedType<T>())
+    {
+        inputTensorInfo.SetQuantizationScale(scale);
+        inputTensorInfo.SetQuantizationOffset(offset);
+        outputTensorInfo.SetQuantizationScale(outScale);
+        outputTensorInfo.SetQuantizationOffset(outOffset);
+    }
+
+    LayerTestResult<T, 4> result(inputTensorInfo);
+
+    auto input = MakeTensor<T, 4>(inputTensorInfo, QuantizedVector<T>(scale, offset, inputData));
+
+    std::unique_ptr<armnn::ITensorHandle> inputHandle = workloadFactory.CreateTensorHandle(inputTensorInfo);
+    std::unique_ptr<armnn::ITensorHandle> outputHandle = workloadFactory.CreateTensorHandle(outputTensorInfo);
+
+    // Setup bounded ReLu.
+    armnn::ActivationQueueDescriptor descriptor;
+    armnn::WorkloadInfo workloadInfo;
+    AddInputToWorkload(descriptor, workloadInfo, inputTensorInfo, inputHandle.get());
+    AddOutputToWorkload(descriptor, workloadInfo, outputTensorInfo, outputHandle.get());
+
+    descriptor.m_Parameters.m_Function = activationFunction;
+    descriptor.m_Parameters.m_A = activationParameterA;
+    descriptor.m_Parameters.m_B = activationParameterB;
+
+    std::unique_ptr<armnn::IWorkload> workload = workloadFactory.CreateActivation(descriptor, workloadInfo);
+
+    inputHandle->Allocate();
+    outputHandle->Allocate();
+
+    CopyDataToITensorHandle(inputHandle.get(), &input[0][0][0][0]);
+
+    workload->Execute();
+
+    CopyDataFromITensorHandle(&result.output[0][0][0][0], outputHandle.get());
+
+    // Calculated manually.
+    result.outputExpected = MakeTensor<T, 4>(outputTensorInfo, QuantizedVector<T>(outScale, outOffset,
+                                                                                  outputExpectedData));
+
+    return result;
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> SimpleSigmoidTestCommon(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    float qScale,
+    int32_t qOffset)
+{
+    std::vector<float> inputData =
+    {
+        -0.1f, -0.2f, -0.3f, -0.4f,
+        0.1f,  0.2f,  0.3f,  0.4f,
+        -1.0f, -2.0f, -3.0f, -4.0f,
+        1.0f,  2.0f,  3.0f,  4.0f
+    };
+
+    // Calculate output values for input.
+    auto f = [](float value)
+    {
+        return 1.0f / (1.0f + std::exp(-value));
+    };
+    std::vector<float> outputExpectedData(inputData.size());
+    std::transform(inputData.begin(), inputData.end(), outputExpectedData.begin(), f);
+
+    return SimpleActivationTest<ArmnnType>(workloadFactory,
+                                           memoryManager,
+                                           armnn::ActivationFunction::Sigmoid,
+                                           0.f,
+                                           0.f,
+                                           qScale,
+                                           qOffset,
+                                           inputData,
+                                           1.f / 256.f,
+                                           0,
+                                           outputExpectedData);
+}
+
+LayerTestResult<float, 4> SimpleSigmoidTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return SimpleSigmoidTestCommon<armnn::DataType::Float32>(workloadFactory, memoryManager, 0.0f, 0);
+}
+
+LayerTestResult<uint8_t, 4> SimpleSigmoidUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return SimpleSigmoidTestCommon<armnn::DataType::QuantisedAsymm8>(workloadFactory, memoryManager, 0.1f, 50);
+}
+
+LayerTestResult<int16_t, 4> SimpleSigmoidInt16Test(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return SimpleSigmoidTestCommon<armnn::DataType::QuantisedSymm16>(workloadFactory, memoryManager, 0.1f, 0);
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> ReLuTestCommon(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        float qScale,
+        int32_t qOffset)
+{
+    std::vector<float> inputData = {
+            -0.1f, -0.2f, -0.3f, -0.4f,
+            0.1f,  0.2f,  0.3f,  0.4f,
+            -1.0f, -2.0f, -3.0f, -4.0f,
+            1.0f,  2.0f,  3.0f,  4.0f
+    };
+
+    // Calculate output values for input.
+    auto f = [](float value)
+    {
+        return std::fmax(0.0f, value);
+    };
+    std::vector<float> outputExpectedData(inputData.size());
+    std::transform(inputData.begin(), inputData.end(), outputExpectedData.begin(), f);
+
+    return SimpleActivationTest<ArmnnType>(workloadFactory,
+                                           memoryManager,
+                                           armnn::ActivationFunction::ReLu,
+                                           0.f,
+                                           0.f,
+                                           qScale,
+                                           qOffset,
+                                           inputData,
+                                           qScale,
+                                           qOffset,
+                                           outputExpectedData);
+}
+
+LayerTestResult<int16_t, 4> ReLuInt16Test(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return ReLuTestCommon<armnn::DataType::QuantisedSymm16>(workloadFactory, memoryManager, 0.1f, 0);
+}
+
+
+LayerTestResult<uint8_t, 4> ReLuUint8Test(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return ReLuTestCommon<armnn::DataType::QuantisedAsymm8>(workloadFactory, memoryManager, 0.1f, 0);
+}
+
+LayerTestResult<float, 4> ReLuTest(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return ReLuTestCommon<armnn::DataType::Float32>(workloadFactory, memoryManager, 0.1f, 0);
+}
+
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> BoundedReLuTestCommon(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        float qScale,
+        int32_t qOffset)
+{
+    std::vector<float> inputData = {
+            -0.1f, -0.2f, -0.3f, -0.4f,
+            0.1f,  0.2f,  0.3f,  0.4f,
+            -1.0f, -2.0f, -3.0f, -4.0f,
+            1.0f,  2.0f,  3.0f,  4.0f
+    };
+    const float a = 1.0f;
+    const float b = -1.0f;
+    // Calculate output values for input.
+    auto f = [a, b](float value)
+    {
+        return std::min(a, std::max(b, value));
+    };
+    std::vector<float> outputExpectedData(inputData.size());
+    std::transform(inputData.begin(), inputData.end(), outputExpectedData.begin(), f);
+
+    return SimpleActivationTest<ArmnnType>(workloadFactory,
+                                           memoryManager,
+                                           armnn::ActivationFunction::BoundedReLu,
+                                           a,
+                                           b,
+                                           qScale,
+                                           qOffset,
+                                           inputData,
+                                           qScale,
+                                           qOffset,
+                                           outputExpectedData);
+}
+
+LayerTestResult<int16_t, 4> BoundedReLuInt16Test(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return ReLuTestCommon<armnn::DataType::QuantisedSymm16>(workloadFactory, memoryManager, 0.1f, 0);
+}
+
+
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> SoftReLuTestCommon(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        float qScale,
+        int32_t qOffset)
+{
+    std::vector<float> inputData = {
+            -0.1f, -0.2f, -0.3f, -0.4f,
+            0.1f,  0.2f,  0.3f,  0.4f,
+            -1.0f, -2.0f, -3.0f, -4.0f,
+            1.0f,  2.0f,  3.0f,  4.0f
+    };
+
+    // Calculate output values for input.
+    auto f = [](float value)
+    {
+        return std::log(1.0f + std::exp(value));
+    };
+    std::vector<float> outputExpectedData(inputData.size());
+    std::transform(inputData.begin(), inputData.end(), outputExpectedData.begin(), f);
+
+    return SimpleActivationTest<ArmnnType>(workloadFactory,
+                                           memoryManager,
+                                           armnn::ActivationFunction::SoftReLu,
+                                           0.f,
+                                           0.f,
+                                           qScale,
+                                           qOffset,
+                                           inputData,
+                                           qScale,
+                                           qOffset,
+                                           outputExpectedData);
+}
+
+LayerTestResult<float, 4> SoftReLuTest(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return SoftReLuTestCommon<armnn::DataType::Float32>(workloadFactory, memoryManager, 0.1f, 0);
+}
+
+LayerTestResult<uint8_t, 4> SoftReLuUint8Test(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return SoftReLuTestCommon<armnn::DataType::QuantisedAsymm8>(workloadFactory, memoryManager, 0.0625f, 64);
+}
+
+LayerTestResult<int16_t, 4> SoftReLuInt16Test(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return SoftReLuTestCommon<armnn::DataType::QuantisedSymm16>(workloadFactory, memoryManager, 0.1f, 0);
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> LeakyReLuTestCommon(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        float qScale,
+        int32_t qOffset)
+{
+    std::vector<float> inputData = {
+            -0.1f, -0.2f, -0.3f, -0.4f,
+            0.1f,  0.2f,  0.3f,  0.4f,
+            -1.0f, -2.0f, -3.0f, -4.0f,
+            1.0f,  2.0f,  3.0f,  4.0f
+    };
+
+    const float a = 0.01f;
+    // Calculate output values for input.
+    auto f = [a](float value)
+    {
+        return value > 0.0f ? value : (value * a);
+    };
+    std::vector<float> outputExpectedData(inputData.size());
+    std::transform(inputData.begin(), inputData.end(), outputExpectedData.begin(), f);
+
+    return SimpleActivationTest<ArmnnType>(workloadFactory,
+                                           memoryManager,
+                                           armnn::ActivationFunction::LeakyReLu,
+                                           a,
+                                           0.f,
+                                           qScale,
+                                           qOffset,
+                                           inputData,
+                                           qScale,
+                                           qOffset,
+                                           outputExpectedData);
+}
+
+LayerTestResult<float, 4> LeakyReLuTest(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return LeakyReLuTestCommon<armnn::DataType::Float32>(workloadFactory, memoryManager, 0.1f, 0);
+}
+
+LayerTestResult<uint8_t, 4> LeakyReLuUint8Test(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return LeakyReLuTestCommon<armnn::DataType::QuantisedAsymm8>(workloadFactory, memoryManager, 0.0625f, 64);
+}
+
+LayerTestResult<int16_t, 4> LeakyReLuInt16Test(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return LeakyReLuTestCommon<armnn::DataType::QuantisedSymm16>(workloadFactory, memoryManager, 0.1f, 0);
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> AbsTestCommon(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        float qScale,
+        int32_t qOffset)
+{
+    std::vector<float> inputData = {
+            -0.1f, -0.2f, -0.3f, -0.4f,
+            0.1f,  0.2f,  0.3f,  0.4f,
+            -1.0f, -2.0f, -3.0f, -4.0f,
+            1.0f,  2.0f,  3.0f,  4.0f
+    };
+
+    // Calculate output values for input.
+    auto f = [](float value)
+    {
+        return std::abs(value);
+    };
+    std::vector<float> outputExpectedData(inputData.size());
+    std::transform(inputData.begin(), inputData.end(), outputExpectedData.begin(), f);
+
+    return SimpleActivationTest<ArmnnType>(workloadFactory,
+                                           memoryManager,
+                                           armnn::ActivationFunction::Abs,
+                                           0.f,
+                                           0.f,
+                                           qScale,
+                                           qOffset,
+                                           inputData,
+                                           qScale,
+                                           qOffset,
+                                           outputExpectedData);
+}
+
+LayerTestResult<float, 4> AbsTest(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return AbsTestCommon<armnn::DataType::Float32>(workloadFactory, memoryManager, 0.1f, 0);
+}
+
+LayerTestResult<uint8_t, 4> AbsUint8Test(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return AbsTestCommon<armnn::DataType::QuantisedAsymm8>(workloadFactory, memoryManager, 0.0625f, 64);
+}
+
+LayerTestResult<int16_t, 4> AbsInt16Test(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return AbsTestCommon<armnn::DataType::QuantisedSymm16>(workloadFactory, memoryManager, 0.1f, 0);
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> SqrtTestCommon(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        float qScale,
+        int32_t qOffset)
+{
+    std::vector<float> inputData = {
+            0.1f,  0.2f,  0.3f,  0.4f,
+            0.1f,  0.2f,  0.3f,  0.4f,
+            1.0f,  2.0f,  3.0f,  4.0f,
+            1.0f,  2.0f,  3.0f,  4.0f
+    };
+
+    // Calculate output values for input.
+    auto f = [](float value)
+    {
+        return std::sqrt(value);
+    };
+    std::vector<float> outputExpectedData(inputData.size());
+    std::transform(inputData.begin(), inputData.end(), outputExpectedData.begin(), f);
+
+    return SimpleActivationTest<ArmnnType>(workloadFactory,
+                                           memoryManager,
+                                           armnn::ActivationFunction::Sqrt,
+                                           0.f,
+                                           0.f,
+                                           qScale,
+                                           qOffset,
+                                           inputData,
+                                           qScale,
+                                           qOffset,
+                                           outputExpectedData);
+}
+
+LayerTestResult<float, 4> SqrtTest(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return SqrtTestCommon<armnn::DataType::Float32>(workloadFactory, memoryManager, 0.1f, 0);
+}
+
+LayerTestResult<uint8_t, 4> SqrtUint8Test(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return SqrtTestCommon<armnn::DataType::QuantisedAsymm8>(workloadFactory, memoryManager, 0.0625f, 64);
+}
+
+LayerTestResult<int16_t, 4> SqrtInt16Test(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return SqrtTestCommon<armnn::DataType::QuantisedSymm16>(workloadFactory, memoryManager, 0.1f, 0);
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> SquareTestCommon(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        float qScale,
+        int32_t qOffset)
+{
+    std::vector<float> inputData = {
+            -0.1f, -0.2f, -0.3f, -0.4f,
+            0.1f,  0.2f,  0.3f,  0.4f,
+            -1.0f, -2.0f, -3.0f, -4.0f,
+            1.0f,  2.0f,  3.0f,  4.0f
+    };
+
+    // Calculate output values for input.
+    auto f = [](float value)
+    {
+        return std::pow(value,2);
+    };
+    std::vector<float> outputExpectedData(inputData.size());
+    std::transform(inputData.begin(), inputData.end(), outputExpectedData.begin(), f);
+
+    return SimpleActivationTest<ArmnnType>(workloadFactory,
+                                           memoryManager,
+                                           armnn::ActivationFunction::Square,
+                                           0.f,
+                                           0.f,
+                                           qScale,
+                                           qOffset,
+                                           inputData,
+                                           qScale,
+                                           qOffset,
+                                           outputExpectedData);
+}
+
+LayerTestResult<float, 4> SquareTest(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return SquareTestCommon<armnn::DataType::Float32>(workloadFactory, memoryManager, 0.1f, 0);
+}
+
+LayerTestResult<uint8_t, 4> SquareUint8Test(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return SquareTestCommon<armnn::DataType::QuantisedAsymm8>(workloadFactory, memoryManager, 0.0625f, 64);
+}
+
+LayerTestResult<int16_t, 4> SquareInt16Test(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return SquareTestCommon<armnn::DataType::QuantisedSymm16>(workloadFactory, memoryManager, 0.1f, 0);
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> TanhTestCommon(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        float qScale,
+        int32_t qOffset)
+{
+    std::vector<float> inputData = {
+            -0.1f, -0.2f, -0.3f, -0.4f,
+            0.1f,  0.2f,  0.3f,  0.4f,
+            -1.0f, -2.0f, -3.0f, -4.0f,
+            1.0f,  2.0f,  3.0f,  4.0f
+    };
+
+    const float a = 2.0f;
+    const float b = 3.0f;
+    // Calculate output values for input.
+    auto f = [a, b](float value)
+    {
+        return a * tanhf(b * value);
+    };
+    std::vector<float> outputExpectedData(inputData.size());
+    std::transform(inputData.begin(), inputData.end(), outputExpectedData.begin(), f);
+
+    return SimpleActivationTest<ArmnnType>(workloadFactory,
+                                           memoryManager,
+                                           armnn::ActivationFunction::TanH,
+                                           a,
+                                           b,
+                                           qScale,
+                                           qOffset,
+                                           inputData,
+                                           qScale,
+                                           qOffset,
+                                           outputExpectedData);
+}
+
+LayerTestResult<float, 4> TanhTest(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return TanhTestCommon<armnn::DataType::Float32>(workloadFactory, memoryManager, 0.1f, 0);
+}
+
+LayerTestResult<uint8_t, 4> TanhUint8Test(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return TanhTestCommon<armnn::DataType::QuantisedAsymm8>(workloadFactory, memoryManager, 0.1f, 64);
+}
+
+LayerTestResult<int16_t, 4> TanhInt16Test(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return TanhTestCommon<armnn::DataType::QuantisedSymm16>(workloadFactory, memoryManager, 0.1f, 0);
+}
+
+
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T,4> CompareActivationTestImpl(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    armnn::IWorkloadFactory& refWorkloadFactory,
+    armnn::ActivationFunction f,
+    unsigned int batchSize = 5,
+    float qScale = 0.0f,
+    int32_t qOffset = 0)
+{
+    unsigned int width     = 17;
+    unsigned int height    = 29;
+    unsigned int channels  = 2;
+
+    float a = 0.234f;
+    float b = -12.345f;
+
+    armnn::TensorInfo inputTensorInfo;
+    armnn::TensorInfo outputTensorInfo;
+
+    unsigned int shape[] = {batchSize, channels, height, width};
+
+    inputTensorInfo = armnn::TensorInfo(4, shape, ArmnnType);
+    outputTensorInfo = armnn::TensorInfo(4, shape, ArmnnType);
+
+    // Set quantization parameters if the requested type is a quantized type.
+    if(armnn::IsQuantizedType<T>())
+    {
+        inputTensorInfo.SetQuantizationScale(qScale);
+        inputTensorInfo.SetQuantizationOffset(qOffset);
+        outputTensorInfo.SetQuantizationScale(qScale);
+        outputTensorInfo.SetQuantizationOffset(qOffset);
+    }
+
+    float minVal = -10.f;
+    if (f == armnn::ActivationFunction::Sqrt)
+    {
+        minVal = 0.f;
+    }
+
+    boost::multi_array<T, 4> input = MakeRandomTensor<T, 4>(inputTensorInfo, 21453, minVal, 10.f);
+
+
+    LayerTestResult<T,4> ret(outputTensorInfo);
+    auto boostArrayExtents = boost::extents
+        [boost::numeric_cast<boost::multi_array_types::extent_gen::index>(batchSize)]
+    [boost::numeric_cast<boost::multi_array_types::extent_gen::index>(channels)]
+    [boost::numeric_cast<boost::multi_array_types::extent_gen::index>(height)]
+    [boost::numeric_cast<boost::multi_array_types::extent_gen::index>(width)];
+    ret.output.resize(boostArrayExtents);
+    ret.outputExpected.resize(boostArrayExtents);
+
+
+    std::unique_ptr<armnn::ITensorHandle> inputHandle = workloadFactory.CreateTensorHandle(inputTensorInfo);
+    std::unique_ptr<armnn::ITensorHandle> outputHandle = workloadFactory.CreateTensorHandle(outputTensorInfo);
+
+    std::unique_ptr<armnn::ITensorHandle> inputHandleRef = refWorkloadFactory.CreateTensorHandle(inputTensorInfo);
+    std::unique_ptr<armnn::ITensorHandle> outputHandleRef = refWorkloadFactory.CreateTensorHandle(outputTensorInfo);
+
+    armnn::ActivationQueueDescriptor data;
+    armnn::WorkloadInfo info;
+    AddInputToWorkload(data, info, inputTensorInfo, inputHandle.get());
+    AddOutputToWorkload(data, info, outputTensorInfo, outputHandle.get());
+    data.m_Parameters.m_A        = a;
+    data.m_Parameters.m_B        = b;
+    data.m_Parameters.m_Function = f;
+
+    armnn::ActivationQueueDescriptor refData = data;
+    armnn::WorkloadInfo refInfo = info;
+    SetWorkloadInput(refData, refInfo, 0, inputTensorInfo, inputHandleRef.get());
+    SetWorkloadOutput(refData, refInfo, 0, outputTensorInfo, outputHandleRef.get());
+
+    std::unique_ptr<armnn::IWorkload> workload = workloadFactory.CreateActivation(data, info);
+    BOOST_ASSERT(workload != nullptr);
+    std::unique_ptr<armnn::IWorkload> workloadRef = refWorkloadFactory.CreateActivation(refData, refInfo);
+    BOOST_ASSERT(workloadRef != nullptr);
+
+    inputHandle->Allocate();
+    outputHandle->Allocate();
+    inputHandleRef->Allocate();
+    outputHandleRef->Allocate();
+
+    CopyDataToITensorHandle(inputHandle.get(), &input[0][0][0][0]);
+    CopyDataToITensorHandle(inputHandleRef.get(), &input[0][0][0][0]);
+
+    workload->Execute();
+    workloadRef->Execute();
+
+    CopyDataFromITensorHandle(&ret.output[0][0][0][0], outputHandle.get());
+    CopyDataFromITensorHandle(&ret.outputExpected[0][0][0][0], outputHandleRef.get());
+
+    return ret;
+}
+
+LayerTestResult<float,4> CompareActivationTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    armnn::IWorkloadFactory& refWorkloadFactory,
+    armnn::ActivationFunction f,
+    unsigned int batchSize)
+{
+    return CompareActivationTestImpl<armnn::DataType::Float32>(
+        workloadFactory, memoryManager, refWorkloadFactory, f, batchSize);
+}
+
+LayerTestResult<uint8_t,4> CompareActivationUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    armnn::IWorkloadFactory& refWorkloadFactory,
+    armnn::ActivationFunction f)
+{
+    return CompareActivationTestImpl<armnn::DataType::QuantisedAsymm8>(
+        workloadFactory, memoryManager, refWorkloadFactory, f, 5, 0.1f, 50);
+}
+
+LayerTestResult<int16_t,4> CompareActivationInt16Test(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        armnn::IWorkloadFactory& refWorkloadFactory,
+        armnn::ActivationFunction f)
+{
+    return CompareActivationTestImpl<armnn::DataType::QuantisedSymm16>(
+            workloadFactory, memoryManager, refWorkloadFactory, f, 5, 0.1f, 0);
+}
diff --git a/src/backends/backendsCommon/test/layerTests/ActivationTestImpl.hpp b/src/backends/backendsCommon/test/layerTests/ActivationTestImpl.hpp
new file mode 100644
index 0000000..fc69cfb
--- /dev/null
+++ b/src/backends/backendsCommon/test/layerTests/ActivationTestImpl.hpp
@@ -0,0 +1,220 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include "LayerTestResult.hpp"
+
+#include <backendsCommon/IBackendInternal.hpp>
+#include <backendsCommon/WorkloadFactory.hpp>
+
+//
+// Sigmoid
+//
+
+LayerTestResult<float, 4> SimpleSigmoidTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<uint8_t, 4> SimpleSigmoidUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<int16_t, 4> SimpleSigmoidInt16Test(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+//
+// TanH
+//
+
+LayerTestResult<float, 4> TanhTest(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<uint8_t, 4> TanhUint8Test(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<int16_t, 4> TanhInt16Test(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+//
+// Linear
+//
+
+LayerTestResult<float, 4> ConstantLinearActivationTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<uint8_t, 4> ConstantLinearActivationUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<int16_t, 4> ConstantLinearActivationInt16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+//
+// ReLu
+//
+
+LayerTestResult<float, 4> ReLuTest(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<uint8_t, 4> ReLuUint8Test(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<int16_t, 4> ReLuInt16Test(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+//
+// BoundedReLu
+//
+
+LayerTestResult<uint8_t, 4> BoundedReLuUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    float upperBound);
+
+LayerTestResult<uint8_t, 4> BoundedReLuUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    float upperBound,
+    float lowerBound);
+
+LayerTestResult<int16_t, 4> BoundedReLuInt16Test(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<float, 4> BoundedReLuUpperAndLowerBoundTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<uint8_t, 4> BoundedReLuUint8UpperAndLowerBoundTest(
+    armnn::IWorkloadFactory& workloadFactor,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManagery);
+
+LayerTestResult<float, 4> BoundedReLuUpperBoundOnlyTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<uint8_t, 4> BoundedReLuUint8UpperBoundOnlyTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<float, 4> CompareBoundedReLuTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    armnn::IWorkloadFactory& refWorkloadFactory,
+    float upperBound,
+    float lowerBound);
+
+//
+// SoftReLu
+//
+
+LayerTestResult<float, 4> SoftReLuTest(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<uint8_t, 4> SoftReLuUint8Test(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<int16_t, 4> SoftReLuInt16Test(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+//
+// LeakyReLu
+//
+
+LayerTestResult<float, 4> LeakyReLuTest(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<uint8_t, 4> LeakyReLuUint8Test(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<int16_t, 4> LeakyReLuInt16Test(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+//
+// Abs
+//
+
+LayerTestResult<float, 4> AbsTest(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<uint8_t, 4> AbsUint8Test(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<int16_t, 4> AbsInt16Test(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+//
+// Sqrt
+//
+
+LayerTestResult<float, 4> SqrtTest(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<uint8_t, 4> SqrtUint8Test(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<int16_t, 4> SqrtInt16Test(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+//
+// Square
+//
+
+LayerTestResult<float, 4> SquareTest(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<uint8_t, 4> SquareUint8Test(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<int16_t, 4> SquareInt16Test(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+//
+// Other
+//
+
+LayerTestResult<float, 4> CompareActivationTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    armnn::IWorkloadFactory& refWorkloadFactory,
+    armnn::ActivationFunction f,
+    unsigned int batchSize);
+
+LayerTestResult<uint8_t, 4> CompareActivationUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    armnn::IWorkloadFactory& refWorkloadFactory,
+    armnn::ActivationFunction f);
+
+LayerTestResult<int16_t, 4> CompareActivationInt16Test(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        armnn::IWorkloadFactory& refWorkloadFactory,
+        armnn::ActivationFunction f);
diff --git a/src/backends/backendsCommon/test/layerTests/BatchNormalizationTestImpl.cpp b/src/backends/backendsCommon/test/layerTests/BatchNormalizationTestImpl.cpp
new file mode 100644
index 0000000..d8f87e1
--- /dev/null
+++ b/src/backends/backendsCommon/test/layerTests/BatchNormalizationTestImpl.cpp
@@ -0,0 +1,566 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "BatchNormalizationTestImpl.hpp"
+
+#include <DataLayoutIndexed.hpp>
+#include <ResolveType.hpp>
+
+#include <armnn/ArmNN.hpp>
+
+#include <backendsCommon/CpuTensorHandle.hpp>
+#include <backendsCommon/IBackendInternal.hpp>
+#include <backendsCommon/WorkloadFactory.hpp>
+
+#include <backendsCommon/test/QuantizeHelper.hpp>
+#include <backendsCommon/test/TensorCopyUtils.hpp>
+#include <backendsCommon/test/WorkloadTestUtils.hpp>
+
+#include <test/TensorHelpers.hpp>
+
+namespace
+{
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> BatchNormTestImpl(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::TensorShape& inputOutputTensorShape,
+    const std::vector<float>& inputValues,
+    const std::vector<float>& expectedOutputValues,
+    float qScale,
+    int32_t qOffset,
+    armnn::DataLayout dataLayout)
+{
+    armnn::TensorInfo inputTensorInfo(inputOutputTensorShape, ArmnnType);
+    armnn::TensorInfo outputTensorInfo(inputOutputTensorShape, ArmnnType);
+
+    armnnUtils::DataLayoutIndexed dataLayoutIndexed(dataLayout);
+
+    armnn::TensorInfo tensorInfo({ inputOutputTensorShape[dataLayoutIndexed.GetChannelsIndex()] },
+                                 ArmnnType);
+
+    // Set quantization parameters if the requested type is a quantized type.
+    if (armnn::IsQuantizedType<T>())
+    {
+        inputTensorInfo.SetQuantizationScale(qScale);
+        inputTensorInfo.SetQuantizationOffset(qOffset);
+        outputTensorInfo.SetQuantizationScale(qScale);
+        outputTensorInfo.SetQuantizationOffset(qOffset);
+        tensorInfo.SetQuantizationScale(qScale);
+        tensorInfo.SetQuantizationOffset(qOffset);
+    }
+
+    auto inputTensor = MakeTensor<T, 4>(inputTensorInfo,
+                                        QuantizedVector<T>(qScale, qOffset, inputValues));
+
+    // These values are per-channel of the input.
+    auto mean     = MakeTensor<T, 1>(tensorInfo, QuantizedVector<T>(qScale, qOffset, {3, -2}));
+    auto variance = MakeTensor<T, 1>(tensorInfo, QuantizedVector<T>(qScale, qOffset, {4,  9}));
+    auto beta     = MakeTensor<T, 1>(tensorInfo, QuantizedVector<T>(qScale, qOffset, {3,  2}));
+    auto gamma    = MakeTensor<T, 1>(tensorInfo, QuantizedVector<T>(qScale, qOffset, {2,  1}));
+
+    LayerTestResult<T, 4> result(outputTensorInfo);
+
+    result.outputExpected = MakeTensor<T, 4>(inputTensorInfo,
+                                             QuantizedVector<T>(qScale, qOffset, expectedOutputValues));
+
+    std::unique_ptr<armnn::ITensorHandle> inputHandle = workloadFactory.CreateTensorHandle(inputTensorInfo);
+    std::unique_ptr<armnn::ITensorHandle> outputHandle = workloadFactory.CreateTensorHandle(outputTensorInfo);
+
+    armnn::ScopedCpuTensorHandle meanTensor(tensorInfo);
+    armnn::ScopedCpuTensorHandle varianceTensor(tensorInfo);
+    armnn::ScopedCpuTensorHandle betaTensor(tensorInfo);
+    armnn::ScopedCpuTensorHandle gammaTensor(tensorInfo);
+
+    armnn::BatchNormalizationQueueDescriptor descriptor;
+    descriptor.m_Mean                    = &meanTensor;
+    descriptor.m_Variance                = &varianceTensor;
+    descriptor.m_Beta                    = &betaTensor;
+    descriptor.m_Gamma                   = &gammaTensor;
+    descriptor.m_Parameters.m_Eps        = 0.0f;
+    descriptor.m_Parameters.m_DataLayout = dataLayout;
+    armnn::WorkloadInfo info;
+
+    AllocateAndCopyDataToITensorHandle(&meanTensor, &mean[0]);
+    AllocateAndCopyDataToITensorHandle(&varianceTensor, &variance[0]);
+    AllocateAndCopyDataToITensorHandle(&betaTensor, &beta[0]);
+    AllocateAndCopyDataToITensorHandle(&gammaTensor, &gamma[0]);
+
+    AddInputToWorkload(descriptor, info, inputTensorInfo, inputHandle.get());
+    AddOutputToWorkload(descriptor, info, outputTensorInfo, outputHandle.get());
+
+    std::unique_ptr<armnn::IWorkload> workload = workloadFactory.CreateBatchNormalization(descriptor, info);
+
+    inputHandle->Allocate();
+    outputHandle->Allocate();
+
+    CopyDataToITensorHandle(inputHandle.get(), &inputTensor[0][0][0][0]);
+
+    workload->Execute();
+
+    CopyDataFromITensorHandle(&result.output[0][0][0][0], outputHandle.get());
+
+    return result;
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T,4> BatchNormTestNhwcImpl(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    float qScale,
+    int32_t qOffset)
+{
+    const unsigned int width    = 2;
+    const unsigned int height   = 3;
+    const unsigned int channels = 2;
+    const unsigned int num      = 1;
+
+    armnn::TensorInfo inputTensorInfo({num, height, width, channels}, ArmnnType);
+    armnn::TensorInfo outputTensorInfo({num, height, width, channels}, ArmnnType);
+    armnn::TensorInfo tensorInfo({channels}, ArmnnType);
+
+    // Set quantization parameters if the requested type is a quantized type.
+    if(armnn::IsQuantizedType<T>())
+    {
+        inputTensorInfo.SetQuantizationScale(qScale);
+        inputTensorInfo.SetQuantizationOffset(qOffset);
+        outputTensorInfo.SetQuantizationScale(qScale);
+        outputTensorInfo.SetQuantizationOffset(qOffset);
+        tensorInfo.SetQuantizationScale(qScale);
+        tensorInfo.SetQuantizationOffset(qOffset);
+    }
+
+    auto input = MakeTensor<T, 4>(inputTensorInfo,
+        QuantizedVector<T>(qScale, qOffset,
+        {
+            1.f, 1.f, 4.f, 1.f,
+            4.f, 4.f, 2.f, 1.f,
+            1.f, -2.f, 6.f, 4.f
+        }));
+    // These values are per-channel of the input.
+    auto mean     = MakeTensor<T, 1>(tensorInfo, QuantizedVector<T>(qScale, qOffset, {3, -2}));
+    auto variance = MakeTensor<T, 1>(tensorInfo, QuantizedVector<T>(qScale, qOffset, {4, 9}));
+    auto beta     = MakeTensor<T, 1>(tensorInfo, QuantizedVector<T>(qScale, qOffset, {3, 2}));
+    auto gamma    = MakeTensor<T, 1>(tensorInfo, QuantizedVector<T>(qScale, qOffset, {2, 1}));
+    LayerTestResult<T,4> ret(outputTensorInfo);
+
+    std::unique_ptr<armnn::ITensorHandle> inputHandle = workloadFactory.CreateTensorHandle(inputTensorInfo);
+    std::unique_ptr<armnn::ITensorHandle> outputHandle = workloadFactory.CreateTensorHandle(outputTensorInfo);
+
+    armnn::BatchNormalizationQueueDescriptor data;
+    armnn::WorkloadInfo info;
+    armnn::ScopedCpuTensorHandle meanTensor(tensorInfo);
+    armnn::ScopedCpuTensorHandle varianceTensor(tensorInfo);
+    armnn::ScopedCpuTensorHandle betaTensor(tensorInfo);
+    armnn::ScopedCpuTensorHandle gammaTensor(tensorInfo);
+
+    AllocateAndCopyDataToITensorHandle(&meanTensor, &mean[0]);
+    AllocateAndCopyDataToITensorHandle(&varianceTensor, &variance[0]);
+    AllocateAndCopyDataToITensorHandle(&betaTensor, &beta[0]);
+    AllocateAndCopyDataToITensorHandle(&gammaTensor, &gamma[0]);
+
+    AddInputToWorkload(data, info, inputTensorInfo, inputHandle.get());
+    AddOutputToWorkload(data, info, outputTensorInfo, outputHandle.get());
+    data.m_Mean             = &meanTensor;
+    data.m_Variance         = &varianceTensor;
+    data.m_Beta             = &betaTensor;
+    data.m_Gamma            = &gammaTensor;
+    data.m_Parameters.m_Eps = 0.0f;
+    data.m_Parameters.m_DataLayout = armnn::DataLayout::NHWC;
+
+    // For each channel:
+    // substract mean, divide by standard deviation (with an epsilon to avoid div by 0),
+    // multiply by gamma and add beta
+    ret.outputExpected = MakeTensor<T, 4>(outputTensorInfo,
+        QuantizedVector<T>(qScale, qOffset,
+        {
+            1.f, 3.f, 4.f, 3.f,
+            4.f, 4.f, 2.f, 3.f,
+            1.f, 2.f, 6.f, 4.f
+        }));
+
+    std::unique_ptr<armnn::IWorkload> workload = workloadFactory.CreateBatchNormalization(data, info);
+
+    inputHandle->Allocate();
+    outputHandle->Allocate();
+
+    CopyDataToITensorHandle(inputHandle.get(), &input[0][0][0][0]);
+
+    workload->Execute();
+
+    CopyDataFromITensorHandle(&ret.output[0][0][0][0], outputHandle.get());
+
+    return ret;
+}
+
+} // anonymous namespace
+
+LayerTestResult<float, 4> BatchNormFloatTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    // BatchSize: 1
+    // Channels: 2
+    // Height: 3
+    // Width: 2
+
+    const armnn::TensorShape inputOutputShape{ 1, 2, 3, 2 };
+    std::vector<float> inputValues
+    {
+        // Batch 0, Channel 0, Height (3) x Width (2)
+         1.f, 4.f,
+         4.f, 2.f,
+         1.f, 6.f,
+
+        // Batch 0, Channel 1, Height (3) x Width (2)
+         1.f, 1.f,
+         4.f, 1.f,
+        -2.f, 4.f
+    };
+    std::vector<float> expectedOutputValues
+    {
+        // Batch 0, Channel 0, Height (3) x Width (2)
+        1.f, 4.f,
+        4.f, 2.f,
+        1.f, 6.f,
+
+        // Batch 0, Channel 1, Height (3) x Width (2)
+        3.f, 3.f,
+        4.f, 3.f,
+        2.f, 4.f
+    };
+
+    return BatchNormTestImpl<armnn::DataType::Float32>(
+        workloadFactory,
+        memoryManager,
+        inputOutputShape,
+        inputValues,
+        expectedOutputValues,
+        0.f,
+        0,
+        armnn::DataLayout::NCHW);
+}
+
+LayerTestResult<float, 4> BatchNormFloatNhwcTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    // BatchSize: 1
+    // Height: 3
+    // Width: 2
+    // Channels: 2
+
+    const armnn::TensorShape inputOutputShape{ 1, 3, 2, 2 };
+    std::vector<float> inputValues
+    {
+        // Batch 0, Height 0, Width (2) x Channel (2)
+        1.f,  1.f,
+        4.f,  1.f,
+
+        // Batch 0, Height 1, Width (2) x Channel (2)
+        4.f,  4.f,
+        2.f,  1.f,
+
+        // Batch 0, Height 2, Width (2) x Channel (2)
+        1.f, -2.f,
+        6.f,  4.f
+    };
+    std::vector<float> expectedOutputValues
+    {
+        // Batch 0, Height 0, Width (2) x Channel (2)
+        1.f, 3.f,
+        4.f, 3.f,
+
+        // Batch 0, Height 1, Width (2) x Channel (2)
+        4.f, 4.f,
+        2.f, 3.f,
+
+        // Batch 0, Height 2, Width (2) x Channel (2)
+        1.f, 2.f,
+        6.f, 4.f
+    };
+
+    return BatchNormTestImpl<armnn::DataType::Float32>(
+        workloadFactory,
+        memoryManager,
+        inputOutputShape,
+        inputValues,
+        expectedOutputValues,
+        0.f,
+        0,
+        armnn::DataLayout::NHWC);
+}
+
+LayerTestResult<uint8_t, 4> BatchNormUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    // BatchSize: 1
+    // Channels: 2
+    // Height: 3
+    // Width: 2
+
+    const armnn::TensorShape inputOutputShape{ 1, 2, 3, 2 };
+    std::vector<float> inputValues
+    {
+        // Batch 0, Channel 0, Height (3) x Width (2)
+         1.f, 4.f,
+         4.f, 2.f,
+         1.f, 6.f,
+
+        // Batch 0, Channel 1, Height (3) x Width (2)
+         1.f, 1.f,
+         4.f, 1.f,
+        -2.f, 4.f
+    };
+    std::vector<float> expectedOutputValues
+    {
+        // Batch 0, Channel 0, Height (3) x Width (2)
+        1.f, 4.f,
+        4.f, 2.f,
+        1.f, 6.f,
+
+        // Batch 0, Channel 1, Height (3) x Width (2)
+        3.f, 3.f,
+        4.f, 3.f,
+        2.f, 4.f
+    };
+
+    return BatchNormTestImpl<armnn::DataType::QuantisedAsymm8>(
+        workloadFactory,
+        memoryManager,
+        inputOutputShape,
+        inputValues,
+        expectedOutputValues,
+        1.f / 20.f,
+        50,
+        armnn::DataLayout::NCHW);
+}
+
+LayerTestResult<uint8_t, 4> BatchNormUint8NhwcTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    // BatchSize: 1
+    // Height: 3
+    // Width: 2
+    // Channels: 2
+
+    const armnn::TensorShape inputOutputShape{ 1, 3, 2, 2 };
+    std::vector<float> inputValues
+    {
+        // Batch 0, Height 0, Width (2) x Channel (2)
+        1.f,  1.f,
+        4.f,  1.f,
+
+        // Batch 0, Height 1, Width (2) x Channel (2)
+        4.f,  4.f,
+        2.f,  1.f,
+
+        // Batch 0, Height 2, Width (2) x Channel (2)
+        1.f, -2.f,
+        6.f,  4.f
+    };
+    std::vector<float> expectedOutputValues
+    {
+        // Batch 0, Height 0, Width (2) x Channel (2)
+        1.f, 3.f,
+        4.f, 3.f,
+
+        // Batch 0, Height 1, Width (2) x Channel (2)
+        4.f, 4.f,
+        2.f, 3.f,
+
+        // Batch 0, Height 2, Width (2) x Channel (2)
+        1.f, 2.f,
+        6.f, 4.f
+    };
+
+    return BatchNormTestImpl<armnn::DataType::QuantisedAsymm8>(
+        workloadFactory,
+        memoryManager,
+        inputOutputShape, inputValues, expectedOutputValues,
+         1.f/20.f, 50, armnn::DataLayout::NHWC);
+}
+
+LayerTestResult<int16_t, 4> BatchNormInt16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    // BatchSize: 1
+    // Channels: 2
+    // Height: 3
+    // Width: 2
+
+    const armnn::TensorShape inputOutputShape{ 1, 2, 3, 2 };
+    std::vector<float> inputValues
+    {
+        // Batch 0, Channel 0, Height (3) x Width (2)
+         1.f, 4.f,
+         4.f, 2.f,
+         1.f, 6.f,
+
+        // Batch 0, Channel 1, Height (3) x Width (2)
+         1.f, 1.f,
+         4.f, 1.f,
+        -2.f, 4.f
+    };
+    std::vector<float> expectedOutputValues
+    {
+        // Batch 0, Channel 0, Height (3) x Width (2)
+        1.f, 4.f,
+        4.f, 2.f,
+        1.f, 6.f,
+
+        // Batch 0, Channel 1, Height (3) x Width (2)
+        3.f, 3.f,
+        4.f, 3.f,
+        2.f, 4.f
+    };
+
+    return BatchNormTestImpl<armnn::DataType::QuantisedSymm16>(
+        workloadFactory,
+        memoryManager,
+        inputOutputShape,
+        inputValues,
+        expectedOutputValues,
+        1.f / 20.f,
+        50,
+        armnn::DataLayout::NCHW);
+}
+
+LayerTestResult<int16_t, 4> BatchNormInt16NhwcTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    // BatchSize: 1
+    // Height: 3
+    // Width: 2
+    // Channels: 2
+
+    const armnn::TensorShape inputOutputShape{ 1, 3, 2, 2 };
+    std::vector<float> inputValues
+    {
+        // Batch 0, Height 0, Width (2) x Channel (2)
+        1.f,  1.f,
+        4.f,  1.f,
+
+        // Batch 0, Height 1, Width (2) x Channel (2)
+        4.f,  4.f,
+        2.f,  1.f,
+
+        // Batch 0, Height 2, Width (2) x Channel (2)
+        1.f, -2.f,
+        6.f,  4.f
+    };
+    std::vector<float> expectedOutputValues
+    {
+        // Batch 0, Height 0, Width (2) x Channel (2)
+        1.f, 3.f,
+        4.f, 3.f,
+
+        // Batch 0, Height 1, Width (2) x Channel (2)
+        4.f, 4.f,
+        2.f, 3.f,
+
+        // Batch 0, Height 2, Width (2) x Channel (2)
+        1.f, 2.f,
+        6.f, 4.f
+    };
+
+    return BatchNormTestImpl<armnn::DataType::QuantisedSymm16>(
+        workloadFactory,
+        memoryManager,
+        inputOutputShape,
+        inputValues,
+        expectedOutputValues,
+        1.f / 20.f,
+        50,
+        armnn::DataLayout::NHWC);
+}
+
+LayerTestResult<float,4> CompareBatchNormTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    armnn::IWorkloadFactory& refWorkloadFactory)
+{
+    const unsigned int width     = 2;
+    const unsigned int height    = 3;
+    const unsigned int channels  = 5;
+    const unsigned int batchSize = 3;
+
+    armnn::TensorInfo inputTensorInfo;
+    armnn::TensorInfo outputTensorInfo;
+    armnn::TensorInfo tensorInfo;
+
+    constexpr unsigned int shape[]       = {batchSize, channels, height, width};
+    constexpr unsigned int tensorShape[] = {channels};
+
+    inputTensorInfo = armnn::TensorInfo(4, shape, armnn::DataType::Float32);
+    outputTensorInfo = armnn::TensorInfo(4, shape, armnn::DataType::Float32);
+    tensorInfo = armnn::TensorInfo(1, tensorShape, armnn::DataType::Float32);
+
+    auto input = MakeRandomTensor<float, 4>(inputTensorInfo, 21312);
+
+    auto mean     = MakeRandomTensor<float, 1>(tensorInfo, 123);
+    auto variance = MakeRandomTensor<float, 1>(tensorInfo, 234, 0.0f);
+    auto beta     = MakeRandomTensor<float, 1>(tensorInfo, 123);
+    auto gamma    = MakeRandomTensor<float, 1>(tensorInfo, 345);
+
+    LayerTestResult<float,4> ret(outputTensorInfo);
+
+    std::unique_ptr<armnn::ITensorHandle> inputHandle  = workloadFactory.CreateTensorHandle(inputTensorInfo);
+    std::unique_ptr<armnn::ITensorHandle> outputHandle = workloadFactory.CreateTensorHandle(outputTensorInfo);
+
+    std::unique_ptr<armnn::ITensorHandle> inputHandleRef  = refWorkloadFactory.CreateTensorHandle(inputTensorInfo);
+    std::unique_ptr<armnn::ITensorHandle> outputHandleRef = refWorkloadFactory.CreateTensorHandle(outputTensorInfo);
+
+    armnn::BatchNormalizationQueueDescriptor data;
+    armnn::WorkloadInfo info;
+    armnn::ScopedCpuTensorHandle meanTensor(tensorInfo);
+    armnn::ScopedCpuTensorHandle varianceTensor(tensorInfo);
+    armnn::ScopedCpuTensorHandle betaTensor(tensorInfo);
+    armnn::ScopedCpuTensorHandle gammaTensor(tensorInfo);
+
+    AllocateAndCopyDataToITensorHandle(&meanTensor, &mean[0]);
+    AllocateAndCopyDataToITensorHandle(&varianceTensor, &variance[0]);
+    AllocateAndCopyDataToITensorHandle(&betaTensor, &beta[0]);
+    AllocateAndCopyDataToITensorHandle(&gammaTensor, &gamma[0]);
+
+    AddInputToWorkload(data, info, inputTensorInfo, inputHandle.get());
+    AddOutputToWorkload(data, info, outputTensorInfo, outputHandle.get());
+    data.m_Mean             = &meanTensor;
+    data.m_Variance         = &varianceTensor;
+    data.m_Beta             = &betaTensor;
+    data.m_Gamma            = &gammaTensor;
+    data.m_Parameters.m_Eps = 0.01f;
+
+    armnn::BatchNormalizationQueueDescriptor refData = data;
+    armnn::WorkloadInfo refInfo = info;
+    SetWorkloadInput(refData, refInfo, 0, inputTensorInfo, inputHandleRef.get());
+    SetWorkloadOutput(refData, refInfo, 0, outputTensorInfo, outputHandleRef.get());
+
+    std::unique_ptr<armnn::IWorkload> workload = workloadFactory.CreateBatchNormalization(data, info);
+    std::unique_ptr<armnn::IWorkload> workloadRef = refWorkloadFactory.CreateBatchNormalization(refData, refInfo);
+
+    inputHandle->Allocate();
+    outputHandle->Allocate();
+    inputHandleRef->Allocate();
+    outputHandleRef->Allocate();
+
+    CopyDataToITensorHandle(inputHandle.get(), &input[0][0][0][0]);
+    CopyDataToITensorHandle(inputHandleRef.get(), &input[0][0][0][0]);
+
+    workload->PostAllocationConfigure();
+    workload->Execute();
+    workloadRef->PostAllocationConfigure();
+    workloadRef->Execute();
+
+    CopyDataFromITensorHandle(&ret.output[0][0][0][0], outputHandle.get());
+    CopyDataFromITensorHandle(&ret.outputExpected[0][0][0][0], outputHandleRef.get());
+
+    return ret;
+}
diff --git a/src/backends/backendsCommon/test/layerTests/BatchNormalizationTestImpl.hpp b/src/backends/backendsCommon/test/layerTests/BatchNormalizationTestImpl.hpp
new file mode 100644
index 0000000..200e5d8
--- /dev/null
+++ b/src/backends/backendsCommon/test/layerTests/BatchNormalizationTestImpl.hpp
@@ -0,0 +1,40 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include "LayerTestResult.hpp"
+
+#include <backendsCommon/IBackendInternal.hpp>
+#include <backendsCommon/WorkloadFactory.hpp>
+
+LayerTestResult<float, 4> BatchNormFloatTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<float, 4> BatchNormFloatNhwcTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<uint8_t, 4> BatchNormUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<uint8_t, 4> BatchNormUint8NhwcTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<int16_t, 4> BatchNormInt16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<int16_t, 4> BatchNormInt16NhwcTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<float, 4> CompareBatchNormTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    armnn::IWorkloadFactory& refWorkloadFactory);
diff --git a/src/backends/backendsCommon/test/layerTests/BatchToSpaceNdTestImpl.hpp b/src/backends/backendsCommon/test/layerTests/BatchToSpaceNdTestImpl.hpp
new file mode 100644
index 0000000..67e7cc5
--- /dev/null
+++ b/src/backends/backendsCommon/test/layerTests/BatchToSpaceNdTestImpl.hpp
@@ -0,0 +1,473 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include "LayerTestResult.hpp"
+
+#include <ResolveType.hpp>
+
+#include <armnn/ArmNN.hpp>
+
+#include <backendsCommon/IBackendInternal.hpp>
+#include <backendsCommon/WorkloadFactory.hpp>
+
+#include <backendsCommon/test/DataTypeUtils.hpp>
+#include <backendsCommon/test/TensorCopyUtils.hpp>
+#include <backendsCommon/test/WorkloadTestUtils.hpp>
+
+#include <test/TensorHelpers.hpp>
+
+namespace
+{
+
+template<armnn::DataType ArmnnType,
+        std::size_t InputDim,
+        std::size_t OutputDim,
+        typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, OutputDim> BatchToSpaceNdHelper(
+        armnn::IWorkloadFactory &workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        const armnn::DataLayout& dataLayout,
+        const unsigned int *inputShape,
+        const std::vector<float> &inputData,
+        const std::vector<unsigned int> &blockShape,
+        const std::vector<std::pair<unsigned int, unsigned int>> &crops,
+        const unsigned int *outputShape,
+        const std::vector<float> &outputData,
+        float scale = 1.0f,
+        int32_t offset = 0)
+{
+    armnn::TensorInfo inputTensorInfo(InputDim, inputShape, ArmnnType);
+    armnn::TensorInfo outputTensorInfo(OutputDim, outputShape, ArmnnType);
+
+    inputTensorInfo.SetQuantizationScale(scale);
+    inputTensorInfo.SetQuantizationOffset(offset);
+
+    outputTensorInfo.SetQuantizationScale(scale);
+    outputTensorInfo.SetQuantizationOffset(offset);
+
+    auto input = MakeTensor<T, InputDim>(inputTensorInfo, ConvertToDataType<ArmnnType>(inputData, inputTensorInfo));
+
+    LayerTestResult<T, OutputDim> result(outputTensorInfo);
+    result.outputExpected = MakeTensor<T, OutputDim>(outputTensorInfo,
+                                                     ConvertToDataType<ArmnnType>(outputData, outputTensorInfo));
+
+    std::unique_ptr<armnn::ITensorHandle> inputHandle = workloadFactory.CreateTensorHandle(inputTensorInfo);
+    std::unique_ptr<armnn::ITensorHandle> outputHandle = workloadFactory.CreateTensorHandle(outputTensorInfo);
+
+    armnn::BatchToSpaceNdQueueDescriptor data;
+    data.m_Parameters.m_DataLayout = dataLayout;
+    data.m_Parameters.m_BlockShape = blockShape;
+    data.m_Parameters.m_Crops = crops;
+    armnn::WorkloadInfo info;
+    AddInputToWorkload(data, info, inputTensorInfo, inputHandle.get());
+    AddOutputToWorkload(data, info, outputTensorInfo, outputHandle.get());
+
+    std::unique_ptr<armnn::IWorkload> workload = workloadFactory.CreateBatchToSpaceNd(data, info);
+
+    inputHandle->Allocate();
+    outputHandle->Allocate();
+
+    CopyDataToITensorHandle(inputHandle.get(), input.origin());
+
+    workload->PostAllocationConfigure();
+    workload->Execute();
+
+    CopyDataFromITensorHandle(&result.output[0][0][0][0], outputHandle.get());
+
+    return result;
+}
+
+} // anonymous namespace
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> BatchToSpaceNdNhwcTest1(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    const unsigned int inputShape[] = {4, 2, 2, 1};
+    const unsigned int outputShape[] = {1, 4, 4, 1};
+
+    std::vector<float> input({
+                                     // Batch 0, Height 0, Width (2) x Channel (1)
+                                     1.0f, 3.0f,
+                                     // Batch 0, Height 1, Width (2) x Channel (1)
+                                     9.0f, 11.0f,
+
+
+                                     // Batch 1, Height 0, Width (2) x Channel (1)
+                                     2.0f, 4.0f,
+                                     // Batch 1, Height 1, Width (2) x Channel (1)
+                                     10.0f, 12.0f,
+
+
+                                     // Batch 2, Height 0, Width (2) x Channel (1)
+                                     5.0f, 7.0f,
+                                     // Batch 2, Height 1, Width (2) x Channel (1)
+                                     13.0f, 15.0f,
+
+                                     // Batch 3, Height 0, Width (2) x Channel (3)
+                                     6.0f, 8.0f,
+                                     // Batch 3, Height 1, Width (2) x Channel (1)
+                                     14.0f, 16.0f
+                             });
+
+    std::vector<float> expectedOutput({
+                                              1.0f,   2.0f,  3.0f,  4.0f,
+                                              5.0f,   6.0f,  7.0f,  8.0f,
+                                              9.0f,  10.0f, 11.0f,  12.0f,
+                                              13.0f, 14.0f, 15.0f,  16.0f
+                                      });
+
+    std::vector<unsigned int> blockShape {2, 2};
+    std::vector<std::pair<unsigned int, unsigned int>> crops = {{0, 0}, {0, 0}};
+
+    return BatchToSpaceNdHelper<ArmnnType, 4, 4>(workloadFactory, memoryManager,
+                                                                armnn::DataLayout::NHWC, inputShape, input, blockShape,
+                                                                crops, outputShape, expectedOutput);
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> BatchToSpaceNdNhwcTest2(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    const unsigned int inputShape[] = {4, 1, 1, 1};
+    const unsigned int outputShape[] = {1, 2, 2, 1};
+
+    std::vector<float> input({
+                                     // Batch 0, Height 0, Width (2) x Channel (1)
+                                     1.0f, 2.0f, 3.0f, 4.0f
+                             });
+
+    std::vector<float> expectedOutput({1.0f, 2.0f, 3.0f, 4.0f});
+
+    std::vector<unsigned int> blockShape({2, 2});
+    std::vector<std::pair<unsigned int, unsigned int>> crops = {{0, 0}, {0, 0}};
+
+    return BatchToSpaceNdHelper<ArmnnType, 4, 4>(workloadFactory, memoryManager,
+                                                                armnn::DataLayout::NHWC, inputShape, input, blockShape,
+                                                                crops, outputShape, expectedOutput);
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> BatchToSpaceNdNhwcTest3(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    const unsigned int inputShape[] = {4, 1, 1, 3};
+    const unsigned int outputShape[] = {1, 2, 2, 3};
+
+    std::vector<float> input({1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f});
+
+    std::vector<float> expectedOutput({1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f});
+
+    std::vector<unsigned int> blockShape({2, 2});
+    std::vector<std::pair<unsigned int, unsigned int>> crops = {{0, 0}, {0, 0}};
+
+    return BatchToSpaceNdHelper<ArmnnType, 4, 4>(workloadFactory, memoryManager,
+                                                                armnn::DataLayout::NHWC, inputShape, input, blockShape,
+                                                                crops, outputShape, expectedOutput);
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> BatchToSpaceNdNhwcTest4(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    const unsigned int inputShape[] = {8, 1, 3, 1};
+    const unsigned int outputShape[] = {2, 2, 4, 1};
+
+    std::vector<float> input({
+                                     0.0f, 1.0f, 3.0f,
+                                     0.0f, 9.0f, 11.0f,
+                                     0.0f, 2.0f, 4.0f,
+                                     0.0f, 10.0f, 12.0f,
+                                     0.0f, 5.0f, 7.0f,
+                                     0.0f, 13.0f, 15.0f,
+                                     0.0f, 6.0f, 8.0f,
+                                     0.0f, 14.0f, 16.0f
+                             });
+
+    std::vector<float> expectedOutput({
+                                              1.0f, 2.0f, 3.0f, 4.0f,
+                                              5.0f, 6.0f, 7.0f, 8.0f,
+                                              9.0f, 10.0f, 11.0f, 12.0f,
+                                              13.0f, 14.0f, 15.0f, 16.0f
+                                      });
+
+    std::vector<unsigned int> blockShape({2, 2});
+    std::vector<std::pair<unsigned int, unsigned int>> crops = {{0, 0}, {2, 0}};
+
+    return BatchToSpaceNdHelper<ArmnnType, 4, 4>(workloadFactory, memoryManager,
+                                                                armnn::DataLayout::NHWC, inputShape, input, blockShape,
+                                                                crops, outputShape, expectedOutput);
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> BatchToSpaceNdNhwcTest5(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    const unsigned int inputShape[] = {4, 2, 2, 1};
+    const unsigned int outputShape[] = {1, 4, 4, 1};
+
+    std::vector<float> input({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16});
+    std::vector<float> expectedOutput({1, 5, 2, 6, 9, 13, 10, 14, 3, 7, 4, 8, 11, 15, 12, 16});
+
+    std::vector<unsigned int> blockShape({2, 2});
+    std::vector<std::pair<unsigned int, unsigned int>> crops = {{0, 0}, {0, 0}};
+
+    return BatchToSpaceNdHelper<ArmnnType, 4, 4>(workloadFactory, memoryManager, armnn::DataLayout::NHWC, inputShape,
+                                                 input, blockShape, crops, outputShape, expectedOutput);
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> BatchToSpaceNdNhwcTest6(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    const unsigned int inputShape[] = {4, 1, 1, 1};
+    const unsigned int outputShape[] = {1, 2, 2, 1};
+
+    std::vector<float> input({
+                                     // Batch 0, Height 0, Width (2) x Channel (1)
+                                     1, 2, 3, 4
+                             });
+
+    std::vector<float> expectedOutput({1, 2, 3, 4});
+
+    std::vector<unsigned int> blockShape({2, 2});
+    std::vector<std::pair<unsigned int, unsigned int>> crops = {{0, 0}, {0, 0}};
+
+    return BatchToSpaceNdHelper<ArmnnType, 4, 4>(workloadFactory, memoryManager,
+                                                 armnn::DataLayout::NHWC, inputShape, input, blockShape,
+                                                 crops, outputShape, expectedOutput);
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> BatchToSpaceNdNhwcTest7(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    const unsigned int inputShape[] = {4, 1, 1, 3};
+    const unsigned int outputShape[] = {1, 2, 2, 3};
+
+    std::vector<float> input({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+
+    std::vector<float> expectedOutput({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+
+    std::vector<unsigned int> blockShape({2, 2});
+    std::vector<std::pair<unsigned int, unsigned int>> crops = {{0, 0}, {0, 0}};
+
+    return BatchToSpaceNdHelper<ArmnnType, 4, 4>(workloadFactory, memoryManager,
+                                                 armnn::DataLayout::NHWC, inputShape, input, blockShape,
+                                                 crops, outputShape, expectedOutput);
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> BatchToSpaceNdNchwTest1(
+        armnn::IWorkloadFactory &workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    const unsigned int inputShape[] = {4, 3, 1, 1};
+    const unsigned int outputShape[] = {1, 3, 2, 2};
+
+    std::vector<float> input({1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f});
+
+    std::vector<float> expectedOutput({
+                                              // Batch 0, Channel 0, Height (2) x Width (2)
+                                              1.0f,  4.0f,
+                                              7.0f, 10.0f,
+
+                                              // Batch 0, Channel 1, Height (2) x Width (2)
+                                              2.0f,  5.0f,
+                                              8.0f, 11.0f,
+
+                                              // Batch 0, Channel 2, Height (2) x Width (2)
+                                              3.0f,  6.0f,
+                                              9.0f, 12.0f,
+                                      });
+
+    std::vector<unsigned int> blockShape({2, 2});
+    std::vector<std::pair<unsigned int, unsigned int>> crops = {{0, 0}, {0, 0}};
+
+    return BatchToSpaceNdHelper<ArmnnType, 4, 4>(workloadFactory, memoryManager,
+                                                                armnn::DataLayout::NCHW, inputShape, input, blockShape,
+                                                                crops, outputShape, expectedOutput);
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> BatchToSpaceNdNchwTest2(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    const unsigned int inputShape[] = {4, 1, 1, 1};
+    const unsigned int outputShape[] = {1, 1, 2, 2};
+
+    std::vector<float> input({
+                                     // Batch 0, Height 0, Width (2) x Channel (1)
+                                     1.0f, 2.0f, 3.0f, 4.0f
+                             });
+
+    std::vector<float> expectedOutput({1.0f, 2.0f, 3.0f, 4.0f});
+
+    std::vector<unsigned int> blockShape({2, 2});
+    std::vector<std::pair<unsigned int, unsigned int>> crops = {{0, 0}, {0, 0}};
+
+    return BatchToSpaceNdHelper<ArmnnType, 4, 4>(workloadFactory, memoryManager,
+                                                                armnn::DataLayout::NCHW, inputShape, input, blockShape,
+                                                                crops, outputShape, expectedOutput);
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> BatchToSpaceNdNchwTest3(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    const unsigned int inputShape[] = {4, 3, 1, 1};
+    const unsigned int outputShape[] = {1, 3, 2, 2};
+
+    std::vector<float> input({1.0f, 3.0f, 5.0f, 7.0f, 9.0f, 11.0f, 2.0f, 4.0f, 6.0f, 8.0f, 10.0f, 12.0f});
+
+    std::vector<float> expectedOutput({
+                                              // Batch 0, Channel 0, Height (2) x Width (2)
+                                              1.0f,  7.0f,
+                                              2.0f,  8.0f,
+
+                                              // Batch 0, Channel 1, Height (2) x Width (2)
+                                              3.0f,  9.0f,
+                                              4.0f, 10.0f,
+
+                                              // Batch 0, Channel 2, Height (2) x Width (2)
+                                              5.0f, 11.0f,
+                                              6.0f, 12.0f,
+                                      });
+
+    std::vector<unsigned int> blockShape({2, 2});
+    std::vector<std::pair<unsigned int, unsigned int>> crops = {{0, 0}, {0, 0}};
+
+    return BatchToSpaceNdHelper<ArmnnType, 4, 4>(workloadFactory, memoryManager,
+                                                                armnn::DataLayout::NCHW, inputShape, input, blockShape,
+                                                                crops, outputShape, expectedOutput);
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> BatchToSpaceNdNchwTest4(
+        armnn::IWorkloadFactory &workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    const unsigned int inputShape[] = {4, 3, 1, 1};
+    const unsigned int outputShape[] = {1, 3, 2, 2};
+
+    std::vector<float> input({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
+
+    std::vector<float> expectedOutput({
+                                              // Batch 0, Channel 0, Height (2) x Width (2)
+                                              1,  4,
+                                              7, 10,
+
+                                              // Batch 0, Channel 1, Height (2) x Width (2)
+                                              2,  5,
+                                              8, 11,
+
+                                              // Batch 0, Channel 2, Height (2) x Width (2)
+                                              3,  6,
+                                              9, 12,
+                                      });
+
+    std::vector<unsigned int> blockShape({2, 2});
+    std::vector<std::pair<unsigned int, unsigned int>> crops = {{0, 0}, {0, 0}};
+
+    return BatchToSpaceNdHelper<ArmnnType, 4, 4>(workloadFactory, memoryManager,
+                                                 armnn::DataLayout::NCHW, inputShape, input, blockShape,
+                                                 crops, outputShape, expectedOutput);
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> BatchToSpaceNdNchwTest5(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    const unsigned int inputShape[] = {4, 1, 1, 1};
+    const unsigned int outputShape[] = {1, 1, 2, 2};
+
+    std::vector<float> input({
+                                     // Batch 0, Height 0, Width (2) x Channel (1)
+                                     1, 2, 3, 4
+                             });
+
+    std::vector<float> expectedOutput({1, 2, 3, 4});
+
+    std::vector<unsigned int> blockShape({2, 2});
+    std::vector<std::pair<unsigned int, unsigned int>> crops = {{0, 0}, {0, 0}};
+
+    return BatchToSpaceNdHelper<ArmnnType, 4, 4>(workloadFactory, memoryManager,
+                                                 armnn::DataLayout::NCHW, inputShape, input, blockShape,
+                                                 crops, outputShape, expectedOutput);
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> BatchToSpaceNdNchwTest6(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    const unsigned int inputShape[] = {4, 3, 1, 1};
+    const unsigned int outputShape[] = {1, 3, 2, 2};
+
+    std::vector<float> input({1, 3, 5, 7, 9, 11, 2, 4, 6, 8, 10, 12});
+
+    std::vector<float> expectedOutput({
+                                              // Batch 0, Channel 0, Height (2) x Width (2)
+                                              1,  7,
+                                              2,  8,
+
+                                              // Batch 0, Channel 1, Height (2) x Width (2)
+                                              3,  9,
+                                              4, 10,
+
+                                              // Batch 0, Channel 2, Height (2) x Width (2)
+                                              5, 11,
+                                              6, 12,
+                                      });
+
+    std::vector<unsigned int> blockShape({2, 2});
+    std::vector<std::pair<unsigned int, unsigned int>> crops = {{0, 0}, {0, 0}};
+
+    return BatchToSpaceNdHelper<ArmnnType, 4, 4>(workloadFactory, memoryManager,
+                                                 armnn::DataLayout::NCHW, inputShape, input, blockShape,
+                                                 crops, outputShape, expectedOutput);
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> BatchToSpaceNdNchwTest7(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    const unsigned int inputShape[] = {8, 1, 1, 3};
+    const unsigned int outputShape[] = {2, 1, 2, 4};
+
+    std::vector<float> input({
+                                     0, 1, 3, 0,  9, 11,
+                                     0, 2, 4, 0, 10, 12,
+                                     0, 5, 7, 0, 13, 15,
+                                     0, 6, 8, 0, 14, 16
+                             });
+
+    std::vector<float> expectedOutput({
+                                              1,  2,  3,  4,
+                                              5,  6,  7,  8,
+                                              9, 10, 11, 12,
+                                              13, 14, 15, 16
+                                      });
+
+    std::vector<unsigned int> blockShape({2, 2});
+    std::vector<std::pair<unsigned int, unsigned int>> crops = {{0, 0}, {2, 0}};
+
+    return BatchToSpaceNdHelper<ArmnnType, 4, 4>(workloadFactory, memoryManager,
+                                                 armnn::DataLayout::NCHW, inputShape, input, blockShape,
+                                                 crops, outputShape, expectedOutput);
+}
diff --git a/src/backends/backendsCommon/test/layerTests/ConcatTestImpl.cpp b/src/backends/backendsCommon/test/layerTests/ConcatTestImpl.cpp
new file mode 100644
index 0000000..3cfbca8
--- /dev/null
+++ b/src/backends/backendsCommon/test/layerTests/ConcatTestImpl.cpp
@@ -0,0 +1,2786 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "ConcatTestImpl.hpp"
+
+#include <Permute.hpp>
+#include <ResolveType.hpp>
+
+#include <armnn/ArmNN.hpp>
+
+#include <backendsCommon/test/TensorCopyUtils.hpp>
+#include <backendsCommon/test/WorkloadTestUtils.hpp>
+
+#include <test/TensorHelpers.hpp>
+
+//
+// Helper functions and templates
+//
+
+armnn::OriginsDescriptor CreateDescriptorForConcat(
+    const std::vector<armnn::TensorInfo> & inputTensorInfos,
+    unsigned int concatDim)
+{
+    std::vector<armnn::TensorShape> shapes;
+    shapes.reserve(inputTensorInfos.size());
+    for (const armnn::TensorInfo& it: inputTensorInfos)
+    {
+        shapes.push_back(it.GetShape());
+    }
+
+    return armnn::CreateDescriptorForConcatenation(shapes.begin(), shapes.end(), concatDim);
+}
+
+//
+// Concat is only supported for N and C dimensions for NCHW and the inner most dimension
+// In case of <4 dimensions we need to make sure that the concat dimensions are at least
+// the 3rd slowest iterating one or the inner most dimension.
+//
+
+bool NeedPermuteForConcat(
+    const std::vector<armnn::TensorInfo> & inputTensorInfos,
+    unsigned int concatDim)
+{
+    // See note above. Additionally we expect the input shapes to have the
+    // same number of dimensions.
+    unsigned int nDimensions = 0;
+
+    // Determine the number of dimensions as well as sanity check them
+    // agains test implementation issues.
+    for (auto && tensorInfo : inputTensorInfos)
+    {
+        if (!nDimensions)
+        {
+            nDimensions = tensorInfo.GetShape().GetNumDimensions();
+        }
+        else
+        {
+            BOOST_ASSERT_MSG(nDimensions == tensorInfo.GetShape().GetNumDimensions(),
+                "Input shapes must have the same number of dimensions");
+        }
+    }
+
+    return (nDimensions < 3 || (nDimensions == 3 && (nDimensions-concatDim) < 3 && (nDimensions-concatDim) != 1));
+}
+
+armnn::TensorShape ExpandTensorShapeTo3dForPermute(const armnn::TensorShape & inputShape)
+{
+    unsigned int numDims = inputShape.GetNumDimensions();
+    if (numDims >= 3)
+    {
+        // Nothing to do if the inputShape has at least 3 dimensions.
+        return inputShape;
+    }
+
+    std::vector<unsigned int> newDims(size_t(3), 1u);
+    unsigned int expandedBy = 3 - numDims;
+    for (unsigned int i=0; i<numDims; ++i)
+    {
+        newDims[expandedBy+i] = inputShape[i];
+    }
+    return armnn::TensorShape(3u, &newDims[0]);
+}
+
+void Generate3dPermuteVectorForConcat(
+    unsigned int numDimensions,
+    unsigned int & concatDim,
+    std::pair<armnn::PermutationVector, armnn::PermutationVector> & permutations)
+{
+    BOOST_ASSERT_MSG(numDimensions <= 3,
+       "Only dimensions 1,2 and 3 are supported by this helper");
+    unsigned int expandedBy = 3 - numDimensions;
+    unsigned int expandedConcatAxis = concatDim + expandedBy;
+
+    if (expandedConcatAxis == 2)
+    {
+        concatDim = 0;
+        armnn::PermutationVector forwardPermutation({1, 2, 0});
+        armnn::PermutationVector reversePermutation({2, 0, 1});
+        permutations = std::make_pair(forwardPermutation, reversePermutation);
+    }
+    else if (expandedConcatAxis == 1)
+    {
+        concatDim = 0;
+        armnn::PermutationVector forwardPermutation({2, 0, 1});
+        armnn::PermutationVector reversePermutation({1, 2, 0});
+        permutations = std::make_pair(forwardPermutation, reversePermutation);
+    }
+    else
+    {
+        BOOST_ASSERT(expandedConcatAxis == 0);
+        concatDim = 0;
+    }
+}
+
+template<typename T> void PermuteTensorData(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::PermutationVector& mappings,
+    armnn::TensorInfo & inputTensorInfo,
+    const T * inputData,
+    std::vector<T>& outputData)
+{
+    BOOST_ASSERT_MSG(inputData != nullptr, "inputData must not be null");
+    if (inputData == nullptr)
+    {
+        // Nullptr is an error in the test. By returning without doing the concatenation
+        // I expect the caller to fail the test. It still makes sense to report this as
+        // an assert for Debug builds.
+        return;
+    }
+
+    armnn::TensorInfo outputTensorInfo = armnnUtils::Permuted(inputTensorInfo, mappings);
+
+    std::unique_ptr<armnn::ITensorHandle> inputHandle = workloadFactory.CreateTensorHandle(inputTensorInfo);
+    std::unique_ptr<armnn::ITensorHandle> outputHandle = workloadFactory.CreateTensorHandle(outputTensorInfo);
+
+    armnn::PermuteQueueDescriptor queueDescriptor;
+    queueDescriptor.m_Parameters = armnn::PermuteDescriptor{mappings};
+    armnn::WorkloadInfo workloadInfo;
+    AddInputToWorkload(queueDescriptor, workloadInfo, inputTensorInfo, inputHandle.get());
+    AddOutputToWorkload(queueDescriptor, workloadInfo, outputTensorInfo, outputHandle.get());
+
+    std::unique_ptr<armnn::IWorkload> workload = workloadFactory.CreatePermute(queueDescriptor, workloadInfo);
+
+    inputHandle->Allocate();
+    outputHandle->Allocate();
+
+    CopyDataToITensorHandle(inputHandle.get(), inputData);
+
+    workload->PostAllocationConfigure();
+    workload->Execute();
+
+    outputData.resize(outputTensorInfo.GetNumElements());
+    CopyDataFromITensorHandle(&outputData[0], outputHandle.get());
+    inputTensorInfo = outputTensorInfo;
+}
+
+//
+// Permute the input tensors so we can do a supported concatenation.
+// Also treat lower than 3d tensors as 3d by adding dummy 1 dimensions
+// at the front. Finally this function tells what the output shape
+// of the permuted concatenated tensor is going to be.
+//
+template<typename T> void PermuteInputsForConcat(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    std::vector<armnn::TensorInfo> & inputTensorInfos,
+    std::vector<T *> & inputData,
+    std::vector<std::vector<T>> & inputDataStorage,
+    armnn::PermutationVector & permuteVector,
+    unsigned int & concatDim,
+    armnn::TensorInfo & outputTensorInfo)
+{
+    BOOST_ASSERT_MSG(inputTensorInfos.size() > 1,
+        "Expecting more than one tensor to be concatenated here");
+
+    unsigned int numDims = 0;
+    unsigned int nthInput = 0;
+    const armnn::PermutationVector identity({0, 1, 2});
+
+    std::pair<armnn::PermutationVector, armnn::PermutationVector> permutations =
+        std::make_pair(identity, identity);
+
+    inputDataStorage.resize(inputData.size());
+
+    for (auto && tensorInfo : inputTensorInfos)
+    {
+        if (numDims == 0)
+        {
+            numDims = tensorInfo.GetShape().GetNumDimensions();
+            Generate3dPermuteVectorForConcat(numDims, concatDim, permutations);
+
+            // Store the reverese permutation.
+            permuteVector = permutations.second;
+            BOOST_ASSERT_MSG(!permuteVector.IsEqual(identity),
+                "Test logic error, we don't need permutation, so we shouldn't arrive here");
+        }
+        else
+        {
+            BOOST_ASSERT_MSG(numDims == tensorInfo.GetShape().GetNumDimensions(),
+                "All inputs must have the same number of dimensions");
+        }
+
+        armnn::TensorInfo newTensorInfo = tensorInfo;
+        newTensorInfo.SetShape(ExpandTensorShapeTo3dForPermute(tensorInfo.GetShape()));
+
+        PermuteTensorData<T>(workloadFactory,
+                             memoryManager,
+                             permutations.first,
+                             newTensorInfo,
+                             inputData[nthInput],
+                             inputDataStorage[nthInput]);
+
+        inputData[nthInput] = inputDataStorage[nthInput].data();
+        inputTensorInfos[nthInput] = newTensorInfo;
+
+        ++nthInput;
+    }
+
+    outputTensorInfo.SetShape(
+        armnnUtils::Permuted(
+            ExpandTensorShapeTo3dForPermute(outputTensorInfo.GetShape()),
+            permutations.first));
+}
+
+//
+// This is the pair of PermuteInputsForConcat(...) which permutes back
+// the output of the concatenation so we can check it against an expected
+// output.
+//
+template <typename T> void PermuteOutputForConcat(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::TensorInfo & tensorInfo,
+    const armnn::PermutationVector & permuteVector,
+    std::unique_ptr<armnn::ITensorHandle> && inputDataHandle,
+    T * data)
+{
+    BOOST_ASSERT_MSG(data != nullptr, "data must not be null");
+    if (data == nullptr)
+    {
+        // Nullptr is an error in the test. By returning without doing the permutation
+        // I expect the caller to fail the test. It still makes sense to report this as
+        // an assert for Debug builds.
+        return;
+    }
+
+    armnn::TensorInfo resultTensorInfo = tensorInfo;
+    std::vector<T> inputData(tensorInfo.GetNumElements());
+    std::vector<T> outputData;
+
+    CopyDataFromITensorHandle(&inputData[0], inputDataHandle.get());
+
+    PermuteTensorData<T>(workloadFactory,
+                         memoryManager,
+                         permuteVector,
+                         resultTensorInfo,
+                         &inputData[0],
+                         outputData);
+
+    ::memcpy(data, &outputData[0], sizeof(T)*outputData.size());
+}
+
+template<typename T> void Concatenate(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    std::initializer_list<const armnn::TensorInfo> inputTensorInfosOrig,
+    std::initializer_list<T *> inputsOrig,
+    const armnn::TensorInfo& outputTensorInfoOrig,
+    T * output,
+    unsigned int concatDim,
+    bool useSubtensor)
+{
+    BOOST_ASSERT_MSG(output != nullptr, "output must not be null");
+    if (output == nullptr)
+    {
+        // Nullptr is an error in the test. By returning without doing the permutation
+        // I expect the caller to fail the test. It still makes sense to report this as
+        // an assert for Debug builds.
+        return;
+    }
+
+    // Saves a copy of the parameters which we might need to change.
+    std::vector<armnn::TensorInfo> inputTensorInfos(inputTensorInfosOrig.begin(), inputTensorInfosOrig.end());
+    std::vector<T *> inputs            = inputsOrig;
+    armnn::TensorInfo outputTensorInfo = outputTensorInfoOrig;
+
+    armnn::PermutationVector permuteVector{0, 1, 2};
+
+    // Holds and automatically releases memory for the reshaped input data.
+    std::vector<std::vector<T>> tmpInputDataStorage;
+
+    const size_t inputCount = inputTensorInfos.size();
+
+    bool needPermuteForConcat = NeedPermuteForConcat(inputTensorInfos, concatDim);
+
+    if (needPermuteForConcat)
+    {
+        //
+        // We need to permute the inputs, because concatenation along
+        // the requested axis is not supported.
+        //
+        PermuteInputsForConcat<T>(workloadFactory,
+                                  memoryManager,
+                                  inputTensorInfos,
+                                  inputs,
+                                  tmpInputDataStorage,
+                                  permuteVector,
+                                  concatDim,
+                                  outputTensorInfo);
+    }
+
+    armnn::WorkloadInfo workloadInfo;
+
+    std::vector<std::unique_ptr<armnn::ITensorHandle>> inputHandles;
+    inputHandles.reserve(inputCount);
+
+    std::unique_ptr<armnn::ITensorHandle> outputHandle = workloadFactory.CreateTensorHandle(outputTensorInfo);
+
+    armnn::ConcatQueueDescriptor queueDescriptor;
+    armnn::OriginsDescriptor viewsDescriptor = CreateDescriptorForConcat(inputTensorInfos, concatDim);
+    queueDescriptor.m_Parameters = viewsDescriptor;
+
+    if (useSubtensor)
+    {
+        queueDescriptor.m_ViewOrigins.reserve(viewsDescriptor.GetNumViews());
+        for (unsigned int i = 0; i < viewsDescriptor.GetNumViews(); ++i)
+        {
+            queueDescriptor.m_ViewOrigins.emplace_back(std::vector<unsigned int>(viewsDescriptor.GetViewOrigin(i),
+                viewsDescriptor.GetViewOrigin(i) + viewsDescriptor.GetNumDimensions()));
+        }
+
+        outputHandle = workloadFactory.CreateTensorHandle(outputTensorInfo);
+
+        const bool subTensorsSupported = workloadFactory.SupportsSubTensors();
+        for (unsigned int i = 0; i < inputCount; ++i)
+        {
+            const armnn::TensorInfo& inputTensorInfo = inputTensorInfos[i];
+            std::unique_ptr<armnn::ITensorHandle> inputHandle =
+                subTensorsSupported ?
+                    workloadFactory.CreateSubTensorHandle(*outputHandle,
+                                                          inputTensorInfo.GetShape(),
+                                                          queueDescriptor.m_ViewOrigins[i].m_Origin.data()) :
+                    workloadFactory.CreateTensorHandle(inputTensorInfo);
+
+            inputHandles.emplace_back(std::move(inputHandle));
+        }
+
+    }
+    else
+    {
+        for (unsigned int i = 0; i < inputCount; ++i)
+        {
+            std::unique_ptr<armnn::ITensorHandle> inputHandle = workloadFactory.CreateTensorHandle(inputTensorInfos[i]);
+            inputHandles.emplace_back(std::move(inputHandle));
+        }
+    }
+
+    for (unsigned int i = 0; i < inputCount; ++i)
+    {
+        AddInputToWorkload(queueDescriptor, workloadInfo, inputTensorInfos[i], inputHandles[i].get());
+    }
+
+    AddOutputToWorkload(queueDescriptor, workloadInfo, outputTensorInfo, outputHandle.get());
+
+    std::unique_ptr<armnn::IWorkload> workload = workloadFactory.CreateConcat(queueDescriptor, workloadInfo);
+
+    for (auto& inputHandle : inputHandles)
+    {
+        inputHandle->Allocate();
+    }
+
+    outputHandle->Allocate();
+
+    unsigned int nextInputId = 0;
+    for (auto& inputHandle : inputHandles)
+    {
+        CopyDataToITensorHandle(inputHandle.get(), inputs[nextInputId]);
+        ++nextInputId;
+    }
+
+    workload->PostAllocationConfigure();
+    workload->Execute();
+
+    if (needPermuteForConcat)
+    {
+        PermuteOutputForConcat<T>(workloadFactory,
+                                  memoryManager,
+                                  outputTensorInfo,
+                                  permuteVector,
+                                  std::move(outputHandle),
+                                  output);
+    }
+    else
+    {
+        CopyDataFromITensorHandle(output, outputHandle.get());
+    }
+}
+
+//
+// Implementation templates
+//
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 1> Concat1dTestImpl(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    float qScale,
+    int32_t qOffset)
+{
+    armnn::TensorInfo inputTensorInfo({ 3 }, ArmnnType, qScale, qOffset);
+
+    auto input0 = MakeTensor<T, 1>(inputTensorInfo, QuantizedVector<T>(qScale, qOffset, { 1.0f, 2.0f, 3.0f }));
+    auto input1 = MakeTensor<T, 1>(inputTensorInfo, QuantizedVector<T>(qScale, qOffset, { 4.0f, 5.0f, 6.0f }));
+    auto input2 = MakeTensor<T, 1>(inputTensorInfo, QuantizedVector<T>(qScale, qOffset, { 7.0f, 8.0f, 9.0f }));
+
+    armnn::TensorInfo outputTensorInfo({ 9 }, ArmnnType, qScale, qOffset);
+
+    LayerTestResult<T, 1> result(outputTensorInfo);
+
+    std::vector<T> output;
+    output.resize(outputTensorInfo.GetNumElements());
+    Concatenate<T>(workloadFactory, memoryManager,
+                   { inputTensorInfo, inputTensorInfo, inputTensorInfo },
+                   { input0.data(), input1.data(), input2.data() },
+                   outputTensorInfo,
+                   output.data(),
+                   0,
+                   true);
+
+    result.output = MakeTensor<T, 1>(outputTensorInfo, output);
+    result.outputExpected = MakeTensor<T, 1>(outputTensorInfo, QuantizedVector<T>(qScale, qOffset, {
+        1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f
+    }));
+
+    return result;
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 2> Concat2dTestImpl(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::TensorInfo& outputTensorInfo,
+    unsigned int dimension,
+    const float qScale,
+    const int32_t qOffset)
+{
+    armnn::TensorInfo inputTensorInfo({ 2, 3 }, ArmnnType, qScale, qOffset);
+
+    auto input0 = MakeTensor<T, 2>(inputTensorInfo, QuantizedVector<T>(qScale, qOffset, {
+        // Batch 0
+        1.0f, 2.0f, 3.0f,
+
+        // Batch 1
+        10.0f, 11.0f, 12.0f,
+    }));
+
+    auto input1 = MakeTensor<T, 2>(inputTensorInfo, QuantizedVector<T>(qScale, qOffset, {
+        // Batch 0
+        4.0f, 5.0f, 6.0f,
+
+        // Batch 1
+        13.0f, 14.0f, 15.0f,
+    }));
+
+    auto input2 = MakeTensor<T, 2>(inputTensorInfo, QuantizedVector<T>(qScale, qOffset, {
+        // Batch 0
+        7.0f, 8.0f, 9.0f,
+
+        // Batch 1
+        16.0f, 17.0f, 18.0f,
+    }));
+
+    LayerTestResult<T, 2> result(outputTensorInfo);
+
+    std::vector<T> output;
+    output.resize(outputTensorInfo.GetNumElements());
+    Concatenate<T>(workloadFactory, memoryManager,
+                   { inputTensorInfo, inputTensorInfo, inputTensorInfo },
+                   { input0.data(), input1.data(), input2.data() },
+                   outputTensorInfo,
+                   output.data(),
+                   dimension,
+                   true);
+
+    result.output = MakeTensor<T, 2>(outputTensorInfo, output);
+    return result;
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 2> Concat2dDim0TestImpl(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    float qScale,
+    int32_t qOffset)
+{
+    armnn::TensorInfo outputTensorInfo({ 6, 3 }, ArmnnType, qScale, qOffset);
+
+    LayerTestResult<T, 2> result = Concat2dTestImpl<ArmnnType>(
+        workloadFactory, memoryManager, outputTensorInfo, 0, qScale, qOffset);
+
+    result.outputExpected = MakeTensor<T, 2>(outputTensorInfo, QuantizedVector<T>(qScale, qOffset, {
+        // Batch 0
+        1.0f, 2.0f, 3.0f,
+
+        // Batch 1
+        10.0f, 11.0f, 12.0f,
+
+        // Batch 2
+        4.0f, 5.0f, 6.0f,
+
+        // Batch 3
+        13.0f, 14.0f, 15.0f,
+
+        // Batch 4
+        7.0f, 8.0f, 9.0f,
+
+        // Batch 5
+        16.0f, 17.0f, 18.0f,
+    }));
+
+    return result;
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 2> Concat2dDim1TestImpl(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    float qScale,
+    int32_t qOffset)
+{
+    armnn::TensorInfo outputTensorInfo({ 2, 9 }, ArmnnType, qScale, qOffset);
+
+    LayerTestResult<T, 2> result = Concat2dTestImpl<ArmnnType>(
+        workloadFactory, memoryManager, outputTensorInfo, 1, qScale, qOffset);
+
+    result.outputExpected = MakeTensor<T, 2>(outputTensorInfo, QuantizedVector<T>(qScale, qOffset, {
+        // Batch 0
+        1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f,
+
+        // Batch 1
+        10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f, 17.0f, 18.0f
+    }));
+
+    return result;
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 2> Concat2dDim0DiffInputDimsTestImpl(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    float qScale,
+    int32_t qOffset)
+{
+    armnn::TensorInfo input0TensorInfo({ 2, 3 }, ArmnnType, qScale, qOffset);
+    auto input0 = MakeTensor<T, 2>(input0TensorInfo, QuantizedVector<T>(qScale, qOffset, {
+        // Batch 0
+        1.0f, 2.0f, 3.0f,
+
+        // Batch 1
+        10.0f, 11.0f, 12.0f,
+    }));
+
+    armnn::TensorInfo input1TensorInfo({ 3, 3 }, ArmnnType, qScale, qOffset);
+    auto input1 = MakeTensor<T, 2>(input1TensorInfo, QuantizedVector<T>(qScale, qOffset, {
+        // Batch 0
+        4.0f, 5.0f, 6.0f,
+
+        // Batch 1
+        13.0f, 14.0f, 15.0f,
+
+        // Batch 0
+        7.0f, 8.0f, 9.0f,
+    }));
+
+    armnn::TensorInfo input2TensorInfo({ 1, 3 }, ArmnnType, qScale, qOffset);
+    auto input2 = MakeTensor<T, 2>(input2TensorInfo, QuantizedVector<T>(qScale, qOffset, {
+        // Batch 1
+        16.0f, 17.0f, 18.0f,
+    }));
+
+    armnn::TensorInfo outputTensorInfo({ 6, 3 }, ArmnnType, qScale, qOffset);
+    LayerTestResult<T, 2> result(outputTensorInfo);
+
+    std::vector<T> output;
+    output.resize(outputTensorInfo.GetNumElements());
+    Concatenate<T>(workloadFactory, memoryManager,
+                   { input0TensorInfo, input1TensorInfo, input2TensorInfo },
+                   { input0.data(), input1.data(), input2.data() },
+                   outputTensorInfo,
+                   output.data(),
+                   0,
+                   true);
+
+    result.output = MakeTensor<T, 2>(outputTensorInfo, output);
+    result.outputExpected = MakeTensor<T, 2>(outputTensorInfo, QuantizedVector<T>(qScale, qOffset, {
+        // Batch 0
+        1.0f, 2.0f, 3.0f,
+
+        // Batch 1
+        10.0f, 11.0f, 12.0f,
+
+        // Batch 2
+        4.0f, 5.0f, 6.0f,
+
+        // Batch 3
+        13.0f, 14.0f, 15.0f,
+
+        // Batch 4
+        7.0f, 8.0f, 9.0f,
+
+        // Batch 5
+        16.0f, 17.0f, 18.0f,
+    }));
+
+    return result;
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 2> Concat2dDim1DiffInputDimsTestImpl(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    float qScale,
+    int32_t qOffset)
+{
+    armnn::TensorInfo input0TensorInfo({ 2, 3 }, ArmnnType, qScale, qOffset);
+    auto input0 = MakeTensor<T, 2>(input0TensorInfo, QuantizedVector<T>(qScale, qOffset, {
+        // Batch 0
+        1.0f, 2.0f, 3.0f,
+
+        // Batch 1
+        10.0f, 11.0f, 12.0f,
+    }));
+
+    armnn::TensorInfo input1TensorInfo({ 2, 5 }, ArmnnType, qScale, qOffset);
+    auto input1 = MakeTensor<T, 2>(input1TensorInfo, QuantizedVector<T>(qScale, qOffset, {
+        // Batch 0
+        4.0f, 5.0f, 6.0f, 7.0f, 8.0f,
+
+        // Batch 1
+        13.0f, 14.0f, 15.0f, 16.0f, 17.0f,
+    }));
+
+    armnn::TensorInfo input2TensorInfo({ 2, 1 }, ArmnnType, qScale, qOffset);
+    auto input2 = MakeTensor<T, 2>(input2TensorInfo, QuantizedVector<T>(qScale, qOffset, {
+        // Batch 0
+        9.0f,
+
+        // Batch 1
+        18.0f
+    }));
+
+    armnn::TensorInfo outputTensorInfo({ 2, 9 }, ArmnnType, qScale, qOffset);
+    LayerTestResult<T, 2> result(outputTensorInfo);
+
+    std::vector<T> output;
+    output.resize(outputTensorInfo.GetNumElements());
+    Concatenate<T>(workloadFactory, memoryManager,
+                   { input0TensorInfo, input1TensorInfo, input2TensorInfo },
+                   { input0.data(), input1.data(), input2.data() },
+                   outputTensorInfo,
+                   output.data(),
+                   1,
+                   true);
+
+    result.output = MakeTensor<T, 2>(outputTensorInfo, output);
+    result.outputExpected = MakeTensor<T, 2>(outputTensorInfo, QuantizedVector<T>(qScale, qOffset, {
+        // Batch 0
+        1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f,
+
+        // Batch 1
+        10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f, 17.0f, 18.0f,
+    }));
+
+    return result;
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 3> Concat3dTestImpl(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::TensorInfo& outputTensorInfo,
+    unsigned int dimension,
+    bool useSubtensor,
+    float qScale,
+    int32_t qOffset)
+{
+    armnn::TensorInfo inputTensorInfo({ 2, 3, 2 }, ArmnnType, qScale, qOffset);
+
+    auto input0 = MakeTensor<T, 3>(inputTensorInfo, QuantizedVector<T>(qScale, qOffset, {
+        // Batch 0, Channel 0
+        1.0f, 2.0f,
+
+        // Batch 0, Channel 1
+        3.0f, 4.0f,
+
+        // Batch 0, Channel 2
+        5.0f, 6.0f,
+
+        // Batch 1, Channel 0
+        19.0f, 20.0f,
+
+        // Batch 1, Channel 1
+        21.0f, 22.0f,
+
+        // Batch 1, Channel 2
+        23.0f, 24.0f
+    }));
+
+    auto input1 = MakeTensor<T, 3>(inputTensorInfo, QuantizedVector<T>(qScale, qOffset, {
+        // Batch 0, Channel 0
+        7.0f, 8.0f,
+
+        // Batch 0, Channel 1
+        9.0f, 10.0f,
+
+        // Batch 0, Channel 2
+        11.0f, 12.0f,
+
+        // Batch 1, Channel 0
+        25.0f, 26.0f,
+
+        // Batch 1, Channel 1
+        27.0f, 28.0f,
+
+        // Batch 1, Channel 2
+        29.0f, 30.0f
+    }));
+
+    auto input2 = MakeTensor<T, 3>(inputTensorInfo, QuantizedVector<T>(qScale, qOffset, {
+        // Batch 0, Channel 0
+        13.0f, 14.0f,
+
+        // Batch 0, Channel 1
+        15.0f, 16.0f,
+
+        // Batch 0, Channel 2
+        17.0f, 18.0f,
+
+        // Batch 1, Channel 0
+        31.0f, 32.0f,
+
+        // Batch 1, Channel 1
+        33.0f, 34.0f,
+
+        // Batch 1, Channel 2
+        35.0f, 36.0f
+    }));
+
+    LayerTestResult<T, 3> result(outputTensorInfo);
+
+    std::vector<T> output;
+    output.resize(outputTensorInfo.GetNumElements());
+    Concatenate<T>(workloadFactory, memoryManager,
+                   { inputTensorInfo, inputTensorInfo, inputTensorInfo },
+                   { input0.data(), input1.data(), input2.data() },
+                   outputTensorInfo,
+                   output.data(),
+                   dimension,
+                   useSubtensor);
+
+    result.output = MakeTensor<T, 3>(outputTensorInfo, output);
+    return result;
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 3> Concat3dDim0TestImpl(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    float qScale,
+    int32_t qOffset)
+{
+    armnn::TensorInfo outputTensorInfo({ 6, 3, 2 }, ArmnnType, qScale, qOffset);
+
+    LayerTestResult<T, 3> result = Concat3dTestImpl<ArmnnType>(
+        workloadFactory, memoryManager, outputTensorInfo, 0, true, qScale, qOffset);
+
+    result.outputExpected = MakeTensor<T, 3>(outputTensorInfo, QuantizedVector<T>(qScale, qOffset, {
+        // Batch 0, Channel 0
+        1.0f, 2.0f,
+
+        // Batch 0, Channel 1
+        3.0f, 4.0f,
+
+        // Batch 0, Channel 2
+        5.0f, 6.0f,
+
+        // Batch 1, Channel 0
+        19.0f, 20.0f,
+
+        // Batch 1, Channel 1
+        21.0f, 22.0f,
+
+        // Batch 1, Channel 2
+        23.0f, 24.0f,
+
+        // Batch 2, Channel 0
+        7.0f, 8.0f,
+
+        // Batch 2, Channel 1
+        9.0f, 10.0f,
+
+        // Batch 2, Channel 2
+        11.0f, 12.0f,
+
+        // Batch 3, Channel 0
+        25.0f, 26.0f,
+
+        // Batch 3, Channel 1
+        27.0f, 28.0f,
+
+        // Batch 3, Channel 2
+        29.0f, 30.0f,
+
+        // Batch 4, Channel 0
+        13.0f, 14.0f,
+
+        // Batch 4, Channel 1
+        15.0f, 16.0f,
+
+        // Batch 4, Channel 2
+        17.0f, 18.0f,
+
+        // Batch 5, Channel 0
+        31.0f, 32.0f,
+
+        // Batch 5, Channel 1
+        33.0f, 34.0f,
+
+        // Batch 5, Channel 2
+        35.0f, 36.0f
+    }));
+
+    return result;
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 3> Concat3dDim1TestImpl(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    float qScale,
+    int32_t qOffset)
+{
+    armnn::TensorInfo outputTensorInfo({ 2, 9, 2 }, ArmnnType, qScale, qOffset);
+
+    LayerTestResult<T, 3> result = Concat3dTestImpl<ArmnnType>(
+        workloadFactory, memoryManager, outputTensorInfo, 1, true, qScale, qOffset);
+
+    result.outputExpected = MakeTensor<T, 3>(outputTensorInfo, QuantizedVector<T>(qScale, qOffset, {
+        // Batch 0, Channel 0
+        1.0f, 2.0f,
+
+        // Batch 0, Channel 1
+        3.0f, 4.0f,
+
+        // Batch 0, Channel 2
+        5.0f, 6.0f,
+
+        // Batch 0, Channel 3
+        7.0f, 8.0f,
+
+        // Batch 0, Channel 4
+        9.0f, 10.0f,
+
+        // Batch 0, Channel 5
+        11.0f, 12.0f,
+
+        // Batch 0, Channel 6
+        13.0f, 14.0f,
+
+        // Batch 0, Channel 7
+        15.0f, 16.0f,
+
+        // Batch 0, Channel 8
+        17.0f, 18.0f,
+
+        // Batch 1, Channel 0
+        19.0f, 20.0f,
+
+        // Batch 1, Channel 1
+        21.0f, 22.0f,
+
+        // Batch 1, Channel 2
+        23.0f, 24.0f,
+
+        // Batch 1, Channel 3
+        25.0f, 26.0f,
+
+        // Batch 1, Channel 4
+        27.0f, 28.0f,
+
+        // Batch 1, Channel 5
+        29.0f, 30.0f,
+
+        // Batch 1, Channel 6
+        31.0f, 32.0f,
+
+        // Batch 1, Channel 7
+        33.0f, 34.0f,
+
+        // Batch 1, Channel 8
+        35.0f, 36.0f
+    }));
+
+    return result;
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 3> Concat3dDim2TestImpl(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    bool useSubtensor,
+    float qScale,
+    int32_t qOffset)
+{
+    armnn::TensorInfo outputTensorInfo({ 2, 3, 6 }, ArmnnType, qScale, qOffset);
+
+    LayerTestResult<T, 3> result = Concat3dTestImpl<ArmnnType>(
+        workloadFactory, memoryManager, outputTensorInfo, 2, useSubtensor, qScale, qOffset);
+
+    result.outputExpected = MakeTensor<T, 3>(outputTensorInfo, QuantizedVector<T>(qScale, qOffset, {
+        // Batch 0, Channel 0
+        1.0f, 2.0f, 7.0f, 8.0f, 13.0f, 14.0f,
+
+        // Batch 0, Channel 1
+        3.0f, 4.0f, 9.0f, 10.0f, 15.0f, 16.0f,
+
+        // Batch 0, Channel 2
+        5.0f, 6.0f, 11.0f, 12.0f, 17.0f, 18.0f,
+
+        // Batch 1, Channel 0
+        19.0f, 20.0f, 25.0f, 26.0f, 31.0f, 32.0f,
+
+        // Batch 1, Channel 1
+        21.0f, 22.0f, 27.0f, 28.0f, 33.0f, 34.0f,
+
+        // Batch 1, Channel 2
+        23.0f, 24.0f, 29.0f, 30.0f, 35.0f, 36.0f,
+    }));
+
+    return result;
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 3> Concat3dDim0DiffInputDimsTestImpl(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    float qScale,
+    int32_t qOffset)
+{
+    armnn::TensorInfo input0TensorInfo({ 2, 3, 2 }, ArmnnType);
+    auto input0 = MakeTensor<T, 3>(input0TensorInfo, QuantizedVector<T>(qScale, qOffset, {
+            // Batch 0, Channel 0
+            1.0f, 2.0f,
+
+            // Batch 0, Channel 1
+            3.0f, 4.0f,
+
+            // Batch 0, Channel 2
+            5.0f, 6.0f,
+
+            // Batch 1, Channel 0
+            19.0f, 20.0f,
+
+            // Batch 1, Channel 1
+            21.0f, 22.0f,
+
+            // Batch 1, Channel 2
+            23.0f, 24.0f
+    }));
+
+    armnn::TensorInfo input1TensorInfo({ 1, 3, 2 }, ArmnnType);
+    auto input1 = MakeTensor<T, 3>(input1TensorInfo, QuantizedVector<T>(qScale, qOffset, {
+            // Batch 0, Channel 0
+            7.0f, 8.0f,
+
+            // Batch 0, Channel 1
+            9.0f, 10.0f,
+
+            // Batch 0, Channel 2
+            11.0f, 12.0f,
+    }));
+
+    armnn::TensorInfo input2TensorInfo({ 3, 3, 2 }, ArmnnType);
+    auto input2 = MakeTensor<T, 3>(input2TensorInfo, QuantizedVector<T>(qScale, qOffset, {
+            // Batch 0, Channel 0
+            25.0f, 26.0f,
+
+            // Batch 0, Channel 1
+            27.0f, 28.0f,
+
+            // Batch 0, Channel 2
+            29.0f, 30.0f,
+
+            // Batch 1, Channel 0
+            13.0f, 14.0f,
+
+            // Batch 1, Channel 1
+            15.0f, 16.0f,
+
+            // Batch 1, Channel 2
+            17.0f, 18.0f,
+
+            // Batch 2, Channel 0
+            31.0f, 32.0f,
+
+            // Batch 2, Channel 1
+            33.0f, 34.0f,
+
+            // Batch 2, Channel 2
+            35.0f, 36.0f
+    }));
+
+    armnn::TensorInfo outputTensorInfo({ 6, 3, 2 }, ArmnnType);
+    LayerTestResult<T, 3> result(outputTensorInfo);
+
+    std::vector<T> output;
+    output.resize(outputTensorInfo.GetNumElements());
+    Concatenate<T>(workloadFactory, memoryManager,
+                   { input0TensorInfo, input1TensorInfo, input2TensorInfo },
+                   { input0.data(), input1.data(), input2.data() },
+                   outputTensorInfo,
+                   output.data(),
+                   0,
+                   true);
+
+    result.output = MakeTensor<T, 3>(outputTensorInfo, output);
+    result.outputExpected = MakeTensor<T, 3>(outputTensorInfo, QuantizedVector<T>(qScale, qOffset, {
+        // Batch 0, Channel 0
+        1.0f, 2.0f,
+
+        // Batch 0, Channel 1
+        3.0f, 4.0f,
+
+        // Batch 0, Channel 2
+        5.0f, 6.0f,
+
+        // Batch 1, Channel 0
+        19.0f, 20.0f,
+
+        // Batch 1, Channel 1
+        21.0f, 22.0f,
+
+        // Batch 1, Channel 2
+        23.0f, 24.0f,
+
+        // Batch 2, Channel 0
+        7.0f, 8.0f,
+
+        // Batch 2, Channel 1
+        9.0f, 10.0f,
+
+        // Batch 2, Channel 2
+        11.0f, 12.0f,
+
+        // Batch 3, Channel 0
+        25.0f, 26.0f,
+
+        // Batch 3, Channel 1
+        27.0f, 28.0f,
+
+        // Batch 3, Channel 2
+        29.0f, 30.0f,
+
+        // Batch 4, Channel 0
+        13.0f, 14.0f,
+
+        // Batch 4, Channel 1
+        15.0f, 16.0f,
+
+        // Batch 4, Channel 2
+        17.0f, 18.0f,
+
+        // Batch 5, Channel 0
+        31.0f, 32.0f,
+
+        // Batch 5, Channel 1
+        33.0f, 34.0f,
+
+        // Batch 5, Channel 2
+        35.0f, 36.0f
+    }));
+
+    return result;
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 3> Concat3dDim1DiffInputDimsTestImpl(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    float qScale,
+    int32_t qOffset)
+{
+    armnn::TensorInfo input0TensorInfo({ 2, 3, 2 }, ArmnnType, qScale, qOffset);
+    auto input0 = MakeTensor<T, 3>(input0TensorInfo, QuantizedVector<T>(qScale, qOffset, {
+        // Batch 0, Channel 0
+        1.0f, 2.0f,
+
+        // Batch 0, Channel 1
+        3.0f, 4.0f,
+
+        // Batch 0, Channel 2
+        5.0f, 6.0f,
+
+        // Batch 1, Channel 0
+        19.0f, 20.0f,
+
+        // Batch 1, Channel 1
+        21.0f, 22.0f,
+
+        // Batch 1, Channel 2
+        23.0f, 24.0f
+    }));
+
+    armnn::TensorInfo input1TensorInfo({ 2, 4, 2 }, ArmnnType, qScale, qOffset);
+    auto input1 = MakeTensor<T, 3>(input1TensorInfo, QuantizedVector<T>(qScale, qOffset, {
+        // Batch 0, Channel 0
+        7.0f, 8.0f,
+
+        // Batch 0, Channel 1
+        9.0f, 10.0f,
+
+        // Batch 0, Channel 2
+        11.0f, 12.0f,
+
+        // Batch 0, Channel 3
+        25.0f, 26.0f,
+
+        // Batch 1, Channel 0
+        27.0f, 28.0f,
+
+        // Batch 1, Channel 1
+        29.0f, 30.0f,
+
+        // Batch 1, Channel 2
+        13.0f, 14.0f,
+
+        // Batch 1, Channel 3
+        15.0f, 16.0f,
+    }));
+
+    armnn::TensorInfo input2TensorInfo({ 2, 1, 2 }, ArmnnType, qScale, qOffset);
+    auto input2 = MakeTensor<T, 3>(input2TensorInfo, QuantizedVector<T>(qScale, qOffset, {
+        // Batch 0, Channel 0
+        17.0f, 18.0f,
+
+        // Batch 1, Channel 0
+        31.0f, 32.0f,
+    }));
+
+    armnn::TensorInfo outputTensorInfo({ 2, 8, 2 }, ArmnnType, qScale, qOffset);
+    LayerTestResult<T, 3> result(outputTensorInfo);
+
+    std::vector<T> output;
+    output.resize(outputTensorInfo.GetNumElements());
+    Concatenate<T>(workloadFactory, memoryManager,
+                   { input0TensorInfo, input1TensorInfo, input2TensorInfo },
+                   { input0.data(), input1.data(), input2.data() },
+                   outputTensorInfo,
+                   output.data(),
+                   1,
+                   true);
+
+    result.output = MakeTensor<T, 3>(outputTensorInfo, output);
+    result.outputExpected = MakeTensor<T, 3>(outputTensorInfo, QuantizedVector<T>(qScale, qOffset, {
+        // Batch 0, Channel 0
+        1.0f, 2.0f,
+
+        // Batch 0, Channel 1
+        3.0f, 4.0f,
+
+        // Batch 0, Channel 2
+        5.0f, 6.0f,
+
+        // Batch 0, Channel 3
+        7.0f, 8.0f,
+
+        // Batch 0, Channel 4
+        9.0f, 10.0f,
+
+        // Batch 0, Channel 5
+        11.0f, 12.0f,
+
+        // Batch 0, Channel 6
+        25.0f, 26.0f,
+
+        // Batch 0, Channel 7
+        17.0f, 18.0f,
+
+        // Batch 1, Channel 0
+        19.0f, 20.0f,
+
+        // Batch 1, Channel 1
+        21.0f, 22.0f,
+
+        // Batch 1, Channel 2
+        23.0f, 24.0f,
+
+        // Batch 1, Channel 3
+        27.0f, 28.0f,
+
+        // Batch 1, Channel 4
+        29.0f, 30.0f,
+
+        // Batch 1, Channel 5
+        13.0f, 14.0f,
+
+        // Batch 1, Channel 6
+        15.0f, 16.0f,
+
+        // Batch 1, Channel 7
+        31.0f, 32.0f,
+    }));
+
+    return result;
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 3> Concat3dDim2DiffInputDimsTestImpl(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    bool useSubtensor,
+    float qScale,
+    int32_t qOffset)
+{
+    armnn::TensorInfo input0TensorInfo({ 2, 3, 2 }, ArmnnType, qScale, qOffset);
+    auto input0 = MakeTensor<T, 3>(input0TensorInfo, QuantizedVector<T>(qScale, qOffset, {
+        // Batch 0, Channel 0
+        1.0f, 2.0f,
+
+        // Batch 0, Channel 1
+        3.0f, 4.0f,
+
+        // Batch 0, Channel 2
+        5.0f, 6.0f,
+
+        // Batch 1, Channel 0
+        19.0f, 20.0f,
+
+        // Batch 1, Channel 1
+        21.0f, 22.0f,
+
+        // Batch 1, Channel 2
+        23.0f, 24.0f
+    }));
+
+    armnn::TensorInfo input1TensorInfo({ 2, 3, 1 }, ArmnnType, qScale, qOffset);
+    auto input1 = MakeTensor<T, 3>(input1TensorInfo, QuantizedVector<T>(qScale, qOffset, {
+        // Batch 0, Channel 0
+        7.0f,
+
+        // Batch 0, Channel 1
+        9.0f,
+
+        // Batch 0, Channel 2
+        11.0f,
+
+        // Batch 1, Channel 0
+        25.0f,
+
+        // Batch 1, Channel 1
+        27.0f,
+
+        // Batch 1, Channel 2
+        29.0f
+    }));
+
+    armnn::TensorInfo input2TensorInfo({ 2, 3, 3 }, ArmnnType, qScale, qOffset);
+    auto input2 = MakeTensor<T, 3>(input2TensorInfo, QuantizedVector<T>(qScale, qOffset, {
+        // Batch 0, Channel 0
+        13.0f, 14.0f, 50.0f,
+
+        // Batch 0, Channel 1
+        15.0f, 16.0f, 51.0f,
+
+        // Batch 0, Channel 2
+        17.0f, 18.0f, 52.0f,
+
+        // Batch 1, Channel 0
+        31.0f, 32.0f, 53.0f,
+
+        // Batch 1, Channel 1
+        33.0f, 34.0f, 54.0f,
+
+        // Batch 1, Channel 2
+        35.0f, 36.0f, 55.0f,
+    }));
+
+    armnn::TensorInfo outputTensorInfo({ 2, 3, 6 }, ArmnnType, qScale, qOffset);
+    LayerTestResult<T, 3> result(outputTensorInfo);
+
+    std::vector<T> output;
+    output.resize(outputTensorInfo.GetNumElements());
+    Concatenate<T>(workloadFactory, memoryManager,
+                   { input0TensorInfo, input1TensorInfo, input2TensorInfo },
+                   { input0.data(), input1.data(), input2.data() },
+                   outputTensorInfo,
+                   output.data(),
+                   2,
+                   useSubtensor);
+
+    result.output = MakeTensor<T, 3>(outputTensorInfo, output);
+    result.outputExpected = MakeTensor<T, 3>(outputTensorInfo, QuantizedVector<T>(qScale, qOffset, {
+        // Batch 0, Channel 0
+        1.0f, 2.0f, 7.0f, 13.0f, 14.0f, 50.0f,
+
+        // Batch 0, Channel 1
+        3.0f, 4.0f, 9.0f, 15.0f, 16.0f, 51.0f,
+
+        // Batch 0, Channel 2
+        5.0f, 6.0f, 11.0f, 17.0f, 18.0f, 52.0f,
+
+        // Batch 1, Channel 0
+        19.0f, 20.0f, 25.0f, 31.0f, 32.0f, 53.0f,
+
+        // Batch 1, Channel 1
+        21.0f, 22.0f, 27.0f, 33.0f, 34.0f, 54.0f,
+
+        // Batch 1, Channel 2
+        23.0f, 24.0f, 29.0f, 35.0f, 36.0f, 55.0f,
+    }));
+
+    return result;
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> Concat4dTestImpl(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::TensorInfo& outputTensorInfo,
+    unsigned int dimension,
+    bool useSubtensor,
+    float qScale,
+    int32_t qOffset)
+{
+    armnn::TensorInfo inputTensorInfo({ 1, 3, 2, 2 }, ArmnnType, qScale, qOffset);
+
+    auto input0 = MakeTensor<T, 4>(inputTensorInfo, QuantizedVector<T>(qScale, qOffset, {
+        1.0f, 2.0f,
+        3.0f, 4.0f,
+        5.0f, 6.0f,
+        7.0f, 8.0f,
+        9.0f, 10.0f,
+        11.0f, 12.0f
+    }));
+
+    auto input1 = MakeTensor<T, 4>(inputTensorInfo, QuantizedVector<T>(qScale, qOffset, {
+        11.0f, 12.0f,
+        13.0f, 14.0f,
+        15.0f, 16.0f,
+        17.0f, 18.0f,
+        19.0f, 20.0f,
+        21.0f, 22.0f
+    }));
+
+    auto input2 = MakeTensor<T, 4>(inputTensorInfo, QuantizedVector<T>(qScale, qOffset, {
+        21.0f, 22.0f,
+        23.0f, 24.0f,
+        25.0f, 26.0f,
+        27.0f, 28.0f,
+        29.0f, 30.0f,
+        31.0f, 32.0f
+    }));
+
+    LayerTestResult<T, 4> result(outputTensorInfo);
+
+    std::vector<T> output;
+    output.resize(outputTensorInfo.GetNumElements());
+
+    Concatenate<T>(workloadFactory,
+                   memoryManager,
+                   {inputTensorInfo, inputTensorInfo, inputTensorInfo},
+                   {input0.data(), input1.data(), input2.data()},
+                   outputTensorInfo,
+                   output.data(),
+                   dimension,
+                   useSubtensor);
+
+    result.output = MakeTensor<T, 4>(outputTensorInfo, output);
+    return result;
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> Concat4dDim0TestImpl(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    float qScale,
+    int32_t qOffset)
+{
+    armnn::TensorInfo outputTensorInfo({ 3, 3, 2, 2 }, ArmnnType, qScale, qOffset);
+
+    LayerTestResult<T, 4> result = Concat4dTestImpl<ArmnnType>(
+        workloadFactory, memoryManager, outputTensorInfo, 0, true, qScale, qOffset);
+
+    result.outputExpected = MakeTensor<T, 4>(outputTensorInfo, QuantizedVector<T>(qScale, qOffset, {
+        1.0f, 2.0f,
+        3.0f, 4.0f,
+        5.0f, 6.0f,
+        7.0f, 8.0f,
+        9.0f, 10.0f,
+        11.0f, 12.0f,
+
+        11.0f, 12.0f,
+        13.0f, 14.0f,
+        15.0f, 16.0f,
+        17.0f, 18.0f,
+        19.0f, 20.0f,
+        21.0f, 22.0f,
+
+        21.0f, 22.0f,
+        23.0f, 24.0f,
+        25.0f, 26.0f,
+        27.0f, 28.0f,
+        29.0f, 30.0f,
+        31.0f, 32.0f
+    }));
+    return result;
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> Concat4dDim1TestImpl(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    float qScale,
+    int32_t qOffset)
+{
+    armnn::TensorInfo outputTensorInfo({ 1, 9, 2, 2 }, ArmnnType, qScale, qOffset);
+
+    LayerTestResult<T, 4> result = Concat4dTestImpl<ArmnnType>(
+        workloadFactory, memoryManager, outputTensorInfo, 1, true, qScale, qOffset);
+
+    result.outputExpected = MakeTensor<T, 4>(outputTensorInfo, QuantizedVector<T>(qScale, qOffset, {
+        1.0f, 2.0f,
+        3.0f, 4.0f,
+        5.0f, 6.0f,
+        7.0f, 8.0f,
+        9.0f, 10.0f,
+        11.0f, 12.0f,
+
+        11.0f, 12.0f,
+        13.0f, 14.0f,
+        15.0f, 16.0f,
+        17.0f, 18.0f,
+        19.0f, 20.0f,
+        21.0f, 22.0f,
+
+        21.0f, 22.0f,
+        23.0f, 24.0f,
+        25.0f, 26.0f,
+        27.0f, 28.0f,
+        29.0f, 30.0f,
+        31.0f, 32.0f
+    }));
+
+    return result;
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> Concat4dDim2TestImpl(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    float qScale,
+    int32_t qOffset)
+{
+    armnn::TensorInfo outputTensorInfo({ 1, 3, 6, 2 }, ArmnnType, qScale, qOffset);
+
+    LayerTestResult<T, 4> result = Concat4dTestImpl<ArmnnType>(
+        workloadFactory, memoryManager, outputTensorInfo, 2, true, qScale, qOffset);
+
+    result.outputExpected = MakeTensor<T, 4>(outputTensorInfo, QuantizedVector<T>(qScale, qOffset, {
+        1.0f, 2.0f,
+        3.0f, 4.0f,
+        11.0f, 12.0f,
+        13.0f, 14.0f,
+        21.0f, 22.0f,
+        23.0f, 24.0f,
+
+        5.0f, 6.0f,
+        7.0f, 8.0f,
+        15.0f, 16.0f,
+        17.0f, 18.0f,
+        25.0f, 26.0f,
+        27.0f, 28.0f,
+
+        9.0f, 10.0f,
+        11.0f, 12.0f,
+        19.0f, 20.0f,
+        21.0f, 22.0f,
+        29.0f, 30.0f,
+        31.0f, 32.0f
+    }));
+
+    return result;
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> Concat4dDim3TestImpl(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    float qScale,
+    int32_t qOffset,
+    bool useSubtensor)
+{
+    armnn::TensorInfo outputTensorInfo({ 1, 3, 2, 6 }, ArmnnType, qScale, qOffset);
+
+    LayerTestResult<T, 4> result = Concat4dTestImpl<ArmnnType>(
+        workloadFactory, memoryManager, outputTensorInfo, 3, useSubtensor, qScale, qOffset);
+
+    result.outputExpected = MakeTensor<T, 4>(outputTensorInfo, QuantizedVector<T>(qScale, qOffset, {
+        1.0f, 2.0f,
+        11.0f, 12.0f,
+        21.0f, 22.0f,
+        3.0f, 4.0f,
+        13.0f, 14.0f,
+        23.0f, 24.0f,
+
+        5.0f, 6.0f,
+        15.0f, 16.0f,
+        25.0f, 26.0f,
+        7.0f, 8.0f,
+        17.0f, 18.0f,
+        27.0f, 28.0f,
+
+        9.0f, 10.0f,
+        19.0f, 20.0f,
+        29.0f, 30.0f,
+        11.0f, 12.0f,
+        21.0f, 22.0f,
+        31.0f, 32.0f
+    }));
+
+    return result;
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> Concat4dDiffShapeDim0TestImpl(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    float qScale,
+    int32_t qOffset)
+{
+    unsigned int dimension = 0;
+    armnn::TensorInfo inputTensorInfo0({ 1, 3, 2, 2 }, ArmnnType, qScale, qOffset);
+
+    auto input0 = MakeTensor<T, 4>(inputTensorInfo0, QuantizedVector<T>(qScale, qOffset, {
+        1.0f, 2.0f,
+        3.0f, 4.0f,
+        5.0f, 6.0f,
+        7.0f, 8.0f,
+        9.0f, 10.0f,
+        11.0f, 12.0f
+    }));
+
+    armnn::TensorInfo inputTensorInfo1({ 2, 3, 2, 2 }, ArmnnType, qScale, qOffset);
+
+    auto input1 = MakeTensor<T, 4>(inputTensorInfo1, QuantizedVector<T>(qScale, qOffset, {
+        11.0f, 12.0f,
+        13.0f, 14.0f,
+        15.0f, 16.0f,
+        17.0f, 18.0f,
+        19.0f, 20.0f,
+        21.0f, 22.0f,
+
+        21.0f, 22.0f,
+        23.0f, 24.0f,
+        25.0f, 26.0f,
+        27.0f, 28.0f,
+        29.0f, 30.0f,
+        31.0f, 32.0f
+
+    }));
+
+    armnn::TensorInfo outputTensorInfo({ 3, 3, 2, 2 }, ArmnnType, qScale, qOffset);
+
+    LayerTestResult<T, 4> result(outputTensorInfo);
+
+    std::vector<T> output;
+    output.resize(outputTensorInfo.GetNumElements());
+    Concatenate<T>(workloadFactory,
+                   memoryManager,
+                   {inputTensorInfo0, inputTensorInfo1},
+                   {input0.data(), input1.data()},
+                   outputTensorInfo,
+                   output.data(),
+                   dimension,
+                   true);
+
+    result.output = MakeTensor<T, 4>(outputTensorInfo, output);
+    result.outputExpected = MakeTensor<T, 4>(outputTensorInfo, QuantizedVector<T>(qScale, qOffset, {
+        1.0f, 2.0f,
+        3.0f, 4.0f,
+        5.0f, 6.0f,
+        7.0f, 8.0f,
+        9.0f, 10.0f,
+        11.0f, 12.0f,
+
+        11.0f, 12.0f,
+        13.0f, 14.0f,
+        15.0f, 16.0f,
+        17.0f, 18.0f,
+        19.0f, 20.0f,
+        21.0f, 22.0f,
+
+        21.0f, 22.0f,
+        23.0f, 24.0f,
+        25.0f, 26.0f,
+        27.0f, 28.0f,
+        29.0f, 30.0f,
+        31.0f, 32.0f
+    }));
+
+    return result;
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> Concat4dDiffShapeDim1TestImpl(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    float qScale,
+    int32_t qOffset)
+{
+    unsigned int dimension = 1;
+    armnn::TensorInfo inputTensorInfo0({ 1, 3, 2, 2 }, ArmnnType, qScale, qOffset);
+
+    auto input0 = MakeTensor<T, 4>(inputTensorInfo0, QuantizedVector<T>(qScale, qOffset, {
+        1.0f, 2.0f,
+        3.0f, 4.0f,
+        5.0f, 6.0f,
+        7.0f, 8.0f,
+        9.0f, 10.0f,
+        11.0f, 12.0f
+    }));
+
+    armnn::TensorInfo inputTensorInfo1({ 1, 2, 2, 2 }, ArmnnType, qScale, qOffset);
+
+    auto input1 = MakeTensor<T, 4>(inputTensorInfo1, QuantizedVector<T>(qScale, qOffset, {
+        11.0f, 12.0f,
+        13.0f, 14.0f,
+        15.0f, 16.0f,
+        17.0f, 18.0f,
+
+    }));
+
+    armnn::TensorInfo outputTensorInfo({ 1, 5, 2, 2 }, ArmnnType, qScale, qOffset);
+
+    LayerTestResult<T, 4> result(outputTensorInfo);
+
+    std::vector<T> output;
+    output.resize(outputTensorInfo.GetNumElements());
+    Concatenate<T>(workloadFactory,
+                   memoryManager,
+                   {inputTensorInfo0, inputTensorInfo1},
+                   {input0.data(), input1.data()},
+                   outputTensorInfo,
+                   output.data(),
+                   dimension,
+                   true);
+
+    result.output = MakeTensor<T, 4>(outputTensorInfo, output);
+    result.outputExpected = MakeTensor<T, 4>(outputTensorInfo, QuantizedVector<T>(qScale, qOffset, {
+        1.0f, 2.0f,
+        3.0f, 4.0f,
+        5.0f, 6.0f,
+        7.0f, 8.0f,
+        9.0f, 10.0f,
+        11.0f, 12.0f,
+        11.0f, 12.0f,
+        13.0f, 14.0f,
+        15.0f, 16.0f,
+        17.0f, 18.0f
+    }));
+
+    return result;
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> Concat4dDiffShapeDim2TestImpl(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    float qScale,
+    int32_t qOffset)
+{
+    unsigned int dimension = 2;
+    armnn::TensorInfo inputTensorInfo0({ 1, 3, 2, 2 }, ArmnnType, qScale, qOffset);
+
+    auto input0 = MakeTensor<T, 4>(inputTensorInfo0, QuantizedVector<T>(qScale, qOffset, {
+        1.0f, 2.0f,
+        3.0f, 4.0f,
+        5.0f, 6.0f,
+        7.0f, 8.0f,
+        9.0f, 10.0f,
+        11.0f, 12.0f
+    }));
+
+    armnn::TensorInfo inputTensorInfo1({ 1, 3, 3, 2 }, ArmnnType, qScale, qOffset);
+
+    auto input1 = MakeTensor<T, 4>(inputTensorInfo1, QuantizedVector<T>(qScale, qOffset, {
+        11.0f, 12.0f,
+        13.0f, 14.0f,
+        15.0f, 16.0f,
+        17.0f, 18.0f,
+        19.0f, 20.0f,
+        21.0f, 22.0f,
+        23.0f, 24.0f,
+        25.0f, 26.0f,
+        27.0f, 28.0f
+    }));
+
+    armnn::TensorInfo outputTensorInfo({ 1, 3, 5, 2 }, ArmnnType, qScale, qOffset);
+
+    LayerTestResult<T, 4> result(outputTensorInfo);
+
+    std::vector<T> output;
+    output.resize(outputTensorInfo.GetNumElements());
+    Concatenate<T>(workloadFactory,
+                   memoryManager,
+                   {inputTensorInfo0, inputTensorInfo1},
+                   {input0.data(), input1.data()},
+                   outputTensorInfo,
+                   output.data(),
+                   dimension,
+                   true);
+
+    result.output = MakeTensor<T, 4>(outputTensorInfo, output);
+    result.outputExpected = MakeTensor<T, 4>(outputTensorInfo, QuantizedVector<T>(qScale, qOffset, {
+        1.0f, 2.0f,
+        3.0f, 4.0f,
+        11.0f, 12.0f,
+        13.0f, 14.0f,
+        15.0f, 16.0f,
+
+        5.0f, 6.0f,
+        7.0f, 8.0f,
+        17.0f, 18.0f,
+        19.0f, 20.0f,
+        21.0f, 22.0f,
+
+        9.0f, 10.0f,
+        11.0f, 12.0f,
+        23.0f, 24.0f,
+        25.0f, 26.0f,
+        27.0f, 28.0f
+    }));
+
+    return result;
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> Concat4dDiffShapeDim3TestImpl(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    float qScale,
+    int32_t qOffset,
+    bool useSubtensor)
+{
+    unsigned int dimension = 3;
+    armnn::TensorInfo inputTensorInfo0({ 1, 3, 2, 2 }, ArmnnType, qScale, qOffset);
+
+    auto input0 = MakeTensor<T, 4>(inputTensorInfo0, QuantizedVector<T>(qScale, qOffset, {
+        1.0f, 2.0f,
+        3.0f, 4.0f,
+        5.0f, 6.0f,
+        7.0f, 8.0f,
+        9.0f, 10.0f,
+        11.0f, 12.0f
+    }));
+
+    armnn::TensorInfo inputTensorInfo1({ 1, 3, 2, 3 }, ArmnnType, qScale, qOffset);
+
+    auto input1 = MakeTensor<T, 4>(inputTensorInfo1, QuantizedVector<T>(qScale, qOffset, {
+        11.0f, 12.0f, 13.0f,
+        14.0f, 15.0f, 16.0f,
+
+        17.0f, 18.0f, 19.0f,
+        20.0f, 21.0f, 22.0f,
+
+        23.0f, 24.0f, 25.0f,
+        26.0f, 27.0f, 28.0f
+    }));
+
+    armnn::TensorInfo outputTensorInfo({ 1, 3, 2, 5 }, ArmnnType, qScale, qOffset);
+
+    LayerTestResult<T, 4> result(outputTensorInfo);
+
+    std::vector<T> output;
+    output.resize(outputTensorInfo.GetNumElements());
+    Concatenate<T>(workloadFactory,
+                   memoryManager,
+                   {inputTensorInfo0, inputTensorInfo1},
+                   {input0.data(), input1.data()},
+                   outputTensorInfo,
+                   output.data(),
+                   dimension,
+                   useSubtensor);
+
+    result.output = MakeTensor<T, 4>(outputTensorInfo, output);
+    result.outputExpected = MakeTensor<T, 4>(outputTensorInfo, QuantizedVector<T>(qScale, qOffset, {
+        1.0f, 2.0f, 11.0f, 12.0f, 13.0f,
+        3.0f, 4.0f, 14.0f, 15.0f, 16.0f,
+        5.0f, 6.0f, 17.0f, 18.0f, 19.0f,
+        7.0f, 8.0f, 20.0f, 21.0f, 22.0f,
+        9.0f, 10.0f, 23.0f, 24.0f, 25.0f,
+        11.0f, 12.0f, 26.0f, 27.0f, 28.0f
+    }));
+
+    return result;
+}
+
+template<armnn::DataType ArmnnType, typename T>
+LayerTestResult<T, 3> ConcatDifferentInputOutputQParamTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    bool useSubtensor)
+{
+    // Defines the tensor descriptors.
+    armnn::TensorInfo outputTensorInfo({ 3, 6, 3 }, ArmnnType);
+    armnn::TensorInfo inputTensorInfo1({ 3, 6, 2 }, ArmnnType);
+    armnn::TensorInfo inputTensorInfo2({ 3, 6, 1 }, ArmnnType);
+
+    std::vector<armnn::TensorShape> inputTensorShapes({inputTensorInfo1.GetShape(), inputTensorInfo2.GetShape()});
+
+    // Quantized input1 tensor.
+    const float inputScale1 = 0.5f;
+    const int32_t inputOffset1 = 5;
+
+    auto input1 = MakeTensor<T, 3>(inputTensorInfo1, std::vector<T>(
+    {
+        1, 2, 3,
+        4, 5, 6,
+        7, 8, 9,
+        10, 11, 12,
+        13, 14, 15,
+        16, 17, 18,
+
+        19, 20, 21,
+        22, 23, 24,
+        25, 26, 27,
+        28, 29, 30,
+        31, 32, 33,
+        34, 35, 36
+    }));
+
+    // Quatized input2 tensor.
+    const float inputScale2 = 0.2f;
+    const int32_t inputOffset2 = 10;
+
+    auto input2 = MakeTensor<T, 3>(inputTensorInfo2, std::vector<T>(
+    {
+        37, 38, 39,
+        40, 41, 42,
+        43, 44, 45,
+        46, 47, 48,
+        49, 50, 51,
+        52, 53, 54
+    }));
+
+    // Quantized output tensor.
+    const float outputScale = 0.1f;
+    const int32_t outputOffset = 20;
+
+    LayerTestResult<T, 3> ret(outputTensorInfo);
+
+    ret.outputExpected = MakeTensor<T, 3>(outputTensorInfo, std::vector<T>(
+    {
+        0,   5,  74,
+        10,  15,  76,
+        20,  25,  78,
+        30,  35,  80,
+        40,  45,  82,
+        50,  55,  84,
+
+        60,  65,  86,
+        70,  75,  88,
+        80,  85,  90,
+        90,  95,  92,
+        100, 105,  94,
+        110, 115,  96,
+
+        120, 125,  98,
+        130, 135, 100,
+        140, 145, 102,
+        150, 155, 104,
+        160, 165, 106,
+        170, 175, 108
+    }));
+
+    outputTensorInfo.SetQuantizationScale(outputScale);
+    outputTensorInfo.SetQuantizationOffset(outputOffset);
+    inputTensorInfo1.SetQuantizationScale(inputScale1);
+    inputTensorInfo1.SetQuantizationOffset(inputOffset1);
+    inputTensorInfo2.SetQuantizationScale(inputScale2);
+    inputTensorInfo2.SetQuantizationOffset(inputOffset2);
+
+    std::vector<unsigned int> wOrigin1 = { 0, 0, 0 }; //Extent of the window is defined by size of input[0].
+    armnn::ConcatQueueDescriptor::ViewOrigin window1(wOrigin1);
+
+    std::vector<unsigned int> wOrigin2 = { 0, 0, 2 }; //Extent of the window is defined by size of input[1].
+    armnn::ConcatQueueDescriptor::ViewOrigin window2(wOrigin2);
+
+    std::unique_ptr<armnn::ITensorHandle> outputHandle = workloadFactory.CreateTensorHandle(outputTensorInfo);
+
+    bool subTensorsSupported = useSubtensor && workloadFactory.SupportsSubTensors();
+
+    std::unique_ptr<armnn::ITensorHandle> inputHandle1 =
+            subTensorsSupported ?
+            workloadFactory.CreateSubTensorHandle(*outputHandle, inputTensorInfo1.GetShape(), wOrigin1.data()) :
+            workloadFactory.CreateTensorHandle(inputTensorInfo1);
+
+    std::unique_ptr<armnn::ITensorHandle> inputHandle2 =
+            subTensorsSupported ?
+            workloadFactory.CreateSubTensorHandle(*outputHandle, inputTensorInfo2.GetShape(), wOrigin2.data()) :
+            workloadFactory.CreateTensorHandle(inputTensorInfo2);
+
+    armnn::ConcatQueueDescriptor data;
+    armnn::OriginsDescriptor desc = armnn::CreateDescriptorForConcatenation(
+            inputTensorShapes.begin(),inputTensorShapes.end(), 2);
+    data.m_Parameters = desc;
+
+    armnn::WorkloadInfo info;
+    AddInputToWorkload(data, info, inputTensorInfo1, inputHandle1.get());
+    AddInputToWorkload(data, info, inputTensorInfo2, inputHandle2.get());
+    AddOutputToWorkload(data, info, outputTensorInfo, outputHandle.get());
+
+    data.m_ViewOrigins.push_back(window1);
+    data.m_ViewOrigins.push_back(window2);
+
+    std::unique_ptr<armnn::IWorkload> workload = workloadFactory.CreateConcat(data, info);
+
+    inputHandle1->Allocate();
+    inputHandle2->Allocate();
+    outputHandle->Allocate();
+
+    CopyDataToITensorHandle(inputHandle1.get(), &input1[0][0][0]);
+    CopyDataToITensorHandle(inputHandle2.get(), &input2[0][0][0]);
+
+    workload->PostAllocationConfigure();
+    workload->Execute();
+
+    CopyDataFromITensorHandle(&ret.output[0][0][0], outputHandle.get());
+
+    return ret;
+}
+
+//
+// Explicit template specializations
+//
+
+template LayerTestResult<armnn::ResolveType<armnn::DataType::QuantisedAsymm8>, 3>
+ConcatDifferentInputOutputQParamTest<armnn::DataType::QuantisedAsymm8>(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    bool useSubtensor);
+
+template LayerTestResult<armnn::ResolveType<armnn::DataType::QuantisedSymm16>, 3>
+ConcatDifferentInputOutputQParamTest<armnn::DataType::QuantisedSymm16>(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    bool useSubtensor);
+
+//
+// Implementation functions
+//
+
+LayerTestResult<float,3> ConcatTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    unsigned int outputWidth = 3;
+    unsigned int outputHeight = 6;
+    unsigned int outputChannels = 3;
+
+    unsigned int inputWidth1 = 3;
+    unsigned int inputHeight1 = 6;
+    unsigned int inputChannels1 = 2;
+
+    unsigned int inputWidth2 = 3;
+    unsigned int inputHeight2 = 6;
+    unsigned int inputChannels2 = 1;
+
+    // Define the tensor descriptors.
+    armnn::TensorInfo outputTensorInfo({ outputChannels, outputHeight, outputWidth }, armnn::DataType::Float32);
+    armnn::TensorInfo inputTensorInfo1({ inputChannels1, inputHeight1, inputWidth1 }, armnn::DataType::Float32);
+    armnn::TensorInfo inputTensorInfo2({ inputChannels2, inputHeight2, inputWidth2 }, armnn::DataType::Float32);
+
+    LayerTestResult<float,3> ret(outputTensorInfo);
+
+    ret.outputExpected = MakeTensor<float, 3>(outputTensorInfo, std::vector<float>(
+    {
+            1.0f, 2.0f, 3.0f,
+            4.0f, 5.0f, 6.0f,
+            7.0f, 8.0f, 9.0f,
+            10.0f, 11.0f, 12.0f,
+            13.0f, 14.0f, 15.0f,
+            16.0f, 17.0f, 18.0f,
+
+            19.0f, 20.0f, 21.0f,
+            22.0f, 23.0f, 24.0f,
+            25.0f, 26.0f, 27.0f,
+            28.0f, 29.0f, 30.0f,
+            31.0f, 32.0f, 33.0f,
+            34.0f, 35.0f, 36.0f,
+
+            37.0f, 38.0f, 39.0f,
+            40.0f, 41.0f, 42.0f,
+            43.0f, 44.0f, 45.0f,
+            46.0f, 47.0f, 48.0f,
+            49.0f, 50.0f, 51.0f,
+            52.0f, 53.0f, 54.0f,
+        })
+    );
+
+    auto input1 = MakeTensor<float, 3>(inputTensorInfo1, std::vector<float>(
+        {
+            1.0f, 2.0f, 3.0f,
+            4.0f, 5.0f, 6.0f,
+            7.0f, 8.0f, 9.0f,
+            10.0f, 11.0f, 12.0f,
+            13.0f, 14.0f, 15.0f,
+            16.0f, 17.0f, 18.0f,
+
+            19.0f, 20.0f, 21.0f,
+            22.0f, 23.0f, 24.0f,
+            25.0f, 26.0f, 27.0f,
+            28.0f, 29.0f, 30.0f,
+            31.0f, 32.0f, 33.0f,
+            34.0f, 35.0f, 36.0f,
+        })
+    );
+
+    auto input2 = MakeTensor<float, 3>(inputTensorInfo2, std::vector<float>(
+        {
+            37.0f, 38.0f, 39.0f,
+            40.0f, 41.0f, 42.0f,
+            43.0f, 44.0f, 45.0f,
+            46.0f, 47.0f, 48.0f,
+            49.0f, 50.0f, 51.0f,
+            52.0f, 53.0f, 54.0f,
+        })
+    );
+
+    std::vector<unsigned int> wOrigin1 = {0, 0, 0}; //Extent of the window is defined by size of input[0].
+    armnn::ConcatQueueDescriptor::ViewOrigin window1(wOrigin1);
+
+    std::vector<unsigned int> wOrigin2 = {2, 0, 0}; //Extent of the window is defined by size of input[1].
+    armnn::ConcatQueueDescriptor::ViewOrigin window2(wOrigin2);
+
+    std::unique_ptr<armnn::ITensorHandle> outputHandle = workloadFactory.CreateTensorHandle(outputTensorInfo);
+
+    bool subTensorsSupported = workloadFactory.SupportsSubTensors();
+
+    std::unique_ptr<armnn::ITensorHandle> inputHandle1 =
+        subTensorsSupported ?
+            workloadFactory.CreateSubTensorHandle(*outputHandle, inputTensorInfo1.GetShape(), wOrigin1.data()) :
+            workloadFactory.CreateTensorHandle(inputTensorInfo1);
+
+    std::unique_ptr<armnn::ITensorHandle> inputHandle2  =
+        subTensorsSupported ?
+            workloadFactory.CreateSubTensorHandle(*outputHandle, inputTensorInfo2.GetShape(), wOrigin2.data()) :
+            workloadFactory.CreateTensorHandle(inputTensorInfo2);
+
+    armnn::ConcatQueueDescriptor data;
+    armnn::WorkloadInfo info;
+    AddInputToWorkload(data, info, inputTensorInfo1, inputHandle1.get());
+    AddInputToWorkload(data, info, inputTensorInfo2, inputHandle2.get());
+    AddOutputToWorkload(data, info, outputTensorInfo, outputHandle.get());
+
+    data.m_ViewOrigins.push_back(window1);
+    data.m_ViewOrigins.push_back(window2);
+
+    std::unique_ptr<armnn::IWorkload> workload = workloadFactory.CreateConcat(data, info);
+
+    inputHandle1->Allocate();
+    inputHandle2->Allocate();
+    outputHandle->Allocate();
+
+    CopyDataToITensorHandle(inputHandle1.get(), &input1[0][0][0]);
+    CopyDataToITensorHandle(inputHandle2.get(), &input2[0][0][0]);
+
+    workload->PostAllocationConfigure();
+    workload->Execute();
+
+    CopyDataFromITensorHandle(&ret.output[0][0][0], outputHandle.get());
+
+    return ret;
+}
+
+LayerTestResult<float, 1> Concat1dTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return Concat1dTestImpl<armnn::DataType::Float32>(workloadFactory, memoryManager, 0.0f, 0);
+}
+
+LayerTestResult<float, 2> Concat2dDim0Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return Concat2dDim0TestImpl<armnn::DataType::Float32>(workloadFactory, memoryManager, 0.0f, 0);
+}
+
+LayerTestResult<float, 2> Concat2dDim1Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return Concat2dDim1TestImpl<armnn::DataType::Float32>(workloadFactory, memoryManager, 0.0f, 0);
+}
+
+LayerTestResult<float, 2> Concat2dDim0DiffInputDimsTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return Concat2dDim0DiffInputDimsTestImpl<armnn::DataType::Float32>(workloadFactory, memoryManager, 0.0f, 0);
+}
+
+LayerTestResult<float, 2> Concat2dDim1DiffInputDimsTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return Concat2dDim1DiffInputDimsTestImpl<armnn::DataType::Float32>(workloadFactory, memoryManager, 0.0f, 0);
+}
+
+LayerTestResult<float, 3> Concat3dDim0Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return Concat3dDim0TestImpl<armnn::DataType::Float32>(workloadFactory, memoryManager, 0.0f, 0);
+}
+
+LayerTestResult<float, 3> Concat3dDim1Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return Concat3dDim1TestImpl<armnn::DataType::Float32>(workloadFactory, memoryManager, 0.0f, 0);
+}
+
+LayerTestResult<float, 3> Concat3dDim2Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    bool useSubtensor)
+{
+    return Concat3dDim2TestImpl<armnn::DataType::Float32>(workloadFactory, memoryManager, useSubtensor, 0.0f, 0);
+}
+
+LayerTestResult<float, 3> Concat3dDim0DiffInputDimsTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return Concat3dDim0DiffInputDimsTestImpl<armnn::DataType::Float32>(
+        workloadFactory, memoryManager, 0.0f, 0);
+}
+
+LayerTestResult<float, 3> Concat3dDim1DiffInputDimsTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return Concat3dDim1DiffInputDimsTestImpl<armnn::DataType::Float32>(workloadFactory, memoryManager, 0.0f, 0);
+}
+
+LayerTestResult<float, 3> Concat3dDim2DiffInputDimsTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    bool useSubtensor)
+{
+    return Concat3dDim2DiffInputDimsTestImpl<armnn::DataType::Float32>(
+        workloadFactory, memoryManager, useSubtensor, 0.0f, 0);
+}
+
+LayerTestResult<float, 4> Concat4dDim0Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return Concat4dDim0TestImpl<armnn::DataType::Float32>(workloadFactory, memoryManager, 0.0f, 0);
+}
+
+LayerTestResult<float, 4> Concat4dDim1Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return Concat4dDim1TestImpl<armnn::DataType::Float32>(workloadFactory, memoryManager, 0.0f, 0);
+}
+
+LayerTestResult<float, 4> Concat4dDim2Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return Concat4dDim2TestImpl<armnn::DataType::Float32>(workloadFactory, memoryManager, 0.0f, 0);
+}
+
+LayerTestResult<float, 4> Concat4dDim3Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    bool useSubtensor)
+{
+    return Concat4dDim3TestImpl<armnn::DataType::Float32>(workloadFactory, memoryManager, 0.0f, 0, useSubtensor);
+}
+
+LayerTestResult<float, 4> Concat4dDiffShapeDim0Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return Concat4dDiffShapeDim0TestImpl<armnn::DataType::Float32>(workloadFactory, memoryManager, 0.0f, 0);
+}
+
+LayerTestResult<float, 4> Concat4dDiffShapeDim1Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return Concat4dDiffShapeDim1TestImpl<armnn::DataType::Float32>(
+        workloadFactory, memoryManager, 0.0f, 0);
+}
+
+LayerTestResult<float, 4> Concat4dDiffShapeDim2Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return Concat4dDiffShapeDim2TestImpl<armnn::DataType::Float32>(workloadFactory, memoryManager, 0.0f, 0);
+}
+
+LayerTestResult<float, 4> Concat4dDiffShapeDim3Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    bool useSubtensor)
+{
+    return Concat4dDiffShapeDim3TestImpl<armnn::DataType::Float32>(
+        workloadFactory, memoryManager, 0.0f, 0, useSubtensor);
+}
+
+LayerTestResult<uint8_t, 3> ConcatUint8DifferentQParamsTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    unsigned int outputWidth = 3;
+    unsigned int outputHeight = 6;
+    unsigned int outputChannels = 3;
+
+    unsigned int inputWidth1 = 3;
+    unsigned int inputHeight1 = 6;
+    unsigned int inputChannels1 = 2;
+
+    unsigned int inputWidth2 = 3;
+    unsigned int inputHeight2 = 6;
+    unsigned int inputChannels2 = 1;
+
+    // Defines the tensor descriptors.
+    armnn::TensorInfo outputTensorInfo({ outputChannels, outputHeight, outputWidth }, armnn::DataType::QuantisedAsymm8);
+    armnn::TensorInfo inputTensorInfo1({ inputChannels1, inputHeight1, inputWidth1 }, armnn::DataType::QuantisedAsymm8);
+    armnn::TensorInfo inputTensorInfo2({ inputChannels2, inputHeight2, inputWidth2 }, armnn::DataType::QuantisedAsymm8);
+
+    // Quantized input1 tensor. Range [-3, 1]
+    const float inputScale1 = 0.015686f;
+    const int32_t inputOffset1 = 192;
+
+    auto input1 = MakeTensor<uint8_t, 3>(inputTensorInfo1, std::vector<uint8_t>(
+    {
+        1, 2, 3,
+        4, 5, 6,
+        7, 8, 9,
+        10, 11, 12,
+        13, 14, 15,
+        16, 17, 18,
+
+        19, 20, 21,
+        22, 23, 24,
+        25, 26, 27,
+        28, 29, 30,
+        31, 32, 33,
+        34, 35, 36,
+    })
+    );
+
+    // Quatized input2 tensor. Range [-1, 4]
+    const float inputScale2 = 0.019608f;
+    const int32_t inputOffset2 = 50;
+
+    auto input2 = MakeTensor<uint8_t, 3>(inputTensorInfo2, std::vector<uint8_t>(
+    {
+        37, 38, 39,
+        40, 41, 42,
+        43, 44, 45,
+        46, 47, 48,
+        49, 50, 51,
+        52, 53, 54,
+    })
+    );
+
+    // Output has the same quantization parameters than input1,
+    // so that only the requantization of input2 is required
+    const float outputScale = 0.015686f;
+    const int32_t outputOffset = 192;
+
+    LayerTestResult<uint8_t, 3> ret(outputTensorInfo);
+
+    ret.outputExpected = MakeTensor<uint8_t, 3>(outputTensorInfo, std::vector<uint8_t>(
+    {
+        1, 2, 3,
+        4, 5, 6,
+        7, 8, 9,
+        10, 11, 12,
+        13, 14, 15,
+        16, 17, 18,
+
+        19, 20, 21,
+        22, 23, 24,
+        25, 26, 27,
+        28, 29, 30,
+        31, 32, 33,
+        34, 35, 36,
+
+        176, 177, 178,
+        179, 181, 182,
+        183, 184, 186,
+        187, 188, 189,
+        191, 192, 193,
+        195, 196, 197,
+    })
+    );
+
+    outputTensorInfo.SetQuantizationScale(outputScale);
+    outputTensorInfo.SetQuantizationOffset(outputOffset);
+    inputTensorInfo1.SetQuantizationScale(inputScale1);
+    inputTensorInfo1.SetQuantizationOffset(inputOffset1);
+    inputTensorInfo2.SetQuantizationScale(inputScale2);
+    inputTensorInfo2.SetQuantizationOffset(inputOffset2);
+
+    std::vector<unsigned int> wOrigin1 = { 0, 0, 0 }; //Extent of the window is defined by size of input[0].
+    armnn::ConcatQueueDescriptor::ViewOrigin window1(wOrigin1);
+
+    std::vector<unsigned int> wOrigin2 = { 2, 0, 0 }; //Extent of the window is defined by size of input[1].
+    armnn::ConcatQueueDescriptor::ViewOrigin window2(wOrigin2);
+
+    std::unique_ptr<armnn::ITensorHandle> outputHandle = workloadFactory.CreateTensorHandle(outputTensorInfo);
+
+    bool subTensorsSupported = workloadFactory.SupportsSubTensors();
+
+    std::unique_ptr<armnn::ITensorHandle> inputHandle1 =
+            subTensorsSupported ?
+            workloadFactory.CreateSubTensorHandle(*outputHandle, inputTensorInfo1.GetShape(), wOrigin1.data()) :
+            workloadFactory.CreateTensorHandle(inputTensorInfo1);
+
+    std::unique_ptr<armnn::ITensorHandle> inputHandle2 =
+            subTensorsSupported ?
+            workloadFactory.CreateSubTensorHandle(*outputHandle, inputTensorInfo2.GetShape(), wOrigin2.data()) :
+            workloadFactory.CreateTensorHandle(inputTensorInfo2);
+
+    armnn::ConcatQueueDescriptor data;
+    armnn::WorkloadInfo info;
+    AddInputToWorkload(data, info, inputTensorInfo1, inputHandle1.get());
+    AddInputToWorkload(data, info, inputTensorInfo2, inputHandle2.get());
+    AddOutputToWorkload(data, info, outputTensorInfo, outputHandle.get());
+
+    data.m_ViewOrigins.push_back(window1);
+    data.m_ViewOrigins.push_back(window2);
+
+    std::unique_ptr<armnn::IWorkload> workload = workloadFactory.CreateConcat(data, info);
+
+    inputHandle1->Allocate();
+    inputHandle2->Allocate();
+    outputHandle->Allocate();
+
+    CopyDataToITensorHandle(inputHandle1.get(), &input1[0][0][0]);
+    CopyDataToITensorHandle(inputHandle2.get(), &input2[0][0][0]);
+
+    workload->PostAllocationConfigure();
+    workload->Execute();
+
+    CopyDataFromITensorHandle(&ret.output[0][0][0], outputHandle.get());
+
+    return ret;
+}
+
+LayerTestResult<uint8_t, 3> ConcatUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    unsigned int outputWidth = 3;
+    unsigned int outputHeight = 6;
+    unsigned int outputChannels = 3;
+
+    unsigned int inputWidth1 = 3;
+    unsigned int inputHeight1 = 6;
+    unsigned int inputChannels1 = 2;
+
+    unsigned int inputWidth2 = 3;
+    unsigned int inputHeight2 = 6;
+    unsigned int inputChannels2 = 1;
+
+    // Defines the tensor descriptors.
+    armnn::TensorInfo outputTensorInfo({ outputChannels, outputHeight, outputWidth }, armnn::DataType::QuantisedAsymm8);
+    armnn::TensorInfo inputTensorInfo1({ inputChannels1, inputHeight1, inputWidth1 }, armnn::DataType::QuantisedAsymm8);
+    armnn::TensorInfo inputTensorInfo2({ inputChannels2, inputHeight2, inputWidth2 }, armnn::DataType::QuantisedAsymm8);
+
+    // Arbitrary scale and offsets. They don't really matter as the Concat operator doesn't dequantize/quantize them.
+    const float scale = 0.13497836f;
+    const int32_t offset = -7;
+
+    outputTensorInfo.SetQuantizationScale(scale);
+    outputTensorInfo.SetQuantizationOffset(offset);
+    inputTensorInfo1.SetQuantizationScale(scale);
+    inputTensorInfo1.SetQuantizationOffset(offset);
+    inputTensorInfo2.SetQuantizationScale(scale);
+    inputTensorInfo2.SetQuantizationOffset(offset);
+
+    LayerTestResult<uint8_t, 3> ret(outputTensorInfo);
+
+    ret.outputExpected = MakeTensor<uint8_t, 3>(outputTensorInfo, std::vector<uint8_t>(
+        {
+            1, 2, 3,
+            4, 5, 6,
+            7, 8, 9,
+            10, 11, 12,
+            13, 14, 15,
+            16, 17, 18,
+
+            19, 20, 21,
+            22, 23, 24,
+            25, 26, 27,
+            28, 29, 30,
+            31, 32, 33,
+            34, 35, 36,
+
+            37, 38, 39,
+            40, 41, 42,
+            43, 44, 45,
+            46, 47, 48,
+            49, 50, 51,
+            52, 53, 54,
+        })
+    );
+
+    auto input1 = MakeTensor<uint8_t, 3>(inputTensorInfo1, std::vector<uint8_t>(
+    {
+        1, 2, 3,
+        4, 5, 6,
+        7, 8, 9,
+        10, 11, 12,
+        13, 14, 15,
+        16, 17, 18,
+
+        19, 20, 21,
+        22, 23, 24,
+        25, 26, 27,
+        28, 29, 30,
+        31, 32, 33,
+        34, 35, 36,
+    })
+    );
+
+    auto input2 = MakeTensor<uint8_t, 3>(inputTensorInfo2, std::vector<uint8_t>(
+    {
+        37, 38, 39,
+        40, 41, 42,
+        43, 44, 45,
+        46, 47, 48,
+        49, 50, 51,
+        52, 53, 54,
+    })
+    );
+
+    std::vector<unsigned int> wOrigin1 = { 0, 0, 0 }; //Extent of the window is defined by size of input[0].
+    armnn::ConcatQueueDescriptor::ViewOrigin window1(wOrigin1);
+
+    std::vector<unsigned int> wOrigin2 = { 2, 0, 0 }; //Extent of the window is defined by size of input[1].
+    armnn::ConcatQueueDescriptor::ViewOrigin window2(wOrigin2);
+
+
+    std::unique_ptr<armnn::ITensorHandle> outputHandle = workloadFactory.CreateTensorHandle(outputTensorInfo);
+
+    bool subTensorsSupported = workloadFactory.SupportsSubTensors();
+
+    std::unique_ptr<armnn::ITensorHandle> inputHandle1 =
+        subTensorsSupported ?
+            workloadFactory.CreateSubTensorHandle(*outputHandle, inputTensorInfo1.GetShape(), wOrigin1.data()) :
+            workloadFactory.CreateTensorHandle(inputTensorInfo1);
+
+    std::unique_ptr<armnn::ITensorHandle> inputHandle2 =
+        subTensorsSupported ?
+            workloadFactory.CreateSubTensorHandle(*outputHandle, inputTensorInfo2.GetShape(), wOrigin2.data()) :
+            workloadFactory.CreateTensorHandle(inputTensorInfo2);
+
+
+    armnn::ConcatQueueDescriptor data;
+    armnn::WorkloadInfo info;
+    AddInputToWorkload(data, info, inputTensorInfo1, inputHandle1.get());
+    AddInputToWorkload(data, info, inputTensorInfo2, inputHandle2.get());
+    AddOutputToWorkload(data, info, outputTensorInfo, outputHandle.get());
+
+    data.m_ViewOrigins.push_back(window1);
+    data.m_ViewOrigins.push_back(window2);
+
+    std::unique_ptr<armnn::IWorkload> workload = workloadFactory.CreateConcat(data, info);
+
+    inputHandle1->Allocate();
+    inputHandle2->Allocate();
+    outputHandle->Allocate();
+
+    CopyDataToITensorHandle(inputHandle1.get(), &input1[0][0][0]);
+    CopyDataToITensorHandle(inputHandle2.get(), &input2[0][0][0]);
+
+    workload->PostAllocationConfigure();
+    workload->Execute();
+
+    CopyDataFromITensorHandle(&ret.output[0][0][0], outputHandle.get());
+
+    return ret;
+}
+
+LayerTestResult<uint16_t, 3> ConcatUint16Test(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    unsigned int outputWidth = 3;
+    unsigned int outputHeight = 6;
+    unsigned int outputChannels = 3;
+
+    unsigned int inputWidth1 = 3;
+    unsigned int inputHeight1 = 6;
+    unsigned int inputChannels1 = 2;
+
+    unsigned int inputWidth2 = 3;
+    unsigned int inputHeight2 = 6;
+    unsigned int inputChannels2 = 1;
+
+    // Defines the tensor descriptors.
+    armnn::TensorInfo outputTensorInfo({ outputChannels, outputHeight, outputWidth }, armnn::DataType::QuantisedSymm16);
+    armnn::TensorInfo inputTensorInfo1({ inputChannels1, inputHeight1, inputWidth1 }, armnn::DataType::QuantisedSymm16);
+    armnn::TensorInfo inputTensorInfo2({ inputChannels2, inputHeight2, inputWidth2 }, armnn::DataType::QuantisedSymm16);
+
+    // Arbitrary scale and offsets. They don't really matter as the Concat operator doesn't dequantize/quantize them.
+    const float scale = 0.13497836f;
+    const int32_t offset = -7;
+
+    outputTensorInfo.SetQuantizationScale(scale);
+    outputTensorInfo.SetQuantizationOffset(offset);
+    inputTensorInfo1.SetQuantizationScale(scale);
+    inputTensorInfo1.SetQuantizationOffset(offset);
+    inputTensorInfo2.SetQuantizationScale(scale);
+    inputTensorInfo2.SetQuantizationOffset(offset);
+
+    LayerTestResult<uint16_t, 3> ret(outputTensorInfo);
+
+    ret.outputExpected = MakeTensor<uint16_t, 3>(outputTensorInfo, std::vector<uint16_t>(
+    {
+        1, 2, 3,
+        4, 5, 6,
+        7, 8, 9,
+        10, 11, 12,
+        13, 14, 15,
+        16, 17, 18,
+
+        19, 20, 21,
+        22, 23, 24,
+        25, 26, 27,
+        28, 29, 30,
+        31, 32, 33,
+        34, 35, 36,
+
+        37, 38, 39,
+        40, 41, 42,
+        43, 44, 45,
+        46, 47, 48,
+        49, 50, 51,
+        52, 53, 54,
+    }));
+
+    auto input1 = MakeTensor<uint16_t, 3>(inputTensorInfo1, std::vector<uint16_t>(
+    {
+        1, 2, 3,
+        4, 5, 6,
+        7, 8, 9,
+        10, 11, 12,
+        13, 14, 15,
+        16, 17, 18,
+
+        19, 20, 21,
+        22, 23, 24,
+        25, 26, 27,
+        28, 29, 30,
+        31, 32, 33,
+        34, 35, 36,
+    }));
+
+    auto input2 = MakeTensor<uint16_t, 3>(inputTensorInfo2, std::vector<uint16_t>(
+    {
+        37, 38, 39,
+        40, 41, 42,
+        43, 44, 45,
+        46, 47, 48,
+        49, 50, 51,
+        52, 53, 54,
+    }));
+
+    std::vector<unsigned int> wOrigin1 = { 0, 0, 0 }; //Extent of the window is defined by size of input[0].
+    armnn::ConcatQueueDescriptor::ViewOrigin window1(wOrigin1);
+
+    std::vector<unsigned int> wOrigin2 = { 2, 0, 0 }; //Extent of the window is defined by size of input[1].
+    armnn::ConcatQueueDescriptor::ViewOrigin window2(wOrigin2);
+
+
+    std::unique_ptr<armnn::ITensorHandle> outputHandle = workloadFactory.CreateTensorHandle(outputTensorInfo);
+
+    bool subTensorsSupported = workloadFactory.SupportsSubTensors();
+
+    std::unique_ptr<armnn::ITensorHandle> inputHandle1 =
+            subTensorsSupported ?
+            workloadFactory.CreateSubTensorHandle(*outputHandle, inputTensorInfo1.GetShape(), wOrigin1.data()) :
+            workloadFactory.CreateTensorHandle(inputTensorInfo1);
+
+    std::unique_ptr<armnn::ITensorHandle> inputHandle2 =
+            subTensorsSupported ?
+            workloadFactory.CreateSubTensorHandle(*outputHandle, inputTensorInfo2.GetShape(), wOrigin2.data()) :
+            workloadFactory.CreateTensorHandle(inputTensorInfo2);
+
+
+    armnn::ConcatQueueDescriptor data;
+    armnn::WorkloadInfo info;
+    AddInputToWorkload(data, info, inputTensorInfo1, inputHandle1.get());
+    AddInputToWorkload(data, info, inputTensorInfo2, inputHandle2.get());
+    AddOutputToWorkload(data, info, outputTensorInfo, outputHandle.get());
+
+    data.m_ViewOrigins.push_back(window1);
+    data.m_ViewOrigins.push_back(window2);
+
+    std::unique_ptr<armnn::IWorkload> workload = workloadFactory.CreateConcat(data, info);
+
+    inputHandle1->Allocate();
+    inputHandle2->Allocate();
+    outputHandle->Allocate();
+
+    CopyDataToITensorHandle(inputHandle1.get(), &input1[0][0][0]);
+    CopyDataToITensorHandle(inputHandle2.get(), &input2[0][0][0]);
+
+    workload->PostAllocationConfigure();
+    workload->Execute();
+
+    CopyDataFromITensorHandle(&ret.output[0][0][0], outputHandle.get());
+
+    return ret;
+}
+
+LayerTestResult<uint8_t, 1> Concat1dUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return Concat1dTestImpl<armnn::DataType::QuantisedAsymm8>(workloadFactory, memoryManager, 0.5f, -1);
+}
+
+LayerTestResult<uint8_t, 2> Concat2dDim0Uint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return Concat2dDim0TestImpl<armnn::DataType::QuantisedAsymm8>(workloadFactory, memoryManager, 0.5f, -1);
+}
+
+LayerTestResult<uint8_t, 2> Concat2dDim1Uint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return Concat2dDim1TestImpl<armnn::DataType::QuantisedAsymm8>(workloadFactory, memoryManager, 0.5f, -1);
+}
+
+LayerTestResult<uint8_t, 2> Concat2dDim0DiffInputDimsUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return Concat2dDim0DiffInputDimsTestImpl<armnn::DataType::QuantisedAsymm8>(
+        workloadFactory, memoryManager, 0.5f, -1);
+}
+
+LayerTestResult<uint8_t, 2> Concat2dDim1DiffInputDimsUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return Concat2dDim1DiffInputDimsTestImpl<armnn::DataType::QuantisedAsymm8>(
+        workloadFactory, memoryManager, 0.5f, -1);
+}
+
+LayerTestResult<uint8_t, 3> Concat3dDim0Uint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return Concat3dDim0TestImpl<armnn::DataType::QuantisedAsymm8>(workloadFactory, memoryManager, 0.5f, -1);
+}
+
+LayerTestResult<uint8_t, 3> Concat3dDim1Uint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return Concat3dDim1TestImpl<armnn::DataType::QuantisedAsymm8>(workloadFactory, memoryManager, 0.5f, -1);
+}
+
+LayerTestResult<uint8_t, 3> Concat3dDim2Uint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    bool useSubtensor)
+{
+    return Concat3dDim2TestImpl<armnn::DataType::QuantisedAsymm8>(
+        workloadFactory, memoryManager, useSubtensor, 0.5f, -1);
+}
+
+LayerTestResult<uint8_t, 3> Concat3dDim0DiffInputDimsUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return Concat3dDim0TestImpl<armnn::DataType::QuantisedAsymm8>(workloadFactory, memoryManager, 0.5f, -1);
+}
+
+LayerTestResult<uint8_t, 3> Concat3dDim1DiffInputDimsUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return Concat3dDim1DiffInputDimsTestImpl<armnn::DataType::QuantisedAsymm8>(
+        workloadFactory, memoryManager, 0.5f, -1);
+}
+
+LayerTestResult<uint8_t, 3> Concat3dDim2DiffInputDimsUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    bool useSubtensor)
+{
+    return Concat3dDim2DiffInputDimsTestImpl<armnn::DataType::QuantisedAsymm8>(
+        workloadFactory, memoryManager, useSubtensor, 0.5f, -1);
+}
+
+LayerTestResult<uint8_t, 4> Concat4dDim0Uint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return Concat4dDim0TestImpl<armnn::DataType::QuantisedAsymm8>(workloadFactory, memoryManager, 0.5f, -1);
+}
+
+LayerTestResult<uint8_t, 4> Concat4dDim1Uint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return Concat4dDim1TestImpl<armnn::DataType::QuantisedAsymm8>(workloadFactory, memoryManager, 0.5f, -1);
+}
+
+LayerTestResult<uint8_t, 4> Concat4dDim2Uint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return Concat4dDim2TestImpl<armnn::DataType::QuantisedAsymm8>(workloadFactory, memoryManager, 0.5f, -1);
+}
+
+LayerTestResult<uint8_t, 4> Concat4dDim3Uint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager, bool useSubtensor)
+{
+    return Concat4dDim3TestImpl<armnn::DataType::QuantisedAsymm8>(
+        workloadFactory, memoryManager, 0.5f, -1, useSubtensor);
+}
+
+LayerTestResult<uint8_t, 4> Concat4dDiffShapeDim0Uint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return Concat4dDiffShapeDim0TestImpl<armnn::DataType::QuantisedAsymm8>(
+        workloadFactory, memoryManager, 0.5f, -1);
+}
+
+LayerTestResult<uint8_t, 4> Concat4dDiffShapeDim1Uint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return Concat4dDiffShapeDim1TestImpl<armnn::DataType::QuantisedAsymm8>(
+        workloadFactory, memoryManager, 0.5f, -1);
+}
+
+LayerTestResult<uint8_t, 4> Concat4dDiffShapeDim2Uint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return Concat4dDiffShapeDim2TestImpl<armnn::DataType::QuantisedAsymm8>(
+        workloadFactory, memoryManager, 0.5f, -1);
+}
+
+LayerTestResult<uint8_t, 4> Concat4dDiffShapeDim3Uint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    bool useSubtensor)
+{
+    return Concat4dDiffShapeDim3TestImpl<armnn::DataType::QuantisedAsymm8>(
+        workloadFactory, memoryManager, 0.5f, -1, useSubtensor);
+}
diff --git a/src/backends/backendsCommon/test/layerTests/ConcatTestImpl.hpp b/src/backends/backendsCommon/test/layerTests/ConcatTestImpl.hpp
new file mode 100644
index 0000000..421d03a
--- /dev/null
+++ b/src/backends/backendsCommon/test/layerTests/ConcatTestImpl.hpp
@@ -0,0 +1,205 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include "LayerTestResult.hpp"
+
+#include <ResolveType.hpp>
+
+#include <backendsCommon/IBackendInternal.hpp>
+#include <backendsCommon/WorkloadFactory.hpp>
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 3> ConcatDifferentInputOutputQParamTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    bool useSubtensor);
+
+LayerTestResult<float, 3> ConcatTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<uint8_t, 3> ConcatUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<uint16_t, 3> ConcatUint16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<uint8_t, 3> ConcatUint8DifferentQParamsTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<float, 1> Concat1dTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<float, 2> Concat2dDim0Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<float, 2> Concat2dDim1Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<float, 2> Concat2dDim0DiffInputDimsTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<float, 2> Concat2dDim1DiffInputDimsTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<float, 3> Concat3dDim0Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<float, 3> Concat3dDim1Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<float, 3> Concat3dDim2Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    bool useSubtensor);
+
+LayerTestResult<float, 3> Concat3dDim0DiffInputDimsTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<float, 3> Concat3dDim1DiffInputDimsTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<float, 3> Concat3dDim2DiffInputDimsTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    bool useSubtensor);
+
+LayerTestResult<float, 4> Concat4dDim0Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<float, 4> Concat4dDim1Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<float, 4> Concat4dDim2Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<float, 4> Concat4dDim3Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    bool useSubtensor);
+
+LayerTestResult<float, 4> Concat4dDiffShapeDim0Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<float, 4> Concat4dDiffShapeDim1Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<float, 4> Concat4dDiffShapeDim2Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<float, 4> Concat4dDiffShapeDim3Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    bool useSubtensor);
+
+LayerTestResult<uint8_t, 4> Concat4dDim0Uint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<uint8_t, 4> Concat4dDim1Uint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<uint8_t, 4> Concat4dDim2Uint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<uint8_t, 4> Concat4dDim3Uint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    bool useSubtensor);
+
+LayerTestResult<uint8_t, 4> Concat4dDiffShapeDim0Uint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<uint8_t, 4> Concat4dDiffShapeDim1Uint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<uint8_t, 4> Concat4dDiffShapeDim2Uint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<uint8_t, 4> Concat4dDiffShapeDim3Uint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    bool useSubtensor);
+
+LayerTestResult<uint8_t, 1> Concat1dUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<uint8_t, 2> Concat2dDim0Uint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<uint8_t, 2> Concat2dDim1Uint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<uint8_t, 2> Concat2dDim0DiffInputDimsUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<uint8_t, 2> Concat2dDim1DiffInputDimsUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<uint8_t, 3> Concat3dDim0Uint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<uint8_t, 3> Concat3dDim1Uint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<uint8_t, 3> Concat3dDim2Uint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    bool useSubtensor);
+
+LayerTestResult<uint8_t, 3> Concat3dDim0DiffInputDimsUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<uint8_t, 3> Concat3dDim1DiffInputDimsUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<uint8_t, 3> Concat3dDim2DiffInputDimsUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    bool useSubtensor);
+
+LayerTestResult<uint8_t, 3> ConcatDifferentInputOutputQParamUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    bool useSubtensor);
+
+LayerTestResult<int16_t, 3> ConcatDifferentInputOutputQParamInt16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    bool useSubtensor);
diff --git a/src/backends/backendsCommon/test/layerTests/ConstantTestImpl.cpp b/src/backends/backendsCommon/test/layerTests/ConstantTestImpl.cpp
new file mode 100644
index 0000000..c3cacd5
--- /dev/null
+++ b/src/backends/backendsCommon/test/layerTests/ConstantTestImpl.cpp
@@ -0,0 +1,154 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "ConstantTestImpl.hpp"
+
+#include <Permute.hpp>
+#include <ResolveType.hpp>
+
+#include <armnn/ArmNN.hpp>
+
+#include <backendsCommon/CpuTensorHandle.hpp>
+
+#include <backendsCommon/test/TensorCopyUtils.hpp>
+#include <backendsCommon/test/WorkloadTestUtils.hpp>
+
+#include <test/TensorHelpers.hpp>
+
+namespace
+{
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> ConstantTestImpl(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    float qScale,
+    int32_t qOffset)
+{
+    constexpr unsigned int inputWidth = 3;
+    constexpr unsigned int inputHeight = 4;
+    constexpr unsigned int inputChannels = 3;
+    constexpr unsigned int inputBatchSize = 2;
+
+    constexpr unsigned int outputWidth = inputWidth;
+    constexpr unsigned int outputHeight = inputHeight;
+    constexpr unsigned int outputChannels = inputChannels;
+    constexpr unsigned int outputBatchSize = inputBatchSize;
+
+    armnn::TensorInfo inputTensorInfo({ inputBatchSize, inputChannels, inputHeight, inputWidth },
+                                        ArmnnType, qScale, qOffset);
+
+    armnn::TensorInfo outputTensorInfo({ outputBatchSize, outputChannels, outputHeight, outputWidth },
+                                         ArmnnType, qScale, qOffset);
+
+    // Set quantization parameters if the requested type is a quantized type.
+    if(armnn::IsQuantizedType<T>())
+    {
+        inputTensorInfo.SetQuantizationScale(qScale);
+        inputTensorInfo.SetQuantizationOffset(qOffset);
+        outputTensorInfo.SetQuantizationScale(qScale);
+        outputTensorInfo.SetQuantizationOffset(qOffset);
+    }
+
+    auto input = MakeTensor<T, 4>(inputTensorInfo, std::vector<T>(
+        QuantizedVector<T>(qScale, qOffset, {
+        // Batch 0, Channel 0
+        235.0f,  46.0f, 178.0f,
+        100.0f, 123.0f,  19.0f,
+        172.0f,  74.0f, 250.0f,
+          6.0f, 195.0f,  80.0f,
+
+        // Batch 0, Channel 1
+        113.0f,  95.0f, 202.0f,
+         77.0f, 114.0f,  71.0f,
+        122.0f, 246.0f, 166.0f,
+         82.0f,  28.0f,  37.0f,
+
+        // Batch 0, Channel 2
+         56.0f, 170.0f, 162.0f,
+        194.0f,  89.0f, 254.0f,
+         12.0f, 209.0f, 200.0f,
+          1.0f,  64.0f,  54.0f,
+
+        // Batch 1, Channel 0
+         67.0f,  90.0f,  49.0f,
+          7.0f, 163.0f,  18.0f,
+         25.0f, 117.0f, 103.0f,
+        247.0f,  59.0f, 189.0f,
+
+        // Batch 1, Channel 1
+        239.0f, 104.0f, 199.0f,
+         17.0f, 124.0f, 153.0f,
+        222.0f, 217.0f, 75.0f,
+         32.0f, 126.0f, 21.0f,
+
+        // Batch 1, Channel 2
+         97.0f, 145.0f, 215.0f,
+        115.0f, 116.0f, 238.0f,
+        226.0f,  16.0f, 132.0f,
+         92.0f, 125.0f,  88.0f,
+    })));
+
+    LayerTestResult<T, 4> result(outputTensorInfo);
+    result.outputExpected = input;
+
+    std::unique_ptr<armnn::ITensorHandle> outputHandle = workloadFactory.CreateTensorHandle(outputTensorInfo);
+
+    armnn::ScopedCpuTensorHandle constantTensor(inputTensorInfo);
+    AllocateAndCopyDataToITensorHandle(&constantTensor, &input[0][0][0][0]);
+
+    armnn::ConstantQueueDescriptor descriptor;
+    descriptor.m_LayerOutput = &constantTensor;
+
+    armnn::WorkloadInfo info;
+    AddOutputToWorkload(descriptor, info, outputTensorInfo, outputHandle.get());
+
+    std::unique_ptr<armnn::IWorkload> workload = workloadFactory.CreateConstant(descriptor, info);
+
+    outputHandle->Allocate();
+
+    workload->PostAllocationConfigure();
+    workload->Execute();
+
+    CopyDataFromITensorHandle(&result.output[0][0][0][0], outputHandle.get());
+    return result;
+}
+
+} // anonymous namespace
+
+LayerTestResult<float, 4> ConstantTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return ConstantTestImpl<armnn::DataType::Float32>(workloadFactory, memoryManager, 0.0f, 0);
+}
+
+LayerTestResult<int16_t, 4> ConstantInt16SimpleQuantizationScaleNoOffsetTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return ConstantTestImpl<armnn::DataType::QuantisedSymm16>(workloadFactory, memoryManager, 1.0f, 0);
+}
+
+LayerTestResult<uint8_t, 4> ConstantUint8SimpleQuantizationScaleNoOffsetTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return ConstantTestImpl<armnn::DataType::QuantisedAsymm8>(workloadFactory, memoryManager, 1.0f, 0);
+}
+
+LayerTestResult<uint8_t, 4> ConstantUint8CustomQuantizationScaleAndOffsetTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return ConstantTestImpl<armnn::DataType::QuantisedAsymm8>(workloadFactory, memoryManager, 2e-6f, 1);
+}
+
+LayerTestResult<int16_t, 4> ConstantInt16CustomQuantizationScaleAndOffsetTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return ConstantTestImpl<armnn::DataType::QuantisedSymm16>(workloadFactory, memoryManager, 2e-6f, 1);
+}
diff --git a/src/backends/backendsCommon/test/layerTests/ConstantTestImpl.hpp b/src/backends/backendsCommon/test/layerTests/ConstantTestImpl.hpp
new file mode 100644
index 0000000..fa3e228
--- /dev/null
+++ b/src/backends/backendsCommon/test/layerTests/ConstantTestImpl.hpp
@@ -0,0 +1,31 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include "LayerTestResult.hpp"
+
+#include <backendsCommon/IBackendInternal.hpp>
+#include <backendsCommon/WorkloadFactory.hpp>
+
+LayerTestResult<float, 4> ConstantTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<uint8_t, 4> ConstantUint8SimpleQuantizationScaleNoOffsetTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<int16_t, 4> ConstantInt16SimpleQuantizationScaleNoOffsetTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<uint8_t, 4> ConstantUint8CustomQuantizationScaleAndOffsetTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<int16_t, 4> ConstantInt16CustomQuantizationScaleAndOffsetTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
diff --git a/src/backends/backendsCommon/test/layerTests/Conv2dTestImpl.cpp b/src/backends/backendsCommon/test/layerTests/Conv2dTestImpl.cpp
new file mode 100644
index 0000000..01c1b18
--- /dev/null
+++ b/src/backends/backendsCommon/test/layerTests/Conv2dTestImpl.cpp
@@ -0,0 +1,3145 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "Conv2dTestImpl.hpp"
+
+#include <DataLayoutIndexed.hpp>
+#include <Permute.hpp>
+#include <TensorUtils.hpp>
+
+#include <armnn/ArmNN.hpp>
+
+#include <backendsCommon/CpuTensorHandle.hpp>
+
+#include <backendsCommon/test/QuantizeHelper.hpp>
+#include <backendsCommon/test/TensorCopyUtils.hpp>
+#include <backendsCommon/test/WorkloadTestUtils.hpp>
+
+#include <test/TensorHelpers.hpp>
+
+#include <boost/numeric/conversion/cast.hpp>
+
+#include <string>
+
+//
+// Static data
+//
+
+// 2-channel bias used by a number of Conv2d tests.
+static std::vector<float> Bias2({0, 2});
+
+static std::vector<float> Bias4({1, 2, 3, 4});
+
+static std::vector<float> Bias8({1, 2, 3, 4, 1, 2, 3, 4});
+
+// 3-channel 16x8 image used as common input data for a number of Conv2d tests.
+static std::vector<float> ConvInput3x8x16({
+    0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f,
+    0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+    0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f,
+    0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f,
+    0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f,
+    0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f,
+    0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f,
+    0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f,
+    0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
+});
+
+//
+// Helper templates
+//
+
+// Helper template that returns either Bias2 or an empty vector depending on whether bias is enabled.
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+boost::multi_array<T, 1> GetBias2(bool biasEnabled, float qScale)
+{
+    if(biasEnabled)
+    {
+        armnn::TensorInfo biasDesc({static_cast<unsigned int>(Bias2.size())}, ArmnnType);
+        boost::multi_array<T, 1> bias = MakeTensor<T, 1>(biasDesc, QuantizedVector<T>(qScale, 0.0f, Bias2));
+        return bias;
+    }
+    else
+    {
+        return boost::multi_array<T, 1>();
+    }
+}
+
+// Helper template that returns either Bias4 or an empty vector depending on whether bias is enabled.
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+boost::multi_array<T, 1> GetBias4(bool biasEnabled, float qScale)
+{
+    if(biasEnabled)
+    {
+        armnn::TensorInfo biasDesc({static_cast<unsigned int>(Bias4.size())}, ArmnnType);
+        boost::multi_array<T, 1> bias = MakeTensor<T, 1>(biasDesc, QuantizedVector<T>(qScale, 0.0f, Bias4));
+        return bias;
+    }
+    else
+    {
+        return boost::multi_array<T, 1>();
+    }
+}
+
+// Helper template that returns either Bias8 or an empty vector depending on whether bias is enabled.
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+boost::multi_array<T, 1> GetBias8(bool biasEnabled, float qScale)
+{
+    if(biasEnabled)
+    {
+        armnn::TensorInfo biasDesc({static_cast<unsigned int>(Bias4.size())}, ArmnnType);
+        boost::multi_array<T, 1> bias = MakeTensor<T, 1>(biasDesc, QuantizedVector<T>(qScale, 0.0f, Bias8));
+        return bias;
+    }
+    else
+    {
+        return boost::multi_array<T, 1>();
+    }
+}
+
+// Helper template that returns either Bias4 or an empty vector depending on whether bias is enabled.
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+boost::multi_array<T, 1> GetBias(bool biasEnabled, float qScale, armnn::TensorInfo outputInfo, armnn::DataLayout layout)
+{
+    const armnnUtils::DataLayoutIndexed dataLayoutIndexed(layout);
+    const unsigned int channelsIndex = dataLayoutIndexed.GetChannelsIndex();
+    const unsigned int outputChannels = outputInfo.GetShape()[channelsIndex];
+
+    switch (outputChannels)
+    {
+        case 2:
+        default:
+        {
+            return GetBias2<ArmnnType>(biasEnabled, qScale);
+        }
+        case 4:
+        {
+            return GetBias4<ArmnnType>(biasEnabled, qScale);
+        }
+        case 8:
+        {
+            return GetBias8<ArmnnType>(biasEnabled, qScale);
+        }
+    }
+}
+
+//
+// Implementation templates
+//
+
+// Mapping from input type to bias type for fully connected layers.
+// float => float, uint8_t => int32_t
+template<typename T>
+struct FullyConnectedBiasTypeForInputType;
+
+template<>
+struct FullyConnectedBiasTypeForInputType<float>
+{
+    using Type = float;
+};
+
+template<>
+struct FullyConnectedBiasTypeForInputType<uint8_t>
+{
+    using Type = int32_t;
+};
+
+// Modifies a std::vector in-place using a specified bias.
+template<typename T, typename B>
+void ApplyBias(std::vector<T>& v, float vScale, int32_t vOffset,
+    const std::vector<B>& bias, float bScale, int32_t bOffset, uint32_t w, uint32_t h)
+{
+    BOOST_ASSERT_MSG((armnn::IsQuantizedType<T>() && vScale != 0.0f) || (!armnn::IsQuantizedType<T>()),
+                     "Invalid type and parameter combination.");
+    BOOST_ASSERT_MSG((armnn::IsQuantizedType<B>() && bScale != 0.0f) || (!armnn::IsQuantizedType<B>()),
+                     "Invalid type and parameter combination.");
+
+    // Note we need to dequantize and re-quantize the image value and the bias.
+    for (uint32_t i = 0; i < bias.size(); ++i)
+    {
+        float dBias = SelectiveDequantize(bias[i], bScale, bOffset);
+        for (uint32_t y = 0; y < h; ++y)
+        {
+            for (uint32_t x = 0; x < w; ++x)
+            {
+                uint32_t offset = (i * h + y) * w + x;
+                BOOST_ASSERT(offset < v.size());
+                T& outRef = v[offset];
+                float dOutput = SelectiveDequantize(outRef, vScale, vOffset);
+                outRef = SelectiveQuantize<T>(dOutput + dBias, vScale, vOffset);
+            }
+        }
+    }
+}
+
+//
+// Convolution2d implementations
+//
+
+template<armnn::DataType ArmnnType, armnn::DataType ArmnnBType,
+         typename T = armnn::ResolveType<ArmnnType>, typename B = armnn::ResolveType<ArmnnBType>>
+LayerTestResult<T, 4> SimpleConvolution2dTestImpl(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const boost::multi_array<T, 4>& originalInput,
+    const boost::multi_array<T, 4>& originalKernel,
+    const boost::multi_array<B, 1>& bias,
+    const boost::multi_array<T, 4>& originalOutputExpected,
+    float qScale,
+    int32_t qOffset,
+    const armnn::DataLayout layout = armnn::DataLayout::NCHW,
+    uint32_t padLeft = 0,
+    uint32_t padTop = 0,
+    uint32_t padRight = 0,
+    uint32_t padBottom = 0,
+    uint32_t strideX = 1,
+    uint32_t strideY = 1,
+    uint32_t dilationX = 1,
+    uint32_t dilationY = 1)
+{
+    unsigned int inputHeight   = boost::numeric_cast<unsigned int>(originalInput.shape()[2]);
+    unsigned int inputWidth    = boost::numeric_cast<unsigned int>(originalInput.shape()[3]);
+    unsigned int inputChannels = boost::numeric_cast<unsigned int>(originalInput.shape()[1]);
+    unsigned int inputNum      = boost::numeric_cast<unsigned int>(originalInput.shape()[0]);
+
+    unsigned int outputHeight   = boost::numeric_cast<unsigned int>(originalOutputExpected.shape()[2]);
+    unsigned int outputWidth    = boost::numeric_cast<unsigned int>(originalOutputExpected.shape()[3]);
+    unsigned int outputChannels = boost::numeric_cast<unsigned int>(originalOutputExpected.shape()[1]);
+    unsigned int outputNum      = boost::numeric_cast<unsigned int>(originalOutputExpected.shape()[0]);
+
+    unsigned int kernelHeight = boost::numeric_cast<unsigned int>(originalKernel.shape()[2]);
+    unsigned int kernelWidth = boost::numeric_cast<unsigned int>(originalKernel.shape()[3]);
+    unsigned int kernelChannels = boost::numeric_cast<unsigned int>(originalKernel.shape()[1]);
+    unsigned int kernelDepthMul = boost::numeric_cast<unsigned int>(originalKernel.shape()[0]);
+
+    bool biasEnabled = bias.size() > 0;
+
+    // This function currently assumes 1 batch of input/output (and duplicates this into 2 batches).
+    BOOST_ASSERT(inputNum == 1);
+    BOOST_ASSERT(outputNum == 1);
+
+    // If a bias is used, its size must equal the number of output channels.
+    BOOST_ASSERT(!biasEnabled || bias.size() == outputChannels);
+
+
+    // Note these tensors will use two (identical) batches.
+    armnn::TensorInfo inputTensorInfo =
+            armnnUtils::GetTensorInfo(2*inputNum, inputChannels, inputHeight, inputWidth, layout, ArmnnType);
+    armnn::TensorInfo outputTensorInfo =
+            armnnUtils::GetTensorInfo(2*outputNum, outputChannels, outputHeight, outputWidth, layout, ArmnnType);
+    armnn::TensorInfo kernelDesc =
+            armnnUtils::GetTensorInfo(kernelDepthMul, kernelChannels, kernelHeight, kernelWidth, layout, ArmnnType);
+    armnn::TensorInfo biasDesc({static_cast<unsigned int>(bias.size())}, ArmnnBType);
+
+    // Set quantization parameters if the requested type is a quantized type.
+    if(armnn::IsQuantizedType<T>())
+    {
+        inputTensorInfo.SetQuantizationScale(qScale);
+        inputTensorInfo.SetQuantizationOffset(qOffset);
+        outputTensorInfo.SetQuantizationScale(qScale);
+        outputTensorInfo.SetQuantizationOffset(qOffset);
+        kernelDesc.SetQuantizationScale(qScale);
+        kernelDesc.SetQuantizationOffset(qOffset);
+        biasDesc.SetQuantizationScale(qScale*qScale);
+        biasDesc.SetQuantizationOffset(0);
+    }
+
+    LayerTestResult<T, 4> ret(outputTensorInfo);
+
+    // Construct input data - two batches of the same input image.
+    std::vector<T> inputImage;
+    inputImage.assign(originalInput.data(), originalInput.data() + 1*inputChannels*inputHeight*inputWidth);
+    std::vector<T> inputData;
+    inputData.insert(inputData.end(), inputImage.begin(), inputImage.end());
+    inputData.insert(inputData.end(), inputImage.begin(), inputImage.end());
+
+    // at this point if we require it permute the input data
+    const armnn::PermutationVector NCHWToNHWC = { 0, 3, 1, 2 };
+    if (layout == armnn::DataLayout::NHWC)
+    {
+        std::vector<T> tmp(inputData.size());
+        armnnUtils::Permute(inputTensorInfo.GetShape(), NCHWToNHWC, inputData.data(), tmp.data(), sizeof(T));
+        inputData = tmp;
+    }
+
+    auto batchedInput = MakeTensor<T, 4>(inputTensorInfo, inputData);
+
+    std::vector<T> outputImage;
+    outputImage.assign(originalOutputExpected.data(),
+            originalOutputExpected.data() + outputChannels*outputHeight*outputWidth);
+
+    // Apply bias to output image if it is enabled.
+    if(biasEnabled)
+    {
+        std::vector<T> biasV;
+        biasV.assign(bias.data(), bias.data() + outputChannels);
+        ApplyBias(outputImage, outputTensorInfo.GetQuantizationScale(), outputTensorInfo.GetQuantizationOffset(),
+            biasV, biasDesc.GetQuantizationScale(), biasDesc.GetQuantizationOffset(),
+            outputWidth, outputHeight);
+    }
+
+    // Construct expected output data - two identical images.
+    std::vector<T> outputData;
+    outputData.insert(outputData.end(), outputImage.begin(), outputImage.end());
+    outputData.insert(outputData.end(), outputImage.begin(), outputImage.end());
+
+    // at this point if we require it permute the expected output
+    if (layout == armnn::DataLayout::NHWC)
+    {
+        std::vector<T> tmp(outputData.size());
+        armnnUtils::Permute(outputTensorInfo.GetShape(), NCHWToNHWC, outputData.data(), tmp.data(), sizeof(T));
+        outputData = tmp;
+    }
+    ret.outputExpected = MakeTensor<T, 4>(outputTensorInfo, outputData);
+
+    std::unique_ptr<armnn::ITensorHandle> inputHandle = workloadFactory.CreateTensorHandle(inputTensorInfo);
+    std::unique_ptr<armnn::ITensorHandle> outputHandle = workloadFactory.CreateTensorHandle(outputTensorInfo);
+
+    armnn::Convolution2dQueueDescriptor data;
+    armnn::WorkloadInfo info;
+    armnn::ScopedCpuTensorHandle weightsTensor(kernelDesc);
+    armnn::ScopedCpuTensorHandle biasTensor(biasDesc);
+    // Permute the kernel if necessary
+    boost::multi_array<T, 4> kernel = boost::multi_array<T, 4>(originalKernel);
+    if (layout == armnn::DataLayout::NHWC)
+    {
+        armnnUtils::Permute(kernelDesc.GetShape(), NCHWToNHWC, originalKernel.data(), kernel.data(), sizeof(T));
+    }
+    AllocateAndCopyDataToITensorHandle(&weightsTensor, &kernel[0][0][0][0]);
+
+    if(biasEnabled)
+    {
+        AllocateAndCopyDataToITensorHandle(&biasTensor, &bias[0]);
+    }
+
+    AddInputToWorkload(data, info, inputTensorInfo, inputHandle.get());
+    AddOutputToWorkload(data, info, outputTensorInfo, outputHandle.get());
+
+    data.m_Weight = &weightsTensor;
+    data.m_Bias = &biasTensor; // Still set this whether or not bias is enabled - can be a source of bugs.
+    data.m_Parameters.m_StrideX = strideX;
+    data.m_Parameters.m_StrideY = strideY;
+    data.m_Parameters.m_PadLeft = padLeft;
+    data.m_Parameters.m_PadRight = padRight;
+    data.m_Parameters.m_PadTop = padTop;
+    data.m_Parameters.m_PadBottom = padBottom;
+    data.m_Parameters.m_BiasEnabled = biasEnabled;
+    data.m_Parameters.m_DataLayout = layout;
+    data.m_Parameters.m_DilationX = dilationX;
+    data.m_Parameters.m_DilationY = dilationY;
+
+    std::unique_ptr<armnn::IWorkload> workload = workloadFactory.CreateConvolution2d(data, info);
+    inputHandle->Allocate();
+    outputHandle->Allocate();
+
+    CopyDataToITensorHandle(inputHandle.get(), &batchedInput[0][0][0][0]);
+
+    ExecuteWorkload(*workload, memoryManager);
+
+    CopyDataFromITensorHandle(&ret.output[0][0][0][0], outputHandle.get());
+
+    return ret;
+}
+
+template<armnn::DataType ArmnnType, armnn::DataType ArmnnBType,
+         typename T = armnn::ResolveType<ArmnnType>, typename B = armnn::ResolveType<ArmnnBType>>
+LayerTestResult<T, 4> SimpleConvolution2dNhwcTestImpl(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const boost::multi_array<T, 4>& input,
+    const boost::multi_array<T, 4>& kernel,
+    const boost::multi_array<B, 1>& bias,
+    const boost::multi_array<T, 4>& outputExpected,
+    const armnn::DataLayout dataLayout,
+    float qScale,
+    int32_t qOffset,
+    uint32_t padLeft = 1,
+    uint32_t padTop = 1,
+    uint32_t padRight = 1,
+    uint32_t padBottom = 1,
+    uint32_t strideX  = 1,
+    uint32_t strideY  = 1)
+{
+    unsigned int inputNum       = boost::numeric_cast<unsigned int>(input.shape()[0]);
+    unsigned int inputChannels  = boost::numeric_cast<unsigned int>(input.shape()[3]);
+    unsigned int inputHeight    = boost::numeric_cast<unsigned int>(input.shape()[1]);
+    unsigned int inputWidth     = boost::numeric_cast<unsigned int>(input.shape()[2]);
+
+    unsigned int kernelChanMul  = boost::numeric_cast<unsigned int>(kernel.shape()[0]);
+    unsigned int kernelChannels = boost::numeric_cast<unsigned int>(kernel.shape()[3]);
+    unsigned int kernelHeight   = boost::numeric_cast<unsigned int>(kernel.shape()[1]);
+    unsigned int kernelWidth    = boost::numeric_cast<unsigned int>(kernel.shape()[2]);
+
+    unsigned int outputNum      = boost::numeric_cast<unsigned int>(outputExpected.shape()[0]);
+    unsigned int outputChannels = boost::numeric_cast<unsigned int>(outputExpected.shape()[3]);
+    unsigned int outputHeight   = boost::numeric_cast<unsigned int>(outputExpected.shape()[1]);
+    unsigned int outputWidth    = boost::numeric_cast<unsigned int>(outputExpected.shape()[2]);
+
+    bool biasEnabled = bias.size() > 0;
+
+    // Creates the tensors.
+    armnn::TensorInfo inputTensorInfo({inputNum, inputHeight, inputWidth, inputChannels}, ArmnnType);
+    armnn::TensorInfo outputTensorInfo({outputNum, outputHeight, outputWidth, outputChannels},
+                                       ArmnnType);
+    armnn::TensorInfo kernelDesc({kernelChanMul, kernelHeight, kernelWidth, kernelChannels}, ArmnnType);
+    armnn::TensorInfo biasDesc({static_cast<unsigned int>(bias.size())}, ArmnnBType);
+
+    // Construct the input data.
+    std::vector<T> inputData;
+    inputData.assign(input.data(), input.data() + inputHeight*inputWidth*inputChannels);
+    auto batchedInput = MakeTensor<T, 4>(inputTensorInfo, inputData);
+
+    // Construct the output data, with bias applied, as appropriate.
+    std::vector<T> outputData;
+    outputData.assign(outputExpected.data(), outputExpected.data() + outputHeight*outputWidth*outputChannels);
+
+    LayerTestResult<T, 4> ret(outputTensorInfo);
+    ret.outputExpected = MakeTensor<T, 4>(outputTensorInfo, outputData);
+
+    std::unique_ptr<armnn::ITensorHandle> inputHandle = workloadFactory.CreateTensorHandle(inputTensorInfo);
+    std::unique_ptr<armnn::ITensorHandle> outputHandle = workloadFactory.CreateTensorHandle(outputTensorInfo);
+
+    armnn::ScopedCpuTensorHandle weightsTensor(kernelDesc);
+    AllocateAndCopyDataToITensorHandle(&weightsTensor, &kernel[0][0][0][0]);
+
+    armnn::ScopedCpuTensorHandle biasTensor(biasDesc);
+
+    armnn::Convolution2dQueueDescriptor data;
+
+    data.m_Weight = &weightsTensor;
+    data.m_Bias = &biasTensor; // Still set this whether or not bias is enabled - can be a source of bugs.
+    data.m_Parameters.m_StrideX = strideX;
+    data.m_Parameters.m_StrideY = strideY;
+    data.m_Parameters.m_PadLeft = padLeft;
+    data.m_Parameters.m_PadRight = padRight;
+    data.m_Parameters.m_PadTop = padTop;
+    data.m_Parameters.m_PadBottom = padBottom;
+    data.m_Parameters.m_BiasEnabled = biasEnabled;
+    data.m_Parameters.m_DataLayout = dataLayout;
+
+    armnn::WorkloadInfo info;
+    AddInputToWorkload(data, info, inputTensorInfo, inputHandle.get());
+    AddOutputToWorkload(data, info, outputTensorInfo, outputHandle.get());
+
+    std::unique_ptr<armnn::IWorkload> workload = workloadFactory.CreateConvolution2d(data, info);
+    inputHandle->Allocate();
+    outputHandle->Allocate();
+
+    CopyDataToITensorHandle(inputHandle.get(), &batchedInput[0][0][0][0]);
+
+    ExecuteWorkload(*workload, memoryManager);
+
+    CopyDataFromITensorHandle(&ret.output[0][0][0][0], outputHandle.get());
+
+    return ret;
+}
+
+template<armnn::DataType ArmnnType, armnn::DataType ArmnnBType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T,4> Convolution1dTestImpl(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    float qScale,
+    int32_t qOffset,
+    bool biasEnabled)
+{
+    using B = armnn::ResolveType<ArmnnBType>;
+    // Until we have a specialist 1D convolution layer, we can fake one using
+    // 2D convolution with the final dimension set to 1.
+    // I don't anticipate this being particularly slow, given that convolution is implemented
+    // as a matrix multiplication, at which point dimension doesn't matter.
+
+    unsigned int batchSize      = 1;
+    unsigned int inputChannels  = 2;
+    unsigned int outputChannels = 3;
+    unsigned int inputSize      = 5; // The 1D size (could view as 'width' or 'height').
+    unsigned int kernelSize     = 3;
+    unsigned int padSize        = 2;
+    unsigned int stride         = 1;
+    unsigned int outputSize     = 7; // (inputSize + 2 * padSize - kernelSize + 1) / stride.
+
+    armnn::TensorInfo inputInfo({batchSize, inputChannels, inputSize, 1}, ArmnnType);
+    armnn::TensorInfo outputInfo({batchSize, outputChannels, outputSize, 1}, ArmnnType);
+    armnn::TensorInfo kernelInfo({outputChannels, inputChannels, kernelSize, 1}, ArmnnType);
+    armnn::TensorInfo biasInfo({outputChannels}, ArmnnBType);
+
+    // Set quantization parameters if the requested type is a quantized type.
+    if(armnn::IsQuantizedType<T>())
+    {
+        inputInfo.SetQuantizationScale(qScale);
+        inputInfo.SetQuantizationOffset(qOffset);
+        outputInfo.SetQuantizationScale(qScale);
+        outputInfo.SetQuantizationOffset(qOffset);
+        kernelInfo.SetQuantizationScale(qScale);
+        kernelInfo.SetQuantizationOffset(qOffset);
+        biasInfo.SetQuantizationScale(inputInfo.GetQuantizationScale()*kernelInfo.GetQuantizationScale());
+        biasInfo.SetQuantizationOffset(0);
+    }
+
+    std::vector<T> inputData(
+        QuantizedVector<T>(inputInfo.GetQuantizationScale(), inputInfo.GetQuantizationOffset(), {
+            5.0f, -2.0f, 2.5f, 0.0f, 1.0f,
+            -3.0f, 3.2f, 5.0f, 2.0f, 3.0f,
+        }));
+
+    std::vector<T> kernelData(
+        QuantizedVector<T>(kernelInfo.GetQuantizationScale(), kernelInfo.GetQuantizationOffset(), {
+            1.0f, 0.0f, 0.0f,
+            0.0f, 2.0f, -1.5f,
+
+            0.0f, 0.0f, 0.0f,
+            0.2f, 0.2f, 0.2f,
+
+            0.5f, 0.0f, 0.5f,
+            0.0f, -1.0f, 0.0f
+        }));
+
+    std::vector<B> biasData(
+        QuantizedVector<B>(biasInfo.GetQuantizationScale(), biasInfo.GetQuantizationOffset(), {
+            1.0f, 0.0f, 0.0f
+        }));
+
+    std::vector<T> outputData(
+        QuantizedVector<T>(outputInfo.GetQuantizationScale(), outputInfo.GetQuantizationOffset(), {
+            4.5f, -10.8f, 5.0f + 6.4f - 7.5f, -2.0f + 10.0f -3.0f, 2.5f + 4.0f - 4.5f, 6.0f, 1.0f,
+            -0.6f, -0.6f + 0.64f, -0.6f + 0.64f + 1.0f, 0.64f + 1.0f + 0.4f, 1.0f + 0.4f + 0.6f, 0.4f + 0.6f, 0.6f,
+            2.5f, -1.0f + 3.0f, 1.25f - 3.2f + 2.5f, -1.0f - 5.0f, 1.25f + 0.5f - 2.0f, -3.0f, 0.5f
+        }));
+
+    // Optionally apply bias to output image.
+    if(biasEnabled)
+    {
+        ApplyBias(outputData, outputInfo.GetQuantizationScale(), outputInfo.GetQuantizationOffset(),
+            biasData, biasInfo.GetQuantizationScale(), biasInfo.GetQuantizationOffset(),
+            1, outputSize);
+    }
+
+    std::unique_ptr<armnn::ITensorHandle> inputHandle  = workloadFactory.CreateTensorHandle(inputInfo);
+    std::unique_ptr<armnn::ITensorHandle> outputHandle = workloadFactory.CreateTensorHandle(outputInfo);
+
+    armnn::Convolution2dQueueDescriptor data;
+    armnn::WorkloadInfo info;
+    armnn::ScopedCpuTensorHandle         weightsTensor(kernelInfo);
+    armnn::ScopedCpuTensorHandle         biasTensor(biasInfo);
+
+    AllocateAndCopyDataToITensorHandle(&weightsTensor, kernelData.data());
+    AllocateAndCopyDataToITensorHandle(&biasTensor, biasData.data());
+
+    AddInputToWorkload(data, info, inputInfo, inputHandle.get());
+    AddOutputToWorkload(data, info, outputInfo, outputHandle.get());
+
+    data.m_Weight         = &weightsTensor;
+    data.m_Bias           = &biasTensor;
+    data.m_Parameters.m_StrideX        = 1;
+    data.m_Parameters.m_StrideY        = stride;
+    data.m_Parameters.m_PadLeft        = 0;
+    data.m_Parameters.m_PadRight       = 0;
+    data.m_Parameters.m_PadTop         = padSize;
+    data.m_Parameters.m_PadBottom      = padSize;
+    data.m_Parameters.m_BiasEnabled    = biasEnabled;
+
+    std::unique_ptr<armnn::IWorkload> workload = workloadFactory.CreateConvolution2d(data, info);
+    inputHandle->Allocate();
+    outputHandle->Allocate();
+
+    CopyDataToITensorHandle(inputHandle.get(), inputData.data());
+
+    ExecuteWorkload(*workload, memoryManager);
+
+    // Output
+    LayerTestResult<T,4> ret(outputInfo);
+    CopyDataFromITensorHandle(&ret.output[0][0][0][0], outputHandle.get());
+    ret.outputExpected = MakeTensor<T, 4>(outputInfo, outputData);
+    return ret;
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> SimpleConvolution2d3x3NhwcTestCommon(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    float qScale,
+    int32_t qOffset,
+    bool biasEnabled,
+    armnn::DataLayout dataLayout)
+{
+    // Use common single-batch 5x5 image.
+
+    armnn::TensorInfo inputDesc({1, 3, 4, 1}, ArmnnType);
+    boost::multi_array<T, 4> input = MakeTensor<T, 4>(inputDesc,
+                                                      {
+                                                       1, 5, 2, 3,
+                                                       8, 7, 3, 6,
+                                                       3, 3, 9, 1
+                                                       });
+
+
+    // Use a 2-element batch of 3-channel 3x3 kernels.
+    armnn::TensorInfo kernelDesc({1, 3, 3, 1}, ArmnnType);
+    boost::multi_array<T, 4> kernel = MakeTensor<T, 4>(kernelDesc, {
+                                                                    4, 5, 6,
+                                                                    0, 0, 0,
+                                                                    3, 2, 1
+                                                                    });
+
+    // Expected output is 1 batch of a 5x5 image.
+    armnn::TensorInfo outputDesc({1, 3, 4, 1}, ArmnnType);
+
+    const std::vector<float> outputData =
+            {
+                    23, 41, 33, 21,
+                    44, 65, 76, 52,
+                    82, 85, 79, 42
+            };
+
+    boost::multi_array<T, 4> expectedOutput = MakeTensor<T, 4>(outputDesc, outputData);
+
+    return SimpleConvolution2dNhwcTestImpl<ArmnnType, ArmnnType>(
+        workloadFactory,
+        memoryManager,
+        input,
+        kernel,
+        boost::multi_array<T, 1>(),
+        expectedOutput,
+        dataLayout,
+        qScale,
+        qOffset);
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> SimpleConvolution2d3x3Stride2x2TestCommon(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        float qScale,
+        int32_t qOffset,
+        bool biasEnabled,
+        const armnn::DataLayout& dataLayout)
+{
+    // Input is a single-batch, 1 channel, 5x5 image.
+    armnn::TensorInfo inputDesc({1, 5, 5, 1}, ArmnnType);
+    boost::multi_array<T, 4> input = MakeTensor<T, 4>(inputDesc,
+            {
+                1, 5, 2, 3, 5,
+                8, 7, 3, 6, 3,
+                3, 3, 9, 1, 9,
+                4, 1, 8, 1, 3,
+                6, 8, 1, 9, 2
+            });
+
+    // Use a 3x3 kernel.
+    armnn::TensorInfo kernelDesc({1, 3, 3, 1}, ArmnnType);
+    boost::multi_array<T, 4> kernel = MakeTensor<T, 4>(kernelDesc,
+            {
+                4, 5, 6,
+                0, 0, 0,
+                3, 2, 1
+            });
+
+    // Expected output is a single-batch, 1 channel, 3x3 image.
+    armnn::TensorInfo outputDesc({1, 3, 3, 1}, ArmnnType);
+
+    const std::vector<T> outputData =
+            {
+                23, 33, 24,
+                91, 99, 48,
+                26, 50, 19
+            };
+
+    boost::multi_array<T, 4> expectedOutput = MakeTensor<T, 4>(outputDesc, outputData);
+
+    uint32_t padLeft = 1;
+    uint32_t padTop = 1;
+    uint32_t padRight = 1;
+    uint32_t padBottom = 1;
+    uint32_t strideX  = 2;
+    uint32_t strideY  = 2;
+
+    return SimpleConvolution2dNhwcTestImpl<ArmnnType, ArmnnType>(
+        workloadFactory,
+        memoryManager,
+        input,
+        kernel,
+        boost::multi_array<T, 1>(),
+        expectedOutput,
+        dataLayout,
+        qScale,
+        qOffset,
+        padLeft,
+        padTop,
+        padRight,
+        padBottom,
+        strideX,
+        strideY);
+}
+
+template<armnn::DataType ArmnnType, armnn::DataType ArmnnBType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> SimpleConvolution2d3x5TestCommon(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    float qScale,
+    int32_t qOffset,
+    bool biasEnabled,
+    const armnn::DataLayout layout)
+{
+    // Use common single-batch 3-channel 16x8 image.
+    armnn::TensorInfo inputDesc({1, 3, 8, 16}, ArmnnType);
+    boost::multi_array<T, 4> input = MakeTensor<T, 4>(inputDesc, QuantizedVector<T>(qScale, qOffset, ConvInput3x8x16));
+
+    // Use a 2-element batch with 3-channel 3x5 kernels.
+    armnn::TensorInfo kernelDesc({2, 3, 5, 3}, ArmnnType);
+    boost::multi_array<T, 4> kernel = MakeTensor<T, 4>(kernelDesc, std::vector<T>(
+        QuantizedVector<T>(qScale, qOffset, {
+            1, 1, 1,
+            1, -1, 1,
+            1, 1, 1,
+            1, 1, 1,
+            1, 1, 1,
+
+            0, 0, 0,
+            0, 0, 0,
+            0, 0, 0,
+            0, 0, 0,
+            0, 0, 0,
+
+            2, 2, 2,
+            2, 2, 2,
+            2, 2, 2,
+            2, 2, 2,
+            2, 2, 2,
+
+
+            0, 0, 0,
+            0, 0, 0,
+            0, 0, 0,
+            0, 0, 0,
+            0, 0, 0,
+
+            1, 1, 1,
+            1, 1, 1,
+            1, 1, 1,
+            1, 1, 1,
+            1, 1, 1,
+
+            0, 0, 0,
+            0, 0, 0,
+            0, 0, 0,
+            0, 0, 0,
+            0, 0, 0
+        })));
+
+    // Expected output is 2 batch elements of a 1-channel 14x4 image.
+    armnn::TensorInfo outputDesc({1, 2, 4, 14}, ArmnnType);
+    boost::multi_array<T, 4> expectedOutput = MakeTensor<T, 4>(outputDesc, std::vector<T>(
+        QuantizedVector<T>(qScale, qOffset, {
+            -24, -24, -24, -24, -24, -24, -24, -24, -24, -24, -24, -24, -24, -24,
+            -25, -25, -25, -25, -25, -25, -25, -25, -25, -25, -25, -25, -25, -25,
+            -23.5f, -23.5f, -23.5f, -23.5f, -23.5f, -23.5f, -23.5f, -23.5f, -23.5f, -23.5f, -23.5f,
+            -23.5f, -23.5f, -23.5f,
+            -23.5f, -23.5f, -23.5f, -23.5f, -23.5f, -23.5f, -23.5f, -23.5f, -23.5f, -23.5f, -23.5f,
+            -23.5f, -23.5f, -23.5f,
+
+            5, 5, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+            5, 5, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+            5, 5, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+            5, 5, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+        })));
+
+    return SimpleConvolution2dTestImpl<ArmnnType, ArmnnBType>(
+        workloadFactory,
+        memoryManager,
+        input,
+        kernel,
+        GetBias2<ArmnnBType>(biasEnabled, qScale * qScale),
+        expectedOutput,
+        qScale,
+        qOffset,
+        layout);
+}
+
+template<armnn::DataType ArmnnType, armnn::DataType ArmnnBType,
+         typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> SimpleConvolution2d3x3TestCommon(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    float qScale,
+    int32_t qOffset,
+    bool biasEnabled,
+    const armnn::DataLayout layout)
+{
+    // Use a 3x3 kernel, which exercises ArmCompute's direct convolution path.
+
+    // Use common single-batch 3-channel 16x8 image.
+    armnn::TensorInfo inputDesc({1, 3, 8, 16}, ArmnnType);
+    boost::multi_array<T, 4> input = MakeTensor<T, 4>(inputDesc, QuantizedVector<T>(qScale, qOffset, ConvInput3x8x16));
+
+    // Use a 2-element batch of 3-channel 3x3 kernels.
+    armnn::TensorInfo kernelDesc({2, 3, 3, 3}, ArmnnType);
+    boost::multi_array<T, 4> kernel = MakeTensor<T, 4>(kernelDesc, std::vector<T>(
+        QuantizedVector<T>(qScale, qOffset, {
+            1, 1, 1,
+            1, -1, 1,
+            1, 1, 1,
+
+            0, 0, 0,
+            0, 0, 0,
+            0, 0, 0,
+
+            2, 2, 2,
+            2, 2, 2,
+            2, 2, 2,
+
+
+            0, 0, 0,
+            0, 0, 0,
+            0, 0, 0,
+
+            1, 1, 1,
+            1, 1, 1,
+            1, 1, 1,
+
+            0, 0, 0,
+            0, 0, 0,
+            0, 0, 0
+        })));
+
+    // Expected output is 1 batch of a 2-channel 14x6 image.
+    armnn::TensorInfo outputDesc({1, 2, 6, 14}, ArmnnType);
+    boost::multi_array<T, 4> expectedOutput = MakeTensor<T, 4>(outputDesc, std::vector<T>(
+        QuantizedVector<T>(qScale, qOffset, {
+            -15, -15, -15, -15, -15, -15, -15, -15, -15, -15, -15, -15, -15, -15,
+            -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16, -16,
+            -14.5f,-14.5f,-14.5f,-14.5f,-14.5f,-14.5f,-14.5f,-14.5f,-14.5f,-14.5f,-14.5f,-14.5f,-14.5f,-14.5f,
+            -14.5f,-14.5f,-14.5f,-14.5f,-14.5f,-14.5f,-14.5f,-14.5f,-14.5f,-14.5f,-14.5f,-14.5f,-14.5f,-14.5f,
+            -14.5f,-14.5f,-14.5f,-14.5f,-14.5f,-14.5f,-14.5f,-14.5f,-14.5f,-14.5f,-14.5f,-14.5f,-14.5f,-14.5f,
+            -14.5f,-14.5f,-14.5f,-14.5f,-14.5f,-14.5f,-14.5f,-14.5f,-14.5f,-14.5f,-14.5f,-14.5f,-14.5f,-14.5f,
+
+            3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+            3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+            3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+            3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+            3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+            3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+        })));
+
+    return SimpleConvolution2dTestImpl<ArmnnType, ArmnnBType>(
+        workloadFactory,
+        memoryManager,
+        input,
+        kernel,
+        GetBias2<ArmnnBType>(biasEnabled, qScale * qScale),
+        expectedOutput,
+        qScale,
+        qOffset,
+        layout);
+}
+
+template<armnn::DataType ArmnnType, armnn::DataType ArmnnBType,
+         typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> Convolution2dAsymmetricPaddingLargerThanHalfKernelSizeTestCommon(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::DataLayout layout,
+    float qScale,
+    int32_t qOffset)
+{
+    // Use a single-batch 1-channel 3x3 image as input.
+    armnn::TensorInfo inputDesc({1, 1, 3, 3}, ArmnnType);
+    boost::multi_array<T, 4> input = MakeTensor<T, 4>(inputDesc, std::vector<T>(
+        QuantizedVector<T>(qScale, qOffset, {
+            11,21,31,
+            12,22,32,
+            13,23,33
+        })));
+
+    // Use 1 batch of a 1-channel 2x2 kernel.
+    armnn::TensorInfo kernelDesc({1, 1, 2, 2}, ArmnnType);
+    boost::multi_array<T, 4> kernel = MakeTensor<T, 4>(kernelDesc, std::vector<T>(
+        QuantizedVector<T>(qScale, qOffset, {
+            -11,-21,
+            -12,-22,
+        })));
+
+// Expected output is 1 batch of a 1-channel 6x8 image.
+// Manually calculated like this:
+//[-11*0 -21*0  -12*0 -22*0  ; -11*0  -21*0  -12*0  -22*0  ; -11*0  -21*0  -12*0  -22*0  ; -11*0  -21*0 -12*0  -22*0 ..]
+//[-11*0 -21*0  -12*0 -22*11 ; -11*0  -21*0  -12*11 -22*21 ; -11*0  -21*0  -12*21 -22*31 ; -11*0  -21*0 -12*31 -22*0 ..]
+//[-11*0 -21*11 -12*0 -22*12 ; -11*11 -21*21 -12*12 -22*22 ; -11*21 -21*31 -12*22 -22*32 ; -11*31 -21*0 -12*32 -22*0 ..]
+//[-11*0 -21*12 -12*0 -22*13 ; -11*12 -21*22 -12*13 -22*23 ; -11*22 -21*32 -12*23 -22*33 ; -11*32 -21*0 -12*33 -22*0 ..]
+//[-11*0 -21*13 -12*0 -22*0  ; -11*13 -21*23 -12*0  -22*0  ; -11*23 -21*33 -12*0  -22*0  ; -11*33 -21*0 -12*0  -22*0 ..]
+//[-11*0 -21*0  -12*0 -22*0  ; -11*0  -21*0  -12*0  -22*0  ; -11*0  -21*0  -12*0  -22*0  ; -11*0  -21*0 -12*0  -22*0 ..]
+//[..... .....  ..... .....  ; .....  .....  .....  .....  ; .....  .....  .....  .....  ; .....  ..... .....  ..... ..]
+    armnn::TensorInfo outputDesc({1, 1, 8, 6}, ArmnnType);
+    boost::multi_array<T, 4> expectedOutput = MakeTensor<T, 4>(outputDesc, std::vector<T>(
+        QuantizedVector<T>(qScale, qOffset, {
+               0,    0,      0,    0,    0,    0,
+            -242,  -594,  -934, -372,    0,    0,
+            -495, -1190, -1850, -725,    0,    0,
+            -538, -1256, -1916, -748,    0,    0,
+            -273, -626,  -946,  -363,    0,    0,
+               0,    0,     0,     0,    0,    0,
+               0,    0,     0,     0,    0,    0,
+               0,    0,     0,     0,    0,    0
+        })));
+
+    return SimpleConvolution2dTestImpl<ArmnnType, ArmnnBType>(
+        workloadFactory,
+        memoryManager,
+        input,
+        kernel,
+        GetBias2<ArmnnBType>(false, qScale * qScale),
+        expectedOutput,
+        qScale,
+        qOffset,
+        layout,
+        1,  // Padding left.
+        2,  // Padding top.
+        3,  // Padding right.
+        4); // Padding bottom.
+}
+
+template<armnn::DataType ArmnnType, armnn::DataType ArmnnBType,
+         typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> SimpleConvolution2dAsymmetricPaddingTestCommon(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::DataLayout layout,
+    float qScale,
+    int32_t qOffset)
+{
+    // Use a single-batch 1-channel 5x5 image as input.
+    armnn::TensorInfo inputDesc({ 1, 1, 5, 5 }, ArmnnType);
+    boost::multi_array<T, 4> input = MakeTensor<T, 4>(inputDesc, std::vector<T>(
+        QuantizedVector<T>(qScale, qOffset, {
+            11,21,31,41,51,
+            12,22,32,42,52,
+            13,23,33,43,53,
+            14,24,34,44,54,
+            15,25,35,45,55,
+        })));
+
+    // Use 1 batch of a 1-channel 4x4 kernel.
+    armnn::TensorInfo kernelDesc({ 1, 1, 4, 4 }, ArmnnType);
+    boost::multi_array<T, 4> kernel = MakeTensor<T, 4>(kernelDesc, std::vector<T>(
+        QuantizedVector<T>(qScale, qOffset, {
+            -11,-21,-31,-41,
+            -12,-22,-32,-42,
+            -13,-23,-33,-43,
+            -14,-24,-34,-44,
+        })));
+
+    // Expected output is 1 batch of a 1-channel 5x5 image.
+    armnn::TensorInfo outputDesc({ 1, 1, 5, 5 }, ArmnnType);
+    std::vector<T> myVec(outputDesc.GetNumElements(), 0);
+    boost::multi_array<T, 4> expectedOutput = MakeTensor<T, 4>(outputDesc, std::vector<T>(
+        QuantizedVector<T>(qScale, qOffset, {
+            -7140, -10580, -13940,  -9300, -5230,
+            -9590, -14120, -18520, -12290, -6860,
+            -9980, -14560, -18960, -12560, -7000,
+            -7518, -10904, -14144,  -9318, -5152,
+            -5032,  -7256,  -9376,  -6142, -3368,
+        })));
+
+    return SimpleConvolution2dTestImpl<ArmnnType, ArmnnBType>(
+        workloadFactory,
+        memoryManager,
+        input,
+        kernel,
+        GetBias2<ArmnnBType>(false, qScale * qScale),
+        expectedOutput,
+        qScale,
+        qOffset,
+        layout,
+        1,  // Padding left.
+        1,  // Padding top.
+        2,  // Padding right.
+        2); // Padding bottom.
+}
+
+template<armnn::DataType ArmnnType, armnn::DataType ArmnnBType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> Convolution2d3x3DilationTestCommon(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const std::vector<float>& inputNoQuantizedValues,
+    armnn::TensorInfo& inputTensorInfo,
+    const std::vector<float>& kernelNoQuantizedValues,
+    armnn::TensorInfo& kernelTensorInfo,
+    const std::vector<float>& outputExpectedNoQuantizedValues,
+    armnn::TensorInfo& outputTensorInfo,
+    uint32_t dilationX,
+    uint32_t dilationY,
+    armnn::DataLayout layout = armnn::DataLayout::NCHW,
+    uint32_t padLeft = 0,
+    uint32_t padTop = 0,
+    uint32_t padRight = 0,
+    uint32_t padBottom = 0,
+    uint32_t strideX  = 1,
+    uint32_t strideY  = 1,
+    bool biasEnabled = false
+)
+{
+    float qScale;
+    int32_t qOffset;
+    switch (ArmnnType)
+    {
+        case armnn::DataType::QuantisedAsymm8:
+        {
+            qScale = 0.1f;
+            qOffset = 128;
+            break;
+        }
+        case armnn::DataType::QuantisedSymm16:
+        {
+            qScale = 0.1f;
+            qOffset = 0;
+            break;
+        }
+        case armnn::DataType::Float32:
+        default:
+        {
+            qScale = 0.f;
+            qOffset = 0;
+            break;
+        }
+    }
+
+    inputTensorInfo.SetQuantizationScale(qScale);
+    inputTensorInfo.SetQuantizationOffset(qOffset);
+    kernelTensorInfo.SetQuantizationScale(qScale);
+    kernelTensorInfo.SetQuantizationOffset(qOffset);
+    outputTensorInfo.SetQuantizationScale(qScale);
+    outputTensorInfo.SetQuantizationOffset(qOffset);
+
+    auto input = MakeTensor<T, 4>(inputTensorInfo,
+                                  std::vector<T>(QuantizedVector<T>(inputTensorInfo.GetQuantizationScale(),
+                                                                    inputTensorInfo.GetQuantizationOffset(),
+                                                                    inputNoQuantizedValues)));
+    auto kernel = MakeTensor<T, 4>(kernelTensorInfo,
+                                  std::vector<T>(QuantizedVector<T>(kernelTensorInfo.GetQuantizationScale(),
+                                                                    kernelTensorInfo.GetQuantizationOffset(),
+                                                                    kernelNoQuantizedValues)));
+    auto expectedOutput = MakeTensor<T, 4>(outputTensorInfo,
+                                           std::vector<T>(QuantizedVector<T>(outputTensorInfo.GetQuantizationScale(),
+                                                                             outputTensorInfo.GetQuantizationOffset(),
+                                                                             outputExpectedNoQuantizedValues)));
+
+    return SimpleConvolution2dTestImpl<ArmnnType, ArmnnBType>(
+            workloadFactory,
+            memoryManager,
+            input,
+            kernel,
+            GetBias2<ArmnnBType>(biasEnabled, qScale * qScale),
+            expectedOutput,
+            qScale,
+            qOffset,
+            layout,
+            padLeft,
+            padTop,
+            padRight,
+            padBottom,
+            strideX,
+            strideY,
+            dilationX,
+            dilationY);
+}
+
+template<armnn::DataType ArmnnType, armnn::DataType ArmnnBType, typename T>
+LayerTestResult<T, 4> Convolution2d3x3Dilation3x3Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    bool biasEnabled,
+    const armnn::DataLayout layout)
+{
+    armnn::TensorInfo inputTensorInfo({1, 1, 10, 10}, ArmnnType);
+    std::vector<float> inputNoQuantizedValues =
+    {
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 1, 1, 1, 0, 0, 0,
+        0, 0, 0, 0, 1, 1, 1, 0, 0, 0,
+        0, 0, 0, 0, 1, 1, 1, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+    };
+
+    armnn::TensorInfo kernelTensorInfo({ 1, 1, 3, 3}, ArmnnType);
+    std::vector<float> kernelNoQuantizedValues =
+    {
+        1, 2, 3,
+        4, 5, 6,
+        7, 8, 9
+    };
+
+    // Since the dilation rate is 3 this will dilate the kernel to be like 7x7,
+    // therefore the output will be 4x4: (I−K+2P)/S +1 => (10-7 +0)/1 +1
+    armnn::TensorInfo outputTensorInfo({ 1, 1, 4, 4}, ArmnnType);
+    std::vector<float> outputExpectedNoQuantizedValues =
+    {
+        6., 5., 5., 5.,
+        6., 5., 5., 5.,
+        6., 5., 5., 5.,
+        3., 2., 2., 2.
+    };
+
+    return Convolution2d3x3DilationTestCommon<ArmnnType, ArmnnBType>(
+            workloadFactory,
+            memoryManager,
+            inputNoQuantizedValues,
+            inputTensorInfo,
+            kernelNoQuantizedValues,
+            kernelTensorInfo,
+            outputExpectedNoQuantizedValues,
+            outputTensorInfo,
+            3,
+            3,
+            layout,
+            biasEnabled);
+}
+
+template<armnn::DataType ArmnnType, armnn::DataType ArmnnBType, typename T>
+LayerTestResult<T, 4> Convolution2d2x3x3Dilation3x3Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    bool biasEnabled,
+    const armnn::DataLayout layout)
+{
+    armnn::TensorInfo inputTensorInfo({1, 2, 10, 10}, ArmnnType);
+    std::vector<float> inputNoQuantizedValues =
+    {
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 1, 1, 1, 0, 0, 0,
+        0, 0, 0, 0, 1, 1, 1, 0, 0, 0,
+        0, 0, 0, 0, 1, 1, 1, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 1, 1, 1, 0, 0, 0,
+        0, 0, 0, 0, 1, 1, 1, 0, 0, 0,
+        0, 0, 0, 0, 1, 1, 1, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+    };
+
+    armnn::TensorInfo kernelTensorInfo({ 1, 2, 3, 3}, ArmnnType);
+    std::vector<float> kernelNoQuantizedValues =
+    {
+        1, 2, 3,
+        4, 5, 6,
+        7, 8, 9,
+
+        1, 2, 3,
+        4, 5, 6,
+        7, 8, 9
+    };
+
+    // Since the dilation rate is 3 this will dilate the kernel to be like 7x7,
+    // therefore the output will be 4x4: (I−K+2P)/S +1 => (10-7 +0)/1 +1
+    armnn::TensorInfo outputTensorInfo({ 1, 1, 4, 4}, ArmnnType);
+    std::vector<float> outputExpectedNoQuantizedValues =
+    {
+        12., 10., 10., 10.,
+        12., 10., 10., 10.,
+        12., 10., 10., 10.,
+         6.,  4.,  4.,  4.
+    };
+
+    return Convolution2d3x3DilationTestCommon<ArmnnType, ArmnnBType>(
+            workloadFactory,
+            memoryManager,
+            inputNoQuantizedValues,
+            inputTensorInfo,
+            kernelNoQuantizedValues,
+            kernelTensorInfo,
+            outputExpectedNoQuantizedValues,
+            outputTensorInfo,
+            3,
+            3,
+            layout,
+            biasEnabled);
+}
+
+template<armnn::DataType ArmnnType, armnn::DataType ArmnnBType, typename T>
+LayerTestResult<T, 4> Convolution2d2x2Dilation2x2Padding2x2Stride3x3Test(
+        armnn::IWorkloadFactory &workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr &memoryManager,
+        bool biasEnabled,
+        const armnn::DataLayout layout)
+{
+    armnn::TensorInfo inputTensorInfo({1, 1, 10, 10}, ArmnnType);
+    std::vector<float> inputNoQuantizedValues =
+    {
+        1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+        1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+        1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+        1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+        1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+        1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+        1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+        1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+        1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+        1, 1, 1, 1, 1, 1, 1, 1, 1, 1
+    };
+
+    armnn::TensorInfo kernelTensorInfo({ 1, 1, 2, 2}, ArmnnType);
+    std::vector<float> kernelNoQuantizedValues =
+    {
+        1, 2,
+        3, 4
+    };
+
+    // Since the dilation rate is 2 this will dilate the kernel to be like 3x3: d(K-1)+1 --> 2 x (2-1) + 1 = 3,
+    // therefore the output will be 4x4: (I − K + 2P)/S +1 => trunc ( (10 - 3 + 2x2 ) / 3 + 1 )
+    // where, dilation size = d = 2; kernel size = K = 2; input size = I = 10; padding size = P = 2; stride = S = 3
+    armnn::TensorInfo outputTensorInfo({ 1, 1, 4, 4}, ArmnnType);
+    std::vector<float> outputExpectedNoQuantizedValues =
+    {
+        4,  7,  7, 3,
+        6, 10, 10, 4,
+        6, 10, 10, 4,
+        2,  3,  3, 1
+    };
+    uint32_t padLeft = 1;
+    uint32_t padTop = 1;
+    uint32_t padRight = 1;
+    uint32_t padBottom = 1;
+
+    return Convolution2d3x3DilationTestCommon<ArmnnType, ArmnnBType>(
+            workloadFactory,
+            memoryManager,
+            inputNoQuantizedValues,
+            inputTensorInfo,
+            kernelNoQuantizedValues,
+            kernelTensorInfo,
+            outputExpectedNoQuantizedValues,
+            outputTensorInfo,
+            2,
+            2,
+            layout,
+            padLeft,
+            padTop,
+            padRight,
+            padBottom,
+            3,
+            3,
+            biasEnabled
+            );
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T,4> CompareConvolution2dTestImpl(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    armnn::IWorkloadFactory& refWorkloadFactory)
+{
+    unsigned int inputHeight   = 8;
+    unsigned int inputWidth    = 16;
+    unsigned int inputChannels = 3;
+    unsigned int inputNum      = 5;
+
+    unsigned int kernelHeight = 3;
+    unsigned int kernelWidth  = 3;
+
+    unsigned int strideX = 2;
+    unsigned int strideY = 3;
+    unsigned int padX    = 1;
+    unsigned int padY    = 1;
+
+    unsigned int outputNum      = inputNum;
+    unsigned int outputChannels = 2;
+    unsigned int outputHeight   = (inputHeight + 2 * padY - kernelHeight + strideY) / strideY;
+    unsigned int outputWidth    = (inputWidth + 2 * padX - kernelWidth + strideX) / strideX;
+
+    armnn::TensorInfo inputTensorInfo;
+    armnn::TensorInfo outputTensorInfo;
+    armnn::TensorInfo kernelDesc;
+    armnn::TensorInfo biasDesc;
+
+    unsigned int inputShape[]  = {inputNum, inputChannels, inputHeight, inputWidth};
+    unsigned int outputShape[] = {outputNum, outputChannels, outputHeight, outputWidth};
+    unsigned int kernelShape[] = {outputChannels, inputChannels, kernelHeight, kernelWidth};
+    unsigned int biasShape[]   = {outputChannels};
+
+    inputTensorInfo = armnn::TensorInfo(4, inputShape, ArmnnType);
+    outputTensorInfo = armnn::TensorInfo(4, outputShape, ArmnnType);
+    kernelDesc = armnn::TensorInfo(4, kernelShape, ArmnnType);
+    biasDesc = armnn::TensorInfo(1, biasShape, ArmnnType);
+
+    LayerTestResult<T,4> ret(outputTensorInfo);
+
+    auto input  = MakeRandomTensor<T, 4>(inputTensorInfo, 124908);
+    auto kernel = MakeRandomTensor<T, 4>(kernelDesc, 891234);
+    auto bias   = MakeRandomTensor<T, 1>(biasDesc, 1028);
+
+    std::unique_ptr<armnn::ITensorHandle> inputHandle = workloadFactory.CreateTensorHandle(inputTensorInfo);
+    std::unique_ptr<armnn::ITensorHandle> outputHandle = workloadFactory.CreateTensorHandle(outputTensorInfo);
+
+    armnn::Convolution2dQueueDescriptor data;
+    armnn::WorkloadInfo info;
+    armnn::ScopedCpuTensorHandle weightsTensor(kernelDesc);
+    armnn::ScopedCpuTensorHandle biasTensor(biasDesc);
+
+    AllocateAndCopyDataToITensorHandle(&weightsTensor, &kernel[0][0][0][0]);
+    AllocateAndCopyDataToITensorHandle(&biasTensor, &bias[0]);
+
+    AddInputToWorkload(data, info, inputTensorInfo, inputHandle.get());
+    AddOutputToWorkload(data, info, outputTensorInfo, outputHandle.get());
+    data.m_Weight = &weightsTensor;
+    data.m_Bias = &biasTensor;
+    data.m_Parameters.m_StrideX = strideX;
+    data.m_Parameters.m_StrideY = strideY;
+    data.m_Parameters.m_PadLeft = padX;
+    data.m_Parameters.m_PadRight = padX;
+    data.m_Parameters.m_PadTop = padY;
+    data.m_Parameters.m_PadBottom = padY;
+    data.m_Parameters.m_BiasEnabled = true;
+
+    std::unique_ptr<armnn::ITensorHandle> outputHandleRef = refWorkloadFactory.CreateTensorHandle(outputTensorInfo);
+    std::unique_ptr<armnn::ITensorHandle> inputHandleRef = refWorkloadFactory.CreateTensorHandle(inputTensorInfo);
+
+    armnn::Convolution2dQueueDescriptor refData = data;
+    armnn::WorkloadInfo               refInfo = info;
+    SetWorkloadInput(refData, refInfo, 0, inputTensorInfo, inputHandleRef.get());
+    SetWorkloadOutput(refData, refInfo, 0, outputTensorInfo, outputHandleRef.get());
+
+    std::unique_ptr<armnn::IWorkload> workload  = workloadFactory.CreateConvolution2d(data, info);
+    std::unique_ptr<armnn::IWorkload> workloadRef = refWorkloadFactory.CreateConvolution2d(refData, refInfo);
+
+    outputHandleRef->Allocate();
+    inputHandleRef->Allocate();
+
+    inputHandle->Allocate();
+    outputHandle->Allocate();
+
+    CopyDataToITensorHandle(inputHandle.get(), &input[0][0][0][0]);
+    CopyDataToITensorHandle(inputHandleRef.get(), &input[0][0][0][0]);
+
+    ExecuteWorkload(*workload, memoryManager);
+
+    workloadRef->PostAllocationConfigure();
+    workloadRef->Execute();
+
+    CopyDataFromITensorHandle(&ret.output[0][0][0][0], outputHandle.get());
+    CopyDataFromITensorHandle(&ret.outputExpected[0][0][0][0], outputHandleRef.get());
+
+    return ret;
+}
+
+//
+// DepthwiseConvolution2d implementations
+//
+
+template<armnn::DataType ArmnnType, armnn::DataType ArmnnBType,
+         typename T = armnn::ResolveType<ArmnnType>, typename B = armnn::ResolveType<ArmnnBType>>
+LayerTestResult<T, 4> DepthwiseConvolution2dAsymmetricTestImpl(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const boost::multi_array<T, 4>& input,
+    const boost::multi_array<T, 4>& kernel,
+    const boost::multi_array<B, 1>& bias,
+    const boost::multi_array<T, 4>& outputExpected,
+    float qScale,
+    int32_t qOffset,
+    const armnn::DataLayout layout,
+    uint32_t padLeft = 0,
+    uint32_t padTop = 0,
+    uint32_t padRight = 0,
+    uint32_t padBottom = 0,
+    uint32_t strideX = 1,
+    uint32_t strideY = 1)
+{
+    unsigned int inputNum       = boost::numeric_cast<unsigned int>(input.shape()[0]);
+    unsigned int inputChannels  = boost::numeric_cast<unsigned int>(input.shape()[1]);
+    unsigned int inputHeight    = boost::numeric_cast<unsigned int>(input.shape()[2]);
+    unsigned int inputWidth     = boost::numeric_cast<unsigned int>(input.shape()[3]);
+    unsigned int kernelChanMul  = boost::numeric_cast<unsigned int>(kernel.shape()[0]);
+    unsigned int kernelChannels = boost::numeric_cast<unsigned int>(kernel.shape()[1]);
+    unsigned int kernelHeight   = boost::numeric_cast<unsigned int>(kernel.shape()[2]);
+    unsigned int kernelWidth    = boost::numeric_cast<unsigned int>(kernel.shape()[3]);
+    unsigned int outputNum      = boost::numeric_cast<unsigned int>(outputExpected.shape()[0]);
+    unsigned int outputChannels = boost::numeric_cast<unsigned int>(outputExpected.shape()[1]);
+    unsigned int outputHeight   = boost::numeric_cast<unsigned int>(outputExpected.shape()[2]);
+    unsigned int outputWidth    = boost::numeric_cast<unsigned int>(outputExpected.shape()[3]);
+
+    // If a bias is used, its size must equal the number of output channels.
+    bool biasEnabled = bias.size() > 0;
+    BOOST_ASSERT(!biasEnabled || bias.size() == outputChannels);
+
+    // Creates the tensors.
+    armnn::TensorInfo inputTensorInfo =
+            armnnUtils::GetTensorInfo(inputNum, inputChannels, inputHeight, inputWidth, layout, ArmnnType);
+    armnn::TensorInfo outputTensorInfo =
+            armnnUtils::GetTensorInfo(outputNum, outputChannels, outputHeight, outputWidth, layout, ArmnnType);
+    armnn::TensorInfo kernelDesc({kernelChanMul, kernelChannels, kernelHeight, kernelWidth}, ArmnnType);
+    armnn::TensorInfo biasDesc({static_cast<unsigned int>(bias.size())}, ArmnnBType);
+
+    // Set quantization parameters if the requested type is a quantized type.
+    if (armnn::IsQuantizedType<T>())
+    {
+        inputTensorInfo.SetQuantizationScale(qScale);
+        inputTensorInfo.SetQuantizationOffset(qOffset);
+        outputTensorInfo.SetQuantizationScale(qScale);
+        outputTensorInfo.SetQuantizationOffset(qOffset);
+        kernelDesc.SetQuantizationScale(qScale);
+        kernelDesc.SetQuantizationOffset(qOffset);
+        biasDesc.SetQuantizationScale(qScale*qScale);
+        biasDesc.SetQuantizationOffset(0);
+    }
+
+    // Construct the input data.
+    std::vector<T> inputData;
+    inputData.assign(input.data(), input.data() + inputChannels*inputHeight*inputWidth);
+
+    // At this point if we require it permute the input data
+    const armnn::PermutationVector NCHWToNHWC = { 0, 3, 1, 2 };
+    if (layout == armnn::DataLayout::NHWC)
+    {
+        std::vector<T> tmp(inputData.size());
+        armnnUtils::Permute(inputTensorInfo.GetShape(), NCHWToNHWC, inputData.data(), tmp.data(), sizeof(T));
+        inputData = tmp;
+    }
+
+    auto batchedInput = MakeTensor<T, 4>(inputTensorInfo, inputData);
+
+    // Construct the output data, with bias applied, as appropriate.
+    std::vector<T> outputData;
+    outputData.assign(outputExpected.data(), outputExpected.data() + outputChannels*outputHeight*outputWidth);
+    if (biasEnabled)
+    {
+        std::vector<T> biasV;
+        biasV.assign(bias.data(), bias.data() + outputChannels);
+        ApplyBias(outputData, outputTensorInfo.GetQuantizationScale(), outputTensorInfo.GetQuantizationOffset(),
+            biasV, biasDesc.GetQuantizationScale(), biasDesc.GetQuantizationOffset(),
+            outputWidth, outputHeight);
+    }
+
+    LayerTestResult<T, 4> ret(outputTensorInfo);
+
+    // At this point if we require it permute the expected output
+    if (layout == armnn::DataLayout::NHWC)
+    {
+        std::vector<T> tmp(outputData.size());
+        armnnUtils::Permute(outputTensorInfo.GetShape(), NCHWToNHWC, outputData.data(), tmp.data(), sizeof(T));
+        outputData = tmp;
+    }
+
+    ret.outputExpected = MakeTensor<T, 4>(outputTensorInfo, outputData);
+
+    std::unique_ptr<armnn::ITensorHandle> inputHandle = workloadFactory.CreateTensorHandle(inputTensorInfo);
+    std::unique_ptr<armnn::ITensorHandle> outputHandle = workloadFactory.CreateTensorHandle(outputTensorInfo);
+
+    armnn::ScopedCpuTensorHandle weightsTensor(kernelDesc);
+
+    AllocateAndCopyDataToITensorHandle(&weightsTensor, &kernel[0][0][0][0]);
+
+    armnn::ScopedCpuTensorHandle biasTensor(biasDesc);
+    if (biasEnabled)
+    {
+        AllocateAndCopyDataToITensorHandle(&biasTensor, &bias[0]);
+    }
+
+    armnn::DepthwiseConvolution2dQueueDescriptor data;
+    data.m_Weight = &weightsTensor;
+    data.m_Bias = &biasTensor; // Still set this whether or not bias is enabled - it can be a source of bugs.
+    data.m_Parameters.m_StrideX = strideX;
+    data.m_Parameters.m_StrideY = strideY;
+    data.m_Parameters.m_PadLeft = padLeft;
+    data.m_Parameters.m_PadRight = padRight;
+    data.m_Parameters.m_PadTop = padTop;
+    data.m_Parameters.m_PadBottom = padBottom;
+    data.m_Parameters.m_BiasEnabled = biasEnabled;
+    data.m_Parameters.m_DataLayout = layout;
+
+    armnn::WorkloadInfo info;
+    AddInputToWorkload(data, info, inputTensorInfo, inputHandle.get());
+    AddOutputToWorkload(data, info, outputTensorInfo, outputHandle.get());
+
+    std::unique_ptr<armnn::IWorkload> workload = workloadFactory.CreateDepthwiseConvolution2d(data, info);
+    inputHandle->Allocate();
+    outputHandle->Allocate();
+
+    CopyDataToITensorHandle(inputHandle.get(), &batchedInput[0][0][0][0]);
+
+    ExecuteWorkload(*workload, memoryManager);
+
+    CopyDataFromITensorHandle(&ret.output[0][0][0][0], outputHandle.get());
+
+    return ret;
+}
+
+template<armnn::DataType ArmnnType, armnn::DataType ArmnnBType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> DepthwiseConvolution2dDepthMul1TestImpl(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    float qScale,
+    int32_t qOffset,
+    bool biasEnabled,
+    const armnn::DataLayout layout)
+{
+    using B = armnn::ResolveType<ArmnnBType>;
+
+    unsigned int inputHeight = 3;
+    unsigned int inputWidth = 3;
+    unsigned int inputChannels = 2;
+    unsigned int inputNum = 1;
+
+    unsigned int kernelHeight = 3;
+    unsigned int kernelWidth = 3;
+    unsigned int kernelChannels = inputChannels;
+    unsigned int kernelDepthMultiplier = 1;
+
+    unsigned int outputHeight = 1;
+    unsigned int outputWidth = 1;
+    unsigned int outputChannels = kernelChannels;
+    unsigned int outputNum = inputNum;
+
+    armnn::TensorInfo inputTensorInfo =
+            armnnUtils::GetTensorInfo(inputNum, inputChannels, inputHeight, inputWidth, layout, ArmnnType);
+    armnn::TensorInfo outputTensorInfo =
+            armnnUtils::GetTensorInfo(outputNum, outputChannels, outputHeight, outputWidth, layout, ArmnnType);
+    armnn::TensorInfo kernelDesc({kernelDepthMultiplier, kernelChannels, kernelHeight, kernelWidth},
+                                 ArmnnType);
+    armnn::TensorInfo biasDesc({ outputChannels }, ArmnnBType);
+
+    // Set quantization parameters if the requested type is a quantized type.
+    if(armnn::IsQuantizedType<T>())
+    {
+        inputTensorInfo.SetQuantizationScale(qScale);
+        inputTensorInfo.SetQuantizationOffset(qOffset);
+        outputTensorInfo.SetQuantizationScale(qScale);
+        outputTensorInfo.SetQuantizationOffset(qOffset);
+        kernelDesc.SetQuantizationScale(qScale);
+        kernelDesc.SetQuantizationOffset(qOffset);
+        biasDesc.SetQuantizationScale(qScale*qScale);
+        biasDesc.SetQuantizationOffset(0);
+    }
+    std::vector<T> inputData = std::vector<T>(
+            QuantizedVector<T>(inputTensorInfo.GetQuantizationScale(), inputTensorInfo.GetQuantizationOffset(), {
+                    1.f, 2.f, 1.f,
+                    2.f, 1.f, 2.f,
+                    1.f, 2.f, 1.f,
+
+                    1.f, 2.f, 1.f,
+                    2.f, 1.f, 2.f,
+                    1.f, 2.f, 1.f,
+            }));
+    // at this point if we require it permute the input data
+    const armnn::PermutationVector NCHWToNHWC = { 0, 3, 1, 2 };
+    if (layout == armnn::DataLayout::NHWC)
+    {
+        std::vector<T> tmp(inputData.size());
+        armnnUtils::Permute(inputTensorInfo.GetShape(), NCHWToNHWC, inputData.data(), tmp.data(), sizeof(T));
+        inputData = tmp;
+    }
+    auto input = MakeTensor<T, 4>(inputTensorInfo, inputData);
+
+    std::vector<B> biasV(QuantizedVector<B>(biasDesc.GetQuantizationScale(), biasDesc.GetQuantizationOffset(),
+                                            {0, 2}));
+    auto bias = MakeTensor<B, 1>(biasDesc, biasV);
+
+    std::vector<T> kernelData = std::vector<T>(
+            QuantizedVector<T>(kernelDesc.GetQuantizationScale(), kernelDesc.GetQuantizationOffset(), {
+                    1.f, 0.f,  1.f,
+                    0.f, 0.f,  0.f,
+                    -1.f, 0.f, -1.f,
+
+                    1.f, 0.f,  1.f,
+                    0.f, 0.f,  0.f,
+                    -1.f, 0.f, -1.f,
+            }));
+    auto kernel = MakeTensor<T, 4>(kernelDesc, kernelData);
+
+    // Manually calculated.
+    std::vector<T> outputImage(
+        QuantizedVector<T>(outputTensorInfo.GetQuantizationScale(),
+                           outputTensorInfo.GetQuantizationOffset(),
+                           {0.f, 0.f})
+    );
+
+    // Optionally apply bias to output image.
+    if(biasEnabled)
+    {
+        ApplyBias(outputImage, outputTensorInfo.GetQuantizationScale(), outputTensorInfo.GetQuantizationOffset(),
+                  biasV, biasDesc.GetQuantizationScale(), biasDesc.GetQuantizationOffset(),
+                  outputWidth, outputHeight);
+    }
+
+    LayerTestResult<T, 4> ret(outputTensorInfo);
+    if (layout == armnn::DataLayout::NHWC)
+    {
+        std::vector<T> tmp(outputImage.size());
+        armnnUtils::Permute(outputTensorInfo.GetShape(), NCHWToNHWC, outputImage.data(), tmp.data(), sizeof(T));
+        outputImage = tmp;
+    }
+
+    ret.outputExpected = MakeTensor<T, 4>(outputTensorInfo, outputImage);
+
+    std::unique_ptr<armnn::ITensorHandle> inputHandle = workloadFactory.CreateTensorHandle(inputTensorInfo);
+    std::unique_ptr<armnn::ITensorHandle> outputHandle = workloadFactory.CreateTensorHandle(outputTensorInfo);
+
+    armnn::DepthwiseConvolution2dQueueDescriptor data;
+    armnn::WorkloadInfo info;
+    armnn::ScopedCpuTensorHandle weightsTensor(kernelDesc);
+    armnn::ScopedCpuTensorHandle biasTensor(biasDesc);
+
+    AllocateAndCopyDataToITensorHandle(&weightsTensor, &kernel[0][0][0][0]);
+    AllocateAndCopyDataToITensorHandle(&biasTensor, &bias[0]);
+
+    AddInputToWorkload(data, info, inputTensorInfo, inputHandle.get());
+    AddOutputToWorkload(data, info, outputTensorInfo, outputHandle.get());
+
+    data.m_Weight = &weightsTensor;
+    data.m_Bias = &biasTensor; // Still set this whether or not bias is enabled.
+    data.m_Parameters.m_StrideX = 1;
+    data.m_Parameters.m_StrideY = 1;
+    data.m_Parameters.m_PadLeft = 0;
+    data.m_Parameters.m_PadRight = 0;
+    data.m_Parameters.m_PadTop = 0;
+    data.m_Parameters.m_PadBottom = 0;
+    data.m_Parameters.m_BiasEnabled = biasEnabled;
+    data.m_Parameters.m_DataLayout = layout;
+
+    std::unique_ptr<armnn::IWorkload> workload = workloadFactory.CreateDepthwiseConvolution2d(data, info);
+    inputHandle->Allocate();
+    outputHandle->Allocate();
+
+    CopyDataToITensorHandle(inputHandle.get(), &input[0][0][0][0]);
+
+    ExecuteWorkload(*workload, memoryManager);
+
+    CopyDataFromITensorHandle(&ret.output[0][0][0][0], outputHandle.get());
+
+    return ret;
+}
+
+template<armnn::DataType ArmnnType, armnn::DataType ArmnnBType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> DepthwiseConvolution2dTestImpl(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    float qScale,
+    int32_t qOffset,
+    bool biasEnabled,
+    const armnn::DataLayout layout)
+{
+    using B = armnn::ResolveType<ArmnnBType>;
+
+    unsigned int depthMultiplier = 2;
+
+    unsigned int inputHeight    = 8;
+    unsigned int inputWidth     = 16;
+    unsigned int inputChannels  = 2;
+    unsigned int inputBatchSize = 1;
+
+    unsigned int kernelHeight = 5;
+    unsigned int kernelWidth  = 3;
+
+    unsigned int outputHeight    = inputHeight - kernelHeight + 1 + 2;
+    unsigned int outputWidth     = (inputWidth - kernelWidth + 1)/2;
+    unsigned int outputChannels  = inputChannels * depthMultiplier;
+    unsigned int outputBatchSize = inputBatchSize;
+
+    armnn::TensorInfo inputTensorInfo = armnnUtils::GetTensorInfo(
+            inputBatchSize, inputChannels, inputHeight, inputWidth, layout, ArmnnType);
+    armnn::TensorInfo outputTensorInfo = armnnUtils::GetTensorInfo(
+            outputBatchSize, outputChannels, outputHeight, outputWidth, layout, ArmnnType);
+    armnn::TensorInfo kernelDesc({depthMultiplier, inputChannels, kernelHeight, kernelWidth},
+                                 ArmnnType);
+    armnn::TensorInfo biasDesc({outputChannels}, ArmnnBType);
+
+    // Set quantization parameters if the requested type is a quantized type.
+    if(armnn::IsQuantizedType<T>())
+    {
+        inputTensorInfo.SetQuantizationScale(qScale);
+        inputTensorInfo.SetQuantizationOffset(qOffset);
+        outputTensorInfo.SetQuantizationScale(qScale);
+        outputTensorInfo.SetQuantizationOffset(qOffset);
+        kernelDesc.SetQuantizationScale(qScale);
+        kernelDesc.SetQuantizationOffset(qOffset);
+        biasDesc.SetQuantizationScale(qScale*qScale);
+        biasDesc.SetQuantizationOffset(0);
+    }
+
+    // NOTE: originalInputData is in NCHW format
+    std::vector<T> originalInputData = std::vector<T>(
+            QuantizedVector<T>(inputTensorInfo.GetQuantizationScale(), inputTensorInfo.GetQuantizationOffset(), {
+                    0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f,
+                    0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+                    0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f,
+                    0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f,
+                    0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f,
+                    0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f,
+                    0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f,
+                    0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f,
+                    0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                    0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                    0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                    0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                    0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                    0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                    0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                    0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+            }));
+    std::vector<T> inputData = originalInputData;
+    // at this point if we require it permute the input data
+    const armnn::PermutationVector NCHWToNHWC = { 0, 3, 1, 2 };
+    if (layout == armnn::DataLayout::NHWC)
+    {
+        armnnUtils::Permute(inputTensorInfo.GetShape(), NCHWToNHWC,
+                            originalInputData.data(), inputData.data(), sizeof(T));
+    }
+    auto input = MakeTensor<T, 4>(inputTensorInfo, inputData);
+
+    std::vector<B> biasV(QuantizedVector<B>(biasDesc.GetQuantizationScale(), biasDesc.GetQuantizationOffset(),
+        {0, 2, 1, -1}));
+    auto bias = MakeTensor<B, 1>(biasDesc, biasV);
+
+    std::vector<T> kernelData = std::vector<T>(
+            QuantizedVector<T>(kernelDesc.GetQuantizationScale(), kernelDesc.GetQuantizationOffset(), {
+                    1, 1, 1,
+                    1, -1, 1,
+                    1, 1, 1,
+                    1, 1, 1,
+                    1, 1, 1,
+
+                    2, 2, 2,
+                    2, 2, 2,
+                    2, 2, 2,
+                    2, 2, 2,
+                    2, 2, 2,
+
+                    0, 0, 0,
+                    0, -1, 0,
+                    0, 0, 0,
+                    0, 0, 0,
+                    0, 0, 0,
+
+                    0, 0, 0,
+                    0, 0, 0,
+                    0, 1, 0,
+                    0, 0, 0,
+                    0, 0, 0
+
+            }));
+    auto kernel = MakeTensor<T, 4>(kernelDesc, kernelData);
+
+    // Manually calculated.
+    std::vector<T> originalOutputImage = std::vector<T>(
+        QuantizedVector<T>(outputTensorInfo.GetQuantizationScale(), outputTensorInfo.GetQuantizationOffset(), {
+            3.5f,  3.5f,  3.5f,  3.5f,  3.5f,  3.5f,  3.5f,
+            6.0f,  6.0f,  6.0f,  6.0f,  6.0f,  6.0f,  6.0f,
+            5.0f,  5.0f,  5.0f,  5.0f,  5.0f,  5.0f,  5.0f,
+            6.5f,  6.5f,  6.5f,  6.5f,  6.5f,  6.5f,  6.5f,
+            6.5f,  6.5f,  6.5f,  6.5f,  6.5f,  6.5f,  6.5f,
+            5.0f,  5.0f,  5.0f,  5.0f,  5.0f,  5.0f,  5.0f,
+
+            -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f,
+            0.0f,  0.0f,  0.0f,  0.0f,  0.0f,  0.0f,  0.0f,
+            -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f,
+            -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f,
+            -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f,
+            -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f, -0.5f,
+
+            8.0f,  8.0f,  0.0f,  0.0f,  0.0f,  0.0f,  0.0f,
+            10.0f, 10.0f, 0.0f,  0.0f,  0.0f,  0.0f,  0.0f,
+            10.0f, 10.0f, 0.0f,  0.0f,  0.0f,  0.0f,  0.0f,
+            10.0f, 10.0f, 0.0f,  0.0f,  0.0f,  0.0f,  0.0f,
+            10.0f, 10.0f, 0.0f,  0.0f,  0.0f,  0.0f,  0.0f,
+            8.0f,  8.0f,  0.0f,  0.0f,  0.0f,  0.0f,  0.0f,
+
+            0.0f,  0.0f,  0.0f,  0.0f,  0.0f,  0.0f,  0.0f,
+            0.0f,  0.0f,  0.0f,  0.0f,  0.0f,  0.0f,  0.0f,
+            0.0f,  0.0f,  0.0f,  0.0f,  0.0f,  0.0f,  0.0f,
+            0.0f,  0.0f,  0.0f,  0.0f,  0.0f,  0.0f,  0.0f,
+            0.0f,  0.0f,  0.0f,  0.0f,  0.0f,  0.0f,  0.0f,
+            0.0f,  0.0f,  0.0f,  0.0f,  0.0f,  0.0f,  0.0f
+        }));
+
+    // Optionally apply bias to output image.
+    if(biasEnabled)
+    {
+        ApplyBias(originalOutputImage,
+                  outputTensorInfo.GetQuantizationScale(),
+                  outputTensorInfo.GetQuantizationOffset(),
+                  biasV,
+                  biasDesc.GetQuantizationScale(),
+                  biasDesc.GetQuantizationOffset(),
+                  outputWidth,
+                  outputHeight);
+    }
+
+    LayerTestResult<T, 4> ret(outputTensorInfo);
+    std::vector<T> outputImage = originalOutputImage;
+    if (layout == armnn::DataLayout::NHWC)
+    {
+        armnnUtils::Permute(outputTensorInfo.GetShape(), NCHWToNHWC,
+                            originalOutputImage.data(), outputImage.data(), sizeof(T));
+    }
+
+    ret.outputExpected = MakeTensor<T, 4>(outputTensorInfo, outputImage);
+
+    std::unique_ptr<armnn::ITensorHandle> inputHandle = workloadFactory.CreateTensorHandle(inputTensorInfo);
+    std::unique_ptr<armnn::ITensorHandle> outputHandle = workloadFactory.CreateTensorHandle(outputTensorInfo);
+
+    armnn::DepthwiseConvolution2dQueueDescriptor data;
+    armnn::WorkloadInfo info;
+    armnn::ScopedCpuTensorHandle weightsTensor(kernelDesc);
+    armnn::ScopedCpuTensorHandle biasTensor(biasDesc);
+
+    AllocateAndCopyDataToITensorHandle(&weightsTensor, &kernel[0][0][0][0]);
+    AllocateAndCopyDataToITensorHandle(&biasTensor, &bias[0]);
+
+    AddInputToWorkload(data, info, inputTensorInfo, inputHandle.get());
+    AddOutputToWorkload(data, info, outputTensorInfo, outputHandle.get());
+
+    data.m_Weight = &weightsTensor;
+    data.m_Bias = &biasTensor; // Still set this whether or not bias is enabled.
+    data.m_Parameters.m_StrideX = 2;
+    data.m_Parameters.m_StrideY = 1;
+    data.m_Parameters.m_PadLeft = 0;
+    data.m_Parameters.m_PadRight = 0;
+    data.m_Parameters.m_PadTop = 1;
+    data.m_Parameters.m_PadBottom = 1;
+    data.m_Parameters.m_BiasEnabled = biasEnabled;
+    data.m_Parameters.m_DataLayout = layout;
+
+    std::unique_ptr<armnn::IWorkload> workload = workloadFactory.CreateDepthwiseConvolution2d(data, info);
+    inputHandle->Allocate();
+    outputHandle->Allocate();
+
+    CopyDataToITensorHandle(inputHandle.get(), &input[0][0][0][0]);
+
+    ExecuteWorkload(*workload, memoryManager);
+
+    CopyDataFromITensorHandle(&ret.output[0][0][0][0], outputHandle.get());
+
+    return ret;
+}
+
+template<armnn::DataType ArmnnType, armnn::DataType ArmnnBType,
+        typename T = armnn::ResolveType<ArmnnType>, typename B = armnn::ResolveType<ArmnnBType>>
+LayerTestResult<T, 4> DepthwiseConvolution2dTestImpl(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const boost::multi_array<T, 4>& originalInput,
+    const boost::multi_array<T, 4>& originalKernel,
+    const boost::multi_array<B, 1>& bias,
+    const boost::multi_array<T, 4>& originalOutputExpected,
+    float qScale,
+    int32_t qOffset,
+    const armnn::DataLayout layout = armnn::DataLayout::NCHW,
+    uint32_t padLeft = 0,
+    uint32_t padTop = 0,
+    uint32_t padRight = 0,
+    uint32_t padBottom = 0,
+    uint32_t strideX = 1,
+    uint32_t strideY = 1,
+    uint32_t dilationX = 1,
+    uint32_t dilationY = 1)
+{
+    unsigned int inputHeight   = boost::numeric_cast<unsigned int>(originalInput.shape()[2]);
+    unsigned int inputWidth    = boost::numeric_cast<unsigned int>(originalInput.shape()[3]);
+    unsigned int inputChannels = boost::numeric_cast<unsigned int>(originalInput.shape()[1]);
+    unsigned int inputNum      = boost::numeric_cast<unsigned int>(originalInput.shape()[0]);
+
+    unsigned int outputHeight   = boost::numeric_cast<unsigned int>(originalOutputExpected.shape()[2]);
+    unsigned int outputWidth    = boost::numeric_cast<unsigned int>(originalOutputExpected.shape()[3]);
+    unsigned int outputChannels = boost::numeric_cast<unsigned int>(originalOutputExpected.shape()[1]);
+    unsigned int outputNum      = boost::numeric_cast<unsigned int>(originalOutputExpected.shape()[0]);
+
+    unsigned int kernelHeight = boost::numeric_cast<unsigned int>(originalKernel.shape()[2]);
+    unsigned int kernelWidth = boost::numeric_cast<unsigned int>(originalKernel.shape()[3]);
+    unsigned int kernelChannels = boost::numeric_cast<unsigned int>(originalKernel.shape()[1]);
+    unsigned int kernelDepthMul = boost::numeric_cast<unsigned int>(originalKernel.shape()[0]);
+
+    bool biasEnabled = bias.size() > 0;
+
+    // This function currently assumes 1 batch of input/output (and duplicates this into 2 batches).
+    BOOST_ASSERT(inputNum == 1);
+    BOOST_ASSERT(outputNum == 1);
+
+    // If a bias is used, its size must equal the number of output channels.
+    BOOST_ASSERT(!biasEnabled || bias.size() == outputChannels);
+
+
+    // Note these tensors will use two (identical) batches.
+    armnn::TensorInfo inputTensorInfo =
+            armnnUtils::GetTensorInfo(2*inputNum, inputChannels, inputHeight, inputWidth, layout, ArmnnType);
+    armnn::TensorInfo outputTensorInfo =
+            armnnUtils::GetTensorInfo(2*outputNum, outputChannels, outputHeight, outputWidth, layout, ArmnnType);
+
+    // Kernel must be NCHW layout always, independently of the layout of the input and output for depthwise convolution.
+    armnn::TensorInfo kernelDesc({kernelDepthMul, kernelChannels, kernelHeight, kernelWidth}, ArmnnType);
+
+    armnn::TensorInfo biasDesc({static_cast<unsigned int>(bias.size())}, ArmnnBType);
+
+    // Set quantization parameters if the requested type is a quantized type.
+    if(armnn::IsQuantizedType<T>())
+    {
+        inputTensorInfo.SetQuantizationScale(qScale);
+        inputTensorInfo.SetQuantizationOffset(qOffset);
+        outputTensorInfo.SetQuantizationScale(qScale);
+        outputTensorInfo.SetQuantizationOffset(qOffset);
+        kernelDesc.SetQuantizationScale(qScale);
+        kernelDesc.SetQuantizationOffset(qOffset);
+        biasDesc.SetQuantizationScale(qScale*qScale);
+        biasDesc.SetQuantizationOffset(0);
+    }
+
+    LayerTestResult<T, 4> ret(outputTensorInfo);
+
+    // Construct input data
+    std::vector<T> input;
+    input.assign(originalInput.data(), originalInput.data() + 1*inputChannels*inputHeight*inputWidth);
+    std::vector<T> inputData;
+    inputData.insert(inputData.end(), input.begin(), input.end());
+    inputData.insert(inputData.end(), input.begin(), input.end());
+
+    // at this point if we require it permute the input data
+    const armnn::PermutationVector NCHWToNHWC = { 0, 3, 1, 2 };
+    if (layout == armnn::DataLayout::NHWC)
+    {
+        std::vector<T> tmp(inputData.size());
+        armnnUtils::Permute(inputTensorInfo.GetShape(), NCHWToNHWC, inputData.data(), tmp.data(), sizeof(T));
+        inputData = tmp;
+    }
+
+    auto batchedInput = MakeTensor<T, 4>(inputTensorInfo, inputData);
+
+    std::vector<T> output;
+    output.assign(originalOutputExpected.data(),
+                       originalOutputExpected.data() + outputChannels*outputHeight*outputWidth);
+
+    // Apply bias to output data if it is enabled.
+    if(biasEnabled)
+    {
+        std::vector<T> biasV;
+        biasV.assign(bias.data(), bias.data() + outputChannels);
+        ApplyBias(output, outputTensorInfo.GetQuantizationScale(), outputTensorInfo.GetQuantizationOffset(),
+                  biasV, biasDesc.GetQuantizationScale(), biasDesc.GetQuantizationOffset(),
+                  outputWidth, outputHeight);
+    }
+
+    // Construct expected output data
+    std::vector<T> outputData;
+    outputData.insert(outputData.end(), output.begin(), output.end());
+    outputData.insert(outputData.end(), output.begin(), output.end());
+
+    // at this point if we require it permute the expected output
+    if (layout == armnn::DataLayout::NHWC)
+    {
+        std::vector<T> tmp(outputData.size());
+        armnnUtils::Permute(outputTensorInfo.GetShape(), NCHWToNHWC, outputData.data(), tmp.data(), sizeof(T));
+        outputData = tmp;
+    }
+    ret.outputExpected = MakeTensor<T, 4>(outputTensorInfo, outputData);
+
+    std::unique_ptr<armnn::ITensorHandle> inputHandle = workloadFactory.CreateTensorHandle(inputTensorInfo);
+    std::unique_ptr<armnn::ITensorHandle> outputHandle = workloadFactory.CreateTensorHandle(outputTensorInfo);
+
+    armnn::DepthwiseConvolution2dQueueDescriptor data;
+    armnn::WorkloadInfo info;
+    armnn::ScopedCpuTensorHandle weightsTensor(kernelDesc);
+    armnn::ScopedCpuTensorHandle biasTensor(biasDesc);
+
+    boost::multi_array<T, 4> kernel = boost::multi_array<T, 4>(originalKernel);
+    AllocateAndCopyDataToITensorHandle(&weightsTensor, &kernel[0][0][0][0]);
+
+    if(biasEnabled)
+    {
+        AllocateAndCopyDataToITensorHandle(&biasTensor, &bias[0]);
+    }
+
+    AddInputToWorkload(data, info, inputTensorInfo, inputHandle.get());
+    AddOutputToWorkload(data, info, outputTensorInfo, outputHandle.get());
+
+    data.m_Weight = &weightsTensor;
+    data.m_Bias = &biasTensor; // Still set this whether or not bias is enabled - can be a source of bugs.
+    data.m_Parameters.m_StrideX = strideX;
+    data.m_Parameters.m_StrideY = strideY;
+    data.m_Parameters.m_PadLeft = padLeft;
+    data.m_Parameters.m_PadRight = padRight;
+    data.m_Parameters.m_PadTop = padTop;
+    data.m_Parameters.m_PadBottom = padBottom;
+    data.m_Parameters.m_BiasEnabled = biasEnabled;
+    data.m_Parameters.m_DataLayout = layout;
+    data.m_Parameters.m_DilationX = dilationX;
+    data.m_Parameters.m_DilationY = dilationY;
+
+    std::unique_ptr<armnn::IWorkload> workload = workloadFactory.CreateDepthwiseConvolution2d(data, info);
+    inputHandle->Allocate();
+    outputHandle->Allocate();
+
+    CopyDataToITensorHandle(inputHandle.get(), &batchedInput[0][0][0][0]);
+
+    ExecuteWorkload(*workload, memoryManager);
+
+    CopyDataFromITensorHandle(&ret.output[0][0][0][0], outputHandle.get());
+
+    return ret;
+}
+
+template<armnn::DataType ArmnnType, armnn::DataType ArmnnBType,
+         typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> DepthwiseConvolution2dAsymmetricTestCommon(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    float qScale,
+    int32_t qOffset,
+    bool biasEnabled,
+    const armnn::DataLayout layout)
+{
+    // Use a single-batch 2-channel 5x5 image as input.
+    armnn::TensorInfo inputTensorInfo({ 1, 2, 5, 5 }, ArmnnType);
+    auto input = MakeTensor<T, 4>(inputTensorInfo, std::vector<T>(
+        QuantizedVector<T>(inputTensorInfo.GetQuantizationScale(), inputTensorInfo.GetQuantizationOffset(),
+        {
+             0,  1,  2,  3,  4,
+             5,  6,  7,  8,  9,
+            10, 11, 12, 13, 14,
+            15, 16, 17, 18, 19,
+            20, 21, 22, 23, 24,
+
+            25, 26, 27, 28, 29,
+            30, 31, 32, 33, 34,
+            35, 36, 37, 38, 39,
+            40, 41, 42, 43, 44,
+            45, 46, 47, 48, 49
+        })));
+
+    // Use a depth multiplier of 1 on a 2-channel 4x4 kernel.
+    armnn::TensorInfo kernelTensorInfo({ 1, 2, 4, 4 }, ArmnnType);
+    auto kernel = MakeTensor<T, 4>(kernelTensorInfo, std::vector<T>(
+        QuantizedVector<T>(kernelTensorInfo.GetQuantizationScale(), kernelTensorInfo.GetQuantizationOffset(),
+        {
+            32, 31, 30, 29,
+            28, 27, 26, 25,
+            24, 23, 22, 21,
+            20, 19, 18, 17,
+
+            16, 15, 14, 13,
+            12, 11, 10,  9,
+             8,  7,  6,  5,
+             4,  3,  2,  1
+        })));
+
+    // Expected output is 1 batch of a 2-channel 5x5 image.
+    // Calculated using the python tensorflow library with strideX=1, strideY=1.
+    armnn::TensorInfo outputTensorInfo({ 1, 2, 5, 5 }, ArmnnType);
+    boost::multi_array<T, 4> expectedOutput = MakeTensor<T, 4>(outputTensorInfo, std::vector<T>(
+        QuantizedVector<T>(outputTensorInfo.GetQuantizationScale(), outputTensorInfo.GetQuantizationOffset(),
+        {
+            1062, 1580, 1850, 1530, 1117,
+            2140, 3108, 3500, 2842, 2042,
+            3580, 5068, 5460, 4342, 3062,
+            3618, 5072, 5390, 4248, 2971,
+            3074, 4282, 4510, 3533, 2457,
+
+            1550, 2284, 2362, 1955, 1428,
+            2910, 4206, 4342, 3528, 2536,
+            3390, 4886, 5022, 4068, 2916,
+            3566, 5056, 5182, 4133, 2922,
+            3100, 4352, 4452, 3517, 2465
+        })));
+
+    return DepthwiseConvolution2dAsymmetricTestImpl<ArmnnType, ArmnnBType>(
+        workloadFactory,
+        memoryManager,
+        input,
+        kernel,
+        GetBias2<ArmnnBType>(biasEnabled, qScale * qScale),
+        expectedOutput,
+        qScale,
+        qOffset,
+        layout,
+        1,  // Padding left.
+        1,  // Padding top.
+        2,  // Padding right.
+        2,  // Padding bottom.
+        1,  // strideX
+        1); // strideY
+}
+
+template<armnn::DataType ArmnnType, armnn::DataType ArmnnBType,
+         typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> DepthwiseConvolution2dNhwcTestCommon(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    float qScale,
+    int32_t qOffset,
+    bool biasEnabled)
+{
+    auto layout = armnn::DataLayout::NHWC;
+
+    armnn::TensorInfo inputTensorInfo({ 1, 2, 5, 5}, ArmnnType);
+    auto input = MakeTensor<T, 4>(inputTensorInfo, std::vector<T>(
+        QuantizedVector<T>(inputTensorInfo.GetQuantizationScale(), inputTensorInfo.GetQuantizationOffset(),
+        {
+             0,  1,  2,  3,  4,
+             5,  6,  7,  8,  9,
+            10, 11, 12, 13, 14,
+            15, 16, 17, 18, 19,
+            20, 21, 22, 23, 24,
+
+            25, 26, 27, 28, 29,
+            30, 31, 32, 33, 34,
+            35, 36, 37, 38, 39,
+            40, 41, 42, 43, 44,
+            45, 46, 47, 48, 49
+        })));
+
+    armnn::TensorInfo kernelTensorInfo({ 1, 2, 4, 4 }, ArmnnType);
+    auto kernel = MakeTensor<T, 4>(kernelTensorInfo, std::vector<T>(
+        QuantizedVector<T>(kernelTensorInfo.GetQuantizationScale(), kernelTensorInfo.GetQuantizationOffset(),
+        {
+             32, 31, 30, 29,
+             28, 27, 26, 25,
+             24, 23, 22, 21,
+             20, 19, 18, 17,
+
+             16, 15, 14, 13,
+             12, 11, 10,  9,
+              8,  7,  6,  5,
+              4,  3,  2,  1
+        })));
+
+    armnn::TensorInfo outputTensorInfo({ 1, 2, 5, 5}, ArmnnType);
+    boost::multi_array<T, 4> expectedOutput = MakeTensor<T, 4>(outputTensorInfo, std::vector<T>(
+        QuantizedVector<T>(outputTensorInfo.GetQuantizationScale(), outputTensorInfo.GetQuantizationOffset(),
+        {
+            1062, 1580, 1850, 1530, 1117,
+            2140, 3108, 3500, 2842, 2042,
+            3580, 5068, 5460, 4342, 3062,
+            3618, 5072, 5390, 4248, 2971,
+            3074, 4282, 4510, 3533, 2457,
+
+            1550, 2284, 2362, 1955, 1428,
+            2910, 4206, 4342, 3528, 2536,
+            3390, 4886, 5022, 4068, 2916,
+            3566, 5056, 5182, 4133, 2922,
+            3100, 4352, 4452, 3517, 2465
+        })));
+
+    return DepthwiseConvolution2dTestImpl<ArmnnType, ArmnnBType>(
+        workloadFactory,
+        memoryManager,
+        input,
+        kernel,
+        GetBias2<ArmnnBType>(biasEnabled, qScale * qScale),
+        expectedOutput,
+        qScale,
+        qOffset,
+        layout,
+        1,  // Padding left.
+        1,  // Padding top.
+        2,  // Padding right.
+        2,  // Padding bottom.
+        1,  // strideX
+        1);  // strideY
+}
+
+template<armnn::DataType ArmnnType, armnn::DataType ArmnnBType,
+         typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> SimpleDepthwiseConvolution2d3x3Dilation3x3NhwcTestCommon(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    float qScale,
+    int32_t qOffset,
+    bool biasEnabled)
+{
+    auto layout = armnn::DataLayout::NHWC;
+
+    armnn::TensorInfo inputTensorInfo({ 1, 1, 9, 9}, ArmnnType);
+    auto input = MakeTensor<T, 4>(inputTensorInfo, std::vector<T>(
+        QuantizedVector<T>(inputTensorInfo.GetQuantizationScale(), inputTensorInfo.GetQuantizationOffset(),
+        {
+             0, 0, 0, 0, 0, 0, 0, 0, 0,
+             0, 0, 0, 0, 0, 0, 0, 0, 0,
+             0, 0, 0, 0, 0, 0, 0, 0, 0,
+             0, 0, 0, 1, 1, 1, 0, 0, 0,
+             0, 0, 0, 1, 1, 1, 0, 0, 0,
+             0, 0, 0, 1, 1, 1, 0, 0, 0,
+             0, 0, 0, 0, 0, 0, 0, 0, 0,
+             0, 0, 0, 0, 0, 0, 0, 0, 0,
+             0, 0, 0, 0, 0, 0, 0, 0, 0
+        })));
+
+    armnn::TensorInfo kernelTensorInfo({ 1, 1, 3, 3}, ArmnnType);
+    auto kernel = MakeTensor<T, 4>(kernelTensorInfo, std::vector<T>(
+        QuantizedVector<T>(kernelTensorInfo.GetQuantizationScale(), kernelTensorInfo.GetQuantizationOffset(),
+        {
+             1, 2, 3,
+             4, 5, 6,
+             7, 8, 9
+        })));
+
+    uint32_t padLeft = 0;
+    uint32_t padTop = 0;
+    uint32_t padRight = 0;
+    uint32_t padBottom = 0;
+    uint32_t strideX  = 1;
+    uint32_t strideY  = 1;
+    uint32_t dilationX  = 3;
+    uint32_t dilationY  = 3;
+
+    // Since the dilation rate is 3 this will reduce the size of the output from 9x9 to 3x3 of all 5s.
+    armnn::TensorInfo outputTensorInfo({ 1, 1, 3, 3}, ArmnnType);
+    boost::multi_array<T, 4> expectedOutput = MakeTensor<T, 4>(outputTensorInfo, std::vector<T>(
+        QuantizedVector<T>(outputTensorInfo.GetQuantizationScale(), outputTensorInfo.GetQuantizationOffset(),
+        {
+             5, 5, 5,
+             5, 5, 5,
+             5, 5, 5
+        })));
+
+    return DepthwiseConvolution2dTestImpl<ArmnnType, ArmnnBType>(
+        workloadFactory,
+        memoryManager,
+        input,
+        kernel,
+        GetBias2<ArmnnBType>(biasEnabled, qScale * qScale),
+        expectedOutput,
+        qScale,
+        qOffset,
+        layout,
+        padLeft,
+        padTop,
+        padRight,
+        padBottom,
+        strideX,
+        strideY,
+        dilationX,
+        dilationY);
+}
+
+template<armnn::DataType ArmnnType, armnn::DataType ArmnnBType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> DepthwiseConvolution2d3x3DilationTestCommon(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        const std::vector<float>& inputNoQuantizedValues,
+        armnn::TensorInfo& inputTensorInfo,
+        const std::vector<float>& kernelNoQuantizedValues,
+        armnn::TensorInfo& kernelTensorInfo,
+        const std::vector<float>& outputExpectedNoQuantizedValues,
+        armnn::TensorInfo& outputTensorInfo,
+        uint32_t dilationX,
+        uint32_t dilationY,
+        armnn::DataLayout layout = armnn::DataLayout::NCHW,
+        bool biasEnabled = false)
+{
+    float qScale;
+    int32_t qOffset;
+    switch (ArmnnType)
+    {
+        case armnn::DataType::QuantisedAsymm8:
+        {
+            qScale = 0.1f;
+            qOffset = 128;
+            break;
+        }
+        case armnn::DataType::QuantisedSymm16:
+        {
+            qScale = 0.1f;
+            qOffset = 0;
+            break;
+        }
+        case armnn::DataType::Float32:
+        default:
+        {
+            qScale = 0.f;
+            qOffset = 0;
+            break;
+        }
+    }
+
+    inputTensorInfo.SetQuantizationScale(qScale);
+    inputTensorInfo.SetQuantizationOffset(qOffset);
+    kernelTensorInfo.SetQuantizationScale(qScale);
+    kernelTensorInfo.SetQuantizationOffset(qOffset);
+    outputTensorInfo.SetQuantizationScale(qScale);
+    outputTensorInfo.SetQuantizationOffset(qOffset);
+
+    auto input = MakeTensor<T, 4>(inputTensorInfo,
+                                  std::vector<T>(QuantizedVector<T>(inputTensorInfo.GetQuantizationScale(),
+                                                                    inputTensorInfo.GetQuantizationOffset(),
+                                                                    inputNoQuantizedValues)));
+    auto kernel = MakeTensor<T, 4>(kernelTensorInfo,
+                                   std::vector<T>(QuantizedVector<T>(kernelTensorInfo.GetQuantizationScale(),
+                                                                     kernelTensorInfo.GetQuantizationOffset(),
+                                                                     kernelNoQuantizedValues)));
+    auto expectedOutput = MakeTensor<T, 4>(outputTensorInfo,
+                                           std::vector<T>(QuantizedVector<T>(outputTensorInfo.GetQuantizationScale(),
+                                                                             outputTensorInfo.GetQuantizationOffset(),
+                                                                             outputExpectedNoQuantizedValues)));
+
+    uint32_t padLeft = 0;
+    uint32_t padTop = 0;
+    uint32_t padRight = 0;
+    uint32_t padBottom = 0;
+    uint32_t strideX  = 1;
+    uint32_t strideY  = 1;
+
+    return DepthwiseConvolution2dTestImpl<ArmnnType, ArmnnBType>(
+            workloadFactory,
+            memoryManager,
+            input,
+            kernel,
+            GetBias<ArmnnBType>(biasEnabled, qScale * qScale, outputTensorInfo, layout),
+            expectedOutput,
+            qScale,
+            qOffset,
+            layout,
+            padLeft,
+            padTop,
+            padRight,
+            padBottom,
+            strideX,
+            strideY,
+            dilationX,
+            dilationY);
+}
+
+template<armnn::DataType ArmnnType, armnn::DataType ArmnnBType, typename T>
+LayerTestResult<T, 4> DepthwiseConvolution2d3x3Dilation3x3Test(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        bool biasEnabled,
+        const armnn::DataLayout layout)
+{
+    armnn::TensorInfo inputTensorInfo({1, 1, 10, 10}, ArmnnType);
+    std::vector<float> inputNoQuantizedValues =
+            {
+                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                    0, 0, 0, 0, 1, 1, 1, 0, 0, 0,
+                    0, 0, 0, 0, 1, 1, 1, 0, 0, 0,
+                    0, 0, 0, 0, 1, 1, 1, 0, 0, 0,
+                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+            };
+
+    armnn::TensorInfo kernelTensorInfo({ 1, 1, 3, 3}, ArmnnType);
+    std::vector<float> kernelNoQuantizedValues =
+            {
+                    1, 2, 3,
+                    4, 5, 6,
+                    7, 8, 9
+            };
+
+    // Since the dilation rate is 3 this will dilate the kernel to be like 7x7,
+    // therefore the output will be 4x4: (I−K+2P)/S +1 => (10-7 +0)/1 +1
+    armnn::TensorInfo outputTensorInfo({ 1, 1, 4, 4}, ArmnnType);
+    std::vector<float> outputExpectedNoQuantizedValues =
+            {
+                    6., 5., 5., 5.,
+                    6., 5., 5., 5.,
+                    6., 5., 5., 5.,
+                    3., 2., 2., 2.
+            };
+
+    return DepthwiseConvolution2d3x3DilationTestCommon<ArmnnType, ArmnnBType>(
+            workloadFactory,
+            memoryManager,
+            inputNoQuantizedValues,
+            inputTensorInfo,
+            kernelNoQuantizedValues,
+            kernelTensorInfo,
+            outputExpectedNoQuantizedValues,
+            outputTensorInfo,
+            3,
+            3,
+            layout,
+            biasEnabled);
+}
+
+template<armnn::DataType ArmnnType, armnn::DataType ArmnnBType, typename T>
+LayerTestResult<T, 4> DepthwiseConvolution2d2x3x3Dilation3x3Test(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        bool biasEnabled,
+        const armnn::DataLayout layout)
+{
+    armnn::TensorInfo inputTensorInfo({1, 2, 10, 10}, ArmnnType);
+    std::vector<float> inputNoQuantizedValues =
+            {
+                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                    0, 0, 0, 0, 1, 1, 1, 0, 0, 0,
+                    0, 0, 0, 0, 1, 1, 1, 0, 0, 0,
+                    0, 0, 0, 0, 1, 1, 1, 0, 0, 0,
+                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+
+                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                    0, 0, 0, 0, 1, 1, 1, 0, 0, 0,
+                    0, 0, 0, 0, 1, 1, 1, 0, 0, 0,
+                    0, 0, 0, 0, 1, 1, 1, 0, 0, 0,
+                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+                    0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+            };
+
+    armnn::TensorInfo kernelTensorInfo({ 1, 2, 3, 3}, ArmnnType);
+    std::vector<float> kernelNoQuantizedValues =
+            {
+                    1, 2, 3,
+                    4, 5, 6,
+                    7, 8, 9,
+
+                    1, 2, 3,
+                    4, 5, 6,
+                    7, 8, 9
+            };
+
+    // Since the dilation rate is 3 this will dilate the kernel to be like 7x7,
+    // therefore the output will be 2x4x4: (I−K+2P)/S +1 => (10-7 +0)/1 +1
+    armnn::TensorInfo outputTensorInfo({ 1, 2, 4, 4}, ArmnnType);
+    std::vector<float> outputExpectedNoQuantizedValues =
+            {
+                    6., 5., 5., 5.,
+                    6., 5., 5., 5.,
+                    6., 5., 5., 5.,
+                    3., 2., 2., 2.,
+
+                    6., 5., 5., 5.,
+                    6., 5., 5., 5.,
+                    6., 5., 5., 5.,
+                    3., 2., 2., 2.
+            };
+
+    return DepthwiseConvolution2d3x3DilationTestCommon<ArmnnType, ArmnnBType>(
+            workloadFactory,
+            memoryManager,
+            inputNoQuantizedValues,
+            inputTensorInfo,
+            kernelNoQuantizedValues,
+            kernelTensorInfo,
+            outputExpectedNoQuantizedValues,
+            outputTensorInfo,
+            3,
+            3,
+            layout,
+            biasEnabled);
+}
+
+template<armnn::DataType ArmnnType, armnn::DataType ArmnnBType, typename T>
+LayerTestResult<T, 4> DepthwiseConvolution2dMult4Test(
+            armnn::IWorkloadFactory& workloadFactory,
+            const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+            bool biasEnabled,
+            const armnn::DataLayout layout)
+{
+    armnn::TensorInfo inputTensorInfo({1, 2, 3, 3}, ArmnnType);
+    std::vector<float> inputNoQuantizedValues =
+            {
+                    10.0, 10.0, 10.0,
+                    10.0, 10.0, 10.0,
+                    10.0, 10.0, 10.0,
+
+                    21.0, 22.0, 23.0,
+                    24.0, 25.0, 26.0,
+                    27.0, 28.0, 29.0
+            };
+
+    armnn::TensorInfo kernelTensorInfo({ 4, 2, 2, 2}, ArmnnType);
+
+    std::vector<float> kernelNoQuantizedValues =
+            {
+                    0.25f, 0.25f,
+                    0.25f, 0.25f,
+
+                    0.25f, 0.25f,
+                    0.25f, 0.25f,
+
+                    0.0f , 0.0f,
+                    0.0f , 0.1f,
+
+                    0.0f , 0.0f,
+                    0.0f , 0.1f,
+
+                    0.2f , 0.0f,
+                    0.0f , 0.0f,
+
+                    0.2f , 0.0f,
+                    0.0f , 0.0f,
+
+                    0.0f , 0.3f,
+                    0.0f , 0.0f,
+
+                    0.0f , 0.3f,
+                    0.0f , 0.0f
+            };
+
+    armnn::TensorInfo outputTensorInfo({ 1, 8, 2, 2}, ArmnnType);
+    std::vector<float> outputExpectedNoQuantizedValues =
+            {
+                    10.f, 10.f,
+                    10.f, 10.f,
+
+                    1.f, 1.f,
+                    1.f, 1.f,
+
+                    2.f, 2.f,
+                    2.f, 2.f,
+
+                    3.f, 3.f,
+                    3.f, 3.f,
+
+                    23.f, 24.f,
+                    26.f, 27.f,
+
+                    2.5f, 2.6000001f,
+                    2.8f, 2.9f,
+
+                    4.2000003f, 4.4f,
+                    4.8f, 5.f,
+
+                    6.6000004f, 6.9f,
+                    7.5000005f, 7.8f
+            };
+
+
+    return DepthwiseConvolution2d3x3DilationTestCommon<ArmnnType, ArmnnBType>(
+            workloadFactory,
+            memoryManager,
+            inputNoQuantizedValues,
+            inputTensorInfo,
+            kernelNoQuantizedValues,
+            kernelTensorInfo,
+            outputExpectedNoQuantizedValues,
+            outputTensorInfo,
+            1,
+            1,
+            layout,
+            biasEnabled);
+}
+
+template<armnn::DataType ArmnnType, armnn::DataType ArmnnBType, typename T>
+LayerTestResult<T, 4> DepthwiseConvolution2dMult2Test(
+            armnn::IWorkloadFactory& workloadFactory,
+            const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+            bool biasEnabled,
+            const armnn::DataLayout layout)
+{
+    armnn::TensorInfo inputTensorInfo({1, 2, 3, 3}, ArmnnType);
+    std::vector<float> inputNoQuantizedValues =
+            {
+                    10.0, 10.0, 10.0,
+                    10.0, 10.0, 10.0,
+                    10.0, 10.0, 10.0,
+
+                    21.0, 22.0, 23.0,
+                    24.0, 25.0, 26.0,
+                    27.0, 28.0, 29.0
+            };
+
+    armnn::TensorInfo kernelTensorInfo({ 2, 2, 2, 2}, ArmnnType);
+
+    std::vector<float> kernelNoQuantizedValues =
+            {
+                    0.25f, 0.25f,
+                    0.25f, 0.25f,
+
+                    0.2f , 0.0f,
+                    0.0f , 0.0f,
+
+                    0.0f , 0.0f,
+                    0.0f , 0.1f,
+
+                    0.0f , 0.3f,
+                    0.0f , 0.0f
+
+            };
+
+    armnn::TensorInfo outputTensorInfo({ 1, 4, 2, 2}, ArmnnType);
+    std::vector<float> outputExpectedNoQuantizedValues =
+            {
+                    10.f, 10.f,
+                    10.f, 10.f,
+
+                    1.f, 1.f,
+                    1.f, 1.f,
+
+                    4.2000003f, 4.4f,
+                    4.8f, 5.f,
+
+                    6.6000004f, 6.9f,
+                    7.5000005f, 7.8f
+            };
+
+
+    return DepthwiseConvolution2d3x3DilationTestCommon<ArmnnType, ArmnnBType>(
+            workloadFactory,
+            memoryManager,
+            inputNoQuantizedValues,
+            inputTensorInfo,
+            kernelNoQuantizedValues,
+            kernelTensorInfo,
+            outputExpectedNoQuantizedValues,
+            outputTensorInfo,
+            1,
+            1,
+            layout,
+            biasEnabled);
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> CompareDepthwiseConvolution2dTestImpl(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    armnn::IWorkloadFactory& refWorkloadFactory,
+    const armnnUtils::DataLayoutIndexed& layout)
+{
+    unsigned int inputHeight = 8;
+    unsigned int inputWidth = 16;
+    unsigned int inputChannels = 3;
+    unsigned int inputNum = 5;
+
+    unsigned int kernelHeight = 3;
+    unsigned int kernelWidth = 3;
+    unsigned int channelMultiplier = 1;
+
+    unsigned int strideX = 2;
+    unsigned int strideY = 3;
+    unsigned int padX = 1;
+    unsigned int padY = 1;
+
+    unsigned int outputNum = inputNum;
+    unsigned int outputChannels = inputChannels * channelMultiplier;
+    unsigned int outputHeight = (inputHeight + 2 * padY - kernelHeight + strideY) / strideY;
+    unsigned int outputWidth = (inputWidth + 2 * padX - kernelWidth + strideX) / strideX;
+
+    armnn::TensorInfo inputTensorInfo;
+    armnn::TensorInfo outputTensorInfo;
+    armnn::TensorInfo kernelDesc;
+    armnn::TensorInfo biasDesc;
+
+
+    std::vector<unsigned int> inputShape;
+    std::vector<unsigned int> outputShape;
+    std::vector<unsigned int> kernelShape{ channelMultiplier, inputChannels, kernelHeight, kernelWidth };
+    std::vector<unsigned int> biasShape{ outputChannels };
+    switch (layout.GetDataLayout())
+    {
+        case armnn::DataLayout::NCHW:
+            inputShape =  { inputNum, inputChannels, inputHeight, inputWidth };
+            outputShape = { outputNum, outputChannels, outputHeight, outputWidth };
+            break;
+        case armnn::DataLayout ::NHWC:
+            inputShape =  { inputNum, inputHeight, inputWidth, inputChannels };
+            outputShape = { outputNum, outputHeight, outputWidth, outputChannels };
+            break;
+        default:
+            throw armnn::InvalidArgumentException("unknown data layout ["
+                                                  + std::to_string(static_cast<int>(layout.GetDataLayout())) + "]");
+    }
+
+    float inputsQScale = armnn::IsQuantizedType<T>() ? 1.0f : 0;
+    float outputQScale = armnn::IsQuantizedType<T>() ? 2.0f : 0;
+    int32_t qOffset = 0;
+
+    inputTensorInfo = armnn::TensorInfo(4, inputShape.data(), ArmnnType, inputsQScale, qOffset);
+    outputTensorInfo = armnn::TensorInfo(4, outputShape.data(), ArmnnType, outputQScale, qOffset);
+    kernelDesc = armnn::TensorInfo(4, kernelShape.data(), ArmnnType, inputsQScale, qOffset);
+    biasDesc = armnn::TensorInfo(
+        1, biasShape.data(), armnn::GetBiasDataType(ArmnnType), inputsQScale, qOffset);
+
+    LayerTestResult<T, 4> ret(outputTensorInfo);
+
+    auto input = MakeRandomTensor<T, 4>(inputTensorInfo, 124908, 0.0f, 255.0f);
+    auto kernel = MakeRandomTensor<T, 4>(kernelDesc, 891234, 0.0f, 255.0f);
+    auto bias = MakeRandomTensor<typename FullyConnectedBiasTypeForInputType<T>::Type, 1>(
+            biasDesc, 1028, 0.0f, 255.0f);
+
+    std::unique_ptr<armnn::ITensorHandle> inputHandle = workloadFactory.CreateTensorHandle(inputTensorInfo);
+    std::unique_ptr<armnn::ITensorHandle> outputHandle = workloadFactory.CreateTensorHandle(outputTensorInfo);
+
+    armnn::DepthwiseConvolution2dQueueDescriptor data;
+    armnn::WorkloadInfo info;
+    armnn::ScopedCpuTensorHandle weightsTensor(kernelDesc);
+    armnn::ScopedCpuTensorHandle biasTensor(biasDesc);
+
+    AllocateAndCopyDataToITensorHandle(&weightsTensor, &kernel[0][0][0][0]);
+    AllocateAndCopyDataToITensorHandle(&biasTensor, &bias[0]);
+
+    AddInputToWorkload(data, info, inputTensorInfo, inputHandle.get());
+    AddOutputToWorkload(data, info, outputTensorInfo, outputHandle.get());
+    data.m_Weight = &weightsTensor;
+    data.m_Bias = &biasTensor;
+    data.m_Parameters.m_StrideX = strideX;
+    data.m_Parameters.m_StrideY = strideY;
+    data.m_Parameters.m_PadLeft = padX;
+    data.m_Parameters.m_PadRight = padX;
+    data.m_Parameters.m_PadTop = padY;
+    data.m_Parameters.m_PadBottom = padY;
+    data.m_Parameters.m_BiasEnabled = true;
+    data.m_Parameters.m_DataLayout = layout.GetDataLayout();
+
+    std::unique_ptr<armnn::ITensorHandle> outputHandleRef = refWorkloadFactory.CreateTensorHandle(outputTensorInfo);
+    std::unique_ptr<armnn::ITensorHandle> inputHandleRef = refWorkloadFactory.CreateTensorHandle(inputTensorInfo);
+
+    armnn::DepthwiseConvolution2dQueueDescriptor refData = data;
+    armnn::WorkloadInfo refInfo = info;
+    SetWorkloadInput(refData, refInfo, 0, inputTensorInfo, inputHandleRef.get());
+    SetWorkloadOutput(refData, refInfo, 0, outputTensorInfo, outputHandleRef.get());
+
+    std::unique_ptr<armnn::IWorkload> workload = workloadFactory.CreateDepthwiseConvolution2d(data, info);
+    std::unique_ptr<armnn::IWorkload> workloadRef = refWorkloadFactory.CreateDepthwiseConvolution2d(refData, refInfo);
+
+    outputHandleRef->Allocate();
+    inputHandleRef->Allocate();
+
+    inputHandle->Allocate();
+    outputHandle->Allocate();
+
+    CopyDataToITensorHandle(inputHandle.get(), &input[0][0][0][0]);
+    CopyDataToITensorHandle(inputHandleRef.get(), &input[0][0][0][0]);
+
+    ExecuteWorkload(*workload, memoryManager);
+
+    workloadRef->PostAllocationConfigure();
+    workloadRef->Execute();
+
+    CopyDataFromITensorHandle(&ret.output[0][0][0][0], outputHandle.get());
+    CopyDataFromITensorHandle(&ret.outputExpected[0][0][0][0], outputHandleRef.get());
+
+    return ret;
+}
+
+//
+// Explicit template specializations
+//
+
+template LayerTestResult<armnn::ResolveType<armnn::DataType::Float32>, 4>
+Convolution2d3x3Dilation3x3Test<armnn::DataType::Float32, armnn::DataType::Float32>(
+    armnn::IWorkloadFactory&,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr&,
+    bool,
+    armnn::DataLayout);
+
+template LayerTestResult<armnn::ResolveType<armnn::DataType::QuantisedAsymm8>, 4>
+Convolution2d3x3Dilation3x3Test<armnn::DataType::QuantisedAsymm8, armnn::DataType::Signed32>(
+    armnn::IWorkloadFactory&,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr&,
+    bool,
+    armnn::DataLayout);
+
+template LayerTestResult<armnn::ResolveType<armnn::DataType::QuantisedSymm16>, 4>
+Convolution2d3x3Dilation3x3Test<armnn::DataType::QuantisedSymm16, armnn::DataType::Signed32>(
+    armnn::IWorkloadFactory&,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr&,
+    bool,
+    armnn::DataLayout);
+
+template LayerTestResult<armnn::ResolveType<armnn::DataType::Float32>, 4>
+Convolution2d2x3x3Dilation3x3Test<armnn::DataType::Float32, armnn::DataType::Float32>(
+    armnn::IWorkloadFactory&,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr&,
+    bool,
+    armnn::DataLayout);
+
+template LayerTestResult<armnn::ResolveType<armnn::DataType::QuantisedAsymm8>, 4>
+Convolution2d2x3x3Dilation3x3Test<armnn::DataType::QuantisedAsymm8, armnn::DataType::Signed32>(
+    armnn::IWorkloadFactory&,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr&,
+    bool,
+    armnn::DataLayout);
+
+template LayerTestResult<armnn::ResolveType<armnn::DataType::QuantisedSymm16>, 4>
+Convolution2d2x3x3Dilation3x3Test<armnn::DataType::QuantisedSymm16, armnn::DataType::Signed32>(
+    armnn::IWorkloadFactory&,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr&,
+    bool,
+    armnn::DataLayout);
+
+template LayerTestResult<armnn::ResolveType<armnn::DataType::Float32>, 4>
+Convolution2d2x2Dilation2x2Padding2x2Stride3x3Test<armnn::DataType::Float32, armnn::DataType::Float32>(
+    armnn::IWorkloadFactory &workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr &memoryManager,
+    bool biasEnabled,
+    const armnn::DataLayout layout);
+
+template LayerTestResult<armnn::ResolveType<armnn::DataType::QuantisedAsymm8>, 4>
+Convolution2d2x2Dilation2x2Padding2x2Stride3x3Test<armnn::DataType::QuantisedAsymm8, armnn::DataType::Signed32>(
+    armnn::IWorkloadFactory &workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr &memoryManager,
+    bool biasEnabled,
+    const armnn::DataLayout layout);
+
+template LayerTestResult<armnn::ResolveType<armnn::DataType::QuantisedSymm16>, 4>
+Convolution2d2x2Dilation2x2Padding2x2Stride3x3Test<armnn::DataType::QuantisedSymm16, armnn::DataType::Signed32>(
+    armnn::IWorkloadFactory &workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr &memoryManager,
+    bool biasEnabled,
+    const armnn::DataLayout layout);
+
+template LayerTestResult<armnn::ResolveType<armnn::DataType::Float32>, 4>
+DepthwiseConvolution2d3x3Dilation3x3Test<armnn::DataType::Float32, armnn::DataType::Float32>(
+        armnn::IWorkloadFactory&,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr&,
+        bool,
+        armnn::DataLayout);
+
+template LayerTestResult<armnn::ResolveType<armnn::DataType::QuantisedAsymm8>, 4>
+DepthwiseConvolution2d3x3Dilation3x3Test<armnn::DataType::QuantisedAsymm8, armnn::DataType::Signed32>(
+        armnn::IWorkloadFactory&,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr&,
+        bool,
+        armnn::DataLayout);
+
+template LayerTestResult<armnn::ResolveType<armnn::DataType::QuantisedSymm16>, 4>
+DepthwiseConvolution2d3x3Dilation3x3Test<armnn::DataType::QuantisedSymm16, armnn::DataType::Signed32>(
+        armnn::IWorkloadFactory&,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr&,
+        bool,
+        armnn::DataLayout);
+
+template LayerTestResult<armnn::ResolveType<armnn::DataType::Float32>, 4>
+DepthwiseConvolution2d2x3x3Dilation3x3Test<armnn::DataType::Float32, armnn::DataType::Float32>(
+        armnn::IWorkloadFactory&,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr&,
+        bool,
+        armnn::DataLayout);
+
+template LayerTestResult<armnn::ResolveType<armnn::DataType::QuantisedAsymm8>, 4>
+DepthwiseConvolution2d2x3x3Dilation3x3Test<armnn::DataType::QuantisedAsymm8, armnn::DataType::Signed32>(
+        armnn::IWorkloadFactory&,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr&,
+        bool,
+        armnn::DataLayout);
+
+template LayerTestResult<armnn::ResolveType<armnn::DataType::QuantisedSymm16>, 4>
+DepthwiseConvolution2d2x3x3Dilation3x3Test<armnn::DataType::QuantisedSymm16, armnn::DataType::Signed32>(
+        armnn::IWorkloadFactory&,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr&,
+        bool,
+        armnn::DataLayout);
+
+template LayerTestResult<armnn::ResolveType<armnn::DataType::Float32>, 4>
+DepthwiseConvolution2dMult4Test<armnn::DataType::Float32, armnn::DataType::Float32>(
+        armnn::IWorkloadFactory &workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr &memoryManager,
+        bool biasEnabled,
+        const armnn::DataLayout layout);
+
+template LayerTestResult<armnn::ResolveType<armnn::DataType::Float32>, 4>
+DepthwiseConvolution2dMult2Test<armnn::DataType::Float32, armnn::DataType::Float32>(
+        armnn::IWorkloadFactory &workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr &memoryManager,
+        bool biasEnabled,
+        const armnn::DataLayout layout);
+
+//
+// Implementation functions
+//
+
+LayerTestResult<float, 4> SimpleConvolution2d3x5Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    bool biasEnabled,
+    const armnn::DataLayout layout)
+{
+    return SimpleConvolution2d3x5TestCommon<armnn::DataType::Float32, armnn::DataType::Float32>(
+        workloadFactory, memoryManager, 0.f, 0, biasEnabled, layout);
+}
+
+LayerTestResult<uint8_t, 4> SimpleConvolution2d3x5Uint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    bool biasEnabled,
+    const armnn::DataLayout layout)
+{
+    return SimpleConvolution2d3x5TestCommon<armnn::DataType::QuantisedAsymm8, armnn::DataType::Signed32>(
+        workloadFactory, memoryManager, 0.5f, 50, biasEnabled, layout);
+}
+
+LayerTestResult<float, 4> SimpleConvolution2d3x3Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    bool biasEnabled,
+    const armnn::DataLayout layout)
+{
+    return SimpleConvolution2d3x3TestCommon<armnn::DataType::Float32, armnn::DataType::Float32>(
+        workloadFactory, memoryManager, 0.f, 0, biasEnabled, layout);
+}
+
+LayerTestResult<float, 4> SimpleConvolution2d3x3NhwcTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    bool biasEnabled)
+{
+    return SimpleConvolution2d3x3NhwcTestCommon<armnn::DataType::Float32>(
+        workloadFactory,
+        memoryManager,
+        0.f,
+        0,
+        biasEnabled,
+        armnn::DataLayout::NHWC);
+}
+
+LayerTestResult<float, 4> SimpleConvolution2d3x3Stride2x2Test(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        bool biasEnabled,
+        const armnn::DataLayout layout)
+{
+    return SimpleConvolution2d3x3Stride2x2TestCommon<armnn::DataType::Float32>(
+        workloadFactory,
+        memoryManager,
+        0.f,
+        0,
+        biasEnabled,
+        layout);
+}
+
+LayerTestResult<uint8_t, 4> SimpleConvolution2d3x3Uint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    bool biasEnabled,
+    const armnn::DataLayout layout)
+{
+    return SimpleConvolution2d3x3TestCommon<armnn::DataType::QuantisedAsymm8, armnn::DataType::Signed32>(
+        workloadFactory, memoryManager, 0.5f, 50, biasEnabled, layout);
+}
+
+LayerTestResult<int16_t, 4> SimpleConvolution2d3x5QSymm16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    bool biasEnabled,
+    const armnn::DataLayout layout)
+{
+    return SimpleConvolution2d3x5TestCommon<armnn::DataType::QuantisedSymm16, armnn::DataType::Signed32>(
+        workloadFactory, memoryManager, 0.5f, 50, biasEnabled, layout);
+}
+
+LayerTestResult<int16_t, 4> SimpleConvolution2d3x3QSymm16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    bool biasEnabled,
+    const armnn::DataLayout layout)
+{
+    return SimpleConvolution2d3x3TestCommon<armnn::DataType::QuantisedSymm16, armnn::DataType::Signed32>(
+            workloadFactory, memoryManager, 0.5f, 50, biasEnabled, layout);
+}
+
+LayerTestResult<float, 4> Convolution2dAsymmetricPaddingTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    armnn::DataLayout layout)
+{
+    return SimpleConvolution2dAsymmetricPaddingTestCommon<armnn::DataType::Float32, armnn::DataType::Float32>(
+            workloadFactory, memoryManager, layout, 0.0f, 0);
+}
+
+LayerTestResult<float, 4> Convolution2dAsymmetricPaddingLargerThanHalfKernelSizeTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    armnn::DataLayout layout)
+{
+    return Convolution2dAsymmetricPaddingLargerThanHalfKernelSizeTestCommon
+            <armnn::DataType::Float32, armnn::DataType::Float32>(
+            workloadFactory, memoryManager, layout, 0.0f, 0);
+}
+
+LayerTestResult<float, 4> Convolution1dTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    bool biasEnabled)
+{
+    return Convolution1dTestImpl<armnn::DataType::Float32, armnn::DataType::Float32>(
+            workloadFactory, memoryManager, 0.0f, 0, biasEnabled);
+}
+
+LayerTestResult<uint8_t, 4> Convolution1dUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    bool biasEnabled)
+{
+    return Convolution1dTestImpl<armnn::DataType::QuantisedAsymm8, armnn::DataType::Signed32>(
+            workloadFactory, memoryManager, 0.1f, 128, biasEnabled);
+}
+
+LayerTestResult<float,4> CompareConvolution2dTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    armnn::IWorkloadFactory& refWorkloadFactory)
+{
+    return CompareConvolution2dTestImpl<armnn::DataType::Float32>(
+            workloadFactory, memoryManager, refWorkloadFactory);
+}
+
+LayerTestResult<float, 4> DepthwiseConvolution2dTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    bool biasEnabled,
+    const armnn::DataLayout layout)
+{
+    return DepthwiseConvolution2dTestImpl<armnn::DataType::Float32, armnn::DataType::Float32>(
+        workloadFactory, memoryManager, 0.0f, 0, biasEnabled, layout);
+}
+
+LayerTestResult<float, 4> DepthwiseConvolution2dDepthNhwcTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    bool biasEnabled)
+{
+    return DepthwiseConvolution2dNhwcTestCommon<armnn::DataType::Float32, armnn::DataType::Float32>(
+        workloadFactory, memoryManager, 0.0f, 0, biasEnabled);
+}
+
+LayerTestResult<float, 4> DepthwiseConvolution2dDepthMul1Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    bool biasEnabled,
+    const armnn::DataLayout layout)
+{
+    return DepthwiseConvolution2dDepthMul1TestImpl<armnn::DataType::Float32, armnn::DataType::Float32>(
+        workloadFactory, memoryManager, 0.0f, 0, biasEnabled, layout);
+}
+
+LayerTestResult<float, 4> DepthwiseConvolution2dDepthMul64Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    armnn::TensorInfo inputTensorInfo({ 1, 1, 2, 2 }, armnn::DataType::Float32);
+    auto input = MakeTensor<float, 4>(inputTensorInfo, { 1.f, 2.f, 3.f, 4.f });
+
+    std::vector<float> kernelData;
+    std::vector<float> singleDepthKernel{ 1.f, -1.f, -1.f, 1.f };
+    for (unsigned int i = 0; i < 64; ++i)
+    {
+        kernelData.insert(kernelData.end(), singleDepthKernel.begin(), singleDepthKernel.end());
+    }
+    armnn::TensorInfo kernelTensorInfo({ 64, 1, 2, 2 }, armnn::DataType::Float32);
+    auto kernel = MakeTensor<float, 4>(kernelTensorInfo, kernelData);
+
+    std::vector<float> expectedOutputData(64, 0.f);
+    armnn::TensorInfo outputTensorInfo({ 1, 64, 1, 1 }, armnn::DataType::Float32);
+    auto expectedOutput = MakeTensor<float, 4>(outputTensorInfo, expectedOutputData);
+
+    return DepthwiseConvolution2dTestImpl<armnn::DataType::Float32, armnn::DataType::Float32>(
+            workloadFactory,
+            memoryManager,
+            input,
+            kernel,
+            boost::multi_array<float, 1>(),
+            expectedOutput,
+            0.f,
+            0,
+            armnn::DataLayout::NCHW);
+}
+
+LayerTestResult<float, 4> DepthwiseConvolution2dAsymmetricTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    bool biasEnabled,
+    const armnn::DataLayout layout)
+{
+    return DepthwiseConvolution2dAsymmetricTestCommon<armnn::DataType::Float32, armnn::DataType::Float32>(
+        workloadFactory, memoryManager, 0.0f, 0, biasEnabled, layout);
+}
+
+LayerTestResult<uint8_t, 4> DepthwiseConvolution2dUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    bool biasEnabled,
+    const armnn::DataLayout layout)
+{
+    return DepthwiseConvolution2dTestImpl<armnn::DataType::QuantisedAsymm8, armnn::DataType::Signed32>(
+        workloadFactory, memoryManager, 0.5f, 50, biasEnabled, layout);
+}
+
+LayerTestResult<uint8_t, 4> DepthwiseConvolution2dDepthMul1Uint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    bool biasEnabled,
+    const armnn::DataLayout layout)
+{
+    return DepthwiseConvolution2dDepthMul1TestImpl<armnn::DataType::QuantisedAsymm8, armnn::DataType::Signed32>(
+        workloadFactory, memoryManager, 0.5f, 50, biasEnabled, layout);
+}
+
+LayerTestResult<float, 4> SimpleDepthwiseConvolution2d3x3Dilation3x3NhwcTest(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return SimpleDepthwiseConvolution2d3x3Dilation3x3NhwcTestCommon<armnn::DataType::Float32, armnn::DataType::Float32>(
+            workloadFactory,
+            memoryManager,
+            0.f,
+            0,
+            false);
+}
+
+LayerTestResult<int16_t, 4> DepthwiseConvolution2dInt16Test(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        bool biasEnabled,
+        const armnn::DataLayout layout)
+{
+    return DepthwiseConvolution2dTestImpl<armnn::DataType::QuantisedSymm16, armnn::DataType::Signed32>(
+        workloadFactory, memoryManager, 0.5f, 50, biasEnabled, layout);
+}
+
+LayerTestResult<int16_t, 4> DepthwiseConvolution2dDepthMul1Int16Test(
+                armnn::IWorkloadFactory& workloadFactory,
+                const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+                bool biasEnabled,
+                const armnn::DataLayout layout)
+{
+    return DepthwiseConvolution2dDepthMul1TestImpl<armnn::DataType::QuantisedSymm16, armnn::DataType::Signed32>(
+        workloadFactory, memoryManager, 0.5f, 50, biasEnabled, layout);
+}
+
+LayerTestResult<float, 4> CompareDepthwiseConvolution2dFloatTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    armnn::IWorkloadFactory& refWorkloadFactory,
+    const armnn::DataLayout layout)
+{
+    return CompareDepthwiseConvolution2dTestImpl<armnn::DataType::Float32>(
+        workloadFactory, memoryManager, refWorkloadFactory, layout);
+}
+
+LayerTestResult<uint8_t, 4> CompareDepthwiseConvolution2dUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    armnn::IWorkloadFactory& refWorkloadFactory,
+    const armnn::DataLayout layout)
+{
+    return CompareDepthwiseConvolution2dTestImpl<armnn::DataType::QuantisedAsymm8>(
+        workloadFactory, memoryManager, refWorkloadFactory, layout);
+}
diff --git a/src/backends/backendsCommon/test/layerTests/Conv2dTestImpl.hpp b/src/backends/backendsCommon/test/layerTests/Conv2dTestImpl.hpp
new file mode 100644
index 0000000..f5ff586
--- /dev/null
+++ b/src/backends/backendsCommon/test/layerTests/Conv2dTestImpl.hpp
@@ -0,0 +1,218 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include "LayerTestResult.hpp"
+
+#include <ResolveType.hpp>
+
+#include <armnn/Types.hpp>
+
+#include <backendsCommon/IBackendInternal.hpp>
+#include <backendsCommon/WorkloadFactory.hpp>
+
+//
+// Convolution2d
+//
+
+template<armnn::DataType ArmnnType, armnn::DataType ArmnnBType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> Convolution2d3x3Dilation3x3Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    bool biasEnabled,
+    const armnn::DataLayout layout);
+
+template<armnn::DataType ArmnnType, armnn::DataType ArmnnBType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> Convolution2d2x3x3Dilation3x3Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    bool biasEnabled,
+    const armnn::DataLayout layout);
+
+template<armnn::DataType ArmnnType, armnn::DataType ArmnnBType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> Convolution2d2x2Dilation2x2Padding2x2Stride3x3Test(
+    armnn::IWorkloadFactory &workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr &memoryManager,
+    bool biasEnabled,
+    const armnn::DataLayout layout);
+
+LayerTestResult<float, 4> SimpleConvolution2d3x5Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    bool biasEnabled,
+    const armnn::DataLayout layout);
+
+LayerTestResult<float, 4> SimpleConvolution2d3x3Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    bool biasEnabled,
+    const armnn::DataLayout layout);
+
+LayerTestResult<float, 4> SimpleConvolution2d3x3Stride2x2Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    bool biasEnabled,
+    const armnn::DataLayout layout);
+
+LayerTestResult<float, 4> SimpleConvolution2d3x3NhwcTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    bool biasEnabled);
+
+LayerTestResult<uint8_t, 4> SimpleConvolution2d3x5Uint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    bool biasEnabled,
+    const armnn::DataLayout layout);
+
+LayerTestResult<uint8_t, 4> SimpleConvolution2d3x3Uint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    bool biasEnabled,
+    const armnn::DataLayout layout);
+
+LayerTestResult<int16_t, 4> SimpleConvolution2d3x5QSymm16Test(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        bool biasEnabled,
+        const armnn::DataLayout layout);
+
+LayerTestResult<int16_t, 4> SimpleConvolution2d3x3QSymm16Test(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        bool biasEnabled,
+        const armnn::DataLayout layout);
+
+LayerTestResult<float, 4> Convolution2dAsymmetricPaddingLargerThanHalfKernelSizeTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    armnn::DataLayout layout);
+
+LayerTestResult<float, 4> Convolution2dAsymmetricPaddingTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    armnn::DataLayout layout);
+
+LayerTestResult<float,   4> Convolution1dTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    bool biasEnabled);
+
+LayerTestResult<uint8_t, 4> Convolution1dUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    bool biasEnabled);
+
+LayerTestResult<float, 4> CompareConvolution2dTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    armnn::IWorkloadFactory& refWorkloadFactory);
+
+//
+// DepthwiseConvolution2d
+//
+
+template<armnn::DataType ArmnnType, armnn::DataType ArmnnBType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> DepthwiseConvolution2d3x3Dilation3x3Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    bool biasEnabled,
+    const armnn::DataLayout layout);
+
+template<armnn::DataType ArmnnType, armnn::DataType ArmnnBType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> DepthwiseConvolution2d2x3x3Dilation3x3Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    bool biasEnabled,
+    const armnn::DataLayout layout);
+
+template<armnn::DataType ArmnnType, armnn::DataType ArmnnBType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> DepthwiseConvolution2dMult4Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    bool biasEnabled,
+    const armnn::DataLayout layout);
+
+template<armnn::DataType ArmnnType, armnn::DataType ArmnnBType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> DepthwiseConvolution2dMult2Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    bool biasEnabled,
+    const armnn::DataLayout layout);
+
+template<typename T>
+LayerTestResult<T, 4> CompareDepthwiseConvolution2dTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    armnn::IWorkloadFactory& refWorkloadFactory,
+    const armnn::DataLayout layout);
+
+LayerTestResult<float, 4> DepthwiseConvolution2dTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    bool biasEnabled,
+    const armnn::DataLayout layout);
+
+LayerTestResult<float, 4> DepthwiseConvolution2dDepthNhwcTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    bool biasEnabled);
+
+LayerTestResult<float, 4> DepthwiseConvolution2dDepthMul1Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    bool biasEnabled,
+    const armnn::DataLayout layout);
+
+LayerTestResult<float, 4> DepthwiseConvolution2dDepthMul64Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<float, 4> DepthwiseConvolution2dAsymmetricTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    bool biasEnabled,
+    const armnn::DataLayout layout);
+
+LayerTestResult<float, 4> SimpleDepthwiseConvolution2d3x3Dilation3x3NhwcTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<uint8_t, 4> DepthwiseConvolution2dUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    bool biasEnabled,
+    const armnn::DataLayout layout);
+
+LayerTestResult<uint8_t, 4> DepthwiseConvolution2dDepthMul1Uint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    bool biasEnabled,
+    const armnn::DataLayout layout);
+
+LayerTestResult<int16_t, 4> DepthwiseConvolution2dInt16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    bool biasEnabled,
+    const armnn::DataLayout layout);
+
+LayerTestResult<int16_t, 4> DepthwiseConvolution2dDepthMul1Int16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    bool biasEnabled,
+    const armnn::DataLayout layout);
+
+LayerTestResult<float, 4> CompareDepthwiseConvolution2dFloatTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    armnn::IWorkloadFactory& refWorkloadFactory,
+    const armnn::DataLayout layout);
+
+LayerTestResult<uint8_t, 4> CompareDepthwiseConvolution2dUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    armnn::IWorkloadFactory& refWorkloadFactory,
+    const armnn::DataLayout layout);
diff --git a/src/backends/backendsCommon/test/layerTests/ConvertFp16ToFp32TestImpl.cpp b/src/backends/backendsCommon/test/layerTests/ConvertFp16ToFp32TestImpl.cpp
new file mode 100644
index 0000000..61eb03a
--- /dev/null
+++ b/src/backends/backendsCommon/test/layerTests/ConvertFp16ToFp32TestImpl.cpp
@@ -0,0 +1,55 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "ConvertFp16ToFp32TestImpl.hpp"
+
+#include <Half.hpp>
+
+#include <armnn/ArmNN.hpp>
+
+#include <backendsCommon/test/TensorCopyUtils.hpp>
+#include <backendsCommon/test/WorkloadTestUtils.hpp>
+
+#include <test/TensorHelpers.hpp>
+
+LayerTestResult<float, 4> SimpleConvertFp16ToFp32Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    using namespace half_float::literal;
+
+    const armnn::TensorInfo inputTensorInfo({1, 3, 2, 3}, armnn::DataType::Float16);
+    const armnn::TensorInfo outputTensorInfo({1, 3, 2, 3}, armnn::DataType::Float32);
+
+    auto input = MakeTensor<armnn::Half, 4>(inputTensorInfo,
+        { -37.5_h, -15.2_h, -8.76_h, -2.0_h, -1.5_h, -1.3_h, -0.5_h, -0.4_h, 0.0_h,
+          1.0_h, 0.4_h, 0.5_h, 1.3_h, 1.5_h, 2.0_h, 8.76_h, 15.2_h, 37.5_h });
+
+    LayerTestResult<float, 4> ret(outputTensorInfo);
+    ret.outputExpected = MakeTensor<float, 4>(outputTensorInfo,
+        { -37.5f, -15.2f, -8.76f, -2.0f, -1.5f, -1.3f, -0.5f, -0.4f, 0.0f,
+          1.0f, 0.4f, 0.5f, 1.3f, 1.5f, 2.0f, 8.76f, 15.2f, 37.5f });
+
+    std::unique_ptr<armnn::ITensorHandle> inputHandle = workloadFactory.CreateTensorHandle(inputTensorInfo);
+    std::unique_ptr<armnn::ITensorHandle> outputHandle = workloadFactory.CreateTensorHandle(outputTensorInfo);
+
+    armnn::ConvertFp16ToFp32QueueDescriptor data;
+    armnn::WorkloadInfo info;
+    AddInputToWorkload(data, info, inputTensorInfo, inputHandle.get());
+    AddOutputToWorkload(data, info, outputTensorInfo, outputHandle.get());
+
+    std::unique_ptr<armnn::IWorkload> workload = workloadFactory.CreateConvertFp16ToFp32(data, info);
+
+    inputHandle->Allocate();
+    outputHandle->Allocate();
+
+    CopyDataToITensorHandle(inputHandle.get(), &input[0][0][0][0]);
+
+    workload->Execute();
+
+    CopyDataFromITensorHandle(&ret.output[0][0][0][0], outputHandle.get());
+
+    return ret;
+}
diff --git a/src/backends/backendsCommon/test/layerTests/ConvertFp16ToFp32TestImpl.hpp b/src/backends/backendsCommon/test/layerTests/ConvertFp16ToFp32TestImpl.hpp
new file mode 100644
index 0000000..8e77e26
--- /dev/null
+++ b/src/backends/backendsCommon/test/layerTests/ConvertFp16ToFp32TestImpl.hpp
@@ -0,0 +1,15 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include "LayerTestResult.hpp"
+
+#include <backendsCommon/IBackendInternal.hpp>
+#include <backendsCommon/WorkloadFactory.hpp>
+
+LayerTestResult<float, 4> SimpleConvertFp16ToFp32Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
diff --git a/src/backends/backendsCommon/test/layerTests/ConvertFp32ToFp16TestImpl.cpp b/src/backends/backendsCommon/test/layerTests/ConvertFp32ToFp16TestImpl.cpp
new file mode 100644
index 0000000..e5184e0
--- /dev/null
+++ b/src/backends/backendsCommon/test/layerTests/ConvertFp32ToFp16TestImpl.cpp
@@ -0,0 +1,53 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "ConvertFp32ToFp16TestImpl.hpp"
+
+#include <armnn/ArmNN.hpp>
+
+#include <backendsCommon/test/TensorCopyUtils.hpp>
+#include <backendsCommon/test/WorkloadTestUtils.hpp>
+
+#include <test/TensorHelpers.hpp>
+
+LayerTestResult<armnn::Half, 4> SimpleConvertFp32ToFp16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    using namespace half_float::literal;
+
+    const armnn::TensorInfo inputTensorInfo({1, 3, 2, 3}, armnn::DataType::Float32);
+    const armnn::TensorInfo outputTensorInfo({1, 3, 2, 3}, armnn::DataType::Float16);
+
+    auto input = MakeTensor<float, 4>(inputTensorInfo,
+        { -37.5f, -15.2f, -8.76f, -2.0f, -1.5f, -1.3f, -0.5f, -0.4f, 0.0f,
+          1.0f, 0.4f, 0.5f, 1.3f, 1.5f, 2.0f, 8.76f, 15.2f, 37.5f });
+
+    LayerTestResult<armnn::Half, 4> ret(outputTensorInfo);
+    ret.outputExpected = MakeTensor<armnn::Half, 4>(outputTensorInfo,
+        { -37.5_h, -15.2_h, -8.76_h, -2.0_h, -1.5_h, -1.3_h, -0.5_h, -0.4_h, 0.0_h,
+          1.0_h, 0.4_h, 0.5_h, 1.3_h, 1.5_h, 2.0_h, 8.76_h, 15.2_h, 37.5_h });
+
+    std::unique_ptr<armnn::ITensorHandle> inputHandle = workloadFactory.CreateTensorHandle(inputTensorInfo);
+    std::unique_ptr<armnn::ITensorHandle> outputHandle = workloadFactory.CreateTensorHandle(outputTensorInfo);
+
+    armnn::ConvertFp32ToFp16QueueDescriptor data;
+    armnn::WorkloadInfo info;
+    AddInputToWorkload(data, info, inputTensorInfo, inputHandle.get());
+    AddOutputToWorkload(data, info, outputTensorInfo, outputHandle.get());
+
+    std::unique_ptr<armnn::IWorkload> workload = workloadFactory.CreateConvertFp32ToFp16(data, info);
+
+    inputHandle->Allocate();
+    outputHandle->Allocate();
+
+    CopyDataToITensorHandle(inputHandle.get(), &input[0][0][0][0]);
+
+    workload->Execute();
+
+    CopyDataFromITensorHandle(&ret.output[0][0][0][0], outputHandle.get());
+
+    return ret;
+}
diff --git a/src/backends/backendsCommon/test/layerTests/ConvertFp32ToFp16TestImpl.hpp b/src/backends/backendsCommon/test/layerTests/ConvertFp32ToFp16TestImpl.hpp
new file mode 100644
index 0000000..bb0fce2
--- /dev/null
+++ b/src/backends/backendsCommon/test/layerTests/ConvertFp32ToFp16TestImpl.hpp
@@ -0,0 +1,17 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include "LayerTestResult.hpp"
+
+#include <Half.hpp>
+
+#include <backendsCommon/IBackendInternal.hpp>
+#include <backendsCommon/WorkloadFactory.hpp>
+
+LayerTestResult<armnn::Half, 4> SimpleConvertFp32ToFp16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
diff --git a/src/backends/backendsCommon/test/layerTests/DebugTestImpl.cpp b/src/backends/backendsCommon/test/layerTests/DebugTestImpl.cpp
new file mode 100644
index 0000000..1997c4b
--- /dev/null
+++ b/src/backends/backendsCommon/test/layerTests/DebugTestImpl.cpp
@@ -0,0 +1,365 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "DebugTestImpl.hpp"
+
+#include <ResolveType.hpp>
+
+#include <armnn/ArmNN.hpp>
+
+#include <backendsCommon/test/TensorCopyUtils.hpp>
+#include <backendsCommon/test/WorkloadTestUtils.hpp>
+
+#include <test/TensorHelpers.hpp>
+
+namespace
+{
+
+template<typename T, std::size_t Dim>
+LayerTestResult<T, Dim> DebugTestImpl(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    armnn::TensorInfo& inputTensorInfo,
+    armnn::TensorInfo& outputTensorInfo,
+    std::vector<float>& inputData,
+    std::vector<float>& outputExpectedData,
+    armnn::DebugQueueDescriptor descriptor,
+    const std::string expectedStringOutput,
+    const float qScale = 1.0f,
+    const int32_t qOffset = 0)
+{
+    if(armnn::IsQuantizedType<T>())
+    {
+        inputTensorInfo.SetQuantizationScale(qScale);
+        inputTensorInfo.SetQuantizationOffset(qOffset);
+
+        outputTensorInfo.SetQuantizationScale(qScale);
+        outputTensorInfo.SetQuantizationOffset(qOffset);
+    }
+
+    boost::multi_array<T, Dim> input =
+        MakeTensor<T, Dim>(inputTensorInfo, QuantizedVector<T>(qScale, qOffset, inputData));
+
+    LayerTestResult<T, Dim> ret(outputTensorInfo);
+    ret.outputExpected =
+        MakeTensor<T, Dim>(outputTensorInfo, QuantizedVector<T>(qScale, qOffset, outputExpectedData));
+
+    std::unique_ptr<armnn::ITensorHandle> inputHandle =
+        workloadFactory.CreateTensorHandle(inputTensorInfo);
+
+    std::unique_ptr<armnn::ITensorHandle> outputHandle =
+        workloadFactory.CreateTensorHandle(outputTensorInfo);
+
+    armnn::WorkloadInfo info;
+    AddInputToWorkload(descriptor, info, inputTensorInfo, inputHandle.get());
+    AddOutputToWorkload(descriptor, info, outputTensorInfo, outputHandle.get());
+
+    std::unique_ptr<armnn::IWorkload> workload = workloadFactory.CreateDebug(descriptor, info);
+
+    inputHandle->Allocate();
+    outputHandle->Allocate();
+
+    CopyDataToITensorHandle(inputHandle.get(), input.data());
+
+    std::ostringstream oss;
+    std::streambuf* coutStreambuf = std::cout.rdbuf();
+    std::cout.rdbuf(oss.rdbuf());
+
+    ExecuteWorkload(*workload, memoryManager);
+
+    std::cout.rdbuf(coutStreambuf);
+
+    BOOST_TEST(oss.str() == expectedStringOutput);
+
+    CopyDataFromITensorHandle(ret.output.data(), outputHandle.get());
+
+    return ret;
+}
+
+template <armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> Debug4dTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    armnn::TensorInfo inputTensorInfo;
+    armnn::TensorInfo outputTensorInfo;
+
+    unsigned int inputShape[]  = {1, 2, 2, 3};
+    unsigned int outputShape[] = {1, 2, 2, 3};
+
+    armnn::DebugQueueDescriptor desc;
+    desc.m_Guid = 1;
+    desc.m_LayerName = "TestOutput";
+    desc.m_SlotIndex = 0;
+
+    inputTensorInfo = armnn::TensorInfo(4, inputShape, ArmnnType);
+    outputTensorInfo = armnn::TensorInfo(4, outputShape, ArmnnType);
+
+    std::vector<float> input = std::vector<float>(
+    {
+        1.0f,   2.0f,  3.0f,
+        4.0f,   5.0f,  6.0f,
+        7.0f,   8.0f,  9.0f,
+        10.0f, 11.0f, 12.0f,
+    });
+
+    std::vector<float> outputExpected = std::vector<float>(
+    {
+        1.0f,   2.0f,  3.0f,
+        4.0f,   5.0f,  6.0f,
+        7.0f,   8.0f,  9.0f,
+        10.0f, 11.0f, 12.0f,
+    });
+
+    const std::string expectedStringOutput =
+        "{ \"layerGuid\": 1,"
+        " \"layerName\": \"TestOutput\","
+        " \"outputSlot\": 0,"
+        " \"shape\": [1, 2, 2, 3],"
+        " \"min\": 1, \"max\": 12,"
+        " \"data\": [[[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]]] }\n";
+
+    return DebugTestImpl<T, 4>(workloadFactory,
+                               memoryManager,
+                               inputTensorInfo,
+                               outputTensorInfo,
+                               input,
+                               outputExpected,
+                               desc,
+                               expectedStringOutput);
+}
+
+template <armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 3> Debug3dTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    armnn::TensorInfo inputTensorInfo;
+    armnn::TensorInfo outputTensorInfo;
+
+    unsigned int inputShape[]  = {3, 3, 1};
+    unsigned int outputShape[] = {3, 3, 1};
+
+    armnn::DebugQueueDescriptor desc;
+    desc.m_Guid = 1;
+    desc.m_LayerName = "TestOutput";
+    desc.m_SlotIndex = 0;
+
+    inputTensorInfo = armnn::TensorInfo(3, inputShape, ArmnnType);
+    outputTensorInfo = armnn::TensorInfo(3, outputShape, ArmnnType);
+
+    std::vector<float> input = std::vector<float>(
+    {
+        1.0f, 2.0f, 3.0f,
+        4.0f, 5.0f, 6.0f,
+        7.0f, 8.0f, 9.0f,
+    });
+
+    std::vector<float> outputExpected = std::vector<float>(
+    {
+        1.0f, 2.0f, 3.0f,
+        4.0f, 5.0f, 6.0f,
+        7.0f, 8.0f, 9.0f,
+    });
+
+    const std::string expectedStringOutput =
+        "{ \"layerGuid\": 1,"
+        " \"layerName\": \"TestOutput\","
+        " \"outputSlot\": 0,"
+        " \"shape\": [3, 3, 1],"
+        " \"min\": 1, \"max\": 9,"
+        " \"data\": [[[1], [2], [3]], [[4], [5], [6]], [[7], [8], [9]]] }\n";
+
+    return DebugTestImpl<T, 3>(workloadFactory,
+                               memoryManager,
+                               inputTensorInfo,
+                               outputTensorInfo,
+                               input,
+                               outputExpected,
+                               desc,
+                               expectedStringOutput);
+}
+
+template <armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 2> Debug2dTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    armnn::TensorInfo inputTensorInfo;
+    armnn::TensorInfo outputTensorInfo;
+
+    unsigned int inputShape[]  = {2, 2};
+    unsigned int outputShape[] = {2, 2};
+
+    armnn::DebugQueueDescriptor desc;
+    desc.m_Guid = 1;
+    desc.m_LayerName = "TestOutput";
+    desc.m_SlotIndex = 0;
+
+    inputTensorInfo = armnn::TensorInfo(2, inputShape, ArmnnType);
+    outputTensorInfo = armnn::TensorInfo(2, outputShape, ArmnnType);
+
+    std::vector<float> input = std::vector<float>(
+    {
+        1.0f, 2.0f,
+        3.0f, 4.0f,
+    });
+
+    std::vector<float> outputExpected = std::vector<float>(
+    {
+        1.0f, 2.0f,
+        3.0f, 4.0f,
+    });
+
+    const std::string expectedStringOutput =
+        "{ \"layerGuid\": 1,"
+        " \"layerName\": \"TestOutput\","
+        " \"outputSlot\": 0,"
+        " \"shape\": [2, 2],"
+        " \"min\": 1, \"max\": 4,"
+        " \"data\": [[1, 2], [3, 4]] }\n";
+
+    return DebugTestImpl<T, 2>(workloadFactory,
+                               memoryManager,
+                               inputTensorInfo,
+                               outputTensorInfo,
+                               input,
+                               outputExpected,
+                               desc,
+                               expectedStringOutput);
+}
+
+template <armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 1> Debug1dTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    armnn::TensorInfo inputTensorInfo;
+    armnn::TensorInfo outputTensorInfo;
+
+    unsigned int inputShape[]  = {4};
+    unsigned int outputShape[] = {4};
+
+    armnn::DebugQueueDescriptor desc;
+    desc.m_Guid = 1;
+    desc.m_LayerName = "TestOutput";
+    desc.m_SlotIndex = 0;
+
+    inputTensorInfo = armnn::TensorInfo(1, inputShape, ArmnnType);
+    outputTensorInfo = armnn::TensorInfo(1, outputShape, ArmnnType);
+
+    std::vector<float> input = std::vector<float>(
+    {
+        1.0f, 2.0f, 3.0f, 4.0f,
+    });
+
+    std::vector<float> outputExpected = std::vector<float>(
+    {
+        1.0f, 2.0f, 3.0f, 4.0f,
+    });
+
+    const std::string expectedStringOutput =
+        "{ \"layerGuid\": 1,"
+        " \"layerName\": \"TestOutput\","
+        " \"outputSlot\": 0,"
+        " \"shape\": [4],"
+        " \"min\": 1, \"max\": 4,"
+        " \"data\": [1, 2, 3, 4] }\n";
+
+    return DebugTestImpl<T, 1>(workloadFactory,
+                               memoryManager,
+                               inputTensorInfo,
+                               outputTensorInfo,
+                               input,
+                               outputExpected,
+                               desc,
+                               expectedStringOutput);
+}
+
+} // anonymous namespace
+
+LayerTestResult<float, 4> Debug4dFloat32Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return Debug4dTest<armnn::DataType::Float32>(workloadFactory, memoryManager);
+}
+
+LayerTestResult<float, 3> Debug3dFloat32Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return Debug3dTest<armnn::DataType::Float32>(workloadFactory, memoryManager);
+}
+
+LayerTestResult<float, 2> Debug2dFloat32Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return Debug2dTest<armnn::DataType::Float32>(workloadFactory, memoryManager);
+}
+
+LayerTestResult<float, 1> Debug1dFloat32Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return Debug1dTest<armnn::DataType::Float32>(workloadFactory, memoryManager);
+}
+
+LayerTestResult<uint8_t, 4> Debug4dUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return Debug4dTest<armnn::DataType::QuantisedAsymm8>(workloadFactory, memoryManager);
+}
+
+LayerTestResult<uint8_t, 3> Debug3dUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return Debug3dTest<armnn::DataType::QuantisedAsymm8>(workloadFactory, memoryManager);
+}
+
+LayerTestResult<uint8_t, 2> Debug2dUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return Debug2dTest<armnn::DataType::QuantisedAsymm8>(workloadFactory, memoryManager);
+}
+
+LayerTestResult<uint8_t, 1> Debug1dUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return Debug1dTest<armnn::DataType::QuantisedAsymm8>(workloadFactory, memoryManager);
+}
+
+LayerTestResult<int16_t, 4> Debug4dInt16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return Debug4dTest<armnn::DataType::QuantisedSymm16>(workloadFactory, memoryManager);
+}
+
+LayerTestResult<int16_t, 3> Debug3dInt16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return Debug3dTest<armnn::DataType::QuantisedSymm16>(workloadFactory, memoryManager);
+}
+
+LayerTestResult<int16_t, 2> Debug2dInt16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return Debug2dTest<armnn::DataType::QuantisedSymm16>(workloadFactory, memoryManager);
+}
+
+LayerTestResult<int16_t, 1> Debug1dInt16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return Debug1dTest<armnn::DataType::QuantisedSymm16>(workloadFactory, memoryManager);
+}
diff --git a/src/backends/backendsCommon/test/layerTests/DebugTestImpl.hpp b/src/backends/backendsCommon/test/layerTests/DebugTestImpl.hpp
new file mode 100644
index 0000000..e355279
--- /dev/null
+++ b/src/backends/backendsCommon/test/layerTests/DebugTestImpl.hpp
@@ -0,0 +1,59 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include "LayerTestResult.hpp"
+
+#include <backendsCommon/IBackendInternal.hpp>
+#include <backendsCommon/WorkloadFactory.hpp>
+
+LayerTestResult<float, 4> Debug4dFloat32Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<float, 3> Debug3dFloat32Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<float, 2> Debug2dFloat32Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<float, 1> Debug1dFloat32Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<uint8_t, 4> Debug4dUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<uint8_t, 3> Debug3dUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<uint8_t, 2> Debug2dUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<uint8_t, 1> Debug1dUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<int16_t, 4> Debug4dInt16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<int16_t, 3> Debug3dInt16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<int16_t, 2> Debug2dInt16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<int16_t, 1> Debug1dInt16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
diff --git a/src/backends/backendsCommon/test/layerTests/DequantizeTestImpl.cpp b/src/backends/backendsCommon/test/layerTests/DequantizeTestImpl.cpp
new file mode 100644
index 0000000..42673d5
--- /dev/null
+++ b/src/backends/backendsCommon/test/layerTests/DequantizeTestImpl.cpp
@@ -0,0 +1,151 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "DequantizeTestImpl.hpp"
+
+#include <ResolveType.hpp>
+
+#include <armnn/ArmNN.hpp>
+
+#include <backendsCommon/test/TensorCopyUtils.hpp>
+#include <backendsCommon/test/WorkloadTestUtils.hpp>
+
+#include <test/TensorHelpers.hpp>
+
+namespace
+{
+
+template<typename T, std::size_t Dim>
+LayerTestResult<float, Dim> DequantizeTestImpl(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::TensorInfo& inputTensorInfo,
+    const armnn::TensorInfo& outputTensorInfo,
+    const std::vector<T>& inputData,
+    const std::vector<float>& expectedOutputData,
+    armnn::DequantizeQueueDescriptor descriptor)
+{
+    boost::multi_array<T, Dim> input = MakeTensor<T, Dim>(inputTensorInfo, inputData);
+
+    LayerTestResult<float, Dim> ret(outputTensorInfo);
+    ret.outputExpected = MakeTensor<float, Dim>(outputTensorInfo, expectedOutputData);
+
+    std::unique_ptr<armnn::ITensorHandle> inputHandle = workloadFactory.CreateTensorHandle(inputTensorInfo);
+    std::unique_ptr<armnn::ITensorHandle> outputHandle = workloadFactory.CreateTensorHandle(outputTensorInfo);
+
+    armnn::WorkloadInfo info;
+    AddInputToWorkload(descriptor, info, inputTensorInfo, inputHandle.get());
+    AddOutputToWorkload(descriptor, info, outputTensorInfo, outputHandle.get());
+
+    std::unique_ptr<armnn::IWorkload> workload = workloadFactory.CreateDequantize(descriptor, info);
+
+    inputHandle->Allocate();
+    outputHandle->Allocate();
+
+    CopyDataToITensorHandle(inputHandle.get(), input.data());
+
+    ExecuteWorkload(*workload, memoryManager);
+
+    CopyDataFromITensorHandle(ret.output.data(), outputHandle.get());
+
+    return ret;
+}
+
+template <armnn::DataType ArmnnInputType>
+LayerTestResult<float, 4> DequantizeSimpleTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    using T = armnn::ResolveType<ArmnnInputType>;
+
+    armnn::DequantizeQueueDescriptor desc;
+
+    const armnn::TensorInfo inputTensorInfo({1, 2, 2, 3}, ArmnnInputType, 0.5f, 0);
+    const armnn::TensorInfo outputTensorInfo({1, 2, 2, 3}, armnn::DataType::Float32);
+
+    std::vector<T> inputData = std::vector<T>(
+    {
+         2,  4,  6,
+         8, 10, 12,
+        14, 16, 18,
+        20, 22, 24,
+    });
+
+    std::vector<float> expectedOutputData = std::vector<float>(
+    {
+        1.0f,   2.0f,  3.0f,
+        4.0f,   5.0f,  6.0f,
+        7.0f,   8.0f,  9.0f,
+        10.0f, 11.0f, 12.0f,
+    });
+
+    return DequantizeTestImpl<T, 4>(workloadFactory,
+                                    memoryManager,
+                                    inputTensorInfo,
+                                    outputTensorInfo,
+                                    inputData,
+                                    expectedOutputData,
+                                    desc);
+}
+
+template <armnn::DataType ArmnnInputType>
+LayerTestResult<float, 4> DequantizeOffsetTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    using T = armnn::ResolveType<ArmnnInputType>;
+
+    armnn::DequantizeQueueDescriptor desc;
+
+    const armnn::TensorInfo inputTensorInfo({1, 2, 2, 3}, ArmnnInputType, 0.5f, 1);
+    const armnn::TensorInfo outputTensorInfo({1, 2, 2, 3}, armnn::DataType::Float32);
+
+    std::vector<T> inputData = std::vector<T>(
+    {
+         3,  5,  7,
+         9, 11, 13,
+        15, 17, 19,
+        21, 23, 25,
+    });
+
+    std::vector<float> expectedOutputData = std::vector<float>(
+    {
+        1.0f,   2.0f,  3.0f,
+        4.0f,   5.0f,  6.0f,
+        7.0f,   8.0f,  9.0f,
+        10.0f, 11.0f, 12.0f,
+    });
+
+    return DequantizeTestImpl<T, 4>(workloadFactory,
+                                    memoryManager,
+                                    inputTensorInfo,
+                                    outputTensorInfo,
+                                    inputData,
+                                    expectedOutputData,
+                                    desc);
+}
+
+} // anonymous namespace
+
+LayerTestResult<float, 4> DequantizeSimpleUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return DequantizeSimpleTest<armnn::DataType::QuantisedAsymm8>(workloadFactory, memoryManager);
+}
+
+LayerTestResult<float, 4> DequantizeOffsetUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return DequantizeOffsetTest<armnn::DataType::QuantisedAsymm8>(workloadFactory, memoryManager);
+}
+
+LayerTestResult<float, 4> DequantizeSimpleInt16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return DequantizeSimpleTest<armnn::DataType::QuantisedSymm16>(workloadFactory, memoryManager);
+}
diff --git a/src/backends/backendsCommon/test/layerTests/DequantizeTestImpl.hpp b/src/backends/backendsCommon/test/layerTests/DequantizeTestImpl.hpp
new file mode 100644
index 0000000..55ea4b4
--- /dev/null
+++ b/src/backends/backendsCommon/test/layerTests/DequantizeTestImpl.hpp
@@ -0,0 +1,23 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include "LayerTestResult.hpp"
+
+#include <backendsCommon/IBackendInternal.hpp>
+#include <backendsCommon/WorkloadFactory.hpp>
+
+LayerTestResult<float, 4> DequantizeSimpleUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<float, 4> DequantizeOffsetUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<float, 4> DequantizeSimpleInt16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
diff --git a/src/backends/backendsCommon/test/layerTests/DetectionPostProcessTestImpl.hpp b/src/backends/backendsCommon/test/layerTests/DetectionPostProcessTestImpl.hpp
new file mode 100644
index 0000000..bcb5abf
--- /dev/null
+++ b/src/backends/backendsCommon/test/layerTests/DetectionPostProcessTestImpl.hpp
@@ -0,0 +1,369 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+#pragma once
+
+#include <ResolveType.hpp>
+
+#include <armnn/Types.hpp>
+
+#include <backendsCommon/CpuTensorHandle.hpp>
+#include <backendsCommon/IBackendInternal.hpp>
+#include <backendsCommon/WorkloadFactory.hpp>
+
+#include <backendsCommon/test/TensorCopyUtils.hpp>
+#include <backendsCommon/test/WorkloadFactoryHelper.hpp>
+#include <backendsCommon/test/WorkloadTestUtils.hpp>
+
+#include <test/TensorHelpers.hpp>
+
+namespace
+{
+
+using FloatData = std::vector<float>;
+using QuantData = std::pair<float, int32_t>;
+
+struct TestData
+{
+    static const armnn::TensorShape s_BoxEncodingsShape;
+    static const armnn::TensorShape s_ScoresShape;
+    static const armnn::TensorShape s_AnchorsShape;
+
+    static const QuantData s_BoxEncodingsQuantData;
+    static const QuantData s_ScoresQuantData;
+    static const QuantData s_AnchorsQuantData;
+
+    static const FloatData s_BoxEncodings;
+    static const FloatData s_Scores;
+    static const FloatData s_Anchors;
+};
+
+struct RegularNmsExpectedResults
+{
+    static const FloatData s_DetectionBoxes;
+    static const FloatData s_DetectionScores;
+    static const FloatData s_DetectionClasses;
+    static const FloatData s_NumDetections;
+};
+
+struct FastNmsExpectedResults
+{
+    static const FloatData s_DetectionBoxes;
+    static const FloatData s_DetectionScores;
+    static const FloatData s_DetectionClasses;
+    static const FloatData s_NumDetections;
+};
+
+const armnn::TensorShape TestData::s_BoxEncodingsShape = { 1, 6, 4 };
+const armnn::TensorShape TestData::s_ScoresShape       = { 1, 6, 3 };
+const armnn::TensorShape TestData::s_AnchorsShape      = { 6, 4 };
+
+const QuantData TestData::s_BoxEncodingsQuantData = { 1.00f, 1 };
+const QuantData TestData::s_ScoresQuantData       = { 0.01f, 0 };
+const QuantData TestData::s_AnchorsQuantData      = { 0.50f, 0 };
+
+const FloatData TestData::s_BoxEncodings =
+{
+    0.0f,  0.0f, 0.0f, 0.0f,
+    0.0f,  1.0f, 0.0f, 0.0f,
+    0.0f, -1.0f, 0.0f, 0.0f,
+    0.0f,  0.0f, 0.0f, 0.0f,
+    0.0f,  1.0f, 0.0f, 0.0f,
+    0.0f,  0.0f, 0.0f, 0.0f
+};
+
+const FloatData TestData::s_Scores =
+{
+    0.0f, 0.90f, 0.80f,
+    0.0f, 0.75f, 0.72f,
+    0.0f, 0.60f, 0.50f,
+    0.0f, 0.93f, 0.95f,
+    0.0f, 0.50f, 0.40f,
+    0.0f, 0.30f, 0.20f
+};
+
+const FloatData TestData::s_Anchors =
+{
+    0.5f,   0.5f, 1.0f, 1.0f,
+    0.5f,   0.5f, 1.0f, 1.0f,
+    0.5f,   0.5f, 1.0f, 1.0f,
+    0.5f,  10.5f, 1.0f, 1.0f,
+    0.5f,  10.5f, 1.0f, 1.0f,
+    0.5f, 100.5f, 1.0f, 1.0f
+};
+
+const FloatData RegularNmsExpectedResults::s_DetectionBoxes =
+{
+    0.0f, 10.0f, 1.0f, 11.0f,
+    0.0f, 10.0f, 1.0f, 11.0f,
+    0.0f,  0.0f, 0.0f,  0.0f
+};
+
+const FloatData RegularNmsExpectedResults::s_DetectionScores =
+{
+    0.95f, 0.93f, 0.0f
+};
+
+const FloatData RegularNmsExpectedResults::s_DetectionClasses =
+{
+    1.0f, 0.0f, 0.0f
+};
+
+const FloatData RegularNmsExpectedResults::s_NumDetections = { 2.0f };
+
+const FloatData FastNmsExpectedResults::s_DetectionBoxes =
+{
+    0.0f,  10.0f, 1.0f,  11.0f,
+    0.0f,   0.0f, 1.0f,   1.0f,
+    0.0f, 100.0f, 1.0f, 101.0f
+};
+
+const FloatData FastNmsExpectedResults::s_DetectionScores =
+{
+    0.95f, 0.9f, 0.3f
+};
+
+const FloatData FastNmsExpectedResults::s_DetectionClasses =
+{
+    1.0f, 0.0f, 0.0f
+};
+
+const FloatData FastNmsExpectedResults::s_NumDetections = { 3.0f };
+
+} // anonymous namespace
+
+template<typename FactoryType,
+         armnn::DataType ArmnnType,
+         typename T = armnn::ResolveType<ArmnnType>>
+void DetectionPostProcessImpl(const armnn::TensorInfo& boxEncodingsInfo,
+                              const armnn::TensorInfo& scoresInfo,
+                              const armnn::TensorInfo& anchorsInfo,
+                              const std::vector<T>& boxEncodingsData,
+                              const std::vector<T>& scoresData,
+                              const std::vector<T>& anchorsData,
+                              const std::vector<float>& expectedDetectionBoxes,
+                              const std::vector<float>& expectedDetectionClasses,
+                              const std::vector<float>& expectedDetectionScores,
+                              const std::vector<float>& expectedNumDetections,
+                              bool useRegularNms)
+{
+    std::unique_ptr<armnn::Profiler> profiler = std::make_unique<armnn::Profiler>();
+    armnn::ProfilerManager::GetInstance().RegisterProfiler(profiler.get());
+
+    auto memoryManager = WorkloadFactoryHelper<FactoryType>::GetMemoryManager();
+    FactoryType workloadFactory = WorkloadFactoryHelper<FactoryType>::GetFactory(memoryManager);
+
+    auto boxEncodings = MakeTensor<T, 3>(boxEncodingsInfo, boxEncodingsData);
+    auto scores = MakeTensor<T, 3>(scoresInfo, scoresData);
+    auto anchors = MakeTensor<T, 2>(anchorsInfo, anchorsData);
+
+    armnn::TensorInfo detectionBoxesInfo({ 1, 3, 4 }, armnn::DataType::Float32);
+    armnn::TensorInfo detectionScoresInfo({ 1, 3 }, armnn::DataType::Float32);
+    armnn::TensorInfo detectionClassesInfo({ 1, 3 }, armnn::DataType::Float32);
+    armnn::TensorInfo numDetectionInfo({ 1 }, armnn::DataType::Float32);
+
+    LayerTestResult<float, 3> detectionBoxesResult(detectionBoxesInfo);
+    detectionBoxesResult.outputExpected = MakeTensor<float, 3>(detectionBoxesInfo, expectedDetectionBoxes);
+    LayerTestResult<float, 2> detectionClassesResult(detectionClassesInfo);
+    detectionClassesResult.outputExpected = MakeTensor<float, 2>(detectionClassesInfo, expectedDetectionClasses);
+    LayerTestResult<float, 2> detectionScoresResult(detectionScoresInfo);
+    detectionScoresResult.outputExpected = MakeTensor<float, 2>(detectionScoresInfo, expectedDetectionScores);
+    LayerTestResult<float, 1> numDetectionsResult(numDetectionInfo);
+    numDetectionsResult.outputExpected = MakeTensor<float, 1>(numDetectionInfo, expectedNumDetections);
+
+    std::unique_ptr<armnn::ITensorHandle> boxedHandle = workloadFactory.CreateTensorHandle(boxEncodingsInfo);
+    std::unique_ptr<armnn::ITensorHandle> scoreshandle = workloadFactory.CreateTensorHandle(scoresInfo);
+    std::unique_ptr<armnn::ITensorHandle> anchorsHandle = workloadFactory.CreateTensorHandle(anchorsInfo);
+    std::unique_ptr<armnn::ITensorHandle> outputBoxesHandle = workloadFactory.CreateTensorHandle(detectionBoxesInfo);
+    std::unique_ptr<armnn::ITensorHandle> classesHandle = workloadFactory.CreateTensorHandle(detectionClassesInfo);
+    std::unique_ptr<armnn::ITensorHandle> outputScoresHandle = workloadFactory.CreateTensorHandle(detectionScoresInfo);
+    std::unique_ptr<armnn::ITensorHandle> numDetectionHandle = workloadFactory.CreateTensorHandle(numDetectionInfo);
+
+    armnn::ScopedCpuTensorHandle anchorsTensor(anchorsInfo);
+    AllocateAndCopyDataToITensorHandle(&anchorsTensor, &anchors[0][0]);
+
+    armnn::DetectionPostProcessQueueDescriptor data;
+    data.m_Parameters.m_UseRegularNms = useRegularNms;
+    data.m_Parameters.m_MaxDetections = 3;
+    data.m_Parameters.m_MaxClassesPerDetection = 1;
+    data.m_Parameters.m_DetectionsPerClass =1;
+    data.m_Parameters.m_NmsScoreThreshold = 0.0;
+    data.m_Parameters.m_NmsIouThreshold = 0.5;
+    data.m_Parameters.m_NumClasses = 2;
+    data.m_Parameters.m_ScaleY = 10.0;
+    data.m_Parameters.m_ScaleX = 10.0;
+    data.m_Parameters.m_ScaleH = 5.0;
+    data.m_Parameters.m_ScaleW = 5.0;
+    data.m_Anchors = &anchorsTensor;
+
+    armnn::WorkloadInfo info;
+    AddInputToWorkload(data,  info, boxEncodingsInfo, boxedHandle.get());
+    AddInputToWorkload(data, info, scoresInfo, scoreshandle.get());
+    AddOutputToWorkload(data, info, detectionBoxesInfo, outputBoxesHandle.get());
+    AddOutputToWorkload(data, info, detectionClassesInfo, classesHandle.get());
+    AddOutputToWorkload(data, info, detectionScoresInfo, outputScoresHandle.get());
+    AddOutputToWorkload(data, info, numDetectionInfo, numDetectionHandle.get());
+
+    std::unique_ptr<armnn::IWorkload> workload = workloadFactory.CreateDetectionPostProcess(data, info);
+
+    boxedHandle->Allocate();
+    scoreshandle->Allocate();
+    outputBoxesHandle->Allocate();
+    classesHandle->Allocate();
+    outputScoresHandle->Allocate();
+    numDetectionHandle->Allocate();
+
+    CopyDataToITensorHandle(boxedHandle.get(), boxEncodings.origin());
+    CopyDataToITensorHandle(scoreshandle.get(), scores.origin());
+
+    workload->Execute();
+
+    CopyDataFromITensorHandle(detectionBoxesResult.output.origin(), outputBoxesHandle.get());
+    CopyDataFromITensorHandle(detectionClassesResult.output.origin(), classesHandle.get());
+    CopyDataFromITensorHandle(detectionScoresResult.output.origin(), outputScoresHandle.get());
+    CopyDataFromITensorHandle(numDetectionsResult.output.origin(), numDetectionHandle.get());
+
+    BOOST_TEST(CompareTensors(detectionBoxesResult.output, detectionBoxesResult.outputExpected));
+    BOOST_TEST(CompareTensors(detectionClassesResult.output, detectionClassesResult.outputExpected));
+    BOOST_TEST(CompareTensors(detectionScoresResult.output, detectionScoresResult.outputExpected));
+    BOOST_TEST(CompareTensors(numDetectionsResult.output, numDetectionsResult.outputExpected));
+}
+
+template<armnn::DataType QuantizedType, typename RawType = armnn::ResolveType<QuantizedType>>
+void QuantizeData(RawType* quant, const float* dequant, const armnn::TensorInfo& info)
+{
+    for (size_t i = 0; i < info.GetNumElements(); i++)
+    {
+        quant[i] = armnn::Quantize<RawType>(
+            dequant[i], info.GetQuantizationScale(), info.GetQuantizationOffset());
+    }
+}
+
+template<typename FactoryType>
+void DetectionPostProcessRegularNmsFloatTest()
+{
+    return DetectionPostProcessImpl<FactoryType, armnn::DataType::Float32>(
+        armnn::TensorInfo(TestData::s_BoxEncodingsShape, armnn::DataType::Float32),
+        armnn::TensorInfo(TestData::s_ScoresShape, armnn::DataType::Float32),
+        armnn::TensorInfo(TestData::s_AnchorsShape, armnn::DataType::Float32),
+        TestData::s_BoxEncodings,
+        TestData::s_Scores,
+        TestData::s_Anchors,
+        RegularNmsExpectedResults::s_DetectionBoxes,
+        RegularNmsExpectedResults::s_DetectionClasses,
+        RegularNmsExpectedResults::s_DetectionScores,
+        RegularNmsExpectedResults::s_NumDetections,
+        true);
+}
+
+template<typename FactoryType,
+         armnn::DataType QuantizedType,
+         typename RawType = armnn::ResolveType<QuantizedType>>
+void DetectionPostProcessRegularNmsQuantizedTest()
+{
+    armnn::TensorInfo boxEncodingsInfo(TestData::s_BoxEncodingsShape, QuantizedType);
+    armnn::TensorInfo scoresInfo(TestData::s_ScoresShape, QuantizedType);
+    armnn::TensorInfo anchorsInfo(TestData::s_AnchorsShape, QuantizedType);
+
+    boxEncodingsInfo.SetQuantizationScale(TestData::s_BoxEncodingsQuantData.first);
+    boxEncodingsInfo.SetQuantizationOffset(TestData::s_BoxEncodingsQuantData.second);
+
+    scoresInfo.SetQuantizationScale(TestData::s_ScoresQuantData.first);
+    scoresInfo.SetQuantizationOffset(TestData::s_ScoresQuantData.second);
+
+    anchorsInfo.SetQuantizationScale(TestData::s_AnchorsQuantData.first);
+    anchorsInfo.SetQuantizationOffset(TestData::s_BoxEncodingsQuantData.second);
+
+    std::vector<RawType> boxEncodingsData(TestData::s_BoxEncodingsShape.GetNumElements());
+    QuantizeData<QuantizedType>(boxEncodingsData.data(),
+                                TestData::s_BoxEncodings.data(),
+                                boxEncodingsInfo);
+
+    std::vector<RawType> scoresData(TestData::s_ScoresShape.GetNumElements());
+    QuantizeData<QuantizedType>(scoresData.data(),
+                                TestData::s_Scores.data(),
+                                scoresInfo);
+
+    std::vector<RawType> anchorsData(TestData::s_AnchorsShape.GetNumElements());
+    QuantizeData<QuantizedType>(anchorsData.data(),
+                                TestData::s_Anchors.data(),
+                                anchorsInfo);
+
+    return DetectionPostProcessImpl<FactoryType, QuantizedType>(
+        boxEncodingsInfo,
+        scoresInfo,
+        anchorsInfo,
+        boxEncodingsData,
+        scoresData,
+        anchorsData,
+        RegularNmsExpectedResults::s_DetectionBoxes,
+        RegularNmsExpectedResults::s_DetectionClasses,
+        RegularNmsExpectedResults::s_DetectionScores,
+        RegularNmsExpectedResults::s_NumDetections,
+        true);
+}
+
+template<typename FactoryType>
+void DetectionPostProcessFastNmsFloatTest()
+{
+    return DetectionPostProcessImpl<FactoryType, armnn::DataType::Float32>(
+        armnn::TensorInfo(TestData::s_BoxEncodingsShape, armnn::DataType::Float32),
+        armnn::TensorInfo(TestData::s_ScoresShape, armnn::DataType::Float32),
+        armnn::TensorInfo(TestData::s_AnchorsShape, armnn::DataType::Float32),
+        TestData::s_BoxEncodings,
+        TestData::s_Scores,
+        TestData::s_Anchors,
+        FastNmsExpectedResults::s_DetectionBoxes,
+        FastNmsExpectedResults::s_DetectionClasses,
+        FastNmsExpectedResults::s_DetectionScores,
+        FastNmsExpectedResults::s_NumDetections,
+        false);
+}
+
+template<typename FactoryType,
+         armnn::DataType QuantizedType,
+         typename RawType = armnn::ResolveType<QuantizedType>>
+void DetectionPostProcessFastNmsQuantizedTest()
+{
+    armnn::TensorInfo boxEncodingsInfo(TestData::s_BoxEncodingsShape, QuantizedType);
+    armnn::TensorInfo scoresInfo(TestData::s_ScoresShape, QuantizedType);
+    armnn::TensorInfo anchorsInfo(TestData::s_AnchorsShape, QuantizedType);
+
+    boxEncodingsInfo.SetQuantizationScale(TestData::s_BoxEncodingsQuantData.first);
+    boxEncodingsInfo.SetQuantizationOffset(TestData::s_BoxEncodingsQuantData.second);
+
+    scoresInfo.SetQuantizationScale(TestData::s_ScoresQuantData.first);
+    scoresInfo.SetQuantizationOffset(TestData::s_ScoresQuantData.second);
+
+    anchorsInfo.SetQuantizationScale(TestData::s_AnchorsQuantData.first);
+    anchorsInfo.SetQuantizationOffset(TestData::s_BoxEncodingsQuantData.second);
+
+    std::vector<RawType> boxEncodingsData(TestData::s_BoxEncodingsShape.GetNumElements());
+    QuantizeData<QuantizedType>(boxEncodingsData.data(),
+                                TestData::s_BoxEncodings.data(),
+                                boxEncodingsInfo);
+
+    std::vector<RawType> scoresData(TestData::s_ScoresShape.GetNumElements());
+    QuantizeData<QuantizedType>(scoresData.data(),
+                                TestData::s_Scores.data(),
+                                scoresInfo);
+
+    std::vector<RawType> anchorsData(TestData::s_AnchorsShape.GetNumElements());
+    QuantizeData<QuantizedType>(anchorsData.data(),
+                                TestData::s_Anchors.data(),
+                                anchorsInfo);
+
+    return DetectionPostProcessImpl<FactoryType, QuantizedType>(
+        boxEncodingsInfo,
+        scoresInfo,
+        anchorsInfo,
+        boxEncodingsData,
+        scoresData,
+        anchorsData,
+        FastNmsExpectedResults::s_DetectionBoxes,
+        FastNmsExpectedResults::s_DetectionClasses,
+        FastNmsExpectedResults::s_DetectionScores,
+        FastNmsExpectedResults::s_NumDetections,
+        false);
+}
diff --git a/src/backends/backendsCommon/test/layerTests/FakeQuantizationTestImpl.cpp b/src/backends/backendsCommon/test/layerTests/FakeQuantizationTestImpl.cpp
new file mode 100644
index 0000000..1ce9d2d
--- /dev/null
+++ b/src/backends/backendsCommon/test/layerTests/FakeQuantizationTestImpl.cpp
@@ -0,0 +1,74 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "FakeQuantizationTestImpl.hpp"
+
+#include <armnn/ArmNN.hpp>
+
+#include <backendsCommon/CpuTensorHandle.hpp>
+
+#include <backendsCommon/test/TensorCopyUtils.hpp>
+#include <backendsCommon/test/WorkloadTestUtils.hpp>
+
+#include <test/TensorHelpers.hpp>
+
+LayerTestResult<float, 2> FakeQuantizationTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    constexpr unsigned int width = 2;
+    constexpr unsigned int height = 3;
+
+    const armnn::TensorInfo tensorInfo({height, width },
+        armnn::DataType::Float32);
+
+    auto input = MakeTensor<float, 2>(tensorInfo, std::vector<float>({
+       -10.0f, -5.0f,
+         0.0f,  5.0f,
+        10.0f, 10.0f
+    }));
+
+    LayerTestResult<float, 2> ret(tensorInfo);
+
+    std::unique_ptr<armnn::ITensorHandle> inputHandle   = workloadFactory.CreateTensorHandle(tensorInfo);
+    std::unique_ptr<armnn::ITensorHandle> outputHandle  = workloadFactory.CreateTensorHandle(tensorInfo);
+
+    armnn::FakeQuantizationQueueDescriptor data;
+    armnn::WorkloadInfo info;
+
+    AddInputToWorkload(data, info, tensorInfo, inputHandle.get());
+    AddOutputToWorkload(data, info, tensorInfo, outputHandle.get());
+
+    float min = -10.f;
+    float max =  10.f;
+
+    data.m_Parameters.m_Min = min;
+    data.m_Parameters.m_Max = max;
+
+    armnn::PassthroughCpuTensorHandle refHandle(tensorInfo, &ret.outputExpected[0][0]);
+    armnn::FakeQuantizationQueueDescriptor refData = data;
+    armnn::WorkloadInfo refInfo = info;
+    SetWorkloadOutput(refData, refInfo, 0, tensorInfo, &refHandle);
+
+    std::unique_ptr<armnn::IWorkload> workload = workloadFactory.CreateFakeQuantization(data, info);
+
+    inputHandle->Allocate();
+    outputHandle->Allocate();
+
+    CopyDataToITensorHandle(inputHandle.get(), &input[0][0]);
+
+    workload->PostAllocationConfigure();
+    workload->Execute();
+
+    CopyDataFromITensorHandle(&ret.output[0][0], outputHandle.get());
+
+    ret.outputExpected = MakeTensor<float, 2>(tensorInfo, std::vector<float>({
+        0.0f,     63.0f,
+        128.0f,   191.0f,
+        255.0f,   255.0f
+    }));
+
+    return ret;
+}
diff --git a/src/backends/backendsCommon/test/layerTests/FakeQuantizationTestImpl.hpp b/src/backends/backendsCommon/test/layerTests/FakeQuantizationTestImpl.hpp
new file mode 100644
index 0000000..506e968
--- /dev/null
+++ b/src/backends/backendsCommon/test/layerTests/FakeQuantizationTestImpl.hpp
@@ -0,0 +1,15 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include "LayerTestResult.hpp"
+
+#include <backendsCommon/IBackendInternal.hpp>
+#include <backendsCommon/WorkloadFactory.hpp>
+
+LayerTestResult<float, 2> FakeQuantizationTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
diff --git a/src/backends/backendsCommon/test/layerTests/FloorTestImpl.cpp b/src/backends/backendsCommon/test/layerTests/FloorTestImpl.cpp
new file mode 100644
index 0000000..f97d51a
--- /dev/null
+++ b/src/backends/backendsCommon/test/layerTests/FloorTestImpl.cpp
@@ -0,0 +1,70 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "FloorTestImpl.hpp"
+
+#include <backendsCommon/test/DataTypeUtils.hpp>
+#include <backendsCommon/test/TensorCopyUtils.hpp>
+#include <backendsCommon/test/WorkloadTestUtils.hpp>
+
+#include <test/TensorHelpers.hpp>
+
+template<armnn::DataType ArmnnType, typename T>
+LayerTestResult<T, 4> SimpleFloorTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    armnn::TensorInfo inputTensorInfo({1, 3, 2, 3}, ArmnnType);
+    inputTensorInfo.SetQuantizationScale(0.1f);
+
+    armnn::TensorInfo outputTensorInfo(inputTensorInfo);
+    outputTensorInfo.SetQuantizationScale(0.1f);
+
+    auto input = MakeTensor<T, 4>(inputTensorInfo, ConvertToDataType<ArmnnType>(
+        { -37.5f, -15.2f, -8.76f, -2.0f, -1.5f, -1.3f, -0.5f, -0.4f, 0.0f,
+        1.0f, 0.4f, 0.5f, 1.3f, 1.5f, 2.0f, 8.76f, 15.2f, 37.5f },
+        inputTensorInfo));
+
+    LayerTestResult<T, 4> ret(outputTensorInfo);
+    ret.outputExpected = MakeTensor<T, 4>(outputTensorInfo, ConvertToDataType<ArmnnType>(
+        { -38.0f, -16.0f, -9.0f, -2.0f, -2.0f, -2.0f, -1.0f, -1.0f, 0.0f,
+        1.0f, 0.0f, 0.0f, 1.0f, 1.0f, 2.0f, 8.0f, 15.0f, 37.0f },
+        outputTensorInfo));
+
+    std::unique_ptr<armnn::ITensorHandle> inputHandle = workloadFactory.CreateTensorHandle(inputTensorInfo);
+    std::unique_ptr<armnn::ITensorHandle> outputHandle = workloadFactory.CreateTensorHandle(outputTensorInfo);
+
+    armnn::FloorQueueDescriptor data;
+    armnn::WorkloadInfo info;
+    AddInputToWorkload(data, info, inputTensorInfo, inputHandle.get());
+    AddOutputToWorkload(data, info, outputTensorInfo, outputHandle.get());
+
+    std::unique_ptr<armnn::IWorkload> workload = workloadFactory.CreateFloor(data, info);
+
+    inputHandle->Allocate();
+    outputHandle->Allocate();
+
+    CopyDataToITensorHandle(inputHandle.get(), &input[0][0][0][0]);
+
+    workload->Execute();
+
+    CopyDataFromITensorHandle(&ret.output[0][0][0][0], outputHandle.get());
+
+    return ret;
+}
+
+//
+// Explicit template specializations
+//
+
+template LayerTestResult<armnn::ResolveType<armnn::DataType::Float32>, 4>
+SimpleFloorTest<armnn::DataType::Float32>(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+template LayerTestResult<armnn::ResolveType<armnn::DataType::QuantisedSymm16>, 4>
+SimpleFloorTest<armnn::DataType::QuantisedSymm16>(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
diff --git a/src/backends/backendsCommon/test/layerTests/FloorTestImpl.hpp b/src/backends/backendsCommon/test/layerTests/FloorTestImpl.hpp
new file mode 100644
index 0000000..e5baf5d
--- /dev/null
+++ b/src/backends/backendsCommon/test/layerTests/FloorTestImpl.hpp
@@ -0,0 +1,18 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include "LayerTestResult.hpp"
+
+#include <ResolveType.hpp>
+
+#include <backendsCommon/IBackendInternal.hpp>
+#include <backendsCommon/WorkloadFactory.hpp>
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> SimpleFloorTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
diff --git a/src/backends/backendsCommon/test/layerTests/FullyConnectedTestImpl.cpp b/src/backends/backendsCommon/test/layerTests/FullyConnectedTestImpl.cpp
new file mode 100644
index 0000000..c84b941
--- /dev/null
+++ b/src/backends/backendsCommon/test/layerTests/FullyConnectedTestImpl.cpp
@@ -0,0 +1,349 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "FullyConnectedTestImpl.hpp"
+
+#include <armnn/ArmNN.hpp>
+
+#include <backendsCommon/CpuTensorHandle.hpp>
+
+#include <backendsCommon/test/DataTypeUtils.hpp>
+#include <backendsCommon/test/TensorCopyUtils.hpp>
+#include <backendsCommon/test/WorkloadTestUtils.hpp>
+
+#include <test/TensorHelpers.hpp>
+
+//
+// Implementation templates
+//
+
+template<typename T, typename B>
+LayerTestResult<T, 2> SimpleFullyConnectedTestImpl(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        armnn::TensorInfo inputTensorInfo,
+        armnn::TensorInfo outputTensorInfo,
+        armnn::TensorInfo weightsDesc,
+        armnn::TensorInfo biasesDesc,
+        boost::multi_array<T, 2>& weights,
+        boost::multi_array<B, 1>& bias,
+        boost::multi_array<T, 4>& input,
+        bool biasEnabled,
+        bool transposeWeights)
+{
+    std::unique_ptr<armnn::ITensorHandle> inputHandle = workloadFactory.CreateTensorHandle(inputTensorInfo);
+    std::unique_ptr<armnn::ITensorHandle> outputHandle = workloadFactory.CreateTensorHandle(outputTensorInfo);
+
+    armnn::FullyConnectedQueueDescriptor data;
+    armnn::WorkloadInfo info;
+    armnn::ScopedCpuTensorHandle weightsTensor(weightsDesc);
+    armnn::ScopedCpuTensorHandle biasTensor(biasesDesc);
+
+    AllocateAndCopyDataToITensorHandle(&weightsTensor, &weights[0][0]);
+    AllocateAndCopyDataToITensorHandle(&biasTensor, &bias[0]);
+
+    AddInputToWorkload(data, info, inputTensorInfo, inputHandle.get());
+    AddOutputToWorkload(data, info, outputTensorInfo, outputHandle.get());
+    data.m_Weight = &weightsTensor;
+    data.m_Bias = &biasTensor;
+    data.m_Parameters.m_BiasEnabled = biasEnabled;
+    data.m_Parameters.m_TransposeWeightMatrix = transposeWeights;
+
+    std::unique_ptr<armnn::IWorkload> workload = workloadFactory.CreateFullyConnected(data, info);
+    LayerTestResult<T, 2> result(outputTensorInfo);
+
+    inputHandle->Allocate();
+    outputHandle->Allocate();
+    CopyDataToITensorHandle(inputHandle.get(), &input[0][0][0][0]);
+
+    ExecuteWorkload(*workload, memoryManager);
+
+    CopyDataFromITensorHandle(&result.output[0][0], outputHandle.get());
+
+    return result;
+}
+
+template<armnn::DataType ArmnnType, typename T>
+LayerTestResult<T, 2> FullyConnectedTest(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        bool biasEnabled)
+{
+    constexpr static unsigned int inputWidth = 3u;
+    constexpr static unsigned int inputHeight = 2u;
+    constexpr static unsigned int inputChannels = 1u;
+
+    constexpr static unsigned int inputSize = inputWidth * inputHeight * inputChannels;
+
+    constexpr static unsigned int outputChannels = 2u;
+
+    armnn::TensorInfo inputTensorInfo({ 1, inputChannels, inputHeight, inputWidth }, ArmnnType);
+    inputTensorInfo.SetQuantizationScale(0.1f);
+    inputTensorInfo.SetQuantizationOffset(63);
+
+    armnn::TensorInfo outputTensorInfo({ 1, outputChannels }, ArmnnType);
+    outputTensorInfo.SetQuantizationScale(5.f);
+    outputTensorInfo.SetQuantizationOffset(biasEnabled ? -50 : 10);
+
+    armnn::TensorInfo weightsDesc({ outputChannels, inputSize }, ArmnnType);
+    weightsDesc.SetQuantizationScale(0.2f);
+    weightsDesc.SetQuantizationOffset(93);
+
+    armnn::TensorInfo biasesDesc({ outputChannels }, GetBiasTypeFromWeightsType(weightsDesc.GetDataType()).value());
+    biasesDesc.SetQuantizationScale(inputTensorInfo.GetQuantizationScale() * weightsDesc.GetQuantizationScale());
+    biasesDesc.SetQuantizationOffset(0);
+
+    LayerTestResult<T, 2> result(outputTensorInfo);
+
+    auto input = MakeTensor<T, 4>(inputTensorInfo, ConvertToDataType<ArmnnType>(
+        {
+            -1.2f, 6.1f, -3.5f,
+            18.8f, -5.5f, 2.9f
+        },
+        inputTensorInfo));
+
+    auto weights = MakeTensor<T, 2>(weightsDesc, ConvertToDataType<ArmnnType>(
+        {
+            -8.4f, 20.0f, -10.4f, -8, 16.4f, -11.8f,
+            23.4f, 10.4f, -14.0f, -3.8f, -11.8f, 11.4f
+        },
+        weightsDesc));
+
+    auto bias = MakeTensor<int32_t, 1>(biasesDesc, std::vector<int32_t>{9250, 67500});
+
+    result = SimpleFullyConnectedTestImpl<T>(
+            workloadFactory,
+            memoryManager,
+            inputTensorInfo, outputTensorInfo,
+            weightsDesc, biasesDesc,
+            weights, bias, input,
+            biasEnabled, true
+    );
+
+    if (biasEnabled)
+    {
+        result.outputExpected = MakeTensor<T, 2>(outputTensorInfo,
+                                                 ConvertToDataType<ArmnnType>({80.f, 1460.f}, outputTensorInfo));
+    }
+    else
+    {
+        result.outputExpected = MakeTensor<T, 2>(outputTensorInfo,
+                                                 ConvertToDataType<ArmnnType>({-107.04f, 110.f}, outputTensorInfo));
+    }
+
+    return result;
+}
+
+//
+// ArmNN variant of the AndroidNN fully_connected_float_large test.
+//
+// Tests the fully connected layer with large values, optionally transposing weights.
+// Note this is templated for consistency, but the nature of this tests makes it unlikely to be useful in Uint8 mode.
+//
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 2> FullyConnectedLargeTestCommon(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    bool transposeWeights,
+    float qScale = 0.0f,
+    int32_t qOffset = 0)
+{
+    unsigned int inputWidth = 1;
+    unsigned int inputHeight = 1;
+    unsigned int inputChannels = 5;
+    unsigned int inputNum = 1;
+
+    unsigned int outputChannels = 1;
+    unsigned int outputNum = 1;
+
+    // Define the tensor descriptors.
+    armnn::TensorInfo inputTensorInfo;
+    armnn::TensorInfo outputTensorInfo;
+    armnn::TensorInfo weightsDesc;
+    armnn::TensorInfo biasesDesc;
+
+    unsigned int inputShape[] = { inputNum, inputChannels, inputHeight, inputWidth };
+    unsigned int outputShape[] = { outputNum, outputChannels };
+    unsigned int weightsShape[] = { inputChannels, outputChannels };
+    if (transposeWeights)
+    {
+        std::swap(weightsShape[0], weightsShape[1]);
+    }
+
+    unsigned int biasShape[] = { outputChannels };
+
+    inputTensorInfo = armnn::TensorInfo(4, inputShape, ArmnnType);
+    outputTensorInfo = armnn::TensorInfo(2, outputShape, ArmnnType);
+    weightsDesc = armnn::TensorInfo(2, weightsShape, ArmnnType);
+    biasesDesc = armnn::TensorInfo(1, biasShape, ArmnnType);
+
+    // Set quantization parameters if the requested type is a quantized type.
+    if(armnn::IsQuantizedType<T>())
+    {
+        inputTensorInfo.SetQuantizationScale(qScale);
+        inputTensorInfo.SetQuantizationOffset(qOffset);
+        outputTensorInfo.SetQuantizationScale(qScale);
+        outputTensorInfo.SetQuantizationOffset(qOffset);
+    }
+
+    LayerTestResult<T, 2> result(outputTensorInfo);
+
+    boost::multi_array<T, 4> input = MakeTensor<T, 4>(inputTensorInfo,
+        QuantizedVector<T>(qScale, qOffset, {
+            1.0f, 10.0f, 100.0f, 1000.0f, 10000.0f,
+        })
+    );
+
+    boost::multi_array<T, 2> weights = MakeTensor<T, 2>(weightsDesc,
+        QuantizedVector<T>(qScale, qOffset, {
+            2.0f, 3.0f, 4.0f, 5.0f, 6.0f
+        })
+    );
+
+    std::vector<T> biasValues({900000.f});
+    boost::multi_array<T, 1> bias = MakeTensor<T, 1>(biasesDesc, biasValues);
+
+    result = SimpleFullyConnectedTestImpl<T>(
+        workloadFactory,
+        memoryManager,
+        inputTensorInfo, outputTensorInfo,
+        weightsDesc, biasesDesc,
+        weights, bias, input,
+        true, transposeWeights
+    );
+
+    result.outputExpected = MakeTensor<T, 2>(outputTensorInfo,
+        QuantizedVector<T>(qScale, qOffset, {
+            965432.0f,
+        })
+    );
+
+    return result;
+}
+
+//
+// Explicit template specializations
+//
+
+template LayerTestResult<armnn::ResolveType<armnn::DataType::QuantisedAsymm8>, 2>
+FullyConnectedTest<armnn::DataType::QuantisedAsymm8>(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    bool biasEnabled);
+
+template LayerTestResult<armnn::ResolveType<armnn::DataType::QuantisedSymm16>, 2>
+FullyConnectedTest<armnn::DataType::QuantisedSymm16>(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    bool biasEnabled);
+
+//
+// Implementation functions
+//
+
+LayerTestResult<float, 2> FullyConnectedFloat32Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    bool biasEnabled,
+    bool transposeWeights)
+{
+    unsigned int inputWidth = 1;
+    unsigned int inputHeight = 1;
+    unsigned int inputChannels = 5;
+    unsigned int inputNum = 2;
+
+    unsigned int outputChannels = 3;
+    unsigned int outputNum = 2;
+
+    // Define the tensor descriptors.
+    armnn::TensorInfo inputTensorInfo;
+    armnn::TensorInfo outputTensorInfo;
+    armnn::TensorInfo weightsDesc;
+    armnn::TensorInfo biasesDesc;
+
+    unsigned int inputShape[]   = { inputNum, inputChannels, inputHeight, inputWidth };
+    unsigned int outputShape[]  = { outputNum, outputChannels };
+    unsigned int weightsShape[] = { inputChannels, outputChannels };
+
+    if (transposeWeights)
+    {
+        std::swap(weightsShape[0], weightsShape[1]);
+    }
+
+    unsigned int biasShape[] = { outputChannels };
+
+    inputTensorInfo = armnn::TensorInfo(4, inputShape, armnn::DataType::Float32);
+    outputTensorInfo = armnn::TensorInfo(2, outputShape, armnn::DataType::Float32);
+    weightsDesc = armnn::TensorInfo(2, weightsShape, armnn::DataType::Float32);
+    biasesDesc = armnn::TensorInfo(1, biasShape, armnn::DataType::Float32);
+
+    LayerTestResult<float, 2> result(outputTensorInfo);
+
+    boost::multi_array<float, 4> input = MakeTensor<float, 4>(inputTensorInfo, std::vector<float>(
+        {
+            1.0f, 2.0f, 3.0f, 4.0f, 5.0f,
+
+            5.0f, 4.0f, 3.0f, 2.0f, 1.0f
+        })
+    );
+
+    boost::multi_array<float, 2> weights = MakeTensor<float, 2>(weightsDesc, std::vector<float>(
+        {
+            .5f, 2.f, .5f,
+            .5f, 2.f, 1.f,
+            .5f, 2.f, 2.f,
+            .5f, 2.f, 3.f,
+            .5f, 2.f, 4.f
+        }));
+
+    if (transposeWeights)
+    {
+        weights = MakeTensor<float, 2>(weightsDesc, std::vector<float>(
+        {
+            .5f, .5f, .5f, .5f, .5f,
+            2.f, 2.f, 2.f, 2.f, 2.f,
+            .5f, 1.f, 2.f, 3.f, 4.f
+        }));
+    }
+
+
+    std::vector<float> biasValues({0.f, 0.f, 0.f});
+    if (biasEnabled)
+    {
+        biasValues =  std::vector<float>({10.f, 20.f, 30.f});
+    }
+    boost::multi_array<float, 1> bias = MakeTensor<float, 1>(biasesDesc, biasValues);
+
+    result = SimpleFullyConnectedTestImpl<float>(
+        workloadFactory,
+        memoryManager,
+        inputTensorInfo, outputTensorInfo,
+        weightsDesc, biasesDesc,
+        weights, bias, input,
+        biasEnabled, transposeWeights
+    );
+
+    result.outputExpected = MakeTensor<float, 2>(outputTensorInfo, std::vector<float>(
+        {
+            0.5f + 1.0f + 1.5f + 2.0f + 2.5f + biasValues[0],
+            2.0f + 4.0f + 6.0f + 8.0f + 10.f + biasValues[1],
+            0.5f + 2.0f + 6.0f + 12.f + 20.f + biasValues[2],
+
+            2.5f + 2.0f + 1.5f + 1.0f + 0.5f + biasValues[0],
+            10.0f + 8.0f + 6.0f + 4.0f + 2.f + biasValues[1],
+            2.5f + 4.0f + 6.0f + 6.f + 4.f   + biasValues[2]
+        })
+    );
+
+    return result;
+}
+
+LayerTestResult<float, 2> FullyConnectedLargeTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    bool transposeWeights)
+{
+    return FullyConnectedLargeTestCommon<armnn::DataType::Float32>(workloadFactory, memoryManager, transposeWeights);
+}
diff --git a/src/backends/backendsCommon/test/layerTests/FullyConnectedTestImpl.hpp b/src/backends/backendsCommon/test/layerTests/FullyConnectedTestImpl.hpp
new file mode 100644
index 0000000..8a2463c
--- /dev/null
+++ b/src/backends/backendsCommon/test/layerTests/FullyConnectedTestImpl.hpp
@@ -0,0 +1,30 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include "LayerTestResult.hpp"
+
+#include <ResolveType.hpp>
+
+#include <backendsCommon/IBackendInternal.hpp>
+#include <backendsCommon/WorkloadFactory.hpp>
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 2> FullyConnectedTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    bool biasEnabled);
+
+LayerTestResult<float, 2> FullyConnectedFloat32Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    bool biasEnabled,
+    bool transposeWeights);
+
+LayerTestResult<float, 2> FullyConnectedLargeTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    bool transposeWeights);
diff --git a/src/backends/backendsCommon/test/layerTests/GatherTestImpl.cpp b/src/backends/backendsCommon/test/layerTests/GatherTestImpl.cpp
new file mode 100644
index 0000000..0118f54
--- /dev/null
+++ b/src/backends/backendsCommon/test/layerTests/GatherTestImpl.cpp
@@ -0,0 +1,251 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "GatherTestImpl.hpp"
+
+#include <ResolveType.hpp>
+
+#include <armnn/ArmNN.hpp>
+
+#include <backendsCommon/test/TensorCopyUtils.hpp>
+#include <backendsCommon/test/WorkloadTestUtils.hpp>
+
+#include <test/TensorHelpers.hpp>
+
+namespace
+{
+
+template <armnn::DataType ArmnnType,
+          typename T = armnn::ResolveType<ArmnnType>,
+          size_t ParamsDim,
+          size_t IndicesDim,
+          size_t OutputDim>
+LayerTestResult<T, OutputDim> GatherTestImpl(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::TensorInfo& paramsInfo,
+    const armnn::TensorInfo& indicesInfo,
+    const armnn::TensorInfo& outputInfo,
+    const std::vector<T>& paramsData,
+    const std::vector<int32_t>& indicesData,
+    const std::vector<T>& outputData)
+{
+    auto params  = MakeTensor<T, ParamsDim>(paramsInfo, paramsData);
+    auto indices = MakeTensor<int32_t, IndicesDim>(indicesInfo, indicesData);
+
+    LayerTestResult<T, OutputDim> result(outputInfo);
+    result.outputExpected = MakeTensor<T, OutputDim>(outputInfo, outputData);
+
+    std::unique_ptr<armnn::ITensorHandle> paramsHandle = workloadFactory.CreateTensorHandle(paramsInfo);
+    std::unique_ptr<armnn::ITensorHandle> indicesHandle = workloadFactory.CreateTensorHandle(indicesInfo);
+    std::unique_ptr<armnn::ITensorHandle> outputHandle = workloadFactory.CreateTensorHandle(outputInfo);
+
+    armnn::GatherQueueDescriptor data;
+    armnn::WorkloadInfo info;
+    AddInputToWorkload(data,  info, paramsInfo, paramsHandle.get());
+    AddInputToWorkload(data, info, indicesInfo, indicesHandle.get());
+    AddOutputToWorkload(data, info, outputInfo, outputHandle.get());
+
+    std::unique_ptr<armnn::IWorkload> workload = workloadFactory.CreateGather(data, info);
+
+    paramsHandle->Allocate();
+    indicesHandle->Allocate();
+    outputHandle->Allocate();
+
+    CopyDataToITensorHandle(paramsHandle.get(), params.origin());
+    CopyDataToITensorHandle(indicesHandle.get(), indices.origin());
+
+    workload->Execute();
+
+    CopyDataFromITensorHandle(result.output.origin(), outputHandle.get());
+
+    return result;
+}
+
+template <armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 1> Gather1dParamsTestImpl(armnn::IWorkloadFactory& workloadFactory,
+                                             const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    armnn::TensorInfo paramsInfo({ 8 }, ArmnnType);
+    armnn::TensorInfo indicesInfo({ 4 }, armnn::DataType::Signed32);
+    armnn::TensorInfo outputInfo({ 4 }, ArmnnType);
+
+    if (armnn::IsQuantizedType<T>())
+    {
+        paramsInfo.SetQuantizationScale(1.0f);
+        paramsInfo.SetQuantizationOffset(1);
+        outputInfo.SetQuantizationScale(1.0f);
+        outputInfo.SetQuantizationOffset(1);
+    }
+    const std::vector<T> params         = std::vector<T>({ 1, 2, 3, 4, 5, 6, 7, 8 });
+    const std::vector<int32_t> indices  = std::vector<int32_t>({ 0, 2, 1, 5 });
+    const std::vector<T> expectedOutput = std::vector<T>({ 1, 3, 2, 6 });
+
+    return GatherTestImpl<ArmnnType, T, 1, 1, 1>(
+        workloadFactory,
+        memoryManager,
+        paramsInfo,
+        indicesInfo,
+        outputInfo,
+        params,
+        indices,
+        expectedOutput);
+}
+
+template <armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 2> GatherMultiDimParamsTestImpl(
+    armnn::IWorkloadFactory& workloadFactory, const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    armnn::TensorInfo paramsInfo({ 5, 2 }, ArmnnType);
+    armnn::TensorInfo indicesInfo({ 3 }, armnn::DataType::Signed32);
+    armnn::TensorInfo outputInfo({ 3, 2 }, ArmnnType);
+
+    if (armnn::IsQuantizedType<T>())
+    {
+        paramsInfo.SetQuantizationScale(1.0f);
+        paramsInfo.SetQuantizationOffset(1);
+        outputInfo.SetQuantizationScale(1.0f);
+        outputInfo.SetQuantizationOffset(1);
+    }
+
+    const std::vector<T> params         = std::vector<T>({ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 });
+    const std::vector<int32_t> indices  = std::vector<int32_t>({ 1, 3, 4 });
+    const std::vector<T> expectedOutput = std::vector<T>({ 3, 4, 7, 8, 9, 10 });
+
+    return GatherTestImpl<ArmnnType, T, 2, 1, 2>(
+        workloadFactory,
+        memoryManager,
+        paramsInfo,
+        indicesInfo,
+        outputInfo,
+        params,
+        indices,
+        expectedOutput);
+}
+
+template <armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> GatherMultiDimParamsMultiDimIndicesTestImpl(
+    armnn::IWorkloadFactory& workloadFactory, const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    armnn::TensorInfo paramsInfo({ 3, 2, 3}, ArmnnType);
+    armnn::TensorInfo indicesInfo({ 2, 3 }, armnn::DataType::Signed32);
+    armnn::TensorInfo outputInfo({ 2, 3, 2, 3 }, ArmnnType);
+
+    if (armnn::IsQuantizedType<T>())
+    {
+        paramsInfo.SetQuantizationScale(1.0f);
+        paramsInfo.SetQuantizationOffset(1);
+        outputInfo.SetQuantizationScale(1.0f);
+        outputInfo.SetQuantizationOffset(1);
+    }
+
+    const std::vector<T> params =
+    {
+         1,  2,  3,
+         4,  5,  6,
+
+         7,  8,  9,
+        10, 11, 12,
+
+        13, 14, 15,
+        16, 17, 18
+    };
+
+    const std::vector<int32_t> indices = { 1, 2, 1, 2, 1, 0 };
+
+    const std::vector<T> expectedOutput =
+    {
+         7,  8,  9,
+        10, 11, 12,
+        13, 14, 15,
+        16, 17, 18,
+         7,  8,  9,
+        10, 11, 12,
+
+        13, 14, 15,
+        16, 17, 18,
+         7,  8,  9,
+        10, 11, 12,
+         1,  2,  3,
+         4,  5,  6
+    };
+
+    return GatherTestImpl<ArmnnType, T, 3, 2, 4>(
+        workloadFactory,
+        memoryManager,
+        paramsInfo,
+        indicesInfo,
+        outputInfo,
+        params,
+        indices,
+        expectedOutput);
+}
+
+} // anonymous namespace
+
+LayerTestResult<float, 1> Gather1dParamsFloatTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return Gather1dParamsTestImpl<armnn::DataType::Float32>(workloadFactory, memoryManager);
+}
+
+LayerTestResult<uint8_t, 1> Gather1dParamsUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return Gather1dParamsTestImpl<armnn::DataType::QuantisedAsymm8>(workloadFactory, memoryManager);
+}
+
+LayerTestResult<int16_t, 1> Gather1dParamsInt16Test(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return Gather1dParamsTestImpl<armnn::DataType::QuantisedSymm16>(workloadFactory, memoryManager);
+}
+
+LayerTestResult<float, 2> GatherMultiDimParamsFloatTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return GatherMultiDimParamsTestImpl<armnn::DataType::Float32>(workloadFactory, memoryManager);
+}
+
+LayerTestResult<uint8_t, 2> GatherMultiDimParamsUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return GatherMultiDimParamsTestImpl<armnn::DataType::QuantisedAsymm8>(workloadFactory, memoryManager);
+}
+
+LayerTestResult<int16_t, 2> GatherMultiDimParamsInt16Test(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return GatherMultiDimParamsTestImpl<armnn::DataType::QuantisedSymm16>(workloadFactory, memoryManager);
+}
+
+LayerTestResult<float, 4> GatherMultiDimParamsMultiDimIndicesFloatTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return GatherMultiDimParamsMultiDimIndicesTestImpl<armnn::DataType::Float32>(workloadFactory, memoryManager);
+}
+
+LayerTestResult<uint8_t, 4> GatherMultiDimParamsMultiDimIndicesUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return GatherMultiDimParamsMultiDimIndicesTestImpl<armnn::DataType::QuantisedAsymm8>(
+        workloadFactory, memoryManager);
+}
+
+LayerTestResult<int16_t, 4> GatherMultiDimParamsMultiDimIndicesInt16Test(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return GatherMultiDimParamsMultiDimIndicesTestImpl<armnn::DataType::QuantisedSymm16>(
+        workloadFactory, memoryManager);
+}
diff --git a/src/backends/backendsCommon/test/layerTests/GatherTestImpl.hpp b/src/backends/backendsCommon/test/layerTests/GatherTestImpl.hpp
new file mode 100644
index 0000000..fd12e61
--- /dev/null
+++ b/src/backends/backendsCommon/test/layerTests/GatherTestImpl.hpp
@@ -0,0 +1,47 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include "LayerTestResult.hpp"
+
+#include <backendsCommon/IBackendInternal.hpp>
+#include <backendsCommon/WorkloadFactory.hpp>
+
+LayerTestResult<float, 1> Gather1dParamsFloatTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<uint8_t, 1> Gather1dParamsUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<int16_t, 1> Gather1dParamsInt16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<float, 2> GatherMultiDimParamsFloatTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<uint8_t, 2> GatherMultiDimParamsUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<int16_t, 2> GatherMultiDimParamsInt16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<float, 4> GatherMultiDimParamsMultiDimIndicesFloatTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<uint8_t, 4> GatherMultiDimParamsMultiDimIndicesUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<int16_t, 4> GatherMultiDimParamsMultiDimIndicesInt16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
diff --git a/src/backends/backendsCommon/test/layerTests/L2NormalizationTestImpl.cpp b/src/backends/backendsCommon/test/layerTests/L2NormalizationTestImpl.cpp
new file mode 100644
index 0000000..5c75b6f
--- /dev/null
+++ b/src/backends/backendsCommon/test/layerTests/L2NormalizationTestImpl.cpp
@@ -0,0 +1,821 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "L2NormalizationTestImpl.hpp"
+
+#include <Permute.hpp>
+#include <ResolveType.hpp>
+#include <TensorUtils.hpp>
+
+#include <backendsCommon/test/TensorCopyUtils.hpp>
+#include <backendsCommon/test/WorkloadTestUtils.hpp>
+
+#include <test/TensorHelpers.hpp>
+
+namespace
+{
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> L2NormalizationTestImpl(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::TensorShape& inputOutputTensorShape,
+    float scale,
+    int32_t offset,
+    const std::vector<float>& inputValues,
+    float outScale,
+    int32_t outOffset,
+    const std::vector<float>& expectedOutputValues,
+    const armnn::DataLayout layout,
+    float epsilon = 1e-12f)
+{
+    const armnn::TensorInfo inputTensorInfo(inputOutputTensorShape, ArmnnType, scale, offset);
+    const armnn::TensorInfo outputTensorInfo(inputOutputTensorShape, ArmnnType, outScale, outOffset);
+
+    // at this point if we require it permute the input data
+    const armnn::PermutationVector NCHWToNHWC = { 0, 3, 1, 2 };
+    std::vector<float> inputData = inputValues;
+    if (layout == armnn::DataLayout::NHWC)
+    {
+        std::vector<float> tmp(inputData.size());
+        armnnUtils::Permute(inputTensorInfo.GetShape(), NCHWToNHWC, inputData.data(), tmp.data(), sizeof(float));
+        inputData = tmp;
+    }
+
+    auto inputTensor = MakeTensor<T, 4>(inputTensorInfo, QuantizedVector<T>(
+                                                         inputTensorInfo.GetQuantizationScale(),
+                                                         inputTensorInfo.GetQuantizationOffset(),
+                                                         inputData));
+
+    std::vector<float> expectedOutputData = expectedOutputValues;
+    if (layout == armnn::DataLayout::NHWC)
+    {
+        std::vector<float> tmp(expectedOutputData.size());
+        armnnUtils::Permute(inputTensorInfo.GetShape(), NCHWToNHWC, expectedOutputData.data(), tmp.data(),
+                            sizeof(float));
+        expectedOutputData = tmp;
+    }
+
+    LayerTestResult<T, 4> result(outputTensorInfo);
+    result.outputExpected = MakeTensor<T, 4>(outputTensorInfo, QuantizedVector<T>(
+                                                               outputTensorInfo.GetQuantizationScale(),
+                                                               outputTensorInfo.GetQuantizationOffset(),
+                                                               expectedOutputData));
+
+    std::unique_ptr<armnn::ITensorHandle> inputHandle = workloadFactory.CreateTensorHandle(inputTensorInfo);
+    std::unique_ptr<armnn::ITensorHandle> outputHandle = workloadFactory.CreateTensorHandle(outputTensorInfo);
+
+    armnn::L2NormalizationQueueDescriptor descriptor;
+    descriptor.m_Parameters.m_Eps = epsilon;
+    descriptor.m_Parameters.m_DataLayout = layout;
+    armnn::WorkloadInfo info;
+
+    AddInputToWorkload(descriptor, info, inputTensorInfo, inputHandle.get());
+    AddOutputToWorkload(descriptor, info, outputTensorInfo, outputHandle.get());
+
+    std::unique_ptr<armnn::IWorkload> workload = workloadFactory.CreateL2Normalization(descriptor, info);
+
+    inputHandle->Allocate();
+    outputHandle->Allocate();
+
+    CopyDataToITensorHandle(inputHandle.get(), &inputTensor[0][0][0][0]);
+
+    workload->PostAllocationConfigure();
+    ExecuteWorkload(*workload, memoryManager);
+
+    CopyDataFromITensorHandle(&result.output[0][0][0][0], outputHandle.get());
+
+    return result;
+}
+
+float CalcInvL2Norm(std::initializer_list<float> elements)
+{
+    const float reduction = std::accumulate(elements.begin(), elements.end(), 0.0f,
+        [](float acc, float element) { return acc + element * element; });
+    return 1.0f / sqrtf(reduction);
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> L2NormalizationEpsilonTestCommon(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        float scale,
+        int32_t offset,
+        float outScale,
+        int32_t outOffset,
+        const armnn::DataLayout layout,
+        float epsilon)
+{
+    // Width: 1
+    // Height: 1
+    // Channels: 3
+    // BatchSize: 1
+    unsigned int numberOfBatches = 1;
+    unsigned int numberOfChannels = 3;
+    unsigned int height = 1;
+    unsigned int width = 1;
+
+    const armnn::TensorShape inputOutputShape = armnnUtils::GetTensorShape(
+            numberOfBatches, numberOfChannels, height, width, layout);
+
+    // 0.0000001^2 + 0.00000002^2 + 0.00000003^2 < 1e-12
+    std::vector<float> inputValues
+    {
+        // Batch 0, Channel 0, Height (1) x Width (1)
+        0.00000001f,
+
+        // Batch 0, Channel 1, Height (1) x Width (1)
+        0.00000002f,
+
+        // Batch 0, Channel 2, Height (1) x Width (1)
+        0.00000003f,
+    };
+
+    const float approxInvL2Norm = 1.f / sqrtf(epsilon);
+    std::vector<float> expectedOutputValues
+    {
+        // Batch 0, Channel 0, Height (1) x Width (1)
+        0.00000001f * approxInvL2Norm,
+        0.00000002f * approxInvL2Norm,
+        0.00000003f * approxInvL2Norm,
+    };
+
+    return L2NormalizationTestImpl<ArmnnType>(
+        workloadFactory,
+        memoryManager,
+        inputOutputShape,
+        scale,
+        offset,
+        inputValues,
+        outScale,
+        outOffset,
+        expectedOutputValues,
+        layout,
+        epsilon);
+}
+
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> L2Normalization1dTestCommon(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        float scale,
+        int32_t offset,
+        float outScale,
+        int32_t outOffset,
+        const armnn::DataLayout layout)
+{
+    // Width: 1
+    // Height: 1
+    // Channels: 10
+    // BatchSize: 1
+    unsigned int numberOfBatches = 1;
+    unsigned int numberOfChannels = 10;
+    unsigned int height = 1;
+    unsigned int width = 1;
+
+
+    const armnn::TensorShape inputOutputShape = armnnUtils::GetTensorShape(
+            numberOfBatches, numberOfChannels, height, width, layout);
+    std::vector<float> inputValues
+    {
+        // Batch 0, Channel 0, Height (1) x Width (1)
+        1.0f,
+
+        // Batch 0, Channel 1, Height (1) x Width (1)
+        2.0f,
+
+        // Batch 0, Channel 2, Height (1) x Width (1)
+        3.0f,
+
+        // Batch 0, Channel 3, Height (1) x Width (1)
+        4.0f,
+
+        // Batch 0, Channel 4, Height (1) x Width (1)
+        5.0f,
+
+        // Batch 0, Channel 5, Height (1) x Width (1)
+        6.0f,
+
+        // Batch 0, Channel 6, Height (1) x Width (1)
+        7.0f,
+
+        // Batch 0, Channel 7, Height (1) x Width (1)
+        8.0f,
+
+        // Batch 0, Channel 8, Height (1) x Width (1)
+        9.0f,
+
+        // Batch 0, Channel 9, Height (1) x Width (1)
+        10.0f
+    };
+    const float approxInvL2Norm = 0.050964719f;
+    std::vector<float> expectedOutputValues
+    {
+        // Batch 0, Channel 0, Height (1) x Width (1)
+        1.0f * approxInvL2Norm,
+        2.0f * approxInvL2Norm,
+        3.0f * approxInvL2Norm,
+        4.0f * approxInvL2Norm,
+        5.0f * approxInvL2Norm,
+        6.0f * approxInvL2Norm,
+        7.0f * approxInvL2Norm,
+        8.0f * approxInvL2Norm,
+        9.0f * approxInvL2Norm,
+        10.0f * approxInvL2Norm
+    };
+
+
+    return L2NormalizationTestImpl<ArmnnType>(
+        workloadFactory,
+        memoryManager,
+        inputOutputShape,
+        scale,
+        offset,
+        inputValues,
+        outScale,
+        outOffset,
+        expectedOutputValues,
+        layout);
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> L2Normalization2dTestCommon(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    float scale,
+    int32_t offset,
+    float outScale,
+    int32_t outOffset,
+    const armnn::DataLayout layout)
+{
+    // Width: 5
+    // Height: 1
+    // Channels: 2
+    // BatchSize: 1
+    unsigned int numberOfBatches = 1;
+    unsigned int numberOfChannels = 2;
+    unsigned int height = 1;
+    unsigned int width = 5;
+
+    const armnn::TensorShape inputOutputShape = armnnUtils::GetTensorShape(
+            numberOfBatches, numberOfChannels, height, width, layout);
+    std::vector<float> inputValues
+    {
+        // Batch 0, Channel 0, Height (1) x Width (5)
+        1.0f, 3.0f, 5.0f, 7.0f,  9.0f,
+
+        // Batch 0, Channel 1, Height (1) x Width (5)
+        2.0f, 4.0f, 6.0f, 8.0f, 10.0f
+    };
+    std::vector<float> expectedOutputValues
+    {
+        // Batch 0, Channel 0, Height (1) x Width (5)
+        1.0f * CalcInvL2Norm({ 1.0f,  2.0f }),
+        3.0f * CalcInvL2Norm({ 3.0f,  4.0f }),
+        5.0f * CalcInvL2Norm({ 5.0f,  6.0f }),
+        7.0f * CalcInvL2Norm({ 7.0f,  8.0f }),
+        9.0f * CalcInvL2Norm({ 9.0f, 10.0f }),
+
+        // Batch 0, Channel 1, Height (1) x Width (5)
+        2.0f * CalcInvL2Norm({ 1.0f,  2.0f }),
+        4.0f * CalcInvL2Norm({ 3.0f,  4.0f }),
+        6.0f * CalcInvL2Norm({ 5.0f,  6.0f }),
+        8.0f * CalcInvL2Norm({ 7.0f,  8.0f }),
+        10.0f * CalcInvL2Norm({ 9.0f, 10.0f })
+    };
+
+    return L2NormalizationTestImpl<ArmnnType>(
+        workloadFactory,
+        memoryManager,
+        inputOutputShape,
+        scale,
+        offset,
+        inputValues,
+        outScale,
+        outOffset,
+        expectedOutputValues,
+        layout);
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> L2Normalization3dTestCommon(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    float scale,
+    int32_t offset,
+    float outScale,
+    int32_t outOffset,
+    const armnn::DataLayout layout)
+{
+    // Width: 3
+    // Height: 4
+    // Channels: 2
+    // BatchSize: 1
+    unsigned int numberOfBatches = 1;
+    unsigned int numberOfChannels = 2;
+    unsigned int height = 4;
+    unsigned int width = 3;
+
+    const armnn::TensorShape inputOutputShape = armnnUtils::GetTensorShape(
+            numberOfBatches, numberOfChannels, height, width, layout);
+    std::vector<float> inputValues
+    {
+        // Batch 0, Channel 0, Height (4) x Width (3)
+        119.0f,  21.0f, 150.0f,
+        149.0f,  32.0f, 179.0f,
+        15.0f, 227.0f, 141.0f,
+        147.0f, 199.0f, 220.0f,
+
+        // Batch 0, Channel 1, Height (4) x Width (3)
+        110.0f, 140.0f,  73.0f,
+        211.0f, 212.0f,  89.0f,
+        24.0f, 138.0f, 188.0f,
+        162.0f,  12.0f, 161.0f
+    };
+    std::vector<float> expectedOutputValues
+    {
+        // Batch 0, Channel 0, Height (4) x Width (3)
+        119.0f * CalcInvL2Norm({ 119.0f, 110.0f }),
+        21.0f * CalcInvL2Norm({  21.0f, 140.0f }),
+        150.0f * CalcInvL2Norm({ 150.0f,  73.0f }),
+        149.0f * CalcInvL2Norm({ 149.0f, 211.0f }),
+        32.0f * CalcInvL2Norm({  32.0f, 212.0f }),
+        179.0f * CalcInvL2Norm({ 179.0f,  89.0f }),
+        15.0f * CalcInvL2Norm({  15.0f,  24.0f }),
+        227.0f * CalcInvL2Norm({ 227.0f, 138.0f }),
+        141.0f * CalcInvL2Norm({ 141.0f, 188.0f }),
+        147.0f * CalcInvL2Norm({ 147.0f, 162.0f }),
+        199.0f * CalcInvL2Norm({ 199.0f,  12.0f }),
+        220.0f * CalcInvL2Norm({ 220.0f, 161.0f }),
+
+        // Batch 0, Channel 1, Height (4) x Width (3)
+        110.0f * CalcInvL2Norm({ 119.0f, 110.0f }),
+        140.0f * CalcInvL2Norm({  21.0f, 140.0f }),
+        73.0f * CalcInvL2Norm({ 150.0f,  73.0f }),
+        211.0f * CalcInvL2Norm({ 149.0f, 211.0f }),
+        212.0f * CalcInvL2Norm({  32.0f, 212.0f }),
+        89.0f * CalcInvL2Norm({ 179.0f,  89.0f }),
+        24.0f * CalcInvL2Norm({  15.0f,  24.0f }),
+        138.0f * CalcInvL2Norm({ 227.0f, 138.0f }),
+        188.0f * CalcInvL2Norm({ 141.0f, 188.0f }),
+        162.0f * CalcInvL2Norm({ 147.0f, 162.0f }),
+        12.0f * CalcInvL2Norm({ 199.0f,  12.0f }),
+        161.0f * CalcInvL2Norm({ 220.0f, 161.0f })
+    };
+
+    return L2NormalizationTestImpl<ArmnnType>(
+        workloadFactory,
+        memoryManager,
+        inputOutputShape,
+        scale,
+        offset,
+        inputValues,
+        outScale,
+        outOffset,
+        expectedOutputValues,
+        layout);
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> L2Normalization4dTestCommon(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    float scale,
+    int32_t offset,
+    float outScale,
+    int32_t outOffset,
+    const armnn::DataLayout layout)
+{
+    // Width: 3
+    // Height: 4
+    // Channels: 3
+    // BatchSize: 2
+    unsigned int numberOfBatches = 2;
+    unsigned int numberOfChannels = 3;
+    unsigned int height = 4;
+    unsigned int width = 3;
+
+    const armnn::TensorShape inputOutputShape = armnnUtils::GetTensorShape(
+            numberOfBatches, numberOfChannels, height, width, layout);
+    std::vector<float> inputValues
+    {
+        // Batch 0, Channel 0, Height (4) x Width (3)
+        235.0f,  46.0f, 178.0f,
+        100.0f, 123.0f,  19.0f,
+        172.0f,  74.0f, 250.0f,
+        6.0f, 195.0f,  80.0f,
+
+        // Batch 0, Channel 1, Height (4) x Width (3)
+        113.0f,  95.0f, 202.0f,
+        77.0f, 114.0f,  71.0f,
+        122.0f, 246.0f, 166.0f,
+        82.0f,  28.0f,  37.0f,
+
+        // Batch 0, Channel 2, Height (4) x Width (3)
+        56.0f, 170.0f, 162.0f,
+        194.0f,  89.0f, 254.0f,
+        12.0f, 209.0f, 200.0f,
+        1.0f,  64.0f,  54.0f,
+
+        // Batch 1, Channel 0, Height (4) x Width (3)
+        67.0f,  90.0f,  49.0f,
+        7.0f, 163.0f,  18.0f,
+        25.0f, 117.0f, 103.0f,
+        247.0f,  59.0f, 189.0f,
+
+        // Batch 1, Channel 1, Height (4) x Width (3)
+        239.0f, 104.0f, 199.0f,
+        17.0f, 124.0f, 153.0f,
+        222.0f, 217.0f, 75.0f,
+        32.0f, 126.0f, 21.0f,
+
+        // Batch 1, Channel 2, Height (4) x Width (3)
+        97.0f, 145.0f, 215.0f,
+        115.0f, 116.0f, 238.0f,
+        226.0f,  16.0f, 132.0f,
+        92.0f, 125.0f,  88.0f
+    };
+    std::vector<float> expectedOutputValues
+    {
+        // Batch 0, Channel 0, Height (4) x Width (3)
+        235.0f * CalcInvL2Norm({ 235.0f, 113.0f,  56.0f }),
+        46.0f * CalcInvL2Norm({  46.0f,  95.0f, 170.0f }),
+        178.0f * CalcInvL2Norm({ 178.0f, 202.0F, 162.0f }),
+        100.0f * CalcInvL2Norm({ 100.0f,  77.0f, 194.0f }),
+        123.0f * CalcInvL2Norm({ 123.0f, 114.0f,  89.0f }),
+        19.0f * CalcInvL2Norm({  19.0f,  71.0f, 254.0f }),
+        172.0f * CalcInvL2Norm({ 172.0f, 122.0f,  12.0f }),
+        74.0f * CalcInvL2Norm({  74.0f, 246.0f, 209.0f }),
+        250.0f * CalcInvL2Norm({ 250.0f, 166.0f, 200.0f }),
+        6.0f * CalcInvL2Norm({   6.0f,  82.0f,   1.0f }),
+        195.0f * CalcInvL2Norm({ 195.0f,  28.0f,  64.0f }),
+        80.0f * CalcInvL2Norm({  80.0f,  37.0f,  54.0f }),
+
+        // Batch 0, Channel 1, Height (4) x Width (3)
+        113.0f * CalcInvL2Norm({ 235.0f, 113.0f,  56.0f }),
+        95.0f * CalcInvL2Norm({  46.0f,  95.0f, 170.0f }),
+        202.0f * CalcInvL2Norm({ 178.0f, 202.0F, 162.0f }),
+        77.0f * CalcInvL2Norm({ 100.0f,  77.0f, 194.0f }),
+        114.0f * CalcInvL2Norm({ 123.0f, 114.0f,  89.0f }),
+        71.0f * CalcInvL2Norm({  19.0f,  71.0f, 254.0f }),
+        122.0f * CalcInvL2Norm({ 172.0f, 122.0f,  12.0f }),
+        246.0f * CalcInvL2Norm({  74.0f, 246.0f, 209.0f }),
+        166.0f * CalcInvL2Norm({ 250.0f, 166.0f, 200.0f }),
+        82.0f * CalcInvL2Norm({   6.0f,  82.0f,   1.0f }),
+        28.0f * CalcInvL2Norm({ 195.0f,  28.0f,  64.0f }),
+        37.0f * CalcInvL2Norm({  80.0f,  37.0f,  54.0f }),
+
+        // Batch 0, Channel 2, Height (4) x Width (3)
+        56.0f * CalcInvL2Norm({ 235.0f, 113.0f,  56.0f }),
+        170.0f * CalcInvL2Norm({  46.0f,  95.0f, 170.0f }),
+        162.0f * CalcInvL2Norm({ 178.0f, 202.0F, 162.0f }),
+        194.0f * CalcInvL2Norm({ 100.0f,  77.0f, 194.0f }),
+        89.0f * CalcInvL2Norm({ 123.0f, 114.0f,  89.0f }),
+        254.0f * CalcInvL2Norm({  19.0f,  71.0f, 254.0f }),
+        12.0f * CalcInvL2Norm({ 172.0f, 122.0f,  12.0f }),
+        209.0f * CalcInvL2Norm({  74.0f, 246.0f, 209.0f }),
+        200.0f * CalcInvL2Norm({ 250.0f, 166.0f, 200.0f }),
+        1.0f * CalcInvL2Norm({   6.0f,  82.0f,   1.0f }),
+        64.0f * CalcInvL2Norm({ 195.0f,  28.0f,  64.0f }),
+        54.0f * CalcInvL2Norm({  80.0f,  37.0f,  54.0f }),
+
+        // Batch 1, Channel 0, Height (4) x Width (3)
+        67.0f * CalcInvL2Norm({  67.0f, 239.0f,  97.0f }),
+        90.0f * CalcInvL2Norm({  90.0f, 104.0f, 145.0f }),
+        49.0f * CalcInvL2Norm({  49.0f, 199.0f, 215.0f }),
+        7.0f * CalcInvL2Norm({   7.0f,  17.0f, 115.0f }),
+        163.0f * CalcInvL2Norm({ 163.0f, 124.0f, 116.0f }),
+        18.0f * CalcInvL2Norm({  18.0f, 153.0f, 238.0f }),
+        25.0f * CalcInvL2Norm({  25.0f, 222.0f, 226.0f }),
+        117.0f * CalcInvL2Norm({ 117.0f, 217.0f,  16.0f }),
+        103.0f * CalcInvL2Norm({ 103.0f,  75.0f, 132.0f }),
+        247.0f * CalcInvL2Norm({ 247.0f,  32.0f,  92.0f }),
+        59.0f * CalcInvL2Norm({  59.0f, 126.0f, 125.0f }),
+        189.0f * CalcInvL2Norm({ 189.0f,  21.0f,  88.0f }),
+
+        // Batch 1, Channel 1, Height (4) x Width (3)
+        239.0f * CalcInvL2Norm({  67.0f, 239.0f,  97.0f }),
+        104.0f * CalcInvL2Norm({  90.0f, 104.0f, 145.0f }),
+        199.0f * CalcInvL2Norm({  49.0f, 199.0f, 215.0f }),
+        17.0f * CalcInvL2Norm({   7.0f,  17.0f, 115.0f }),
+        124.0f * CalcInvL2Norm({ 163.0f, 124.0f, 116.0f }),
+        153.0f * CalcInvL2Norm({  18.0f, 153.0f, 238.0f }),
+        222.0f * CalcInvL2Norm({  25.0f, 222.0f, 226.0f }),
+        217.0f * CalcInvL2Norm({ 117.0f, 217.0f,  16.0f }),
+        75.0f * CalcInvL2Norm({ 103.0f,  75.0f, 132.0f }),
+        32.0f * CalcInvL2Norm({ 247.0f,  32.0f,  92.0f }),
+        126.0f * CalcInvL2Norm({  59.0f, 126.0f, 125.0f }),
+        21.0f * CalcInvL2Norm({ 189.0f,  21.0f,  88.0f }),
+
+        // Batch 1, Channel 2, Height (4) x Width (3)
+        97.0f * CalcInvL2Norm({  67.0f, 239.0f,  97.0f }),
+        145.0f * CalcInvL2Norm({  90.0f, 104.0f, 145.0f }),
+        215.0f * CalcInvL2Norm({  49.0f, 199.0f, 215.0f }),
+        115.0f * CalcInvL2Norm({   7.0f,  17.0f, 115.0f }),
+        116.0f * CalcInvL2Norm({ 163.0f, 124.0f, 116.0f }),
+        238.0f * CalcInvL2Norm({  18.0f, 153.0f, 238.0f }),
+        226.0f * CalcInvL2Norm({  25.0f, 222.0f, 226.0f }),
+        16.0f * CalcInvL2Norm({ 117.0f, 217.0f,  16.0f }),
+        132.0f * CalcInvL2Norm({ 103.0f,  75.0f, 132.0f }),
+        92.0f * CalcInvL2Norm({ 247.0f,  32.0f,  92.0f }),
+        125.0f * CalcInvL2Norm({  59.0f, 126.0f, 125.0f }),
+        88.0f * CalcInvL2Norm({ 189.0f,  21.0f,  88.0f })
+    };
+
+    return L2NormalizationTestImpl<ArmnnType>(
+        workloadFactory,
+        memoryManager,
+        inputOutputShape,
+        scale,
+        offset,
+        inputValues,
+        outScale,
+        outOffset,
+        expectedOutputValues,
+        layout);
+}
+
+} // anonymous namespace
+
+LayerTestResult<float, 4> L2NormalizationDefaultEpsilonTest(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        const armnn::DataLayout layout)
+{
+    // Dummy descriptor to get the default value of epsilon.
+    armnn::L2NormalizationDescriptor descriptor;
+
+    return L2NormalizationEpsilonTestCommon<armnn::DataType::Float32>(
+        workloadFactory,
+        memoryManager,
+        0.f,
+        0,
+        0.f,
+        0,
+        layout,
+        descriptor.m_Eps);
+}
+
+LayerTestResult<float, 4> L2NormalizationNonDefaultEpsilonTest(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        const armnn::DataLayout layout)
+{
+    return L2NormalizationEpsilonTestCommon<armnn::DataType::Float32>(
+        workloadFactory,
+        memoryManager,
+        0.f,
+        0,
+        0.f,
+        0,
+        layout,
+        1e-9f);
+}
+
+LayerTestResult<float, 4> L2Normalization1dTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::DataLayout layout)
+{
+    return L2Normalization1dTestCommon<armnn::DataType::Float32>(
+        workloadFactory,
+        memoryManager,
+        0.f,
+        0,
+        0.f,
+        0,
+        layout);
+}
+
+LayerTestResult<int16_t, 4> L2Normalization1dInt16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::DataLayout layout)
+{
+    return L2Normalization1dTestCommon<armnn::DataType::QuantisedSymm16>(
+        workloadFactory,
+        memoryManager,
+        1.f,
+        0,
+        1.f,
+        0,
+        layout);
+}
+
+LayerTestResult<uint8_t, 4> L2Normalization1dUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::DataLayout layout)
+{
+    return L2Normalization1dTestCommon<armnn::DataType::QuantisedAsymm8>(
+        workloadFactory,
+        memoryManager,
+        1.f,
+        0,
+        1.f / 128,
+        128,
+        layout);
+}
+
+LayerTestResult<float, 4> L2Normalization2dTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::DataLayout layout)
+{
+    return L2Normalization2dTestCommon<armnn::DataType::Float32>(
+        workloadFactory,
+        memoryManager,
+        0.f,
+        0,
+        0.f,
+        0,
+        layout);
+}
+
+LayerTestResult<int16_t, 4> L2Normalization2dInt16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::DataLayout layout)
+{
+    return L2Normalization1dTestCommon<armnn::DataType::QuantisedSymm16>(
+        workloadFactory,
+        memoryManager,
+        1.f,
+        0,
+        1.f,
+        0,
+        layout);
+}
+
+LayerTestResult<uint8_t, 4> L2Normalization2dUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::DataLayout layout)
+{
+    return L2Normalization1dTestCommon<armnn::DataType::QuantisedAsymm8>(
+        workloadFactory,
+        memoryManager,
+        1.f,
+        0,
+        1.f / 128,
+        128,
+        layout);
+}
+
+LayerTestResult<float, 2> L2Normalization2dShapeTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    const armnn::DataLayout layout = armnn::DataLayout::NHWC;
+    const armnn::TensorShape inputOutputTensorShape = armnn::TensorShape({ 5, 2 });
+
+    std::vector<float> inputData
+    {
+        1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, 9.f, 10.f
+    };
+    std::vector<float> expectedOutputData
+    {
+        1.0f * CalcInvL2Norm({ 1.0f,  2.0f }),
+        2.0f * CalcInvL2Norm({ 1.0f,  2.0f }),
+        3.0f * CalcInvL2Norm({ 3.0f,  4.0f }),
+        4.0f * CalcInvL2Norm({ 3.0f,  4.0f }),
+        5.0f * CalcInvL2Norm({ 5.0f,  6.0f }),
+        6.0f * CalcInvL2Norm({ 5.0f,  6.0f }),
+        7.0f * CalcInvL2Norm({ 7.0f,  8.0f }),
+        8.0f * CalcInvL2Norm({ 7.0f,  8.0f }),
+        9.0f  * CalcInvL2Norm({ 9.0f, 10.0f }),
+        10.0f * CalcInvL2Norm({ 9.0f, 10.0f })
+    };
+
+    const armnn::TensorInfo inputTensorInfo(inputOutputTensorShape, armnn::DataType::Float32, 0.f, 0);
+    const armnn::TensorInfo outputTensorInfo(inputOutputTensorShape, armnn::DataType::Float32, 0.f, 0);
+
+    auto inputTensor = MakeTensor<float, 2>(inputTensorInfo, QuantizedVector<float>(
+                                                             inputTensorInfo.GetQuantizationScale(),
+                                                             inputTensorInfo.GetQuantizationOffset(),
+                                                             inputData));
+
+    LayerTestResult<float, 2> result(outputTensorInfo);
+    result.outputExpected = MakeTensor<float, 2>(outputTensorInfo, QuantizedVector<float>(
+                                                                   outputTensorInfo.GetQuantizationScale(),
+                                                                   outputTensorInfo.GetQuantizationOffset(),
+                                                                   expectedOutputData));
+
+    std::unique_ptr<armnn::ITensorHandle> inputHandle = workloadFactory.CreateTensorHandle(inputTensorInfo);
+    std::unique_ptr<armnn::ITensorHandle> outputHandle = workloadFactory.CreateTensorHandle(outputTensorInfo);
+
+    armnn::L2NormalizationQueueDescriptor descriptor;
+    descriptor.m_Parameters.m_Eps = 1e-12f;
+    descriptor.m_Parameters.m_DataLayout = layout;
+    armnn::WorkloadInfo info;
+
+    AddInputToWorkload(descriptor, info, inputTensorInfo, inputHandle.get());
+    AddOutputToWorkload(descriptor, info, outputTensorInfo, outputHandle.get());
+
+    std::unique_ptr<armnn::IWorkload> workload = workloadFactory.CreateL2Normalization(descriptor, info);
+
+    inputHandle->Allocate();
+    outputHandle->Allocate();
+
+    CopyDataToITensorHandle(inputHandle.get(), &inputTensor[0][0]);
+
+    workload->PostAllocationConfigure();
+    ExecuteWorkload(*workload, memoryManager);
+
+    CopyDataFromITensorHandle(&result.output[0][0], outputHandle.get());
+
+    return result;
+}
+
+LayerTestResult<float, 4> L2Normalization3dTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::DataLayout layout)
+{
+    return L2Normalization3dTestCommon<armnn::DataType::Float32>(
+        workloadFactory,
+        memoryManager,
+        0.f,
+        0,
+        0.f,
+        0,
+        layout);
+}
+
+LayerTestResult<int16_t, 4> L2Normalization3dInt16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::DataLayout layout)
+{
+    return L2Normalization1dTestCommon<armnn::DataType::QuantisedSymm16>(
+        workloadFactory,
+        memoryManager,
+        1.f,
+        0,
+        1.f,
+        0,
+        layout);
+}
+
+LayerTestResult<uint8_t, 4> L2Normalization3dUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::DataLayout layout)
+{
+    return L2Normalization1dTestCommon<armnn::DataType::QuantisedAsymm8>(
+        workloadFactory,
+        memoryManager,
+        1.f,
+        0,
+        1.f / 128,
+        128,
+        layout);
+}
+
+LayerTestResult<float, 4> L2Normalization4dTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::DataLayout layout)
+{
+    return L2Normalization4dTestCommon<armnn::DataType::Float32>(
+        workloadFactory,
+        memoryManager,
+        0.f,
+        0,
+        0.f,
+        0,
+        layout);
+}
+
+LayerTestResult<int16_t, 4> L2Normalization4dInt16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::DataLayout layout)
+{
+    return L2Normalization1dTestCommon<armnn::DataType::QuantisedSymm16>(
+        workloadFactory,
+        memoryManager,
+        1.f,
+        0,
+        1.f,
+        0,
+        layout);
+}
+
+LayerTestResult<uint8_t, 4> L2Normalization4dUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::DataLayout layout)
+{
+    return L2Normalization1dTestCommon<armnn::DataType::QuantisedAsymm8>(
+        workloadFactory,
+        memoryManager,
+        1.f,
+        0,
+        1.f / 128,
+        128,
+        layout);
+}
diff --git a/src/backends/backendsCommon/test/layerTests/L2NormalizationTestImpl.hpp b/src/backends/backendsCommon/test/layerTests/L2NormalizationTestImpl.hpp
new file mode 100644
index 0000000..78c2ac1
--- /dev/null
+++ b/src/backends/backendsCommon/test/layerTests/L2NormalizationTestImpl.hpp
@@ -0,0 +1,87 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include "LayerTestResult.hpp"
+
+#include <armnn/Types.hpp>
+
+#include <backendsCommon/IBackendInternal.hpp>
+#include <backendsCommon/WorkloadFactory.hpp>
+
+LayerTestResult<float, 4> L2NormalizationDefaultEpsilonTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::DataLayout layout);
+
+LayerTestResult<float, 4> L2NormalizationNonDefaultEpsilonTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::DataLayout layout);
+
+LayerTestResult<float, 4> L2Normalization1dTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::DataLayout layout);
+
+LayerTestResult<int16_t, 4> L2Normalization1dInt16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::DataLayout layout);
+
+LayerTestResult<uint8_t, 4> L2Normalization1dUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::DataLayout layout);
+
+LayerTestResult<float, 4> L2Normalization2dTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::DataLayout layout);
+
+LayerTestResult<int16_t, 4> L2Normalization2dInt16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::DataLayout layout);
+
+LayerTestResult<uint8_t, 4> L2Normalization2dUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::DataLayout layout);
+
+LayerTestResult<float, 2> L2Normalization2dShapeTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<float, 4> L2Normalization3dTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::DataLayout layout);
+
+LayerTestResult<int16_t, 4> L2Normalization3dInt16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::DataLayout layout);
+
+LayerTestResult<uint8_t, 4> L2Normalization3dUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::DataLayout layout);
+
+LayerTestResult<float, 4> L2Normalization4dTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::DataLayout layout);
+
+LayerTestResult<int16_t, 4> L2Normalization4dInt16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::DataLayout layout);
+
+LayerTestResult<uint8_t, 4> L2Normalization4dUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::DataLayout layout);
diff --git a/src/backends/backendsCommon/test/layerTests/LstmTestImpl.cpp b/src/backends/backendsCommon/test/layerTests/LstmTestImpl.cpp
new file mode 100644
index 0000000..c07f623
--- /dev/null
+++ b/src/backends/backendsCommon/test/layerTests/LstmTestImpl.cpp
@@ -0,0 +1,2072 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "LstmTestImpl.hpp"
+
+#include <armnn/ArmNN.hpp>
+
+#include <backendsCommon/CpuTensorHandle.hpp>
+
+#include <backendsCommon/test/QuantizeHelper.hpp>
+#include <backendsCommon/test/TensorCopyUtils.hpp>
+#include <backendsCommon/test/WorkloadTestUtils.hpp>
+
+#include <reference/workloads/Decoders.hpp>
+#include <reference/workloads/Encoders.hpp>
+#include <reference/workloads/LstmUtils.hpp>
+
+#include <test/TensorHelpers.hpp>
+
+#include <boost/multi_array.hpp>
+
+namespace
+{
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+void LstmUtilsVectorBatchVectorAddTestImpl(
+        boost::multi_array<float, 1>& vec,
+        boost::multi_array<float, 2>& batchVec,
+        uint32_t vSize,
+        uint32_t nBatch,
+        boost::multi_array<float, 2>& expectedOutput )
+{
+    float qScale = 0.0f;
+    int32_t qOffset = 0;
+    armnn::TensorInfo tensorInfo({nBatch, vSize}, ArmnnType,  qScale, qOffset );
+
+    // Make encoder and decoder
+    std::unique_ptr<armnn::Decoder<float>> vecDecoder = armnn::MakeDecoder<float>(tensorInfo, vec.data());
+    std::unique_ptr<armnn::Decoder<float>> batchVecDecoder = armnn::MakeDecoder<float>(tensorInfo, batchVec.data());
+    std::unique_ptr<armnn::Encoder<float>> batchVecEncoder = armnn::MakeEncoder<float>(tensorInfo, batchVec.data());
+
+    VectorBatchVectorAdd(*vecDecoder, vSize, *batchVecDecoder, nBatch, *batchVecEncoder);
+
+    // check shape and compare values
+    BOOST_TEST(CompareTensors(batchVec, expectedOutput));
+
+    // check if iterator is back at start position
+    batchVecEncoder->Set(1.0f);
+    BOOST_TEST(batchVec[0][0] == 1.0f);
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+void LstmUtilsZeroVectorTestImpl(
+        boost::multi_array<float, 1>& input,
+        uint32_t vSize,
+        boost::multi_array<float, 1>& expectedOutput)
+{
+    float qScale = 0.0f;
+    int32_t qOffset = 0;
+
+    armnn::TensorInfo tensorInfo({vSize}, ArmnnType,  qScale, qOffset );
+
+    // Make encoder for input
+    std::unique_ptr<armnn::Encoder<float>> outputEncoder = armnn::MakeEncoder<float>(tensorInfo, input.data());
+
+    // call ZeroVector
+    ZeroVector(*outputEncoder, vSize);
+
+    // check shape and compare values
+    BOOST_TEST(CompareTensors(input, expectedOutput));
+
+    // check if iterator is back at start position
+    outputEncoder->Set(1.0f);
+    BOOST_TEST(input[0] == 1.0f);
+
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+void LstmUtilsMeanStddevNormalizationTestImpl(
+        boost::multi_array<float, 2>& input,
+        uint32_t vSize,
+        uint32_t nBatch,
+        boost::multi_array<float, 2>& expectedOutput)
+{
+    float qScale = 0.0f;
+    int32_t qOffset = 0;
+    armnn::TensorInfo tensorInfo({nBatch, vSize}, ArmnnType,  qScale, qOffset );
+
+    // Make encoder and decoder for input
+    std::unique_ptr<armnn::Decoder<float>> inputDecoder = armnn::MakeDecoder<float>(tensorInfo, input.data());
+    std::unique_ptr<armnn::Encoder<float>> outputEncoder = armnn::MakeEncoder<float>(tensorInfo, input.data());
+
+    MeanStddevNormalization(*inputDecoder, *outputEncoder, vSize, nBatch, 1e-8f);
+
+    // check shape and compare values
+    BOOST_TEST(CompareTensors(input, expectedOutput));
+
+    // check if iterator is back at start position
+    outputEncoder->Set(1.0f);
+    BOOST_TEST(input[0][0] == 1.0f);
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+void LstmUtilsVectorBatchVectorCwiseProductTestImpl(
+        boost::multi_array<float, 1>& vec,
+        boost::multi_array<float, 2>& batchVec,
+        uint32_t vSize,
+        uint32_t nBatch,
+        boost::multi_array<float, 2>& expectedOutput)
+{
+    float qScale = 0.0f;
+    int32_t qOffset = 0;
+    armnn::TensorInfo tensorInfo({nBatch, vSize}, ArmnnType,  qScale, qOffset );
+
+    // Make encoder and decoder
+    std::unique_ptr<armnn::Decoder<float>> vecDecoder = armnn::MakeDecoder<float>(tensorInfo, vec.data());
+    std::unique_ptr<armnn::Decoder<float>> batchVecDecoder = armnn::MakeDecoder<float>(tensorInfo, batchVec.data());
+    std::unique_ptr<armnn::Encoder<float>> batchVecEncoder = armnn::MakeEncoder<float>(tensorInfo, batchVec.data());
+
+    VectorBatchVectorCwiseProduct(*vecDecoder, vSize, *batchVecDecoder, nBatch, *batchVecEncoder);
+
+    // check shape and compare values
+    BOOST_TEST(CompareTensors(batchVec, expectedOutput));
+
+    // check if iterator is back at start position
+    batchVecEncoder->Set(1.0f);
+    BOOST_TEST(batchVec[0][0] == 1.0f);
+}
+
+// Lstm Layer tests:
+// *********************************** //
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 2>
+LstmNoCifgNoPeepholeNoProjectionTestImpl(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        const boost::multi_array<T, 2>& input,
+        const boost::multi_array<T, 2>& outputExpected,
+        float qScale = 0.0f,
+        int32_t qOffset = 0,
+        armnn::DataType constantDataType = armnn::DataType::Float32)
+{
+    unsigned int batchSize = boost::numeric_cast<unsigned int>(input.shape()[0]);
+    unsigned int inputSize = boost::numeric_cast<unsigned int>(input.shape()[1]);
+    unsigned int outputSize = boost::numeric_cast<unsigned int>(outputExpected.shape()[1]);
+    // cellSize and outputSize have the same size when there is no projection.
+    unsigned numUnits = outputSize;
+
+    armnn::TensorInfo inputTensorInfo({batchSize , inputSize}, ArmnnType,  qScale, qOffset );
+    armnn::TensorInfo cellStateInTensorInfo({batchSize , numUnits}, ArmnnType, qScale, qOffset);
+    armnn::TensorInfo outputStateInTensorInfo({batchSize , outputSize}, ArmnnType, qScale, qOffset);
+
+    armnn::TensorInfo scratchBufferTensorInfo({batchSize, numUnits * 4}, ArmnnType, qScale, qOffset);
+    armnn::TensorInfo cellStateOutTensorInfo({batchSize, numUnits}, ArmnnType, qScale, qOffset);
+    armnn::TensorInfo outputStateOutTensorInfo({batchSize, outputSize}, ArmnnType, qScale, qOffset);
+    armnn::TensorInfo outputTensorInfo({batchSize, outputSize}, ArmnnType, qScale, qOffset);
+
+    LayerTestResult<T, 2> ret(outputTensorInfo);
+
+    std::vector<float> inputVector;
+    inputVector.assign(input.data(), input.data() + (batchSize * inputSize));
+    auto inputTensor = MakeTensor<float,2>(inputTensorInfo, inputVector);
+
+    std::vector<float> cellStateInVector(batchSize * numUnits, 0.f);
+    auto cellStateInTensor = MakeTensor<float,2>(cellStateInTensorInfo, cellStateInVector);
+
+    std::vector<float> outputStateInVector(batchSize * outputSize, 0.f);
+    auto outputStateInTensor = MakeTensor<float,2>(outputStateInTensorInfo, outputStateInVector);
+
+    std::vector<float> scratchBufferVector(batchSize * numUnits * 4, 0.f);
+    auto scratchBufferTensor = MakeTensor<float,2>(scratchBufferTensorInfo, scratchBufferVector);
+
+    std::vector<float> outputStateOutVector(batchSize * outputSize, 0.f);
+    auto outputStateOutTensor = MakeTensor<float,2>(outputStateOutTensorInfo, outputStateOutVector);
+
+    std::vector<float> cellStateOutVector(batchSize * numUnits, 0.f);
+    auto cellStateOutTensor = MakeTensor<float,2>(cellStateOutTensorInfo, cellStateOutVector);
+
+    std::vector<float> outputVector;
+    outputVector.assign(outputExpected.data(), outputExpected.data() + (batchSize * outputSize));
+    ret.outputExpected = MakeTensor<float, 2>(outputTensorInfo, outputVector);
+
+    std::unique_ptr<armnn::ITensorHandle> inputHandle = workloadFactory.CreateTensorHandle(inputTensorInfo);
+    std::unique_ptr<armnn::ITensorHandle> cellStateInHandle =
+            workloadFactory.CreateTensorHandle(cellStateInTensorInfo);
+    std::unique_ptr<armnn::ITensorHandle> outputStateInHandle =
+            workloadFactory.CreateTensorHandle(outputStateInTensorInfo);
+
+    std::unique_ptr<armnn::ITensorHandle> scratchHandle = workloadFactory.CreateTensorHandle(scratchBufferTensorInfo);
+    std::unique_ptr<armnn::ITensorHandle> outputStateOutHandle =
+            workloadFactory.CreateTensorHandle(outputStateOutTensorInfo);
+    std::unique_ptr<armnn::ITensorHandle> cellStateOutHandle =
+            workloadFactory.CreateTensorHandle(cellStateOutTensorInfo);
+    std::unique_ptr<armnn::ITensorHandle> outputHandle = workloadFactory.CreateTensorHandle(outputTensorInfo);
+
+
+    armnn::LstmQueueDescriptor data;
+    armnn::WorkloadInfo info;
+
+    AddInputToWorkload(data, info, inputTensorInfo, inputHandle.get());
+    AddInputToWorkload(data, info, outputStateInTensorInfo, outputStateInHandle.get());
+    AddInputToWorkload(data, info, cellStateInTensorInfo, cellStateInHandle.get());
+
+    AddOutputToWorkload(data, info, scratchBufferTensorInfo, scratchHandle.get());
+    AddOutputToWorkload(data, info, outputStateOutTensorInfo, outputStateOutHandle.get());
+    AddOutputToWorkload(data, info, cellStateOutTensorInfo, cellStateOutHandle.get());
+    AddOutputToWorkload(data, info, outputTensorInfo, outputHandle.get());
+
+    armnn::TensorInfo tensorInfo4({numUnits}, constantDataType , qScale, qOffset);
+    armnn::TensorInfo tensorInfo8({numUnits, 2}, constantDataType, qScale, qOffset);
+    armnn::TensorInfo tensorInfo16({numUnits, 4}, constantDataType, qScale, qOffset);
+
+    auto inputToInputWeights = MakeTensor<float, 2>(tensorInfo8, {-0.45018822f, -0.02338299f, -0.0870589f,
+                                                                  -0.34550029f, 0.04266912f, -0.15680569f,
+                                                                  -0.34856534f, 0.43890524f});
+
+    auto inputToForgetWeights = MakeTensor<float, 2>(tensorInfo8, {0.09701663f, 0.20334584f, -0.50592935f,
+                                                                   -0.31343272f, -0.40032279f, 0.44781327f,
+                                                                   0.01387155f, -0.35593212f});
+
+    auto inputToCellWeights = MakeTensor<float, 2>(tensorInfo8, {-0.50013041f, 0.1370284f, 0.11810488f, 0.2013163f,
+                                                                 -0.20583314f, 0.44344562f, 0.22077113f,
+                                                                 -0.29909778f});
+
+    auto inputToOutputWeights = MakeTensor<float, 2>(tensorInfo8, {-0.25065863f, -0.28290087f, 0.04613829f,
+                                                                   0.40525138f, 0.44272184f, 0.03897077f,
+                                                                   -0.1556896f, 0.19487578f});
+
+    auto recurrentToInputWeights = MakeTensor<float, 2>(tensorInfo16, {-0.0063535f, -0.2042388f, 0.31454784f,
+                                                                       -0.35746509f, 0.28902304f, 0.08183324f,
+                                                                       -0.16555229f, 0.02286911f, -0.13566875f,
+                                                                       0.03034258f, 0.48091322f, -0.12528998f,
+                                                                       0.24077177f, -0.51332325f, -0.33502164f,
+                                                                       0.10629296f});
+
+    auto recurrentToForgetWeights = MakeTensor<float, 2>(tensorInfo16, {-0.48684245f, -0.06655136f, 0.42224967f,
+                                                                        0.2112639f, 0.27654213f, 0.20864892f,
+                                                                        -0.07646349f, 0.45877004f, 0.00141793f,
+                                                                        -0.14609534f, 0.36447752f, 0.09196436f,
+                                                                        0.28053468f, 0.01560611f, -0.20127171f,
+                                                                        -0.01140004f});
+
+    auto recurrentToCellWeights = MakeTensor<float, 2>(tensorInfo16, {-0.3407414f, 0.24443203f, -0.2078532f,
+                                                                      0.26320225f, 0.05695659f, -0.00123841f,
+                                                                      -0.4744786f, -0.35869038f, -0.06418842f,
+                                                                      -0.13502428f, -0.501764f, 0.22830659f,
+                                                                      -0.46367589f, 0.26016325f, -0.03894562f,
+                                                                      -0.16368064f});
+
+    auto recurrentToOutputWeights = MakeTensor<float, 2>(tensorInfo16, {0.43385774f, -0.17194885f, 0.2718237f,
+                                                                        0.09215671f, 0.24107647f, -0.39835793f,
+                                                                        0.18212086f, 0.01301402f, 0.48572797f,
+                                                                        -0.50656658f, 0.20047462f, -0.20607421f,
+                                                                        -0.51818722f, -0.15390486f, 0.0468148f,
+                                                                        0.39922136f});
+
+    auto cellToInputWeights = MakeTensor<float, 1>(tensorInfo4, {0., 0., 0., 0.});
+
+    auto inputGateBias = MakeTensor<float, 1>(tensorInfo4, {0., 0., 0., 0.});
+
+    auto forgetGateBias = MakeTensor<float, 1>(tensorInfo4, {1., 1., 1., 1.});
+
+    auto cellBias = MakeTensor<float, 1>(tensorInfo4, {0., 0., 0., 0.});
+
+    auto outputGateBias = MakeTensor<float, 1>(tensorInfo4, {0., 0., 0., 0.});
+
+    armnn::ScopedCpuTensorHandle inputToInputWeightsTensor(tensorInfo8);
+    armnn::ScopedCpuTensorHandle inputToForgetWeightsTensor(tensorInfo8);
+    armnn::ScopedCpuTensorHandle inputToCellWeightsTensor(tensorInfo8);
+    armnn::ScopedCpuTensorHandle inputToOutputWeightsTensor(tensorInfo8);
+    armnn::ScopedCpuTensorHandle recurrentToInputWeightsTensor(tensorInfo16);
+    armnn::ScopedCpuTensorHandle recurrentToForgetWeightsTensor(tensorInfo16);
+    armnn::ScopedCpuTensorHandle recurrentToCellWeightsTensor(tensorInfo16);
+    armnn::ScopedCpuTensorHandle recurrentToOutputWeightsTensor(tensorInfo16);
+    armnn::ScopedCpuTensorHandle cellToInputWeightsTensor(tensorInfo4);
+    armnn::ScopedCpuTensorHandle inputGateBiasTensor(tensorInfo4);
+    armnn::ScopedCpuTensorHandle forgetGateBiasTensor(tensorInfo4);
+    armnn::ScopedCpuTensorHandle cellBiasTensor(tensorInfo4);
+    armnn::ScopedCpuTensorHandle outputGateBiasTensor(tensorInfo4);
+
+    AllocateAndCopyDataToITensorHandle(&inputToInputWeightsTensor, &inputToInputWeights[0][0]);
+    AllocateAndCopyDataToITensorHandle(&inputToForgetWeightsTensor, &inputToForgetWeights[0][0]);
+    AllocateAndCopyDataToITensorHandle(&inputToCellWeightsTensor, &inputToCellWeights[0][0]);
+    AllocateAndCopyDataToITensorHandle(&inputToOutputWeightsTensor, &inputToOutputWeights[0][0]);
+    AllocateAndCopyDataToITensorHandle(&recurrentToInputWeightsTensor, &recurrentToInputWeights[0][0]);
+    AllocateAndCopyDataToITensorHandle(&recurrentToForgetWeightsTensor, &recurrentToForgetWeights[0][0]);
+    AllocateAndCopyDataToITensorHandle(&recurrentToCellWeightsTensor, &recurrentToCellWeights[0][0]);
+    AllocateAndCopyDataToITensorHandle(&recurrentToOutputWeightsTensor, &recurrentToOutputWeights[0][0]);
+    AllocateAndCopyDataToITensorHandle(&cellToInputWeightsTensor, &cellToInputWeights[0]);
+    AllocateAndCopyDataToITensorHandle(&inputGateBiasTensor, &inputGateBias[0]);
+    AllocateAndCopyDataToITensorHandle(&forgetGateBiasTensor, &forgetGateBias[0]);
+    AllocateAndCopyDataToITensorHandle(&cellBiasTensor, &cellBias[0]);
+    AllocateAndCopyDataToITensorHandle(&outputGateBiasTensor, &outputGateBias[0]);
+
+    data.m_InputToInputWeights = &inputToInputWeightsTensor;
+    data.m_InputToForgetWeights = &inputToForgetWeightsTensor;
+    data.m_InputToCellWeights = &inputToCellWeightsTensor;
+    data.m_InputToOutputWeights = &inputToOutputWeightsTensor;
+    data.m_RecurrentToInputWeights = &recurrentToInputWeightsTensor;
+    data.m_RecurrentToForgetWeights = &recurrentToForgetWeightsTensor;
+    data.m_RecurrentToCellWeights = &recurrentToCellWeightsTensor;
+    data.m_RecurrentToOutputWeights = &recurrentToOutputWeightsTensor;
+    data.m_InputGateBias = &inputGateBiasTensor;
+    data.m_ForgetGateBias = &forgetGateBiasTensor;
+    data.m_CellBias = &cellBiasTensor;
+    data.m_OutputGateBias = &outputGateBiasTensor;
+
+    // Flags to set test configuration
+    data.m_Parameters.m_ActivationFunc = 4;
+    data.m_Parameters.m_CifgEnabled = false;
+    data.m_Parameters.m_PeepholeEnabled = false;
+    data.m_Parameters.m_ProjectionEnabled = false;
+
+    std::unique_ptr<armnn::IWorkload> workload = workloadFactory.CreateLstm(data, info);
+    inputHandle->Allocate();
+    outputStateInHandle->Allocate();
+    cellStateInHandle->Allocate();
+
+    scratchHandle->Allocate();
+    outputStateOutHandle->Allocate();
+    cellStateOutHandle->Allocate();
+    outputHandle->Allocate();
+
+    CopyDataToITensorHandle(inputHandle.get(), &inputTensor[0][0]);
+    CopyDataToITensorHandle(outputStateInHandle.get(), &outputStateInTensor[0][0]);
+    CopyDataToITensorHandle(cellStateInHandle.get(), &cellStateInTensor[0][0]);
+
+    workload->Execute();
+
+    CopyDataFromITensorHandle(&ret.output[0][0], outputHandle.get());
+
+    return ret;
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 2>
+LstmLayerNoCifgWithPeepholeWithProjectionTestImpl(armnn::IWorkloadFactory& workloadFactory,
+                                                  const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+                                                  const boost::multi_array<T, 2>& input,
+                                                  const boost::multi_array<T, 2>& outputExpected,
+                                                  float qScale = 0.0f,
+                                                  int32_t qOffset = 0,
+                                                  armnn::DataType constantDataType = armnn::DataType::Float32)
+{
+    unsigned int batchSize = 2;
+    unsigned int outputSize = 16;
+    unsigned int inputSize = 5;
+    unsigned numUnits = 20;
+
+    armnn::TensorInfo inputTensorInfo({batchSize , inputSize}, ArmnnType, qScale, qOffset);
+    armnn::TensorInfo cellStateInTensorInfo({batchSize , numUnits}, ArmnnType, qScale, qOffset);
+    armnn::TensorInfo outputStateInTensorInfo({batchSize , outputSize}, ArmnnType, qScale, qOffset);
+
+    // Scratch buffer size without CIFG [batchSize, numUnits * 4]
+    armnn::TensorInfo scratchBufferTensorInfo({batchSize, numUnits * 4}, ArmnnType, qScale, qOffset);
+    armnn::TensorInfo cellStateOutTensorInfo({batchSize, numUnits}, ArmnnType, qScale, qOffset);
+    armnn::TensorInfo outputStateOutTensorInfo({batchSize, outputSize}, ArmnnType, qScale, qOffset);
+    armnn::TensorInfo outputTensorInfo({batchSize, outputSize}, ArmnnType, qScale, qOffset);
+
+    LayerTestResult<T, 2> ret(outputTensorInfo);
+
+    std::vector<float> inputVector;
+    inputVector.assign(input.data(), input.data() + (batchSize * inputSize));
+    auto inputTensor = MakeTensor<float,2>(inputTensorInfo, inputVector);
+
+    std::vector<float> cellStateInVector(batchSize * numUnits, 0.f);
+    auto cellStateInTensor = MakeTensor<float,2>(cellStateInTensorInfo, cellStateInVector);
+
+    std::vector<float> outputStateInVector(batchSize * outputSize, 0.f);
+    auto outputStateInTensor = MakeTensor<float,2>(outputStateInTensorInfo, outputStateInVector);
+
+    std::vector<float> scratchBufferVector(batchSize * numUnits * 4, 0.f);
+    auto scratchBufferTensor = MakeTensor<float,2>(scratchBufferTensorInfo, scratchBufferVector);
+
+    std::vector<float> outputStateOutVector(batchSize * outputSize, 0.f);
+    auto outputStateOutTensor = MakeTensor<float,2>(outputStateOutTensorInfo, outputStateOutVector);
+
+    std::vector<float> cellStateOutVector(batchSize * numUnits, 0.f);
+    auto cellStateOutTensor = MakeTensor<float,2>(cellStateOutTensorInfo, cellStateOutVector);
+
+    std::vector<float> outputVector;
+    outputVector.assign(outputExpected.data(), outputExpected.data() + (batchSize * outputSize));
+    ret.outputExpected = MakeTensor<float, 2>(outputTensorInfo, outputVector);
+
+    std::unique_ptr<armnn::ITensorHandle> inputHandle = workloadFactory.CreateTensorHandle(inputTensorInfo);
+    std::unique_ptr<armnn::ITensorHandle> cellStateInHandle =
+            workloadFactory.CreateTensorHandle(cellStateInTensorInfo);
+    std::unique_ptr<armnn::ITensorHandle> outputStateInHandle =
+            workloadFactory.CreateTensorHandle(outputStateInTensorInfo);
+
+    std::unique_ptr<armnn::ITensorHandle> scratchHandle = workloadFactory.CreateTensorHandle(scratchBufferTensorInfo);
+    std::unique_ptr<armnn::ITensorHandle> outputStateOutHandle =
+            workloadFactory.CreateTensorHandle(outputStateOutTensorInfo);
+    std::unique_ptr<armnn::ITensorHandle> cellStateOutHandle =
+            workloadFactory.CreateTensorHandle(cellStateOutTensorInfo);
+    std::unique_ptr<armnn::ITensorHandle> outputHandle = workloadFactory.CreateTensorHandle(outputTensorInfo);
+
+    armnn::LstmQueueDescriptor data;
+    armnn::WorkloadInfo info;
+
+    AddInputToWorkload(data, info, inputTensorInfo, inputHandle.get());
+    AddInputToWorkload(data, info, outputStateInTensorInfo, outputStateInHandle.get());
+    AddInputToWorkload(data, info, cellStateInTensorInfo, cellStateInHandle.get());
+
+    AddOutputToWorkload(data, info, scratchBufferTensorInfo, scratchHandle.get());
+    AddOutputToWorkload(data, info, outputStateOutTensorInfo, outputStateOutHandle.get());
+    AddOutputToWorkload(data, info, cellStateOutTensorInfo, cellStateOutHandle.get());
+    AddOutputToWorkload(data, info, outputTensorInfo, outputHandle.get());
+
+    armnn::TensorInfo tensorInfo16({outputSize}, constantDataType, qScale, qOffset);
+    armnn::TensorInfo tensorInfo20({numUnits}, constantDataType, qScale, qOffset);
+    armnn::TensorInfo tensorInfo20x5({numUnits, inputSize}, constantDataType, qScale, qOffset);
+    armnn::TensorInfo tensorInfo20x16({numUnits, outputSize}, constantDataType, qScale, qOffset);
+    armnn::TensorInfo tensorInfo16x20({outputSize, numUnits}, constantDataType, qScale, qOffset);
+
+    auto inputToInputWeights =
+            MakeTensor<float, 2>(tensorInfo20x5, {0.021393683f,0.06124551f,  0.046905167f,-0.014657677f,-0.03149463f,
+                                                  0.09171803f, 0.14647801f,0.10797193f,   -0.0057968358f,0.0019193048f,
+                                                  -0.2726754f, 0.10154029f, -0.018539885f, 0.080349885f, -0.10262385f,
+                                                  -0.022599787f,-0.09121155f, -0.008675967f, -0.045206103f,-0.0821282f,
+                                                  -0.008045952f,0.015478081f, 0.055217247f,  0.038719587f, 0.044153627f,
+                                                  -0.06453243f,0.05031825f, -0.046935108f, -0.008164439f, 0.014574226f,
+                                                  -0.1671009f,   -0.15519552f, -0.16819797f,-0.13971269f,-0.11953059f,
+                                                  0.25005487f, -0.22790983f, 0.009855087f,  -0.028140958f, -0.11200698f,
+                                                  0.11295408f, -0.0035217577f, 0.054485075f,  0.05184695f, 0.064711206f,
+                                                  0.10989193f,   0.11674786f,  0.03490607f, 0.07727357f, 0.11390585f,
+                                                  -0.1863375f,  -0.1034451f, -0.13945189f, -0.049401227f, -0.18767063f,
+                                                  0.042483903f, 0.14233552f, 0.13832581f, 0.18350165f,    0.14545603f,
+                                                  -0.028545704f,0.024939531f,0.050929718f,0.0076203286f,-0.0029723682f,
+                                                  -0.042484224f, -0.11827596f, -0.09171104f,  -0.10808628f,-0.16327988f,
+                                                  -0.2273378f,   -0.0993647f, -0.017155107f,0.0023917493f,0.049272764f,
+                                                  0.0038534778f, 0.054764505f,   0.089753784f, 0.06947234f, 0.08014476f,
+                                                  -0.04544234f, -0.0497073f,-0.07135631f,  -0.048929106f,-0.004042012f,
+                                                  -0.009284026f, 0.018042054f, 0.0036860977f,-0.07427302f, -0.11434604f,
+                                                  -0.018995456f, 0.031487543f, 0.012834908f,0.019977754f,0.044256654f,
+                                                  -0.39292613f,  -0.18519334f, -0.11651281f,-0.06809892f, 0.011373677f
+            });
+
+    auto inputToForgetWeights =
+            MakeTensor<float, 2>(tensorInfo20x5, {-0.0018401089f, -0.004852237f,0.03698424f, 0.014181704f,0.028273236f,
+                                                   -0.016726194f, -0.05249759f,-0.10204261f, 0.00861066f,-0.040979505f,
+                                                   -0.009899187f,0.01923892f,-0.028177269f, -0.08535103f,-0.14585495f,
+                                                   0.10662567f,-0.01909731f,-0.017883534f,-0.0047269356f,-0.045103323f,
+                                                   0.0030784295f,0.076784775f,0.07463696f, 0.094531395f,0.0814421f,
+                                                   -0.12257899f, -0.033945758f,-0.031303465f, 0.045630626f,0.06843887f,
+                                                   -0.13492945f, -0.012480007f,-0.0811829f, -0.07224499f,-0.09628791f,
+                                                   0.045100946f,0.0012300825f, 0.013964662f, 0.099372394f,0.02543059f,
+                                                   0.06958324f,    0.034257296f, 0.0482646f, 0.06267997f,0.052625068f,
+                                                   0.12784666f,    0.07077897f,  0.025725935f, 0.04165009f,0.07241905f,
+                                                   0.018668644f, -0.037377294f,-0.06277783f,-0.08833636f,-0.040120605f,
+                                                   -0.011405586f,-0.007808335f,-0.010301386f,-0.005102167f,0.027717464f,
+                                                   0.05483423f, 0.11449111f, 0.11289652f,0.10939839f, 0.13396506f,
+                                                   -0.08402166f,-0.01901462f,  -0.044678304f,-0.07720565f,0.014350063f,
+                                                   -0.11757958f, -0.0652038f, -0.08185733f,-0.076754324f,-0.092614375f,
+                                                   0.10405491f, 0.052960336f, 0.035755895f,0.035839386f,-0.012540553f,
+                                                   0.036881298f,   0.02913376f,  0.03420159f,0.05448447f,-0.054523353f,
+                                                   0.02582715f, 0.02327355f, -0.011857179f,-0.0011980024f,-0.034641717f,
+                                                   -0.026125094f,-0.17582615f,-0.15923657f,-0.27486774f,-0.0006143371f,
+                                                   0.0001771948f,  -8.470171e-05f, 0.02651807f,0.045790765f,0.06956496f
+            });
+
+    auto inputToCellWeights =
+            MakeTensor<float, 2>(tensorInfo20x5, {-0.04580283f,   -0.09549462f,   -0.032418985f,  -0.06454633f,
+                                                  -0.043528453f,  0.043018587f,   -0.049152344f,  -0.12418144f,
+                                                  -0.078985475f,  -0.07596889f,   0.019484362f,   -0.11434962f,
+                                                  -0.0074034138f, -0.06314844f,   -0.092981495f,  0.0062155537f,
+                                                  -0.025034338f,  -0.0028890965f, 0.048929527f,   0.06235075f,
+                                                  0.10665918f,    -0.032036792f,  -0.08505916f,   -0.10843358f,
+                                                  -0.13002433f,   -0.036816437f,  -0.02130134f,   -0.016518239f,
+                                                  0.0047691227f,  -0.0025825808f, 0.066017866f,   0.029991534f,
+                                                  -0.10652836f,   -0.1037554f,    -0.13056071f,   -0.03266643f,
+                                                  -0.033702414f,  -0.006473424f,  -0.04611692f,   0.014419339f,
+                                                  -0.025174323f,  0.0396852f,     0.081777506f,   0.06157468f,
+                                                  0.10210095f,    -0.009658194f,  0.046511717f,   0.03603906f,
+                                                  0.0069369148f,  0.015960095f,   -0.06507666f,   0.09551598f,
+                                                  0.053568836f,   0.06408714f,    0.12835667f,    -0.008714329f,
+                                                  -0.20211966f,   -0.12093674f,   0.029450472f,   0.2849013f,
+                                                  -0.029227901f,  0.1164364f,     -0.08560263f,   0.09941786f,
+                                                  -0.036999565f,  -0.028842626f,  -0.0033637602f, -0.017012902f,
+                                                  -0.09720865f,   -0.11193351f,   -0.029155117f,  -0.017936034f,
+                                                  -0.009768936f,  -0.04223324f,   -0.036159635f,  0.06505112f,
+                                                  -0.021742892f,  -0.023377212f,  -0.07221364f,   -0.06430552f,
+                                                  0.05453865f,    0.091149814f,   0.06387331f,    0.007518393f,
+                                                  0.055960953f,   0.069779344f,   0.046411168f,   0.10509911f,
+                                                  0.07463894f,    0.0075130584f,  0.012850982f,   0.04555431f,
+                                                  0.056955688f,   0.06555285f,    0.050801456f,   -0.009862683f,
+                                                  0.00826772f,    -0.026555609f,  -0.0073611983f, -0.0014897042f
+            });
+
+    auto inputToOutputWeights =
+            MakeTensor<float, 2>(tensorInfo20x5, {-0.0998932f,   -0.07201956f, -0.052803773f,-0.15629593f,-0.15001918f,
+                                                  -0.07650751f,0.02359855f, -0.075155355f, -0.08037709f,  -0.15093534f,
+                                                  0.029517552f, -0.04751393f, 0.010350531f,-0.02664851f, -0.016839722f,
+                                                  -0.023121163f, 0.0077019283f, 0.012851257f, -0.05040649f,-0.0129761f,
+                                                  -0.021737747f,-0.038305793f,-0.06870586f, -0.01481247f,-0.001285394f,
+                                                  0.10124236f,  0.083122835f, 0.053313006f,-0.062235646f,-0.075637154f,
+                                                  -0.027833903f, 0.029774971f,  0.1130802f, 0.09218906f, 0.09506135f,
+                                                  -0.086665764f,-0.037162706f,-0.038880914f,-0.035832845f,-0.014481564f,
+                                                  -0.09825003f,-0.12048569f,-0.097665586f,-0.05287633f, -0.0964047f,
+                                                  -0.11366429f,  0.035777505f,  0.13568819f, 0.052451383f,0.050649304f,
+                                                  0.05798951f, -0.021852335f,-0.099848844f,0.014740475f,-0.078897946f,
+                                                  0.04974699f, 0.014160473f,  0.06973932f,    0.04964942f, 0.033364646f,
+                                                  0.08190124f,   0.025535367f, 0.050893165f, 0.048514254f,0.06945813f,
+                                                  -0.078907564f,-0.06707616f,  -0.11844508f, -0.09986688f,-0.07509403f,
+                                                  0.06263226f,   0.14925587f,   0.20188436f, 0.12098451f,0.14639415f,
+                                                  0.0015017595f, -0.014267382f, -0.03417257f,0.012711468f,0.0028300495f,
+                                                  -0.024758482f, -0.05098548f,-0.0821182f, 0.014225672f,  0.021544158f,
+                                                  0.08949725f,  0.07505268f, -0.0020780868f, 0.04908258f,0.06476295f,
+                                                  -0.022907063f,0.027562456f,0.040185735f, 0.019567577f,-0.015598739f,
+                                                  -0.049097303f, -0.017121866f, -0.083368234f,-0.02332002f,-0.0840956f
+            });
+
+    auto inputGateBias =
+            MakeTensor<float, 1>(tensorInfo20, {0.02234832f,  0.14757581f,   0.18176508f,  0.10380666f,  0.053110216f,
+                                                -0.06928846f, -0.13942584f,  -0.11816189f, 0.19483899f,  0.03652339f,
+                                                -0.10250295f, 0.036714908f,  -0.18426876f, 0.036065217f, 0.21810818f,
+                                                0.02383196f,  -0.043370757f, 0.08690144f,  -0.04444982f, 0.00030581196f
+            });
+
+    auto forgetGateBias =
+            MakeTensor<float, 1>(tensorInfo20, {0.035185695f, -0.042891346f, -0.03032477f, 0.23027696f,
+                                                0.11098921f,  0.15378423f,   0.09263801f,  0.09790885f,
+                                                0.09508917f,  0.061199076f,  0.07665568f,  -0.015443159f,
+                                                -0.03499149f, 0.046190713f,  0.08895977f,  0.10899629f,
+                                                0.40694186f,  0.06030037f,   0.012413437f, -0.06108739f
+            });
+
+    auto cellBias =
+            MakeTensor<float, 1>(tensorInfo20, {-0.024379363f, 0.0055531194f, 0.23377132f,   0.033463873f,
+                                                -0.1483596f,   -0.10639995f,  -0.091433935f, 0.058573797f,
+                                                -0.06809782f,  -0.07889636f,  -0.043246906f, -0.09829136f,
+                                                -0.4279842f,   0.034901652f,  0.18797937f,   0.0075234566f,
+                                                0.016178843f,  0.1749513f,    0.13975595f,   0.92058027f
+            });
+
+    auto outputGateBias =
+            MakeTensor<float, 1>(tensorInfo20, {0.046159424f,  -0.0012809046f, 0.03563469f, 0.12648113f, 0.027195795f,
+                                                0.35373217f,   -0.018957434f,  0.008907322f, -0.0762701f, 0.12018895f,
+                                                0.04216877f,   0.0022856654f,  0.040952638f,  0.3147856f,  0.08225149f,
+                                                -0.057416286f, -0.14995944f,   -0.008040261f, 0.13208859f, 0.029760877f
+            });
+
+    auto recurrentToInputWeights =
+            MakeTensor<float, 2>(tensorInfo20x16, {-0.001374326f,   -0.078856036f,   0.10672688f,    0.029162422f,
+                                                   -0.11585556f,    0.02557986f,     -0.13446963f,   -0.035785314f,
+                                                   -0.01244275f,    0.025961924f,    -0.02337298f,   -0.044228926f,
+                                                   -0.055839065f,   -0.046598054f,   -0.010546039f,  -0.06900766f,
+                                                   0.027239809f,    0.022582639f,    -0.013296484f,  -0.05459212f,
+                                                   0.08981f,        -0.045407712f,   0.08682226f,    -0.06867011f,
+                                                   -0.14390695f,    -0.02916037f,    0.000996957f,   0.091420636f,
+                                                   0.14283475f,     -0.07390571f,    -0.06402044f,   0.062524505f,
+                                                   -0.093129106f,   0.04860203f,     -0.08364217f,   -0.08119002f,
+                                                   0.009352075f,    0.22920375f,     0.0016303885f,  0.11583097f,
+                                                   -0.13732095f,    0.012405723f,    -0.07551853f,   0.06343048f,
+                                                   0.12162708f,     -0.031923793f,   -0.014335606f,  0.01790974f,
+                                                   -0.10650317f,    -0.0724401f,     0.08554849f,    -0.05727212f,
+                                                   0.06556731f,     -0.042729504f,   -0.043227166f,  0.011683251f,
+                                                   -0.013082158f,   -0.029302018f,   -0.010899579f,  -0.062036745f,
+                                                   -0.022509435f,   -0.00964907f,    -0.01567329f,   0.04260106f,
+                                                   -0.07787477f,    -0.11576462f,    0.017356863f,   0.048673786f,
+                                                   -0.017577527f,   -0.05527947f,    -0.082487635f,  -0.040137455f,
+                                                   -0.10820036f,    -0.04666372f,    0.022746278f,   -0.07851417f,
+                                                   0.01068115f,     0.032956902f,    0.022433773f,   0.0026891115f,
+                                                   0.08944216f,     -0.0685835f,     0.010513544f,   0.07228705f,
+                                                   0.02032331f,     -0.059686817f,   -0.0005566496f, -0.086984694f,
+                                                   0.040414046f,    -0.1380399f,     0.094208956f,   -0.05722982f,
+                                                   0.012092817f,    -0.04989123f,    -0.086576f,     -0.003399834f,
+                                                   -0.04696032f,    -0.045747425f,   0.10091314f,    0.048676282f,
+                                                   -0.029037097f,   0.031399418f,    -0.0040285117f, 0.047237843f,
+                                                   0.09504992f,     0.041799378f,    -0.049185462f,  -0.031518843f,
+                                                   -0.10516937f,    0.026374253f,    0.10058866f,    -0.0033195973f,
+                                                   -0.041975245f,   0.0073591834f,   0.0033782164f,  -0.004325073f,
+                                                   -0.10167381f,    0.042500053f,    -0.01447153f,   0.06464186f,
+                                                   -0.017142897f,   0.03312627f,     0.009205989f,   0.024138335f,
+                                                   -0.011337001f,   0.035530265f,    -0.010912711f,  0.0706555f,
+                                                   -0.005894094f,   0.051841937f,    -0.1401738f,    -0.02351249f,
+                                                   0.0365468f,      0.07590991f,     0.08838724f,    0.021681072f,
+                                                   -0.10086113f,    0.019608743f,    -0.06195883f,   0.077335775f,
+                                                   0.023646897f,    -0.095322326f,   0.02233014f,    0.09756986f,
+                                                   -0.048691444f,   -0.009579111f,   0.07595467f,    0.11480546f,
+                                                   -0.09801813f,    0.019894179f,    0.08502348f,    0.004032281f,
+                                                   0.037211012f,    0.068537936f,    -0.048005626f,  -0.091520436f,
+                                                   -0.028379958f,   -0.01556313f,    0.06554592f,    -0.045599163f,
+                                                   -0.01672207f,    -0.020169014f,   -0.011877351f,  -0.20212261f,
+                                                   0.010889619f,    0.0047078193f,   0.038385306f,   0.08540671f,
+                                                   -0.017140968f,   -0.0035865551f,  0.016678626f,   0.005633034f,
+                                                   0.015963363f,    0.00871737f,     0.060130805f,   0.028611384f,
+                                                   0.10109069f,     -0.015060172f,   -0.07894427f,   0.06401885f,
+                                                   0.011584063f,    -0.024466386f,   0.0047652307f,  -0.09041358f,
+                                                   0.030737216f,    -0.0046374933f,  0.14215417f,    -0.11823516f,
+                                                   0.019899689f,    0.006106124f,    -0.027092824f,  0.0786356f,
+                                                   0.05052217f,     -0.058925f,      -0.011402121f,  -0.024987547f,
+                                                   -0.0013661642f,  -0.06832946f,    -0.015667673f,  -0.1083353f,
+                                                   -0.00096863037f, -0.06988685f,    -0.053350925f,  -0.027275559f,
+                                                   -0.033664223f,   -0.07978348f,    -0.025200296f,  -0.017207067f,
+                                                   -0.058403496f,   -0.055697463f,   0.005798788f,   0.12965427f,
+                                                   -0.062582195f,   0.0013350133f,   -0.10482091f,   0.0379771f,
+                                                   0.072521195f,    -0.0029455067f,  -0.13797039f,   -0.03628521f,
+                                                   0.013806405f,    -0.017858358f,   -0.01008298f,   -0.07700066f,
+                                                   -0.017081132f,   0.019358726f,    0.0027079724f,  0.004635139f,
+                                                   0.062634714f,    -0.02338735f,    -0.039547626f,  -0.02050681f,
+                                                   0.03385117f,     -0.083611414f,   0.002862572f,   -0.09421313f,
+                                                   0.058618143f,    -0.08598433f,    0.00972939f,    0.023867095f,
+                                                   -0.053934585f,   -0.023203006f,   0.07452513f,    -0.048767887f,
+                                                   -0.07314807f,    -0.056307215f,   -0.10433547f,   -0.06440842f,
+                                                   0.04328182f,     0.04389765f,     -0.020006588f,  -0.09076438f,
+                                                   -0.11652589f,    -0.021705797f,   0.03345259f,    -0.010329105f,
+                                                   -0.025767034f,   0.013057034f,    -0.07316461f,   -0.10145612f,
+                                                   0.06358255f,     0.18531723f,     0.07759293f,    0.12006465f,
+                                                   0.1305557f,      0.058638252f,    -0.03393652f,   0.09622831f,
+                                                   -0.16253184f,    -2.4580743e-06f, 0.079869635f,   -0.070196845f,
+                                                   -0.005644518f,   0.06857898f,     -0.12598175f,   -0.035084512f,
+                                                   0.03156317f,     -0.12794146f,    -0.031963028f,  0.04692781f,
+                                                   0.030070418f,    0.0071660685f,   -0.095516115f,  -0.004643372f,
+                                                   0.040170413f,    -0.062104587f,   -0.0037324072f, 0.0554317f,
+                                                   0.08184801f,     -0.019164372f,   0.06791302f,    0.034257166f,
+                                                   -0.10307039f,    0.021943003f,    0.046745934f,   0.0790918f,
+                                                   -0.0265588f,     -0.007824208f,   0.042546265f,   -0.00977924f,
+                                                   -0.0002440307f,  -0.017384544f,   -0.017990116f,  0.12252321f,
+                                                   -0.014512694f,   -0.08251313f,    0.08861942f,    0.13589665f,
+                                                   0.026351685f,    0.012641483f,    0.07466548f,    0.044301085f,
+                                                   -0.045414884f,   -0.051112458f,   0.03444247f,    -0.08502782f,
+                                                   -0.04106223f,    -0.028126027f,   0.028473156f,   0.10467447f
+            });
+
+    auto recurrentToForgetWeights =
+            MakeTensor<float, 2>(tensorInfo20x16, {-0.057784554f,  -0.026057621f,  -0.068447545f,   -0.022581743f,
+                                                   0.14811787f,    0.10826372f,    0.09471067f,     0.03987225f,
+                                                   -0.0039523416f, 0.00030638507f, 0.053185795f,    0.10572994f,
+                                                   0.08414449f,    -0.022036452f,  -0.00066928595f, -0.09203576f,
+                                                   0.032950465f,   -0.10985798f,   -0.023809856f,   0.0021431844f,
+                                                   -0.02196096f,   -0.00326074f,   0.00058621005f,  -0.074678116f,
+                                                   -0.06193199f,   0.055729095f,   0.03736828f,     0.020123724f,
+                                                   0.061878487f,   -0.04729229f,   0.034919553f,    -0.07585433f,
+                                                   -0.04421272f,   -0.044019096f,  0.085488975f,    0.04058006f,
+                                                   -0.06890133f,   -0.030951202f,  -0.024628663f,   -0.07672815f,
+                                                   0.034293607f,   0.08556707f,    -0.05293577f,    -0.033561368f,
+                                                   -0.04899627f,   0.0241671f,     0.015736353f,    -0.095442444f,
+                                                   -0.029564252f,  0.016493602f,   -0.035026584f,   0.022337519f,
+                                                   -0.026871363f,  0.004780428f,   0.0077918363f,   -0.03601621f,
+                                                   0.016435321f,   -0.03263031f,   -0.09543275f,    -0.047392778f,
+                                                   0.013454138f,   0.028934088f,   0.01685226f,     -0.086110644f,
+                                                   -0.046250615f,  -0.01847454f,   0.047608484f,    0.07339695f,
+                                                   0.034546845f,   -0.04881143f,   0.009128804f,    -0.08802852f,
+                                                   0.03761666f,    0.008096139f,   -0.014454086f,   0.014361001f,
+                                                   -0.023502491f,  -0.0011840804f, -0.07607001f,    0.001856849f,
+                                                   -0.06509276f,   -0.006021153f,  -0.08570962f,    -0.1451793f,
+                                                   0.060212336f,   0.055259194f,   0.06974018f,     0.049454916f,
+                                                   -0.027794661f,  -0.08077226f,   -0.016179763f,   0.1169753f,
+                                                   0.17213494f,    -0.0056326236f, -0.053934924f,   -0.0124349f,
+                                                   -0.11520337f,   0.05409887f,    0.088759385f,    0.0019655675f,
+                                                   0.0042065294f,  0.03881498f,    0.019844765f,    0.041858196f,
+                                                   -0.05695512f,   0.047233116f,   0.038937137f,    -0.06542224f,
+                                                   0.014429736f,   -0.09719407f,   0.13908425f,     -0.05379757f,
+                                                   0.012321099f,   0.082840554f,   -0.029899208f,   0.044217527f,
+                                                   0.059855383f,   0.07711018f,    -0.045319796f,   0.0948846f,
+                                                   -0.011724666f,  -0.0033288454f, -0.033542685f,   -0.04764985f,
+                                                   -0.13873616f,   0.040668588f,   0.034832682f,    -0.015319203f,
+                                                   -0.018715994f,  0.046002675f,   0.0599172f,      -0.043107376f,
+                                                   0.0294216f,     -0.002314414f,  -0.022424703f,   0.0030315618f,
+                                                   0.0014641669f,  0.0029166266f,  -0.11878115f,    0.013738511f,
+                                                   0.12375372f,    -0.0006038222f, 0.029104086f,    0.087442465f,
+                                                   0.052958444f,   0.07558703f,    0.04817258f,     0.044462286f,
+                                                   -0.015213451f,  -0.08783778f,   -0.0561384f,     -0.003008196f,
+                                                   0.047060397f,   -0.002058388f,  0.03429439f,     -0.018839769f,
+                                                   0.024734668f,   0.024614193f,   -0.042046934f,   0.09597743f,
+                                                   -0.0043254104f, 0.04320769f,    0.0064070094f,   -0.0019131786f,
+                                                   -0.02558259f,   -0.022822596f,  -0.023273505f,   -0.02464396f,
+                                                   -0.10991725f,   -0.006240552f,  0.0074488563f,   0.024044557f,
+                                                   0.04383914f,    -0.046476185f,  0.028658995f,    0.060410924f,
+                                                   0.050786525f,   0.009452605f,   -0.0073054377f,  -0.024810238f,
+                                                   0.0052906186f,  0.0066939713f,  -0.0020913032f,  0.014515517f,
+                                                   0.015898481f,   0.021362653f,   -0.030262267f,   0.016587038f,
+                                                   -0.011442813f,  0.041154444f,   -0.007631438f,   -0.03423484f,
+                                                   -0.010977775f,  0.036152758f,   0.0066366293f,   0.11915515f,
+                                                   0.02318443f,    -0.041350313f,  0.021485701f,    -0.10906167f,
+                                                   -0.028218046f,  -0.00954771f,   0.020531068f,    -0.11995105f,
+                                                   -0.03672871f,   0.024019798f,   0.014255957f,    -0.05221243f,
+                                                   -0.00661567f,   -0.04630967f,   0.033188973f,    0.10107534f,
+                                                   -0.014027541f,  0.030796422f,   -0.10270911f,    -0.035999842f,
+                                                   0.15443139f,    0.07684145f,    0.036571592f,    -0.035900835f,
+                                                   -0.0034699554f, 0.06209149f,    0.015920248f,    -0.031122351f,
+                                                   -0.03858649f,   0.01849943f,    0.13872518f,     0.01503974f,
+                                                   0.069941424f,   -0.06948533f,   -0.0088794185f,  0.061282158f,
+                                                   -0.047401894f,  0.03100163f,    -0.041533746f,   -0.10430945f,
+                                                   0.044574402f,   -0.01425562f,   -0.024290353f,   0.034563623f,
+                                                   0.05866852f,    0.023947537f,   -0.09445152f,    0.035450947f,
+                                                   0.02247216f,    -0.0042998926f, 0.061146557f,    -0.10250651f,
+                                                   0.020881841f,   -0.06747029f,   0.10062043f,     -0.0023941975f,
+                                                   0.03532124f,    -0.016341697f,  0.09685456f,     -0.016764693f,
+                                                   0.051808182f,   0.05875331f,    -0.04536488f,    0.001626336f,
+                                                   -0.028892258f,  -0.01048663f,   -0.009793449f,   -0.017093895f,
+                                                   0.010987891f,   0.02357273f,    -0.00010856845f, 0.0099760275f,
+                                                   -0.001845119f,  -0.03551521f,   0.0018358806f,   0.05763657f,
+                                                   -0.01769146f,   0.040995963f,   0.02235177f,     -0.060430344f,
+                                                   0.11475477f,    -0.023854522f,  0.10071741f,     0.0686208f,
+                                                   -0.014250481f,  0.034261297f,   0.047418304f,    0.08562733f,
+                                                   -0.030519066f,  0.0060542435f,  0.014653856f,    -0.038836084f,
+                                                   0.04096551f,    0.032249358f,   -0.08355519f,    -0.026823482f,
+                                                   0.056386515f,   -0.010401743f,  -0.028396193f,   0.08507674f,
+                                                   0.014410365f,   0.020995233f,   0.17040324f,     0.11511526f,
+                                                   0.02459721f,    0.0066619175f,  0.025853224f,    -0.023133837f,
+                                                   -0.081302024f,  0.017264642f,   -0.009585969f,   0.09491168f,
+                                                   -0.051313367f,  0.054532815f,   -0.014298593f,   0.10657464f,
+                                                   0.007076659f,   0.10964551f,    0.0409152f,      0.008275321f,
+                                                   -0.07283536f,   0.07937492f,    0.04192024f,     -0.1075027f
+            });
+
+    auto recurrentToCellWeights =
+            MakeTensor<float, 2>(tensorInfo20x16, {-0.037322544f,   0.018592842f,   0.0056175636f,  -0.06253426f,
+                                                   0.055647098f,    -0.05713207f,   -0.05626563f,   0.005559383f,
+                                                   0.03375411f,     -0.025757805f,  -0.088049285f,  0.06017052f,
+                                                   -0.06570978f,    0.007384076f,   0.035123326f,   -0.07920549f,
+                                                   0.053676967f,    0.044480428f,   -0.07663568f,   0.0071805613f,
+                                                   0.08089997f,     0.05143358f,    0.038261272f,   0.03339287f,
+                                                   -0.027673481f,   0.044746667f,   0.028349208f,   0.020090483f,
+                                                   -0.019443132f,   -0.030755889f,  -0.0040000007f, 0.04465846f,
+                                                   -0.021585021f,   0.0031670958f,  0.0053199246f,  -0.056117613f,
+                                                   -0.10893326f,    0.076739706f,   -0.08509834f,   -0.027997585f,
+                                                   0.037871376f,    0.01449768f,    -0.09002357f,   -0.06111149f,
+                                                   -0.046195522f,   0.0422062f,     -0.005683705f,  -0.1253618f,
+                                                   -0.012925729f,   -0.04890792f,   0.06985068f,    0.037654128f,
+                                                   0.03398274f,     -0.004781977f,  0.007032333f,   -0.031787455f,
+                                                   0.010868644f,    -0.031489216f,  0.09525667f,    0.013939797f,
+                                                   0.0058680447f,   0.0167067f,     0.02668468f,    -0.04797466f,
+                                                   -0.048885044f,   -0.12722108f,   0.035304096f,   0.06554885f,
+                                                   0.00972396f,     -0.039238118f,  -0.05159735f,   -0.11329045f,
+                                                   0.1613692f,      -0.03750952f,   0.06529313f,    -0.071974665f,
+                                                   -0.11769596f,    0.015524369f,   -0.0013754242f, -0.12446318f,
+                                                   0.02786344f,     -0.014179351f,  0.005264273f,   0.14376344f,
+                                                   0.015983658f,    0.03406988f,    -0.06939408f,   0.040699873f,
+                                                   0.02111075f,     0.09669095f,    0.041345075f,   -0.08316494f,
+                                                   -0.07684199f,    -0.045768797f,  0.032298047f,   -0.041805092f,
+                                                   0.0119405f,      0.0061010392f,  0.12652606f,    0.0064572375f,
+                                                   -0.024950314f,   0.11574242f,    0.04508852f,    -0.04335324f,
+                                                   0.06760663f,     -0.027437469f,  0.07216407f,    0.06977076f,
+                                                   -0.05438599f,    0.034033038f,   -0.028602652f,  0.05346137f,
+                                                   0.043184172f,    -0.037189785f,  0.10420091f,    0.00882477f,
+                                                   -0.054019816f,   -0.074273005f,  -0.030617684f,  -0.0028467078f,
+                                                   0.024302477f,    -0.0038869337f, 0.005332455f,   0.0013399826f,
+                                                   0.04361412f,     -0.007001822f,  0.09631092f,    -0.06702025f,
+                                                   -0.042049985f,   -0.035070654f,  -0.04103342f,   -0.10273396f,
+                                                   0.0544271f,      0.037184782f,   -0.13150354f,   -0.0058036847f,
+                                                   -0.008264958f,   0.042035464f,   0.05891794f,    0.029673764f,
+                                                   0.0063542654f,   0.044788733f,   0.054816857f,   0.062257513f,
+                                                   -0.00093483756f, 0.048938446f,   -0.004952862f,  -0.007730018f,
+                                                   -0.04043371f,    -0.017094059f,  0.07229206f,    -0.023670016f,
+                                                   -0.052195564f,   -0.025616996f,  -0.01520939f,   0.045104615f,
+                                                   -0.007376126f,   0.003533447f,   0.006570588f,   0.056037236f,
+                                                   0.12436656f,     0.051817212f,   0.028532185f,   -0.08686856f,
+                                                   0.11868599f,     0.07663395f,    -0.07323171f,   0.03463402f,
+                                                   -0.050708205f,   -0.04458982f,   -0.11590894f,   0.021273347f,
+                                                   0.1251325f,      -0.15313013f,   -0.12224372f,   0.17228661f,
+                                                   0.023029093f,    0.086124025f,   0.006445803f,   -0.03496501f,
+                                                   0.028332196f,    0.04449512f,    -0.042436164f,  -0.026587414f,
+                                                   -0.006041347f,   -0.09292539f,   -0.05678812f,   0.03897832f,
+                                                   0.09465633f,     0.008115513f,   -0.02171956f,   0.08304309f,
+                                                   0.071401566f,    0.019622514f,   0.032163795f,   -0.004167056f,
+                                                   0.02295182f,     0.030739572f,   0.056506045f,   0.004612461f,
+                                                   0.06524936f,     0.059999723f,   0.046395954f,   -0.0045512207f,
+                                                   -0.1335546f,     -0.030136576f,  0.11584653f,    -0.014678886f,
+                                                   0.0020118146f,   -0.09688814f,   -0.0790206f,    0.039770417f,
+                                                   -0.0329582f,     0.07922767f,    0.029322514f,   0.026405897f,
+                                                   0.04207835f,     -0.07073373f,   0.063781224f,   0.0859677f,
+                                                   -0.10925287f,    -0.07011058f,   0.048005477f,   0.03438226f,
+                                                   -0.09606514f,    -0.006669445f,  -0.043381985f,  0.04240257f,
+                                                   -0.06955775f,    -0.06769346f,   0.043903265f,   -0.026784198f,
+                                                   -0.017840602f,   0.024307009f,   -0.040079936f,  -0.019946516f,
+                                                   0.045318738f,    -0.12233574f,   0.026170589f,   0.0074471775f,
+                                                   0.15978073f,     0.10185836f,    0.10298046f,    -0.015476589f,
+                                                   -0.039390966f,   -0.072174534f,  0.0739445f,     -0.1211869f,
+                                                   -0.0347889f,     -0.07943156f,   0.014809798f,   -0.12412325f,
+                                                   -0.0030663363f,  0.039695457f,   0.0647603f,     -0.08291318f,
+                                                   -0.018529687f,   -0.004423833f,  0.0037507233f,  0.084633216f,
+                                                   -0.01514876f,    -0.056505352f,  -0.012800942f,  -0.06994386f,
+                                                   0.012962922f,    -0.031234352f,  0.07029052f,    0.016418684f,
+                                                   0.03618972f,     0.055686004f,   -0.08663945f,   -0.017404709f,
+                                                   -0.054761406f,   0.029065743f,   0.052404847f,   0.020238016f,
+                                                   0.0048197987f,   -0.0214882f,    0.07078733f,    0.013016777f,
+                                                   0.06262858f,     0.009184685f,   0.020785125f,   -0.043904778f,
+                                                   -0.0270329f,     -0.03299152f,   -0.060088247f,  -0.015162964f,
+                                                   -0.001828936f,   0.12642565f,    -0.056757294f,  0.013586685f,
+                                                   0.09232601f,     -0.035886683f,  0.06000002f,    0.05229691f,
+                                                   -0.052580316f,   -0.082029596f,  -0.010794592f,  0.012947712f,
+                                                   -0.036429964f,   -0.085508935f,  -0.13127148f,   -0.017744139f,
+                                                   0.031502828f,    0.036232427f,   -0.031581745f,  0.023051167f,
+                                                   -0.05325106f,    -0.03421577f,   0.028793324f,   -0.034633752f,
+                                                   -0.009881397f,   -0.043551125f,  -0.018609839f,  0.0019097115f,
+                                                   -0.008799762f,   0.056595087f,   0.0022273948f,  0.055752404f
+            });
+
+    auto recurrentToOutputWeights =
+            MakeTensor<float, 2>(tensorInfo20x16, {0.025825322f, -0.05813119f, 0.09495884f,-0.045984812f, -0.01255415f,
+                                                    -0.0026479573f,-0.08196161f,-0.054914974f,-0.0046604523f,
+                                                   -0.029587349f, -0.044576716f,  -0.07480124f,  -0.082868785f,
+                                                   0.023254942f,    0.027502948f, -0.0039728214f, -0.08683098f,
+                                                   -0.08116779f,  -0.014675607f,   -0.037924774f, -0.023314456f,
+                                                   -0.007401714f, -0.09255757f,  0.029460307f,    -0.08829125f,
+                                                    -0.005139627f,  -0.08989442f,  -0.0555066f,   0.13596267f,
+                                                   -0.025062224f, -0.048351806f,  -0.03850004f,  0.07266485f,
+                                                   -0.022414139f,   0.05940088f, 0.075114764f,   0.09597592f,
+                                                   -0.010211725f, -0.0049794707f,  -0.011523867f, -0.025980417f,
+                                                   0.072999895f,  0.11091378f,   -0.081685916f,   0.014416728f,
+                                                    0.043229222f,   0.034178585f,  -0.07530371f,  0.035837382f,
+                                                   -0.085607f, -0.007721233f,  -0.03287832f,  -0.043848954f,
+                                                   -0.06404588f,    -0.06632928f, -0.073643476f,  0.008214239f,
+                                                   -0.045984086f, 0.039764922f,    0.03474462f, 0.060612556f,
+                                                   -0.080590084f, 0.049127717f,  0.04151091f,     -0.030063879f,
+                                                    0.008801774f,   -0.023021035f, -0.019558564f, 0.05158114f,
+                                                   -0.010947698f, -0.011825728f,  0.0075720972f, 0.0699727f,
+                                                   -0.0039981045f,  0.069350146f, 0.08799282f,    0.016156472f,
+                                                   0.035502106f,  0.11695009f,     0.006217345f, 0.13392477f,
+                                                   -0.037875112f, 0.025745004f,  0.08940699f,     -0.00924166f,
+                                                    0.0046702605f,  -0.036598757f, -0.08811812f,  0.10522024f,
+                                                   -0.032441203f, 0.008176899f,   -0.04454919f,  0.07058152f,
+                                                   0.0067963637f,   0.039206743f, 0.03259838f,    0.03725492f,
+                                                   -0.09515802f,  0.013326398f,    -0.052055415f, -0.025676316f,
+                                                   0.03198509f,   -0.015951829f, -0.058556724f,   0.036879618f,
+                                                    0.043357447f,   0.028362012f,  -0.05908629f,  0.0059240665f,
+                                                   -0.04995891f, -0.019187413f,0.0276265f, -0.01628143f, 0.0025863599f,
+                                                   0.08800015f, 0.035250366f,   -0.022165963f, -0.07328642f,
+                                                   -0.009415526f,   -0.07455109f, 0.11690406f,    0.0363299f,
+                                                   0.07411125f,   0.042103454f,    -0.009660886f, 0.019076364f,
+                                                   0.018299393f, -0.046004917f, 0.08891175f,0.0431396f, -0.026327137f,
+                                                   -0.051502608f, 0.08979574f,   -0.051670972f,   0.04940282f,
+                                                    -0.07491107f,   -0.021240504f, 0.022596184f,  -0.034280192f,
+                                                   0.060163025f, -0.058211457f,  -0.051837247f, -0.01349775f,
+                                                   -0.04639988f,    -0.035936575f, -0.011681591f,  0.064818054f,
+                                                   0.0073146066f, -0.021745546f,   -0.043124277f, -0.06471268f,
+                                                   -0.07053354f,  -0.029321948f, -0.05330136f,    0.016933719f,
+                                                    -0.053782392f,  0.13747959f,   -0.1361751f,   -0.11569455f,
+                                                   0.0033329215f, 0.05693899f,    -0.053219706f, 0.063698f,
+                                                   0.07977434f,     -0.07924483f, 0.06936997f,    0.0034815092f,
+                                                   -0.007305279f, -0.037325785f,   -0.07251102f, -0.033633437f,
+                                                   -0.08677009f,  0.091591336f,  -0.14165086f,    0.021752775f,
+                                                    0.019683983f,   0.0011612234f, -0.058154266f, 0.049996935f,
+                                                   0.0288841f, -0.0024567875f, -0.14345716f, 0.010955264f,-0.10234828f,
+                                                   0.1183656f, -0.0010731248f, -0.023590032f,-0.072285876f,-0.0724771f,
+                                                   -0.026382286f, -0.0014920527f, 0.042667855f,  0.0018776858f,
+                                                   0.02986552f,     0.009814309f, 0.0733756f,     0.12289186f,
+                                                   0.018043943f,  -0.0458958f,     0.049412545f, 0.033632483f,
+                                                   0.05495232f,   0.036686596f,  -0.013781798f,   -0.010036754f,
+                                                    0.02576849f,    -0.08307328f,  0.010112348f,  0.042521734f,
+                                                   -0.05869831f, -0.071689695f, 0.03876447f, -0.13275425f, -0.0352966f,
+                                                   -0.023077697f, 0.10285965f,    0.084736146f,  0.15568255f,
+                                                   -0.00040734606f, 0.027835453f, -0.10292561f,   -0.032401145f,
+                                                   0.10053256f,   -0.026142767f,   -0.08271222f, -0.0030240538f,
+                                                   -0.016368777f, 0.1070414f,    0.042672627f,    0.013456989f,
+                                                    -0.0437609f,    -0.022309763f, 0.11576483f,   0.04108048f,
+                                                   0.061026827f, -0.0190714f,  -0.0869359f, 0.037901703f,  0.0610107f,
+                                                   0.07202949f, 0.01675338f,    0.086139716f,  -0.08795751f,
+                                                   -0.014898893f,   -0.023771819f, -0.01965048f,   0.007955471f,
+                                                   -0.043740474f, 0.03346837f,     -0.10549954f, 0.090567775f,
+                                                   0.042013682f,  -0.03176985f,  0.12569028f,     -0.02421228f,
+                                                    -0.029526481f,  0.023851605f,  0.031539805f,  0.05292009f,
+                                                   -0.02344001f, -0.07811758f,   -0.08834428f,  0.10094801f,
+                                                   0.16594367f,     -0.06861939f, -0.021256343f,  -0.041093912f,
+                                                   -0.06669611f,  0.035498552f,    0.021757556f, -0.09302526f,
+                                                   -0.015403468f, -0.06614931f,  -0.051798206f,   -0.013874718f,
+                                                    0.03630673f,    0.010412845f,  -0.08077351f,  0.046185967f,
+                                                   0.0035662893f, 0.03541868f,    -0.094149634f, -0.034814864f,
+                                                   0.003128424f,    -0.020674974f, -0.03944324f,   -0.008110165f,
+                                                   -0.11113267f,  0.08484226f,     0.043586485f, 0.040582247f,
+                                                   0.0968012f,    -0.065249965f, -0.028036479f,   0.0050708856f,
+                                                    0.0017462453f,  0.0326779f,    0.041296225f,  0.09164146f,
+                                                   -0.047743853f, -0.015952192f,  -0.034451712f, 0.084197424f,
+                                                   -0.05347844f,    -0.11768019f, 0.085926116f,   -0.08251791f,
+                                                   -0.045081906f, 0.0948852f,      0.068401024f, 0.024856757f,
+                                                   0.06978981f,   -0.057309967f, -0.012775832f,   -0.0032452994f,
+                                                    0.01977615f, -0.041040014f, -0.024264973f,0.063464895f, 0.05431621f
+            });
+
+    auto cellToInputWeights =
+            MakeTensor<float, 1>(tensorInfo20, {0.040369894f, 0.030746894f,  0.24704495f,  0.018586371f, -0.037586458f,
+                                                -0.15312155f, -0.11812848f,  -0.11465643f, 0.20259799f,   0.11418174f,
+                                                -0.10116027f, -0.011334949f, 0.12411352f, -0.076769054f,-0.052169047f,
+                                                0.21198851f,  -0.38871562f,  -0.09061183f, -0.09683246f,  -0.21929175f
+            });
+
+
+    auto cellToForgetWeights =
+            MakeTensor<float, 1>(tensorInfo20, {-0.01998659f,-0.15568835f,-0.24248174f,   -0.012770197f, 0.041331276f,
+                                                -0.072311886f, -0.052123554f,-0.0066330447f,-0.043891653f,0.036225766f,
+                                                -0.047248036f, 0.021479502f,0.033189066f, 0.11952997f,   -0.020432774f,
+                                                0.64658105f,   -0.06650122f,  -0.03467612f,  0.095340036f, 0.23647355f
+            });
+
+    auto cellToOutputWeights =
+            MakeTensor<float, 1>(tensorInfo20, {0.08286371f,  -0.08261836f, -0.51210177f, 0.002913762f, 0.17764764f,
+                                                -0.5495371f,  -0.08460716f, -0.24552552f, 0.030037103f, 0.04123544f,
+                                                -0.11940523f, 0.007358328f, 0.1890978f,   0.4833202f,   -0.34441817f,
+                                                0.36312827f,  -0.26375428f, 0.1457655f,   -0.19724406f, 0.15548733f
+            });
+
+    auto projectionWeights =
+            MakeTensor<float, 2>(tensorInfo16x20,
+                                 {-0.009802181f,  0.09401916f,    0.0717386f,     -0.13895074f,  0.09641832f,
+                                  0.060420845f,   0.08539281f,    0.054285463f,   0.061395317f,  0.034448683f,
+                                  -0.042991187f,  0.019801661f,   -0.16840284f,   -0.015726732f, -0.23041931f,
+                                  -0.024478018f,  -0.10959692f,   -0.013875541f,  0.18600968f,   -0.061274476f,
+                                  0.0138165f,     -0.08160894f,   -0.07661644f,   0.032372914f,  0.16169067f,
+                                  0.22465782f,    -0.03993472f,   -0.004017731f,  0.08633481f,   -0.28869787f,
+                                  0.08682067f,    0.17240396f,    0.014975425f,   0.056431185f,  0.031037588f,
+                                  0.16702051f,    0.0077946745f,  0.15140012f,    0.29405436f,   0.120285f,
+                                  -0.188994f,     -0.027265169f,  0.043389652f,   -0.022061434f, 0.014777949f,
+                                  -0.20203483f,   0.094781205f,   0.19100232f,    0.13987629f,   -0.036132768f,
+                                  -0.06426278f,   -0.05108664f,   0.13221376f,    0.009441198f,  -0.16715929f,
+                                  0.15859416f,    -0.040437475f,  0.050779544f,   -0.022187516f, 0.012166504f,
+                                  0.027685808f,   -0.07675938f,   -0.0055694645f, -0.09444123f,  0.0046453946f,
+                                  0.050794356f,   0.10770313f,    -0.20790008f,   -0.07149004f,  -0.11425117f,
+                                  0.008225835f,   -0.035802525f,  0.14374903f,    0.15262283f,   0.048710253f,
+                                  0.1847461f,     -0.007487823f,  0.11000021f,    -0.09542012f,  0.22619456f,
+                                  -0.029149994f,  0.08527916f,    0.009043713f,   0.0042746216f, 0.016261552f,
+                                  0.022461696f,   0.12689082f,    -0.043589946f,  -0.12035478f,  -0.08361797f,
+                                  -0.050666027f,  -0.1248618f,    -0.1275799f,    -0.071875185f, 0.07377272f,
+                                  0.09944291f,    -0.18897448f,   -0.1593054f,    -0.06526116f,  -0.040107165f,
+                                  -0.004618631f,  -0.067624845f,  -0.007576253f,  0.10727444f,   0.041546922f,
+                                  -0.20424393f,   0.06907816f,    0.050412357f,   0.00724631f,   0.039827548f,
+                                  0.12449835f,    0.10747581f,    0.13708383f,    0.09134148f,   -0.12617786f,
+                                  -0.06428341f,   0.09956831f,    0.1208086f,     -0.14676677f,  -0.0727722f,
+                                  0.1126304f,     0.010139365f,   0.015571211f,   -0.038128063f, 0.022913318f,
+                                  -0.042050496f,  0.16842307f,    -0.060597885f,  0.10531834f,   -0.06411776f,
+                                  -0.07451711f,   -0.03410368f,   -0.13393489f,   0.06534304f,   0.003620307f,
+                                  0.04490757f,    0.05970546f,    0.05197996f,    0.02839995f,   0.10434969f,
+                                  -0.013699693f,  -0.028353551f,  -0.07260381f,   0.047201227f,  -0.024575593f,
+                                  -0.036445823f,  0.07155557f,    0.009672501f,   -0.02328883f,  0.009533515f,
+                                  -0.03606021f,   -0.07421458f,   -0.028082801f,  -0.2678904f,   -0.13221288f,
+                                  0.18419984f,    -0.13012612f,   -0.014588381f,  -0.035059117f, -0.04824723f,
+                                  0.07830115f,    -0.056184657f,  0.03277091f,    0.025466874f,  0.14494097f,
+                                  -0.12522776f,   -0.098633975f,  -0.10766018f,   -0.08317623f,  0.08594209f,
+                                  0.07749552f,    0.039474737f,   0.1776665f,     -0.07409566f,  -0.0477268f,
+                                  0.29323658f,    0.10801441f,    0.1154011f,     0.013952499f,  0.10739139f,
+                                  0.10708251f,    -0.051456142f,  0.0074137426f,  -0.10430189f,  0.10034707f,
+                                  0.045594677f,   0.0635285f,     -0.0715442f,    -0.089667566f, -0.10811871f,
+                                  0.00026344223f, 0.08298446f,    -0.009525053f,  0.006585689f,  -0.24567553f,
+                                  -0.09450807f,   0.09648481f,    0.026996298f,   -0.06419476f,  -0.04752702f,
+                                  -0.11063944f,   -0.23441927f,   -0.17608605f,   -0.052156363f, 0.067035615f,
+                                  0.19271925f,    -0.0032889997f, -0.043264326f,  0.09663576f,   -0.057112187f,
+                                  -0.10100678f,   0.0628376f,     0.04447668f,    0.017961001f,  -0.10094388f,
+                                  -0.10190601f,   0.18335468f,    0.10494553f,    -0.052095775f, -0.0026118709f,
+                                  0.10539724f,    -0.04383912f,   -0.042349473f,  0.08438151f,   -0.1947263f,
+                                  0.02251204f,    0.11216432f,    -0.10307853f,   0.17351969f,   -0.039091777f,
+                                  0.08066188f,    -0.00561982f,   0.12633002f,    0.11335965f,   -0.0088127935f,
+                                  -0.019777594f,  0.06864014f,    -0.059751723f,  0.016233567f,  -0.06894641f,
+                                  -0.28651384f,   -0.004228674f,  0.019708522f,   -0.16305895f,  -0.07468996f,
+                                  -0.0855457f,    0.099339016f,   -0.07580735f,   -0.13775392f,  0.08434318f,
+                                  0.08330512f,    -0.12131499f,   0.031935584f,   0.09180414f,   -0.08876437f,
+                                  -0.08049874f,   0.008753825f,   0.03498998f,    0.030215185f,  0.03907079f,
+                                  0.089751154f,   0.029194152f,   -0.03337423f,   -0.019092513f, 0.04331237f,
+                                  0.04299654f,    -0.036394123f,  -0.12915532f,   0.09793732f,   0.07512415f,
+                                  -0.11319543f,   -0.032502122f,  0.15661901f,    0.07671967f,   -0.005491124f,
+                                  -0.19379048f,   -0.218606f,     0.21448623f,    0.017840758f,  0.1416943f,
+                                  -0.07051762f,   0.19488361f,    0.02664691f,    -0.18104725f,  -0.09334311f,
+                                  0.15026465f,    -0.15493552f,   -0.057762887f,  -0.11604192f,  -0.262013f,
+                                  -0.01391798f,   0.012185008f,   0.11156489f,    -0.07483202f,  0.06693364f,
+                                  -0.26151478f,   0.046425626f,   0.036540434f,   -0.16435726f,  0.17338543f,
+                                  -0.21401681f,   -0.11385144f,   -0.08283257f,   -0.069031075f, 0.030635102f,
+                                  0.010969227f,   0.11109743f,    0.010919218f,   0.027526086f,  0.13519906f,
+                                  0.01891392f,    -0.046839405f,  -0.040167913f,  0.017953383f,  -0.09700955f,
+                                  0.0061885654f,  -0.07000971f,   0.026893595f,   -0.038844477f, 0.14543656f
+                                 });
+
+    std::vector<float> projectionBiasVector(outputSize, 0.f);
+    auto projectionBias = MakeTensor<float,1>(tensorInfo16, projectionBiasVector);
+
+    armnn::ScopedCpuTensorHandle inputToInputWeightsTensor(tensorInfo20x5);
+    armnn::ScopedCpuTensorHandle inputToForgetWeightsTensor(tensorInfo20x5);
+    armnn::ScopedCpuTensorHandle inputToCellWeightsTensor(tensorInfo20x5);
+    armnn::ScopedCpuTensorHandle inputToOutputWeightsTensor(tensorInfo20x5);
+    armnn::ScopedCpuTensorHandle recurrentToForgetWeightsTensor(tensorInfo20x16);
+    armnn::ScopedCpuTensorHandle recurrentToInputWeightsTensor(tensorInfo20x16);
+    armnn::ScopedCpuTensorHandle recurrentToCellWeightsTensor(tensorInfo20x16);
+    armnn::ScopedCpuTensorHandle recurrentToOutputWeightsTensor(tensorInfo20x16);
+    armnn::ScopedCpuTensorHandle cellToInputWeightsTensor(tensorInfo20);
+    armnn::ScopedCpuTensorHandle inputGateBiasTensor(tensorInfo20);
+    armnn::ScopedCpuTensorHandle forgetGateBiasTensor(tensorInfo20);
+    armnn::ScopedCpuTensorHandle cellBiasTensor(tensorInfo20);
+    armnn::ScopedCpuTensorHandle outputGateBiasTensor(tensorInfo20);
+    armnn::ScopedCpuTensorHandle cellToForgetWeightsTensor(tensorInfo20);
+    armnn::ScopedCpuTensorHandle cellToOutputWeightsTensor(tensorInfo20);
+    armnn::ScopedCpuTensorHandle projectionWeightsTensor(tensorInfo16x20);
+    armnn::ScopedCpuTensorHandle projectionBiasTensor(tensorInfo16);
+
+    AllocateAndCopyDataToITensorHandle(&inputToInputWeightsTensor, &inputToInputWeights[0][0]);
+    AllocateAndCopyDataToITensorHandle(&inputToForgetWeightsTensor, &inputToForgetWeights[0][0]);
+    AllocateAndCopyDataToITensorHandle(&inputToCellWeightsTensor, &inputToCellWeights[0][0]);
+    AllocateAndCopyDataToITensorHandle(&inputToOutputWeightsTensor, &inputToOutputWeights[0][0]);
+    AllocateAndCopyDataToITensorHandle(&recurrentToInputWeightsTensor, &recurrentToInputWeights[0][0]);
+    AllocateAndCopyDataToITensorHandle(&recurrentToForgetWeightsTensor, &recurrentToForgetWeights[0][0]);
+    AllocateAndCopyDataToITensorHandle(&recurrentToCellWeightsTensor, &recurrentToCellWeights[0][0]);
+    AllocateAndCopyDataToITensorHandle(&recurrentToOutputWeightsTensor, &recurrentToOutputWeights[0][0]);
+    AllocateAndCopyDataToITensorHandle(&cellToInputWeightsTensor, &cellToInputWeights[0]);
+    AllocateAndCopyDataToITensorHandle(&inputGateBiasTensor, &inputGateBias[0]);
+    AllocateAndCopyDataToITensorHandle(&forgetGateBiasTensor, &forgetGateBias[0]);
+    AllocateAndCopyDataToITensorHandle(&cellBiasTensor, &cellBias[0]);
+    AllocateAndCopyDataToITensorHandle(&outputGateBiasTensor, &outputGateBias[0]);
+    AllocateAndCopyDataToITensorHandle(&cellToForgetWeightsTensor, &cellToForgetWeights[0]);
+    AllocateAndCopyDataToITensorHandle(&cellToOutputWeightsTensor, &cellToOutputWeights[0]);
+    AllocateAndCopyDataToITensorHandle(&projectionWeightsTensor, &projectionWeights[0][0]);
+    AllocateAndCopyDataToITensorHandle(&projectionBiasTensor, &projectionBias[0]);
+
+    data.m_InputToInputWeights = &inputToInputWeightsTensor;
+    data.m_InputToForgetWeights = &inputToForgetWeightsTensor;
+    data.m_InputToCellWeights = &inputToCellWeightsTensor;
+    data.m_InputToOutputWeights = &inputToOutputWeightsTensor;
+    data.m_RecurrentToInputWeights = &recurrentToInputWeightsTensor;
+    data.m_RecurrentToForgetWeights = &recurrentToForgetWeightsTensor;
+    data.m_RecurrentToCellWeights = &recurrentToCellWeightsTensor;
+    data.m_RecurrentToOutputWeights = &recurrentToOutputWeightsTensor;
+    data.m_CellToInputWeights = &cellToInputWeightsTensor;
+    data.m_InputGateBias = &inputGateBiasTensor;
+    data.m_ForgetGateBias = &forgetGateBiasTensor;
+    data.m_CellBias = &cellBiasTensor;
+    data.m_OutputGateBias = &outputGateBiasTensor;
+    data.m_CellToForgetWeights = &cellToForgetWeightsTensor;
+    data.m_CellToOutputWeights = &cellToOutputWeightsTensor;
+    data.m_ProjectionWeights = &projectionWeightsTensor;
+    data.m_ProjectionBias = &projectionBiasTensor;
+
+    // Flags to set test configuration
+    data.m_Parameters.m_ActivationFunc = 4;
+    data.m_Parameters.m_CifgEnabled = false;
+    data.m_Parameters.m_PeepholeEnabled = true;
+    data.m_Parameters.m_ProjectionEnabled = true;
+
+
+    std::unique_ptr<armnn::IWorkload> workload = workloadFactory.CreateLstm(data, info);
+    inputHandle->Allocate();
+    outputStateInHandle->Allocate();
+    cellStateInHandle->Allocate();
+
+    scratchHandle->Allocate();
+    outputStateOutHandle->Allocate();
+    cellStateOutHandle->Allocate();
+    outputHandle->Allocate();
+
+    CopyDataToITensorHandle(inputHandle.get(), &inputTensor[0][0]);
+    CopyDataToITensorHandle(outputStateInHandle.get(), &outputStateInTensor[0][0]);
+    CopyDataToITensorHandle(cellStateInHandle.get(), &cellStateInTensor[0][0]);
+
+    workload->Execute();
+
+    CopyDataFromITensorHandle(&ret.output[0][0], outputHandle.get());
+
+    return ret;
+
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 2> LstmLayerWithCifgWithPeepholeNoProjectionTestImpl(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        const boost::multi_array<T, 2>& input,
+        const boost::multi_array<T, 2>& outputExpected,
+        float qScale = 0.0f,
+        int32_t qOffset = 0,
+        armnn::DataType constantDataType = armnn::DataType::Float32)
+{
+    bool cifgEnabled = true;
+    bool peepholeEnabled = true;
+    bool projectionEnabled = false;
+    // These are not the input and the output of Lstm yet
+    unsigned int batchSize = boost::numeric_cast<unsigned int>(input.shape()[0]);
+    unsigned int inputSize = boost::numeric_cast<unsigned int>(input.shape()[1]);
+
+    unsigned int outputSize = boost::numeric_cast<unsigned int>(outputExpected.shape()[1]);
+
+    const unsigned int cellSize = outputSize;
+
+    // Decide the shape of all input tensors
+    armnn::TensorInfo inputTensorInfo({batchSize , inputSize}, ArmnnType, qScale, qOffset); // change to ArmnnType
+    armnn::TensorInfo outputStateInTensorInfo({batchSize, outputSize}, ArmnnType, qScale, qOffset);
+    armnn::TensorInfo cellStateInTensorInfo({batchSize, cellSize}, ArmnnType, qScale, qOffset);
+
+    unsigned int scratchBufferSize = cifgEnabled ? cellSize * 3 : cellSize * 4;
+    armnn::TensorInfo scratchBufferTensorInfo({batchSize, scratchBufferSize}, ArmnnType, qScale, qOffset);
+    armnn::TensorInfo outputStateOutTensorInfo({batchSize, outputSize}, ArmnnType, qScale, qOffset);
+    armnn::TensorInfo cellStateOutTensorInfo({batchSize, cellSize}, ArmnnType, qScale, qOffset);
+    armnn::TensorInfo outputTensorInfo({batchSize, outputSize}, ArmnnType, qScale, qOffset);
+
+    // List of inputs
+    std::vector<float> inputData;
+    inputData.assign(input.data(), input.data() + batchSize*inputSize);
+    auto inputTensor = MakeTensor<float,2>(inputTensorInfo, inputData);
+
+    std::vector<float> outputStateInVector(batchSize * outputSize, 0.f);
+    auto outputStateInTensor = MakeTensor<float, 2>(outputStateInTensorInfo, outputStateInVector);
+
+    std::vector<float> cellStateInVector(batchSize * cellSize, 0.f);
+    auto cellStateInTensor = MakeTensor<float, 2>(cellStateInTensorInfo, cellStateInVector);
+
+
+    // Prepare all the weights in the descriptor for LSTM
+    armnn::LstmQueueDescriptor data;
+    armnn::TensorInfo tensorInfoInput({cellSize, inputSize}, constantDataType, qScale, qOffset);
+    armnn::TensorInfo tensorInfoOutput({cellSize, outputSize}, constantDataType, qScale, qOffset);
+    armnn::TensorInfo tensorInfoNumUnits({cellSize}, constantDataType, qScale, qOffset);
+
+    auto inputToCellWeights = MakeTensor<float, 2>(tensorInfoInput,
+                                                     {-0.49770179f, -0.27711356f, -0.09624726f, 0.05100781f,
+                                                     0.04717243f, 0.48944736f, -0.38535351f,
+                                                     -0.17212132f});
+    auto inputToForgetWeights = MakeTensor<float, 2>(tensorInfoInput,
+                                                     {-0.55291498f, -0.42866567f, 0.13056988f,
+                                                       -0.3633365f, -0.22755712f, 0.28253698f, 0.24407166f,
+                                                       0.33826375f});
+    auto inputToOutputWeights = MakeTensor<float, 2>(tensorInfoInput,
+                                                     {0.10725588f, -0.02335852f, -0.55932593f,
+                                                       -0.09426838f, -0.44257352f, 0.54939759f,
+                                                       0.01533556f, 0.42751634f});
+    auto cellBias = MakeTensor<float, 1>(tensorInfoNumUnits, {0.f, 0.f, 0.f, 0.f});
+    auto forgetGateBias = MakeTensor<float, 1>(tensorInfoNumUnits, {1.f, 1.f, 1.f, 1.f});
+    auto outputGateBias = MakeTensor<float, 1>(tensorInfoNumUnits, {0.f, 0.f, 0.f, 0.f});
+
+    auto recurrentToCellWeights = MakeTensor<float, 2>(tensorInfoOutput,
+                {0.54066205f, -0.32668582f, -0.43562764f, -0.56094903f, 0.42957711f,
+                 0.01841056f, -0.32764608f, -0.33027974f, -0.10826075f, 0.20675004f,
+                 0.19069612f, -0.03026325f, -0.54532051f, 0.33003211f, 0.44901288f,
+                 0.21193194f});
+    auto recurrentToForgetWeights = MakeTensor<float, 2>(tensorInfoOutput,
+                 {-0.13832897f, -0.0515101f, -0.2359007f, -0.16661474f, -0.14340827f,
+                  0.36986142f, 0.23414481f, 0.55899f, 0.10798943f, -0.41174671f, 0.17751795f,
+                  -0.34484994f, -0.35874045f, -0.11352962f, 0.27268326f, 0.54058349f});
+
+    auto recurrentToOutputWeights = MakeTensor<float, 2>(tensorInfoOutput,
+                {0.41613156f, 0.42610586f, -0.16495961f, -0.5663873f, 0.30579174f, -0.05115908f,
+                 -0.33941799f, 0.23364776f, 0.11178309f, 0.09481031f, -0.26424935f, 0.46261835f,
+                 0.50248802f, 0.26114327f, -0.43736315f, 0.33149987f});
+
+    auto cellToForgetWeights = MakeTensor<float, 1>(tensorInfoNumUnits,
+                {0.47485286f, -0.51955009f, -0.24458408f, 0.31544167f});
+    auto cellToOutputWeights = MakeTensor<float, 1>(tensorInfoNumUnits,
+                {-0.17135078f, 0.82760304f, 0.85573703f, -0.77109635f});
+
+    armnn::ScopedCpuTensorHandle inputToCellWeightsTensor(tensorInfoInput);
+    armnn::ScopedCpuTensorHandle inputToForgetWeightsTensor(tensorInfoInput);
+    armnn::ScopedCpuTensorHandle inputToOutputWeightsTensor(tensorInfoInput);
+
+    armnn::ScopedCpuTensorHandle cellBiasTensor(tensorInfoNumUnits);
+    armnn::ScopedCpuTensorHandle forgetGateBiasTensor(tensorInfoNumUnits);
+    armnn::ScopedCpuTensorHandle outputGateBiasTensor(tensorInfoNumUnits);
+
+    armnn::ScopedCpuTensorHandle recurrentToCellWeightsTensor(tensorInfoOutput);
+    armnn::ScopedCpuTensorHandle recurrentToForgetWeightsTensor(tensorInfoOutput);
+    armnn::ScopedCpuTensorHandle recurrentToOutputWeightsTensor(tensorInfoOutput);
+
+
+    armnn::ScopedCpuTensorHandle cellToForgetWeightsTensor(tensorInfoNumUnits);
+    armnn::ScopedCpuTensorHandle cellToOutputWeightsTensor(tensorInfoNumUnits);
+
+    AllocateAndCopyDataToITensorHandle(&inputToCellWeightsTensor, &inputToCellWeights[0][0]);
+    AllocateAndCopyDataToITensorHandle(&inputToForgetWeightsTensor, &inputToForgetWeights[0][0]);
+    AllocateAndCopyDataToITensorHandle(&inputToOutputWeightsTensor, &inputToOutputWeights[0][0]);
+
+    AllocateAndCopyDataToITensorHandle(&cellBiasTensor, &cellBias[0]);
+    AllocateAndCopyDataToITensorHandle(&forgetGateBiasTensor, &forgetGateBias[0]);
+    AllocateAndCopyDataToITensorHandle(&outputGateBiasTensor, &outputGateBias[0]);
+
+    AllocateAndCopyDataToITensorHandle(&recurrentToCellWeightsTensor, &recurrentToCellWeights[0][0]);
+    AllocateAndCopyDataToITensorHandle(&recurrentToForgetWeightsTensor, &recurrentToForgetWeights[0][0]);
+    AllocateAndCopyDataToITensorHandle(&recurrentToOutputWeightsTensor, &recurrentToOutputWeights[0][0]);
+
+    AllocateAndCopyDataToITensorHandle(&cellToForgetWeightsTensor, &cellToForgetWeights[0]);
+    AllocateAndCopyDataToITensorHandle(&cellToOutputWeightsTensor, &cellToOutputWeights[0]);
+
+
+    data.m_InputToCellWeights = &inputToCellWeightsTensor;
+    data.m_InputToForgetWeights = &inputToForgetWeightsTensor;
+    data.m_InputToOutputWeights = &inputToOutputWeightsTensor;
+
+    data.m_CellBias = &cellBiasTensor;
+    data.m_ForgetGateBias = &forgetGateBiasTensor;
+    data.m_OutputGateBias = &outputGateBiasTensor;
+
+    data.m_RecurrentToCellWeights = &recurrentToCellWeightsTensor;
+    data.m_RecurrentToForgetWeights = &recurrentToForgetWeightsTensor;
+    data.m_RecurrentToOutputWeights = &recurrentToOutputWeightsTensor;
+
+    data.m_CellToForgetWeights = &cellToForgetWeightsTensor;
+    data.m_CellToOutputWeights = &cellToOutputWeightsTensor;
+
+    // other parameters for the descriptor
+    data.m_Parameters.m_CifgEnabled = cifgEnabled;
+    data.m_Parameters.m_ProjectionEnabled = projectionEnabled;
+    data.m_Parameters.m_PeepholeEnabled = peepholeEnabled;
+
+    data.m_Parameters.m_ActivationFunc = 4;
+    data.m_Parameters.m_ClippingThresProj = 0.0;
+    data.m_Parameters.m_ClippingThresCell = 0.0;
+
+
+    // List of outputs
+    std::vector<float> scratchBufferVector(batchSize * scratchBufferSize, 0.f);
+    auto scratchBufferTensor = MakeTensor<float,2>(scratchBufferTensorInfo, scratchBufferVector);
+    LayerTestResult<T, 2> ret0(scratchBufferTensorInfo);
+
+    // Output state for a certain time step
+    std::vector<float> outputStateOutVector(batchSize * outputSize, 0.f);
+    auto outputStateOutTensor = MakeTensor<float,2>(outputStateOutTensorInfo, outputStateOutVector);
+    LayerTestResult<T, 2> ret1(outputStateOutTensorInfo);
+
+    // Cell state for a certain time step
+    std::vector<float> cellStateOutVector(batchSize * cellSize, 0.f);
+    auto cellStateOutTensor = MakeTensor<float,2>(cellStateOutTensorInfo, cellStateOutVector);
+    LayerTestResult<T, 2> ret2(cellStateOutTensorInfo);
+
+    // Output for a certain time step
+    std::vector<float> outputVector(batchSize * outputSize, 0.f);
+    auto outputTensor = MakeTensor<float, 2>(outputTensorInfo, outputVector);
+    std::vector<float> outputData;
+    outputData.assign(outputExpected.data(), outputExpected.data() + batchSize*outputSize);
+    LayerTestResult<T, 2> ret3(outputTensorInfo);
+    ret3.outputExpected = MakeTensor<float, 2>(outputTensorInfo, outputData);
+
+    // Prepare the inputs and outputs for the workload
+    std::unique_ptr<armnn::ITensorHandle> inputHandle =
+            workloadFactory.CreateTensorHandle(inputTensorInfo);
+    std::unique_ptr<armnn::ITensorHandle> outputStateInHandle =
+            workloadFactory.CreateTensorHandle(outputStateInTensorInfo);
+    std::unique_ptr<armnn::ITensorHandle> cellStateInHandle =
+            workloadFactory.CreateTensorHandle(cellStateInTensorInfo);
+
+    std::unique_ptr<armnn::ITensorHandle> scratchBufferHandle =
+            workloadFactory.CreateTensorHandle(scratchBufferTensorInfo);
+    std::unique_ptr<armnn::ITensorHandle> outputStateOutHandle =
+            workloadFactory.CreateTensorHandle(outputStateOutTensorInfo);
+    std::unique_ptr<armnn::ITensorHandle> cellStateOutHandle =
+            workloadFactory.CreateTensorHandle(cellStateOutTensorInfo);
+    std::unique_ptr<armnn::ITensorHandle> outputHandle =
+            workloadFactory.CreateTensorHandle(outputTensorInfo);
+
+    armnn::WorkloadInfo info;
+    AddInputToWorkload(data, info, inputTensorInfo, inputHandle.get());
+    AddInputToWorkload(data, info, outputStateInTensorInfo, outputStateInHandle.get());
+    AddInputToWorkload(data, info, cellStateInTensorInfo, cellStateInHandle.get());
+
+    AddOutputToWorkload(data, info, scratchBufferTensorInfo, scratchBufferHandle.get());
+    AddOutputToWorkload(data, info, outputStateOutTensorInfo, outputStateOutHandle.get());
+    AddOutputToWorkload(data, info, cellStateOutTensorInfo, cellStateOutHandle.get());
+    AddOutputToWorkload(data, info, outputTensorInfo, outputHandle.get());
+
+    std::unique_ptr<armnn::IWorkload> workload = workloadFactory.CreateLstm(data, info);
+
+
+    inputHandle->Allocate();
+    outputStateInHandle->Allocate();
+    cellStateInHandle->Allocate();
+
+    scratchBufferHandle->Allocate();
+    outputStateOutHandle->Allocate();
+    cellStateOutHandle->Allocate();
+    outputHandle->Allocate();
+
+
+    CopyDataToITensorHandle(inputHandle.get(), &inputTensor[0][0]);
+    CopyDataToITensorHandle(outputStateInHandle.get(), &outputStateInTensor[0][0]);
+    CopyDataToITensorHandle(cellStateInHandle.get(), &cellStateInTensor[0][0]);
+
+    CopyDataToITensorHandle(scratchBufferHandle.get(), &scratchBufferTensor[0][0]);
+    CopyDataToITensorHandle(outputStateOutHandle.get(), &outputStateOutTensor[0][0]);
+    CopyDataToITensorHandle(cellStateOutHandle.get(), &cellStateOutTensor[0][0]);
+
+    workload->Execute();
+
+    CopyDataFromITensorHandle(&ret0.output[0][0], scratchBufferHandle.get());
+    CopyDataFromITensorHandle(&ret1.output[0][0], outputStateOutHandle.get());
+    CopyDataFromITensorHandle(&ret2.output[0][0], cellStateOutHandle.get());
+    CopyDataFromITensorHandle(&ret3.output[0][0], outputHandle.get());
+
+    return ret3;
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 2>
+LstmLayerNoCifgWithPeepholeWithProjectionWithLayerNormTestImpl(armnn::IWorkloadFactory& workloadFactory,
+                                                  const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+                                                  const boost::multi_array<T, 2>& input,
+                                                  const boost::multi_array<T, 2>& outputExpected,
+                                                  float qScale = 0.0f,
+                                                  int32_t qOffset = 0,
+                                                  armnn::DataType constantDataType = armnn::DataType::Float32)
+{
+    unsigned int batchSize = 2;
+    unsigned int outputSize = 3;
+    unsigned int inputSize = 5;
+    unsigned numUnits = 4;
+
+    armnn::TensorInfo inputTensorInfo({batchSize , inputSize}, ArmnnType, qScale, qOffset);
+    armnn::TensorInfo cellStateInTensorInfo({batchSize , numUnits}, ArmnnType, qScale, qOffset);
+    armnn::TensorInfo outputStateInTensorInfo({batchSize , outputSize}, ArmnnType, qScale, qOffset);
+
+    // Scratch buffer size without CIFG [batchSize, numUnits * 4]
+    armnn::TensorInfo scratchBufferTensorInfo({batchSize, numUnits * 4}, ArmnnType, qScale, qOffset);
+    armnn::TensorInfo cellStateOutTensorInfo({batchSize, numUnits}, ArmnnType, qScale, qOffset);
+    armnn::TensorInfo outputStateOutTensorInfo({batchSize, outputSize}, ArmnnType, qScale, qOffset);
+    armnn::TensorInfo outputTensorInfo({batchSize, outputSize}, ArmnnType, qScale, qOffset);
+
+    LayerTestResult<T, 2> ret(outputTensorInfo);
+
+    std::vector<float> inputVector;
+    inputVector.assign(input.data(), input.data() + (batchSize * inputSize));
+    auto inputTensor = MakeTensor<float,2>(inputTensorInfo, inputVector);
+
+    std::vector<float> cellStateInVector(batchSize * numUnits, 0.f);
+    auto cellStateInTensor = MakeTensor<float,2>(cellStateInTensorInfo, cellStateInVector);
+
+    std::vector<float> outputStateInVector(batchSize * outputSize, 0.f);
+    auto outputStateInTensor = MakeTensor<float,2>(outputStateInTensorInfo, outputStateInVector);
+
+    std::vector<float> scratchBufferVector(batchSize * numUnits * 4, 0.f);
+    auto scratchBufferTensor = MakeTensor<float,2>(scratchBufferTensorInfo, scratchBufferVector);
+
+    std::vector<float> outputStateOutVector(batchSize * outputSize, 0.f);
+    auto outputStateOutTensor = MakeTensor<float,2>(outputStateOutTensorInfo, outputStateOutVector);
+
+    std::vector<float> cellStateOutVector(batchSize * numUnits, 0.f);
+    auto cellStateOutTensor = MakeTensor<float,2>(cellStateOutTensorInfo, cellStateOutVector);
+
+    std::vector<float> outputVector;
+    outputVector.assign(outputExpected.data(), outputExpected.data() + (batchSize * outputSize));
+    ret.outputExpected = MakeTensor<float, 2>(outputTensorInfo, outputVector);
+
+    std::unique_ptr<armnn::ITensorHandle> inputHandle = workloadFactory.CreateTensorHandle(inputTensorInfo);
+    std::unique_ptr<armnn::ITensorHandle> cellStateInHandle =
+            workloadFactory.CreateTensorHandle(cellStateInTensorInfo);
+    std::unique_ptr<armnn::ITensorHandle> outputStateInHandle =
+            workloadFactory.CreateTensorHandle(outputStateInTensorInfo);
+
+    std::unique_ptr<armnn::ITensorHandle> scratchHandle = workloadFactory.CreateTensorHandle(scratchBufferTensorInfo);
+    std::unique_ptr<armnn::ITensorHandle> outputStateOutHandle =
+            workloadFactory.CreateTensorHandle(outputStateOutTensorInfo);
+    std::unique_ptr<armnn::ITensorHandle> cellStateOutHandle =
+            workloadFactory.CreateTensorHandle(cellStateOutTensorInfo);
+    std::unique_ptr<armnn::ITensorHandle> outputHandle = workloadFactory.CreateTensorHandle(outputTensorInfo);
+
+    armnn::LstmQueueDescriptor data;
+    armnn::WorkloadInfo info;
+
+    AddInputToWorkload(data, info, inputTensorInfo, inputHandle.get());
+    AddInputToWorkload(data, info, outputStateInTensorInfo, outputStateInHandle.get());
+    AddInputToWorkload(data, info, cellStateInTensorInfo, cellStateInHandle.get());
+
+    AddOutputToWorkload(data, info, scratchBufferTensorInfo, scratchHandle.get());
+    AddOutputToWorkload(data, info, outputStateOutTensorInfo, outputStateOutHandle.get());
+    AddOutputToWorkload(data, info, cellStateOutTensorInfo, cellStateOutHandle.get());
+    AddOutputToWorkload(data, info, outputTensorInfo, outputHandle.get());
+
+    armnn::TensorInfo tensorInfo3({outputSize}, constantDataType, qScale, qOffset);
+    armnn::TensorInfo tensorInfo4({numUnits}, constantDataType, qScale, qOffset);
+    armnn::TensorInfo tensorInfo4x5({numUnits, inputSize}, constantDataType, qScale, qOffset);
+    armnn::TensorInfo tensorInfo4x3({numUnits, outputSize}, constantDataType, qScale, qOffset);
+    armnn::TensorInfo tensorInfo3x4({outputSize, numUnits}, constantDataType, qScale, qOffset);
+
+    auto inputToInputWeights =
+            MakeTensor<float, 2>(tensorInfo4x5, { 0.5f,  0.6f,  0.7f, -0.8f, -0.9f,
+                                                  0.1f,  0.2f,  0.3f, -0.4f,  0.5f,
+                                                 -0.8f,  0.7f, -0.6f,  0.5f, -0.4f,
+                                                 -0.5f, -0.4f, -0.3f, -0.2f, -0.1f});  //{numUnits, inputSize}
+
+    auto inputToForgetWeights =
+            MakeTensor<float, 2>(tensorInfo4x5, {-0.6f, -0.1f,  0.3f,  0.2f,  0.9f,
+                                                 -0.5f, -0.2f, -0.4f,  0.3f, -0.8f,
+                                                 -0.4f,  0.3f, -0.5f, -0.4f, -0.6f,
+                                                  0.3f, -0.4f, -0.6f, -0.5f, -0.5f});  //{numUnits, inputSize}
+
+    auto inputToCellWeights =
+            MakeTensor<float, 2>(tensorInfo4x5, {-0.4f, -0.3f, -0.2f, -0.1f, -0.5f,
+                                                  0.5f, -0.2f, -0.3f, -0.2f, -0.6f,
+                                                  0.6f, -0.1f, -0.4f, -0.3f, -0.7f,
+                                                  0.7f, -0.9f, -0.5f,  0.8f,  0.6f});  //{numUnits, inputSize}
+
+    auto inputToOutputWeights =
+            MakeTensor<float, 2>(tensorInfo4x5, {-0.8f, -0.4f, -0.2f, -0.9f, -0.1f,
+                                                 -0.7f,  0.3f, -0.3f, -0.8f, -0.2f,
+                                                  0.6f, -0.2f,  0.4f, -0.7f, -0.3f,
+                                                 -0.5f,  0.1f,  0.5f, -0.6f, -0.4f}); //{numUnits, inputSize}
+
+    auto inputGateBias =
+            MakeTensor<float, 1>(tensorInfo4, {0.03f, 0.15f, 0.22f, 0.38f});  //{numUnits}
+
+    auto forgetGateBias =
+            MakeTensor<float, 1>(tensorInfo4, {0.1f, -0.3f, -0.2f, 0.1f});    //{numUnits}
+
+    auto cellBias =
+            MakeTensor<float, 1>(tensorInfo4, {-0.05f, 0.72f, 0.25f, 0.08f}); //{numUnits}
+
+    auto outputGateBias =
+            MakeTensor<float, 1>(tensorInfo4, {0.05f, -0.01f, 0.2f, 0.1f});   //{numUnits}
+
+    auto recurrentToInputWeights =
+            MakeTensor<float, 2>(tensorInfo4x3, {-0.2f, -0.3f,  0.4f,
+                                                  0.1f, -0.5f,  0.9f,
+                                                 -0.2f, -0.3f, -0.7f,
+                                                 0.05f, -0.2f, -0.6f});  //{numUnits, outputSize}
+
+    auto recurrentToCellWeights =
+            MakeTensor<float, 2>(tensorInfo4x3, {-0.3f,  0.2f,   0.1f,
+                                                 -0.3f,  0.8f, -0.08f,
+                                                 -0.2f,  0.3f,   0.8f,
+                                                 -0.6f, -0.1f,   0.2f}); //{numUnits, outputSize}
+
+    auto recurrentToForgetWeights =
+            MakeTensor<float, 2>(tensorInfo4x3, {-0.5f, -0.3f, -0.5f,
+                                                 -0.2f,  0.6f,  0.4f,
+                                                  0.9f,  0.3f, -0.1f,
+                                                  0.2f,  0.5f,  0.2f});  //{numUnits, outputSize}
+
+    auto recurrentToOutputWeights =
+            MakeTensor<float, 2>(tensorInfo4x3, { 0.3f, -0.1f,  0.1f,
+                                                 -0.2f, -0.5f, -0.7f,
+                                                 -0.2f, -0.6f, -0.1f,
+                                                 -0.4f, -0.7f, -0.2f});  //{numUnits, outputSize}
+
+    auto cellToInputWeights =
+            MakeTensor<float, 1>(tensorInfo4, {0.05f, 0.1f, 0.25f, 0.15f});      //{numUnits}
+
+    auto cellToForgetWeights =
+            MakeTensor<float, 1>(tensorInfo4, {-0.02f, -0.15f, -0.25f, -0.03f}); //{numUnits}
+
+    auto cellToOutputWeights =
+            MakeTensor<float, 1>(tensorInfo4, {0.1f, -0.1f, -0.5f, 0.05f});      //{numUnits}
+
+    auto projectionWeights =
+            MakeTensor<float, 2>(tensorInfo3x4,
+                                 {-0.1f, 0.2f, 0.01f, -0.2f,
+                                   0.1f, 0.5f,  0.3f, 0.08f,
+                                  0.07f, 0.2f, -0.4f,  0.2f}); //{outputSize, numUnits}
+
+    std::vector<float> projectionBiasVector(outputSize, 0.f);
+    auto projectionBias = MakeTensor<float,1>(tensorInfo3, projectionBiasVector); //{outputSize}
+
+    auto inputLayerNormWeights =
+            MakeTensor<float, 1>(tensorInfo4, {0.1f, 0.2f, 0.3f, 0.5f}); //{numUnits}
+
+    auto forgetLayerNormWeights =
+            MakeTensor<float, 1>(tensorInfo4, {0.2f, 0.2f, 0.4f, 0.3f}); //{numUnits}
+
+    auto cellLayerNormWeights =
+            MakeTensor<float, 1>(tensorInfo4, {0.7f, 0.2f, 0.3f, 0.8f}); //{numUnits}
+
+    auto outputLayerNormWeights =
+            MakeTensor<float, 1>(tensorInfo4, {0.6f, 0.2f, 0.2f, 0.5f}); //{numUnits}
+
+
+    armnn::ScopedCpuTensorHandle inputToInputWeightsTensor(tensorInfo4x5);
+    armnn::ScopedCpuTensorHandle inputToForgetWeightsTensor(tensorInfo4x5);
+    armnn::ScopedCpuTensorHandle inputToCellWeightsTensor(tensorInfo4x5);
+    armnn::ScopedCpuTensorHandle inputToOutputWeightsTensor(tensorInfo4x5);
+    armnn::ScopedCpuTensorHandle recurrentToForgetWeightsTensor(tensorInfo4x3);
+    armnn::ScopedCpuTensorHandle recurrentToInputWeightsTensor(tensorInfo4x3);
+    armnn::ScopedCpuTensorHandle recurrentToCellWeightsTensor(tensorInfo4x3);
+    armnn::ScopedCpuTensorHandle recurrentToOutputWeightsTensor(tensorInfo4x3);
+    armnn::ScopedCpuTensorHandle cellToInputWeightsTensor(tensorInfo4);
+    armnn::ScopedCpuTensorHandle inputGateBiasTensor(tensorInfo4);
+    armnn::ScopedCpuTensorHandle forgetGateBiasTensor(tensorInfo4);
+    armnn::ScopedCpuTensorHandle cellBiasTensor(tensorInfo4);
+    armnn::ScopedCpuTensorHandle outputGateBiasTensor(tensorInfo4);
+    armnn::ScopedCpuTensorHandle cellToForgetWeightsTensor(tensorInfo4);
+    armnn::ScopedCpuTensorHandle cellToOutputWeightsTensor(tensorInfo4);
+    armnn::ScopedCpuTensorHandle projectionWeightsTensor(tensorInfo3x4);
+    armnn::ScopedCpuTensorHandle projectionBiasTensor(tensorInfo3);
+
+    armnn::ScopedCpuTensorHandle inputLayerNormWeightsTensor(tensorInfo4);
+    armnn::ScopedCpuTensorHandle forgetLayerNormWeightsTensor(tensorInfo4);
+    armnn::ScopedCpuTensorHandle cellLayerNormWeightsTensor(tensorInfo4);
+    armnn::ScopedCpuTensorHandle outputLayerNormWeightsTensor(tensorInfo4);
+
+    AllocateAndCopyDataToITensorHandle(&inputToInputWeightsTensor, &inputToInputWeights[0][0]);
+    AllocateAndCopyDataToITensorHandle(&inputToForgetWeightsTensor, &inputToForgetWeights[0][0]);
+    AllocateAndCopyDataToITensorHandle(&inputToCellWeightsTensor, &inputToCellWeights[0][0]);
+    AllocateAndCopyDataToITensorHandle(&inputToOutputWeightsTensor, &inputToOutputWeights[0][0]);
+    AllocateAndCopyDataToITensorHandle(&recurrentToInputWeightsTensor, &recurrentToInputWeights[0][0]);
+    AllocateAndCopyDataToITensorHandle(&recurrentToForgetWeightsTensor, &recurrentToForgetWeights[0][0]);
+    AllocateAndCopyDataToITensorHandle(&recurrentToCellWeightsTensor, &recurrentToCellWeights[0][0]);
+    AllocateAndCopyDataToITensorHandle(&recurrentToOutputWeightsTensor, &recurrentToOutputWeights[0][0]);
+    AllocateAndCopyDataToITensorHandle(&cellToInputWeightsTensor, &cellToInputWeights[0]);
+    AllocateAndCopyDataToITensorHandle(&inputGateBiasTensor, &inputGateBias[0]);
+    AllocateAndCopyDataToITensorHandle(&forgetGateBiasTensor, &forgetGateBias[0]);
+    AllocateAndCopyDataToITensorHandle(&cellBiasTensor, &cellBias[0]);
+    AllocateAndCopyDataToITensorHandle(&outputGateBiasTensor, &outputGateBias[0]);
+    AllocateAndCopyDataToITensorHandle(&cellToForgetWeightsTensor, &cellToForgetWeights[0]);
+    AllocateAndCopyDataToITensorHandle(&cellToOutputWeightsTensor, &cellToOutputWeights[0]);
+    AllocateAndCopyDataToITensorHandle(&projectionWeightsTensor, &projectionWeights[0][0]);
+    AllocateAndCopyDataToITensorHandle(&projectionBiasTensor, &projectionBias[0]);
+
+    AllocateAndCopyDataToITensorHandle(&inputLayerNormWeightsTensor, &inputLayerNormWeights[0]);
+    AllocateAndCopyDataToITensorHandle(&forgetLayerNormWeightsTensor, &forgetLayerNormWeights[0]);
+    AllocateAndCopyDataToITensorHandle(&cellLayerNormWeightsTensor, &cellLayerNormWeights[0]);
+    AllocateAndCopyDataToITensorHandle(&outputLayerNormWeightsTensor, &outputLayerNormWeights[0]);
+
+    data.m_InputToInputWeights = &inputToInputWeightsTensor;
+    data.m_InputToForgetWeights = &inputToForgetWeightsTensor;
+    data.m_InputToCellWeights = &inputToCellWeightsTensor;
+    data.m_InputToOutputWeights = &inputToOutputWeightsTensor;
+    data.m_RecurrentToInputWeights = &recurrentToInputWeightsTensor;
+    data.m_RecurrentToForgetWeights = &recurrentToForgetWeightsTensor;
+    data.m_RecurrentToCellWeights = &recurrentToCellWeightsTensor;
+    data.m_RecurrentToOutputWeights = &recurrentToOutputWeightsTensor;
+    data.m_CellToInputWeights = &cellToInputWeightsTensor;
+    data.m_InputGateBias = &inputGateBiasTensor;
+    data.m_ForgetGateBias = &forgetGateBiasTensor;
+    data.m_CellBias = &cellBiasTensor;
+    data.m_OutputGateBias = &outputGateBiasTensor;
+    data.m_CellToForgetWeights = &cellToForgetWeightsTensor;
+    data.m_CellToOutputWeights = &cellToOutputWeightsTensor;
+    data.m_ProjectionWeights = &projectionWeightsTensor;
+    data.m_ProjectionBias = &projectionBiasTensor;
+
+    data.m_InputLayerNormWeights = &inputLayerNormWeightsTensor;
+    data.m_ForgetLayerNormWeights = &forgetLayerNormWeightsTensor;
+    data.m_CellLayerNormWeights = &cellLayerNormWeightsTensor;
+    data.m_OutputLayerNormWeights = &outputLayerNormWeightsTensor;
+
+    // Flags to set test configuration
+    data.m_Parameters.m_ActivationFunc = 4;
+    data.m_Parameters.m_CifgEnabled = false;
+    data.m_Parameters.m_PeepholeEnabled = true;
+    data.m_Parameters.m_ProjectionEnabled = true;
+    data.m_Parameters.m_LayerNormEnabled = true;
+
+
+    std::unique_ptr<armnn::IWorkload> workload = workloadFactory.CreateLstm(data, info);
+    inputHandle->Allocate();
+    outputStateInHandle->Allocate();
+    cellStateInHandle->Allocate();
+
+    scratchHandle->Allocate();
+    outputStateOutHandle->Allocate();
+    cellStateOutHandle->Allocate();
+    outputHandle->Allocate();
+
+    CopyDataToITensorHandle(inputHandle.get(), &inputTensor[0][0]);
+    CopyDataToITensorHandle(outputStateInHandle.get(), &outputStateInTensor[0][0]);
+    CopyDataToITensorHandle(cellStateInHandle.get(), &cellStateInTensor[0][0]);
+
+    workload->Execute();
+
+    CopyDataFromITensorHandle(&ret.output[0][0], outputHandle.get());
+
+    return ret;
+}
+
+LayerTestResult<uint8_t, 2> QuantizedLstmTestImpl(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const boost::multi_array<uint8_t, 2>& input,
+    const boost::multi_array<uint8_t, 2>& outputExpected)
+{
+    auto numBatches = boost::numeric_cast<unsigned int>(input.shape()[0]);
+    auto inputSize = boost::numeric_cast<unsigned int>(input.shape()[1]);
+    auto outputSize = boost::numeric_cast<unsigned int>(outputExpected.shape()[1]);
+
+    // Scale/Offset for input/output, cellState In/Out, weights, bias
+    float inputOutputScale = 0.0078125f;
+    int32_t inputOutputOffset = 128;
+
+    float cellStateScale = 0.00048828125f;
+    int32_t cellStateOffset = 0;
+
+    float weightsScale = 0.00408021f;
+    int32_t weightsOffset = 100;
+
+    float biasScale = 3.1876640625e-05f;
+    int32_t biasOffset = 0;
+
+    // Input/Output tensor info
+    armnn::TensorInfo inputInfo({numBatches , inputSize},
+                                 armnn::DataType::QuantisedAsymm8,
+                                 inputOutputScale,
+                                 inputOutputOffset);
+
+    armnn::TensorInfo cellStateInfo({numBatches , outputSize},
+                                     armnn::DataType::QuantisedSymm16,
+                                     cellStateScale,
+                                     cellStateOffset);
+
+    armnn::TensorInfo outputStateInfo({numBatches , outputSize},
+                                       armnn::DataType::QuantisedAsymm8,
+                                       inputOutputScale,
+                                       inputOutputOffset);
+
+    LayerTestResult<uint8_t, 2> ret(outputStateInfo);
+
+    // Input0
+    std::vector<uint8_t> inputVector;
+    inputVector.assign(input.data(), input.data() + (numBatches * inputSize));
+    auto inputTensor = MakeTensor<uint8_t, 2>(inputInfo, inputVector);
+
+    // Input1
+    std::vector<int16_t> cellStateInVector   = {876, 1034, 955, -909, 761, 1029, 796, -1036}; // 13
+    auto cellStateInTensor   = MakeTensor<int16_t, 2>(cellStateInfo, cellStateInVector);
+
+    // Input2
+    std::vector<uint8_t> outputStateInVector = {136, 150, 140, 115, 135, 152, 138, 112}; // 14
+    auto outputStateInTensor = MakeTensor<uint8_t, 2>(outputStateInfo, outputStateInVector);
+
+    // Output0
+    std::vector<int16_t> cellStateOutVector  = {1485, 1177, 1373, -1023, 1019, 1355, 1097, -1235}; // 0
+    auto cellStateOutTensor  = MakeTensor<int16_t, 2>(cellStateInfo, cellStateOutVector);
+
+    // Output1
+    std::vector<uint8_t> outputVector; // 1
+    outputVector.assign(outputExpected.data(), outputExpected.data() + (numBatches * outputSize));
+    ret.outputExpected = MakeTensor<uint8_t, 2>(outputStateInfo, outputVector);
+
+    // Create tensor handles
+    std::unique_ptr<armnn::ITensorHandle> inputHandle = workloadFactory.CreateTensorHandle(inputInfo);
+    std::unique_ptr<armnn::ITensorHandle> cellStateInHandle =
+            workloadFactory.CreateTensorHandle(cellStateInfo);
+    std::unique_ptr<armnn::ITensorHandle> outputStateInHandle =
+            workloadFactory.CreateTensorHandle(outputStateInfo);
+
+    std::unique_ptr<armnn::ITensorHandle> cellStateOutHandle =
+            workloadFactory.CreateTensorHandle(cellStateInfo);
+    std::unique_ptr<armnn::ITensorHandle> outputHandle = workloadFactory.CreateTensorHandle(outputStateInfo);
+
+    armnn::QuantizedLstmQueueDescriptor data;
+    armnn::WorkloadInfo info;
+
+    // Add inputs and outputs to workload
+    AddInputToWorkload(data, info, inputInfo, inputHandle.get());
+    AddInputToWorkload(data, info, cellStateInfo, cellStateInHandle.get());
+    AddInputToWorkload(data, info, outputStateInfo, outputStateInHandle.get());
+
+    AddOutputToWorkload(data, info, cellStateInfo, cellStateOutHandle.get());
+    AddOutputToWorkload(data, info, outputStateInfo, outputHandle.get());
+
+    // Weights and bias tensor and quantization info
+    armnn::TensorInfo inputWeightsInfo({outputSize, inputSize},
+                                        armnn::DataType::QuantisedAsymm8,
+                                        weightsScale,
+                                        weightsOffset);
+
+    armnn::TensorInfo recurrentWeightsInfo({outputSize, outputSize},
+                                            armnn::DataType::QuantisedAsymm8,
+                                            weightsScale,
+                                            weightsOffset);
+
+    armnn::TensorInfo biasInfo({outputSize}, armnn::DataType::Signed32, biasScale, biasOffset);
+
+    // Weights and bias tensor data
+    auto inputToInputWeights  = MakeTensor<uint8_t, 2>(inputWeightsInfo, {146, 250, 235, 171, 10, 218, 171, 108});
+    auto inputToForgetWeights = MakeTensor<uint8_t, 2>(inputWeightsInfo, {24, 50, 132, 179, 158, 110, 3, 169});
+    auto inputToCellWeights   = MakeTensor<uint8_t, 2>(inputWeightsInfo, {133, 34, 29, 49, 206, 109, 54, 183});
+    auto inputToOutputWeights = MakeTensor<uint8_t, 2>(inputWeightsInfo, {195, 187, 11, 99, 109, 10, 218, 48});
+
+    auto recurrentToInputWeights  = MakeTensor<uint8_t, 2>(recurrentWeightsInfo,
+            {254, 206, 77, 168, 71, 20, 215, 6, 223, 7, 118, 225, 59, 130, 174, 26});
+    auto recurrentToForgetWeights = MakeTensor<uint8_t, 2>(recurrentWeightsInfo,
+            {137, 240, 103, 52, 68, 51, 237, 112, 0, 220, 89, 23, 69, 4, 207, 253});
+    auto recurrentToCellWeights   = MakeTensor<uint8_t, 2>(recurrentWeightsInfo,
+            {172, 60, 205, 65, 14, 0, 140, 168, 240, 223, 133, 56, 142, 64, 246, 216});
+    auto recurrentToOutputWeights = MakeTensor<uint8_t, 2>(recurrentWeightsInfo,
+            {106, 214, 67, 23, 59, 158, 45, 3, 119, 132, 49, 205, 129, 218, 11, 98});
+
+    auto inputGateBias  = MakeTensor<int32_t, 1>(biasInfo, {-7876, 13488, -726, 32839});
+    auto forgetGateBias = MakeTensor<int32_t, 1>(biasInfo, {9206, -46884, -11693, -38724});
+    auto cellBias       = MakeTensor<int32_t, 1>(biasInfo, {39481, 48624, 48976, -21419});
+    auto outputGateBias = MakeTensor<int32_t, 1>(biasInfo, {-58999, -17050, -41852, -40538});
+
+    // ScopedCpuTensorHandles
+    armnn::ScopedCpuTensorHandle inputToInputWeightsTensor(inputWeightsInfo);
+    armnn::ScopedCpuTensorHandle inputToForgetWeightsTensor(inputWeightsInfo);
+    armnn::ScopedCpuTensorHandle inputToCellWeightsTensor(inputWeightsInfo);
+    armnn::ScopedCpuTensorHandle inputToOutputWeightsTensor(inputWeightsInfo);
+
+    armnn::ScopedCpuTensorHandle recurrentToInputWeightsTensor(recurrentWeightsInfo);
+    armnn::ScopedCpuTensorHandle recurrentToForgetWeightsTensor(recurrentWeightsInfo);
+    armnn::ScopedCpuTensorHandle recurrentToCellWeightsTensor(recurrentWeightsInfo);
+    armnn::ScopedCpuTensorHandle recurrentToOutputWeightsTensor(recurrentWeightsInfo);
+
+    armnn::ScopedCpuTensorHandle inputGateBiasTensor(biasInfo);
+    armnn::ScopedCpuTensorHandle forgetGateBiasTensor(biasInfo);
+    armnn::ScopedCpuTensorHandle cellBiasTensor(biasInfo);
+    armnn::ScopedCpuTensorHandle outputGateBiasTensor(biasInfo);
+
+    // Allocate and copy data
+    AllocateAndCopyDataToITensorHandle(&inputToInputWeightsTensor, &inputToInputWeights[0][0]);
+    AllocateAndCopyDataToITensorHandle(&inputToForgetWeightsTensor, &inputToForgetWeights[0][0]);
+    AllocateAndCopyDataToITensorHandle(&inputToCellWeightsTensor, &inputToCellWeights[0][0]);
+    AllocateAndCopyDataToITensorHandle(&inputToOutputWeightsTensor, &inputToOutputWeights[0][0]);
+
+    AllocateAndCopyDataToITensorHandle(&recurrentToInputWeightsTensor, &recurrentToInputWeights[0][0]);
+    AllocateAndCopyDataToITensorHandle(&recurrentToForgetWeightsTensor, &recurrentToForgetWeights[0][0]);
+    AllocateAndCopyDataToITensorHandle(&recurrentToCellWeightsTensor, &recurrentToCellWeights[0][0]);
+    AllocateAndCopyDataToITensorHandle(&recurrentToOutputWeightsTensor, &recurrentToOutputWeights[0][0]);
+
+    AllocateAndCopyDataToITensorHandle(&inputGateBiasTensor, &inputGateBias[0]);
+    AllocateAndCopyDataToITensorHandle(&forgetGateBiasTensor, &forgetGateBias[0]);
+    AllocateAndCopyDataToITensorHandle(&cellBiasTensor, &cellBias[0]);
+    AllocateAndCopyDataToITensorHandle(&outputGateBiasTensor, &outputGateBias[0]);
+
+    // Setup queue descriptor
+    data.m_InputToInputWeights = &inputToInputWeightsTensor;
+    data.m_InputToForgetWeights = &inputToForgetWeightsTensor;
+    data.m_InputToCellWeights = &inputToCellWeightsTensor;
+    data.m_InputToOutputWeights = &inputToOutputWeightsTensor;
+
+    data.m_RecurrentToInputWeights = &recurrentToInputWeightsTensor;
+    data.m_RecurrentToForgetWeights = &recurrentToForgetWeightsTensor;
+    data.m_RecurrentToCellWeights = &recurrentToCellWeightsTensor;
+    data.m_RecurrentToOutputWeights = &recurrentToOutputWeightsTensor;
+
+    data.m_InputGateBias = &inputGateBiasTensor;
+    data.m_ForgetGateBias = &forgetGateBiasTensor;
+    data.m_CellBias = &cellBiasTensor;
+    data.m_OutputGateBias = &outputGateBiasTensor;
+
+    // Create workload and allocate tensor handles
+    std::unique_ptr<armnn::IWorkload> workload = workloadFactory.CreateQuantizedLstm(data, info);
+    inputHandle->Allocate();
+    outputStateInHandle->Allocate();
+    cellStateInHandle->Allocate();
+
+    cellStateOutHandle->Allocate();
+    outputHandle->Allocate();
+
+    CopyDataToITensorHandle(inputHandle.get(), &inputTensor[0][0]);
+    CopyDataToITensorHandle(outputStateInHandle.get(), &outputStateInTensor[0][0]);
+    CopyDataToITensorHandle(cellStateInHandle.get(), &cellStateInTensor[0][0]);
+
+    workload->Execute();
+
+    CopyDataFromITensorHandle(&ret.output[0][0], outputHandle.get());
+
+    return ret;
+}
+
+} // anonymous namespace
+
+#if defined(ARMNNREF_ENABLED)
+
+// The LSTM test units are run only for the reference backend at the moment
+
+void LstmUtilsZeroVectorTest()
+{
+    armnn::TensorInfo inputDesc({4}, armnn::DataType::Float32);
+    boost::multi_array<float, 1> input = MakeTensor<float, 1>(inputDesc, std::vector<float>(
+            {2., 3., 3., 4.}));
+
+    boost::multi_array<float, 1> expectedOutput = MakeTensor<float, 1>(inputDesc, std::vector<float>(
+            {0., 0., 0., 0.}));
+
+    return LstmUtilsZeroVectorTestImpl<armnn::DataType::Float32>(input, 4, expectedOutput);
+}
+
+void LstmUtilsMeanStddevNormalizationNoneZeroInputTest()
+{
+    uint32_t batchSize = 2;
+    uint32_t vecSize = 4;
+    armnn::TensorInfo inputDesc({batchSize, vecSize}, armnn::DataType::Float32);
+    boost::multi_array<float, 2> input = MakeTensor<float, 2>(inputDesc, std::vector<float>(
+            { 0.1f, 0.2f, 0.3f, 0.4f,      //batch 0
+              0.9f, 1.0f, 1.1f, 1.2f }));  //batch 1
+
+    boost::multi_array<float, 2> expectedOutput = MakeTensor<float, 2>(inputDesc, std::vector<float>(
+            { -1.34164071f, -0.447213531f, 0.44721365f,  1.34164071f,      //batch 0
+              -1.34163153f, -0.447210163f, 0.447211236f, 1.3416326f  }));  //batch 1
+
+    return LstmUtilsMeanStddevNormalizationTestImpl<armnn::DataType::Float32>(input,
+            vecSize, batchSize, expectedOutput);
+}
+
+void LstmUtilsMeanStddevNormalizationAllZeroInputTest()
+{
+    uint32_t batchSize = 2;
+    uint32_t vecSize = 4;
+    armnn::TensorInfo inputDesc({batchSize, vecSize}, armnn::DataType::Float32);
+    boost::multi_array<float, 2> input = MakeTensor<float, 2>(inputDesc, std::vector<float>(
+            { 0.0f, 0.0f, 0.0f, 0.0f,      //batch 0
+              0.0f, 0.0f, 0.0f, 0.0f }));  //batch 1
+
+    boost::multi_array<float, 2> expectedOutput = MakeTensor<float, 2>(inputDesc, std::vector<float>(
+            { 0.0f, 0.0f, 0.0f, 0.0f,      //batch 0
+              0.0f, 0.0f, 0.0f, 0.0f }));  //batch 1
+
+    return LstmUtilsMeanStddevNormalizationTestImpl<armnn::DataType::Float32>(input,
+            vecSize, batchSize, expectedOutput);
+}
+
+void LstmUtilsMeanStddevNormalizationMixedZeroInputTest()
+{
+    uint32_t batchSize = 2;
+    uint32_t vecSize = 4;
+    armnn::TensorInfo inputDesc({batchSize, vecSize}, armnn::DataType::Float32);
+    boost::multi_array<float, 2> input = MakeTensor<float, 2>(inputDesc, std::vector<float>(
+            { 0.0f, 0.0f, 0.0f, 0.0f,      //batch 0
+              0.1f, 0.2f, 0.3f, 0.4f }));  //batch 1
+
+    boost::multi_array<float, 2> expectedOutput = MakeTensor<float, 2>(inputDesc, std::vector<float>(
+            {         0.0f,          0.0f,        0.0f,        0.0f,      //batch 0
+              -1.34164071f, -0.447213531f, 0.44721365f, 1.34164071f }));  //batch 1
+
+    return LstmUtilsMeanStddevNormalizationTestImpl<armnn::DataType::Float32>(input,
+            vecSize, batchSize, expectedOutput);
+}
+
+void LstmUtilsVectorBatchVectorCwiseProductTest()
+{
+    uint32_t batchSize = 4;
+    uint32_t vecSize = 29;
+    armnn::TensorInfo vecDesc({vecSize}, armnn::DataType::Float32);
+    boost::multi_array<float, 1> vector = MakeTensor<float, 1>(vecDesc, std::vector<float>(
+            {   1.1f,   2.2f,   3.3f,   4.4f,   5.5f,   6.6f,   7.7f,   8.8f,   9.9f, 10.1f,
+              11.11f, 12.12f, 13.13f, 14.14f, 15.15f, 16.16f, 17.17f, 18.18f, 19.19f, 20.2f,
+              21.21f, 22.22f, 23.23f, 24.24f, 25.25f, 26.26f, 27.27f, 28.28f,     0.0f}));
+
+    armnn::TensorInfo batchVecDesc({batchSize, vecSize}, armnn::DataType::Float32);
+    boost::multi_array<float, 2> batchVector = MakeTensor<float, 2>(batchVecDesc, std::vector<float>(
+            { /* batch 0 */
+                1.1f,   2.2f,   3.3f,   4.4f,   5.5f,   6.6f,   7.7f,   8.8f,   9.9f,  10.1f,
+              11.11f, 12.12f, 13.13f, 14.14f, 15.15f, 16.16f, 17.17f, 18.18f, 19.19f,  20.2f,
+              21.21f, 22.22f, 23.23f, 24.24f, 25.25f, 26.26f, 27.27f, 28.28f,   0.0f,
+              /* batch 1 */
+                -1.1f,   -2.2f,   -3.3f,   -4.4f,   -5.5f,   -6.6f,   -7.7f,   -8.8f,   -9.9f, -10.1f,
+              -11.11f, -12.12f, -13.13f, -14.14f, -15.15f, -16.16f, -17.17f, -18.18f, -19.19f, -20.2f,
+              -21.21f, -22.22f, -23.23f, -24.24f, -25.25f, -26.26f, -27.27f, -28.28f,    0.0f,
+              /* batch 2 */
+                1.1f,   -2.2f,   3.3f,   -4.4f,   5.5f,   -6.6f,   7.7f,   -8.8f,   9.9f, -10.1f,
+              11.11f, -12.12f, 13.13f, -14.14f, 15.15f, -16.16f, 17.17f, -18.18f, 19.19f, -20.2f,
+              21.21f, -22.22f, 23.23f, -24.24f, 25.25f, -26.26f, 27.27f, -28.28f,   0.0f,
+              /* batch 3 */
+                -1.1f,   2.2f,   -3.3f,   4.4f,   -5.5f,   6.6f,   -7.7f,   8.8f,   -9.9f, 10.1f,
+              -11.11f, 12.12f, -13.13f, 14.14f, -15.15f, 16.16f, -17.17f, 18.18f, -19.19f, 20.2f,
+              -21.21f, 22.22f, -23.23f, 24.24f, -25.25f, 26.26f, -27.27f, 28.28f,    0.0f}));
+
+    // Expect output = input * output + output.
+    boost::multi_array<float, 2> expectedOutput = MakeTensor<float, 2>(batchVecDesc, std::vector<float>(
+            { /* batch 0 */
+                 1.210000f,    4.840000f,   10.889999f,   19.360001f,   30.250000f,   43.559998f,
+                59.289997f,   77.440002f,   98.009995f,  102.010010f,  123.432091f,  146.894394f,
+               172.396896f,  199.939606f,  229.522491f,  261.145599f,  294.808899f,  330.512421f,
+               368.256134f,  408.040039f,  449.864075f,  493.728363f,  539.632874f,  587.577576f,
+               637.562500f,  689.587585f,  743.652954f,  799.758423f,    0.000000f,
+              /* batch 1 */
+                -1.210000f,   -4.840000f,  -10.889999f,  -19.360001f,  -30.250000f,  -43.559998f,
+               -59.289997f,  -77.440002f,  -98.009995f, -102.010010f, -123.432091f, -146.894394f,
+              -172.396896f, -199.939606f, -229.522491f, -261.145599f, -294.808899f, -330.512421f,
+              -368.256134f, -408.040039f, -449.864075f, -493.728363f, -539.632874f, -587.577576f,
+              -637.562500f, -689.587585f, -743.652954f, -799.758423f,    0.000000f,
+              /* batch 2 */
+                 1.210000f,   -4.840000f,  10.889999f,   -19.360001f,   30.250000f,  -43.559998f,
+                59.289997f,  -77.440002f,  98.009995f,  -102.010010f,  123.432091f, -146.894394f,
+               172.396896f, -199.939606f, 229.522491f,  -261.145599f,  294.808899f, -330.512421f,
+               368.256134f, -408.040039f, 449.864075f,  -493.728363f,  539.632874f, -587.577576f,
+               637.562500f, -689.587585f, 743.652954f,  -799.758423f,    0.000000f,
+              /* batch 3 */
+                -1.210000f,    4.840000f,  -10.889999f,   19.360001f,  -30.250000f,   43.559998f,
+               -59.289997f,   77.440002f,  -98.009995f,  102.010010f, -123.432091f,  146.894394f,
+              -172.396896f,  199.939606f, -229.522491f,  261.145599f, -294.808899f,  330.512421f,
+              -368.256134f,  408.040039f, -449.864075f,  493.728363f, -539.632874f,  587.577576f,
+              -637.562500f,  689.587585f, -743.652954f,  799.758423f,    0.000000f}));
+
+    return LstmUtilsVectorBatchVectorCwiseProductTestImpl<armnn::DataType::Float32>(vector, batchVector,
+            vecSize, batchSize, expectedOutput);
+}
+
+void LstmUtilsVectorBatchVectorAddTest()
+{
+    uint32_t batchSize = 2;
+    uint32_t vecSize = 3;
+    armnn::TensorInfo vecDesc({vecSize}, armnn::DataType::Float32);
+    boost::multi_array<float, 1> vector = MakeTensor<float, 1>(vecDesc, std::vector<float>(
+            { 0.0f, -0.5f, 1.0f}));
+
+    armnn::TensorInfo batchVecDesc({batchSize, vecSize}, armnn::DataType::Float32);
+    boost::multi_array<float, 2> batchVector = MakeTensor<float, 2>(batchVecDesc, std::vector<float>(
+            { 1.0f, 2.0f, 3.0f,    //batch 0
+              4.0f, 5.0f, 6.0f})); //batch 1
+
+    boost::multi_array<float, 2> expectedOutput = MakeTensor<float, 2>(batchVecDesc, std::vector<float>(
+            { 1.0f, 1.5f, 4.0f,
+              4.0f, 4.5f, 7.0f}));
+
+    return LstmUtilsVectorBatchVectorAddTestImpl<armnn::DataType::Float32>(vector, batchVector,
+            vecSize, batchSize, expectedOutput);
+}
+
+#endif
+
+LayerTestResult<float, 2> LstmLayerFloat32WithCifgWithPeepholeNoProjectionTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    armnn::TensorInfo inputDesc({ 2, 2 }, armnn::DataType::Float32);
+    boost::multi_array<float, 2> input = MakeTensor<float, 2>(inputDesc, std::vector<float>(
+            { 2., 3., 3., 4. }));
+
+    armnn::TensorInfo outputDesc({ 2, 4 }, armnn::DataType::Float32);
+    boost::multi_array<float, 2> expectedOutput = MakeTensor<float, 2>(outputDesc, std::vector<float>(
+            {-0.36444446f, -0.00352185f, 0.12886585f, -0.05163646f,
+             -0.42734814f, -0.00478661f,  0.13455015f, -0.03560682f}));
+    return LstmLayerWithCifgWithPeepholeNoProjectionTestImpl<armnn::DataType::Float32>(
+        workloadFactory, memoryManager, input, expectedOutput);
+}
+
+LayerTestResult<float, 2> LstmLayerFloat32NoCifgWithPeepholeWithProjectionTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    armnn::TensorInfo inputDesc({ 2, 5 }, armnn::DataType::Float32);
+    boost::multi_array<float, 2> input = MakeTensor<float, 2>(inputDesc, std::vector<float>(
+            {0.787926f, 0.151646f, 0.071352f, 0.118426f, 0.458058f,
+             0.295743f, 0.544053f, 0.690064f, 0.858138f, 0.497181f}));
+
+    armnn::TensorInfo outputDesc({ 2, 16 }, armnn::DataType::Float32);
+    boost::multi_array<float, 2> expectedOutput = MakeTensor<float, 2>(outputDesc, std::vector<float>(
+            {-0.00396806f, 0.029352f,     -0.00279226f, 0.0159977f,   -0.00835576f,
+             -0.0211779f,  0.0283512f,    -0.0114597f,  0.00907307f,  -0.0244004f,
+             -0.0152191f,  -0.0259063f,   0.00914318f,  0.00415118f,  0.017147f,
+             0.0134203f, -0.013869f,    0.0287268f,   -0.00334693f, 0.00733398f,  -0.0287926f,
+             -0.0186926f,   0.0193662f,   -0.0115437f,  0.00422612f,  -0.0345232f,
+             0.00223253f,   -0.00957321f, 0.0210624f,   0.013331f,    0.0150954f,
+             0.02168f}));
+    return LstmLayerNoCifgWithPeepholeWithProjectionTestImpl<armnn::DataType::Float32>(
+        workloadFactory, memoryManager, input, expectedOutput);
+}
+
+LayerTestResult<float, 2> LstmLayerFloat32NoCifgNoPeepholeNoProjectionTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    armnn::TensorInfo inputDesc({2, 2}, armnn::DataType::Float32);
+    boost::multi_array<float, 2> input = MakeTensor<float, 2>(inputDesc, std::vector<float>(
+            {2., 3., 3., 4.}));
+
+    armnn::TensorInfo outputDesc({2, 4}, armnn::DataType::Float32);
+    boost::multi_array<float, 2> expectedOutput = MakeTensor<float, 2>(outputDesc, std::vector<float>(
+            {{-0.02973187f, 0.1229473f,   0.20885126f, -0.15358765f,
+              -0.0185422f,   0.11281417f,  0.24466537f, -0.1826292f}}));
+
+    return LstmNoCifgNoPeepholeNoProjectionTestImpl<armnn::DataType::Float32>(
+        workloadFactory, memoryManager, input, expectedOutput);
+}
+
+LayerTestResult<float, 2> LstmLayerFloat32NoCifgWithPeepholeWithProjectionWithLayerNormTest(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    armnn::TensorInfo inputDesc({ 2, 5 }, armnn::DataType::Float32);
+    boost::multi_array<float, 2> input = MakeTensor<float, 2>(inputDesc, std::vector<float>(
+            {0.7f, 0.8f, 0.1f, 0.2f, 0.3f,     //batch 0
+             0.3f, 0.2f, 0.9f, 0.8f, 0.1f}));  //batch 1
+
+    armnn::TensorInfo outputDesc({ 2, 3 }, armnn::DataType::Float32);
+    boost::multi_array<float, 2> expectedOutput = MakeTensor<float, 2>(outputDesc, std::vector<float>(
+            {  0.0244077f,  0.128027f, -0.00170918f,    //batch 0
+             -0.00692428f, 0.0848741f,    0.063445f})); //batch 1
+    return LstmLayerNoCifgWithPeepholeWithProjectionWithLayerNormTestImpl<armnn::DataType::Float32>(
+            workloadFactory, memoryManager, input, expectedOutput);
+}
+
+LayerTestResult<int16_t, 2> LstmLayerInt16NoCifgNoPeepholeNoProjectionTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    const float qScale = 1.0f;
+    const int32_t qOffset = 0;
+
+    const armnn::DataType datatype = armnn::DataType::QuantisedSymm16;
+    const armnn::DataType constantDatatype = armnn::DataType::QuantisedAsymm8;
+
+    armnn::TensorInfo inputDesc({2, 2}, datatype);
+    boost::multi_array<int16_t , 2> input = MakeTensor<int16_t , 2>(inputDesc, QuantizedVector<int16_t>(qScale, qOffset,
+            std::vector<float>{2., 3., 3., 4.}));
+
+    armnn::TensorInfo outputDesc({2, 4}, datatype);
+    boost::multi_array<int16_t, 2> expectedOutput = MakeTensor<int16_t, 2>(outputDesc, QuantizedVector<int16_t>(qScale,
+            qOffset, std::vector<float>({{-0.02973187f, 0.1229473f,   0.20885126f, -0.15358765f,
+                                          -0.0185422f,  0.11281417f,  0.24466537f, -0.1826292f}})));
+
+    return LstmNoCifgNoPeepholeNoProjectionTestImpl<datatype>(
+        workloadFactory, memoryManager, input, expectedOutput, qScale, qOffset, constantDatatype);
+
+}
+
+LayerTestResult<int16_t, 2> LstmLayerInt16WithCifgWithPeepholeNoProjectionTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    const float qScale = 1.0f;
+    const int32_t qOffset = 0;
+
+    const armnn::DataType datatype = armnn::DataType::QuantisedSymm16;
+    const armnn::DataType constantDatatype = armnn::DataType::QuantisedAsymm8;
+
+    armnn::TensorInfo inputDesc({ 2, 2 }, datatype);
+    boost::multi_array<int16_t, 2> input = MakeTensor<int16_t, 2>(inputDesc, QuantizedVector<int16_t>(qScale, qOffset,
+            std::vector<float>({ 2., 3., 3., 4. })));
+
+    armnn::TensorInfo outputDesc({ 2, 4 }, datatype);
+    boost::multi_array<int16_t, 2> expectedOutput = MakeTensor<int16_t, 2>(outputDesc, QuantizedVector<int16_t>(qScale,
+            qOffset, std::vector<float>(
+            {-0.36444446f, -0.00352185f, 0.12886585f, -0.05163646f,
+             -0.42734814f, -0.00478661f, 0.13455015f, -0.03560682f})));
+
+    return LstmLayerWithCifgWithPeepholeNoProjectionTestImpl<datatype>(
+        workloadFactory, memoryManager, input, expectedOutput, qScale, qOffset, constantDatatype);
+}
+
+LayerTestResult<int16_t, 2> LstmLayerInt16NoCifgWithPeepholeWithProjectionTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    const float qScale = 2.0f;
+    const int32_t qOffset = 0;
+
+    const armnn::DataType datatype = armnn::DataType::QuantisedSymm16;
+    const armnn::DataType constantDatatype = armnn::DataType::QuantisedAsymm8;
+
+    armnn::TensorInfo inputDesc({ 2, 5 }, datatype);
+    boost::multi_array<int16_t, 2> input = MakeTensor<int16_t, 2>(inputDesc, QuantizedVector<int16_t>(qScale,
+            qOffset, std::vector<float>(
+            {0.787926f, 0.151646f, 0.071352f, 0.118426f, 0.458058f,
+             0.295743f, 0.544053f, 0.690064f, 0.858138f, 0.497181f})));
+
+    armnn::TensorInfo outputDesc({ 2, 16 }, datatype);
+    boost::multi_array<int16_t, 2> expectedOutput = MakeTensor<int16_t, 2>(outputDesc, QuantizedVector<int16_t>(qScale,
+            qOffset, std::vector<float>(
+            {-0.00396806f,  0.029352f,   -0.00279226f, 0.0159977f,  -0.00835576f,
+             -0.0211779f,   0.0283512f,  -0.0114597f,  0.00907307f, -0.0244004f,
+             -0.0152191f,  -0.0259063f,   0.00914318f, 0.00415118f,  0.017147f,
+              0.0134203f,  -0.013869f,    0.0287268f, -0.00334693f,  0.00733398f, -0.0287926f,
+             -0.0186926f,   0.0193662f,  -0.0115437f,  0.00422612f, -0.0345232f,
+              0.00223253f, -0.00957321f,  0.0210624f,  0.013331f,    0.0150954f,   0.02168f})));
+
+    return LstmLayerNoCifgWithPeepholeWithProjectionTestImpl<datatype>(
+        workloadFactory, memoryManager, input, expectedOutput, qScale, qOffset, constantDatatype);
+}
+
+LayerTestResult<int16_t, 2> LstmLayerInt16NoCifgNoPeepholeNoProjectionInt16ConstantTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    const float qScale = 1.0f;
+    const int32_t qOffset = 0;
+
+    const armnn::DataType datatype = armnn::DataType::QuantisedSymm16; // datatype & constants set to QSymm16
+
+    armnn::TensorInfo inputDesc({2, 2}, datatype);
+    boost::multi_array<int16_t , 2> input = MakeTensor<int16_t , 2>(inputDesc, QuantizedVector<int16_t>(qScale,
+            qOffset, std::vector<float>{2., 3., 3., 4.}));
+
+    armnn::TensorInfo outputDesc({2, 4}, datatype);
+    boost::multi_array<int16_t, 2> expectedOutput = MakeTensor<int16_t, 2>(outputDesc, QuantizedVector<int16_t>(qScale,
+            qOffset, std::vector<float>({{-0.02973187f, 0.1229473f,   0.20885126f, -0.15358765f,
+                                          -0.0185422f,  0.11281417f,  0.24466537f, -0.1826292f}})));
+
+    return LstmNoCifgNoPeepholeNoProjectionTestImpl<datatype>(
+        workloadFactory, memoryManager, input, expectedOutput, qScale, qOffset, datatype);
+}
+
+//
+// QuantizedLstm
+//
+
+LayerTestResult<uint8_t, 2> QuantizedLstmTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    armnn::TensorInfo inputDesc({2, 2}, armnn::DataType::QuantisedAsymm8);
+    boost::multi_array<uint8_t, 2> input = MakeTensor<uint8_t, 2>(inputDesc, std::vector<uint8_t>(
+        {166, 179, 50, 150}));
+
+    armnn::TensorInfo outputDesc({2, 4}, armnn::DataType::QuantisedAsymm8);
+    boost::multi_array<uint8_t, 2> expectedOutput = MakeTensor<uint8_t, 2>(outputDesc, std::vector<uint8_t>(
+        {140, 151, 146, 112, 136, 156, 142, 112 }));
+
+    return QuantizedLstmTestImpl(workloadFactory, memoryManager, input, expectedOutput);
+}
diff --git a/src/backends/backendsCommon/test/layerTests/LstmTestImpl.hpp b/src/backends/backendsCommon/test/layerTests/LstmTestImpl.hpp
new file mode 100644
index 0000000..2779009
--- /dev/null
+++ b/src/backends/backendsCommon/test/layerTests/LstmTestImpl.hpp
@@ -0,0 +1,60 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include "LayerTestResult.hpp"
+
+#include <backendsCommon/IBackendInternal.hpp>
+#include <backendsCommon/WorkloadFactory.hpp>
+
+#if defined(ARMNNREF_ENABLED)
+void LstmUtilsZeroVectorTest();
+void LstmUtilsMeanStddevNormalizationNoneZeroInputTest();
+void LstmUtilsMeanStddevNormalizationAllZeroInputTest();
+void LstmUtilsMeanStddevNormalizationMixedZeroInputTest();
+void LstmUtilsVectorBatchVectorCwiseProductTest();
+void LstmUtilsVectorBatchVectorAddTest();
+#endif
+
+LayerTestResult<float, 2> LstmLayerFloat32WithCifgWithPeepholeNoProjectionTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<float, 2> LstmLayerFloat32NoCifgNoPeepholeNoProjectionTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<float, 2> LstmLayerFloat32NoCifgWithPeepholeWithProjectionTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<float, 2> LstmLayerFloat32NoCifgWithPeepholeWithProjectionWithLayerNormTest(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<int16_t, 2> LstmLayerInt16NoCifgNoPeepholeNoProjectionTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<int16_t, 2> LstmLayerInt16WithCifgWithPeepholeNoProjectionTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<int16_t, 2> LstmLayerInt16NoCifgWithPeepholeWithProjectionTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<int16_t, 2> LstmLayerInt16NoCifgNoPeepholeNoProjectionInt16ConstantTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+//
+// QuantizedLstm
+//
+
+LayerTestResult<uint8_t, 2> QuantizedLstmTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
diff --git a/src/backends/backendsCommon/test/layerTests/MeanTestImpl.hpp b/src/backends/backendsCommon/test/layerTests/MeanTestImpl.hpp
new file mode 100644
index 0000000..d0bdfa4
--- /dev/null
+++ b/src/backends/backendsCommon/test/layerTests/MeanTestImpl.hpp
@@ -0,0 +1,178 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include "LayerTestResult.hpp"
+
+#include <ResolveType.hpp>
+
+#include <backendsCommon/IBackendInternal.hpp>
+#include <backendsCommon/WorkloadFactory.hpp>
+
+namespace
+{
+
+template<armnn::DataType ArmnnType, typename T, std::size_t InputDim, std::size_t OutputDim>
+LayerTestResult<T, OutputDim> MeanTestHelper(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        const unsigned int* inputShape,
+        const std::vector<float>& inputData,
+        const std::vector<unsigned int>& axis,
+        bool keepDims,
+        const unsigned int* outputShape,
+        const std::vector<float>& outputData,
+        float scale = 1.0f,
+        int32_t offset = 0)
+{
+    armnn::TensorInfo inputTensorInfo(InputDim, inputShape, ArmnnType);
+    armnn::TensorInfo outputTensorInfo(OutputDim, outputShape, ArmnnType);
+
+    inputTensorInfo.SetQuantizationScale(scale);
+    inputTensorInfo.SetQuantizationOffset(offset);
+
+    outputTensorInfo.SetQuantizationScale(scale);
+    outputTensorInfo.SetQuantizationOffset(offset);
+
+    auto input = MakeTensor<T, InputDim>(inputTensorInfo, ConvertToDataType<ArmnnType>(inputData, inputTensorInfo));
+
+    LayerTestResult<T, OutputDim> result(outputTensorInfo);
+    result.outputExpected = MakeTensor<T, OutputDim>(
+            outputTensorInfo, ConvertToDataType<ArmnnType>(outputData, outputTensorInfo));
+
+    std::unique_ptr<armnn::ITensorHandle> inputHandle = workloadFactory.CreateTensorHandle(inputTensorInfo);
+    std::unique_ptr<armnn::ITensorHandle> outputHandle = workloadFactory.CreateTensorHandle(outputTensorInfo);
+
+    armnn::MeanQueueDescriptor data;
+    data.m_Parameters.m_Axis = axis;
+    data.m_Parameters.m_KeepDims = keepDims;
+    armnn::WorkloadInfo info;
+    AddInputToWorkload(data,  info, inputTensorInfo, inputHandle.get());
+    AddOutputToWorkload(data, info, outputTensorInfo, outputHandle.get());
+
+    std::unique_ptr<armnn::IWorkload> workload = workloadFactory.CreateMean(data, info);
+
+    inputHandle->Allocate();
+    outputHandle->Allocate();
+
+    CopyDataToITensorHandle(inputHandle.get(), input.origin());
+
+    workload->PostAllocationConfigure();
+    workload->Execute();
+
+    CopyDataFromITensorHandle(result.output.origin(), outputHandle.get());
+
+    return result;
+}
+
+} // anonymous namespace
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 1> MeanSimpleTest(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    const unsigned int inputShape[] = { 3, 2 };
+    const unsigned int outputShape[] = { 1 };
+
+    std::vector<float> input({ 1.5f, 1.5f, 2.5f, 2.5f, 3.5f, 3.5f });
+    std::vector<float> output({ 2.5f });
+
+    return MeanTestHelper<ArmnnType, T, 2, 1>(
+            workloadFactory, memoryManager, inputShape, input, {}, false, outputShape, output);
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 3> MeanSimpleAxisTest(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    const unsigned int inputShape[] = { 2, 3, 1, 2 };
+    const unsigned int outputShape[] = { 3, 1, 2 };
+
+    std::vector<float> input({ 1.5f, 2.5f, 3.5f, 4.5f, 5.5f, 6.5f, 1.5f, 2.5f, 3.5f, 4.5f, 5.5f, 6.5f });
+    std::vector<float> output({ 1.5f, 2.5f, 3.5f, 4.5f, 5.5f, 6.5f });
+
+    return MeanTestHelper<ArmnnType, T, 4, 3>(
+            workloadFactory, memoryManager, inputShape, input, { 0 }, false, outputShape, output);
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> MeanKeepDimsTest(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    const unsigned int inputShape[] = { 1, 1, 3, 2 };
+    const unsigned int outputShape[] = { 1, 1, 1, 2 };
+
+    std::vector<float> input({ 1.5f, 1.5f, 2.5f, 2.5f, 3.5f, 3.5f });
+    std::vector<float> output({ 2.5f, 2.5f });
+
+    return MeanTestHelper<ArmnnType, T, 4, 4>(
+            workloadFactory, memoryManager, inputShape, input, { 2 }, true, outputShape, output);
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> MeanMultipleDimsTest(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    const unsigned int inputShape[] = { 2, 3, 1, 2 };
+    const unsigned int outputShape[] = { 1, 3, 1, 1 };
+
+    std::vector<float> input({ 1.5f, 2.5f, 3.5f, 4.5f, 5.5f, 6.5f, 1.5f, 2.5f, 3.5f, 4.5f, 5.5f, 6.5 });
+    std::vector<float> output({ 2.0f, 4.0f, 6.0f });
+
+    return MeanTestHelper<ArmnnType, T, 4, 4>(
+            workloadFactory, memoryManager, inputShape, input, { 0, 3 }, true, outputShape, output);
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 1> MeanVts1Test(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    const unsigned int inputShape[] = { 4, 3, 2 };
+    const unsigned int outputShape[] = { 2 };
+
+    std::vector<float> input({ 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f,
+                               15.0f, 16.0f, 17.0f, 18.0f, 19.0f, 20.0f, 21.0f, 22.0f, 23.0f, 24.0f });
+    std::vector<float> output({ 12.0f, 13.0f });
+
+    return MeanTestHelper<ArmnnType, T, 3, 1>(
+            workloadFactory, memoryManager, inputShape, input, { 0, 1 }, false, outputShape, output);
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 3> MeanVts2Test(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    const unsigned int inputShape[] = { 4, 3, 2 };
+    const unsigned int outputShape[] = { 1, 3, 1 };
+
+    std::vector<float> input({ 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f,
+                               15.0f, 16.0f, 17.0f, 18.0f, 19.0f, 20.0f, 21.0f, 22.0f, 23.0f, 24.0f });
+    std::vector<float> output({ 10.5f, 12.5f, 14.5f });
+
+    return MeanTestHelper<ArmnnType, T, 3, 3>(
+            workloadFactory, memoryManager, inputShape, input, { 0, 2 }, true, outputShape, output);
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 3> MeanVts3Test(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    const unsigned int inputShape[] = { 1, 2, 2, 1 };
+    const unsigned int outputShape[] = { 1, 2, 1 };
+
+    std::vector<float> input({ 1.0f, 2.0f, 3.0f, 4.0f });
+    std::vector<float> output({ 1.5f, 3.5f });
+
+    return MeanTestHelper<ArmnnType, T, 4, 3>(
+            workloadFactory, memoryManager, inputShape, input, { 2 }, false, outputShape, output);
+}
diff --git a/src/backends/backendsCommon/test/layerTests/NormalizationTestImpl.cpp b/src/backends/backendsCommon/test/layerTests/NormalizationTestImpl.cpp
new file mode 100644
index 0000000..f65960f
--- /dev/null
+++ b/src/backends/backendsCommon/test/layerTests/NormalizationTestImpl.cpp
@@ -0,0 +1,392 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "NormalizationTestImpl.hpp"
+
+#include <armnn/Exceptions.hpp>
+#include <armnn/LayerSupport.hpp>
+
+#include <backendsCommon/CpuTensorHandle.hpp>
+
+#include <backendsCommon/test/TensorCopyUtils.hpp>
+#include <backendsCommon/test/WorkloadTestUtils.hpp>
+
+#include <test/TensorHelpers.hpp>
+
+namespace
+{
+
+LayerTestResult<float,4> SimpleNormalizationTestImpl(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    armnn::NormalizationAlgorithmChannel normChannel,
+    armnn::NormalizationAlgorithmMethod normMethod)
+{
+    const unsigned int inputHeight = 2;
+    const unsigned int inputWidth = 2;
+    const unsigned int inputChannels = 1;
+    const unsigned int inputNum = 2;
+
+    unsigned int outputHeight = inputHeight;
+    unsigned int outputWidth = inputWidth;
+    unsigned int outputChannels = inputChannels;
+    unsigned int outputNum = inputNum;
+
+    unsigned int inputShape[] = { inputNum, inputChannels, inputHeight, inputWidth };
+    unsigned int outputShape[] = { outputNum, outputChannels, outputHeight, outputWidth };
+
+    auto inputTensorInfo = armnn::TensorInfo(4, inputShape, armnn::DataType::Float32);
+    auto outputTensorInfo = armnn::TensorInfo(4, outputShape, armnn::DataType::Float32);
+
+    LayerTestResult<float,4> ret(outputTensorInfo);
+
+    auto input = MakeTensor<float, 4>(inputTensorInfo, std::vector<float>({
+        // Batch #0
+        1.0f, 2.0f,
+        3.0f, 4.0f,
+        // Batch #1
+        5.0f, 6.0f,
+        7.0f, 8.0f
+    }));
+
+    float alpha = 1.f;
+    float beta = 1.f;
+    float kappa = 1.f;
+    uint32_t normSize = 3;
+
+    std::unique_ptr<armnn::ITensorHandle> inputHandle = workloadFactory.CreateTensorHandle(inputTensorInfo);
+    std::unique_ptr<armnn::ITensorHandle> outputHandle = workloadFactory.CreateTensorHandle(outputTensorInfo);
+
+    armnn::NormalizationQueueDescriptor data;
+    armnn::WorkloadInfo info;
+    AddInputToWorkload(data, info, inputTensorInfo, inputHandle.get());
+    AddOutputToWorkload(data, info, outputTensorInfo, outputHandle.get());
+    data.m_Parameters.m_NormChannelType = normChannel;
+    data.m_Parameters.m_NormMethodType = normMethod;
+    data.m_Parameters.m_NormSize = normSize;
+    data.m_Parameters.m_Alpha = alpha;
+    data.m_Parameters.m_Beta = beta;
+    data.m_Parameters.m_K = kappa;
+    data.m_Parameters.m_DataLayout = armnn::DataLayout::NCHW;
+
+    armnn::PassthroughCpuTensorHandle refHandle(outputTensorInfo, &ret.outputExpected[0][0][0][0]);
+    armnn::NormalizationQueueDescriptor refData = data;
+    armnn::WorkloadInfo refInfo = info;
+    SetWorkloadOutput(refData, refInfo, 0, outputTensorInfo, &refHandle);
+
+    std::unique_ptr<armnn::IWorkload> workload = workloadFactory.CreateNormalization(data, info);
+
+    inputHandle->Allocate();
+    outputHandle->Allocate();
+
+    CopyDataToITensorHandle(inputHandle.get(), &input[0][0][0][0]);
+
+    ExecuteWorkload(*workload, memoryManager);
+
+    CopyDataFromITensorHandle(&ret.output[0][0][0][0], outputHandle.get());
+
+    switch (normMethod)
+    {
+        case armnn::NormalizationAlgorithmMethod::LocalBrightness:
+        {
+            switch (normChannel)
+            {
+                case armnn::NormalizationAlgorithmChannel::Within:
+                {
+                    // When normalising within channels, the 3x3 kernel covers the entire 2x2 input at every index.
+                    // Therefore, all output values should equal the inputs, but divided by:
+                    // pow((kappa + (accumulatedScale * alpha)), beta)
+                    // ...where accumulatedScale is the sum of every element squared.
+                    float divisor[inputNum];
+                    for(int i = 0; i < boost::numeric_cast<int>(inputNum); i++)
+                    {
+                        float accumulatedScale = input[i][0][0][0]*input[i][0][0][0] +
+                                                 input[i][0][0][1]*input[i][0][0][1] +
+                                                 input[i][0][1][0]*input[i][0][1][0] +
+                                                 input[i][0][1][1]*input[i][0][1][1];
+                        divisor[i] = powf((kappa + accumulatedScale * alpha), beta);
+                    }
+                    ret.outputExpected = MakeTensor<float, 4>(outputTensorInfo,
+                                                              std::vector<float>({input[0][0][0][0]/divisor[0],
+                                                                                  input[0][0][0][1]/divisor[0],
+                                                                                  input[0][0][1][0]/divisor[0],
+                                                                                  input[0][0][1][1]/divisor[0],
+                                                                                  input[1][0][0][0]/divisor[1],
+                                                                                  input[1][0][0][1]/divisor[1],
+                                                                                  input[1][0][1][0]/divisor[1],
+                                                                                  input[1][0][1][1]/divisor[1]}));
+                    break;
+                }
+                case armnn::NormalizationAlgorithmChannel::Across:
+                {
+                    // When normalising across channels, all output values should equal the inputs, but multiplied by:
+                    // pow((kappa + (accumulatedScale * alpha)), -beta)
+                    // ...where accumulatedScale is the sum of the inputs for adjacent channels for this element squared
+                    // ...where adjacent channels means within half the normSize for the channel
+                    // The test data has only one channel, so this is simplified below.
+                    std::vector<float> outputVector;
+                    for (int n = 0; n < boost::numeric_cast<int>(inputNum); ++n)
+                    {
+                        for (int h = 0; h < boost::numeric_cast<int>(inputHeight); ++h)
+                        {
+                            for (int w = 0; w < boost::numeric_cast<int>(inputWidth); ++w)
+                            {
+                                float accumulatedScale = input[n][0][h][w]*input[n][0][h][w];
+                                float scale = powf((kappa + accumulatedScale * alpha), -beta);
+                                outputVector.push_back(input[n][0][h][w] * scale);
+                            }
+                        }
+                    }
+                    ret.outputExpected = MakeTensor<float, 4>(outputTensorInfo, outputVector);
+                    break;
+                }
+                default:
+                {
+                    throw armnn::UnimplementedException("Unsupported normalisation channel type, "
+                                                        "only Across and Within are supported");
+                }
+            }
+            break;
+        }
+        case armnn::NormalizationAlgorithmMethod::LocalContrast: // NOTE: intentional fallthrough.
+        default:
+        {
+            throw armnn::UnimplementedException("Unsupported normalisation method type, "
+                                                "only LocalBrightness is supported");
+        }
+    }
+
+    return ret;
+}
+
+LayerTestResult<float,4> SimpleNormalizationNhwcTestImpl(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    armnn::NormalizationAlgorithmChannel normChannel,
+    armnn::NormalizationAlgorithmMethod normMethod)
+{
+    const unsigned int inputHeight = 2;
+    const unsigned int inputWidth = 2;
+    const unsigned int inputChannels = 1;
+    const unsigned int inputNum = 2;
+
+    unsigned int outputHeight = inputHeight;
+    unsigned int outputWidth = inputWidth;
+    unsigned int outputChannels = inputChannels;
+    unsigned int outputNum = inputNum;
+
+    unsigned int inputShape[] = { inputNum, inputHeight, inputWidth, inputChannels };
+    unsigned int outputShape[] = { outputNum, outputHeight, outputWidth, outputChannels };
+
+    auto inputTensorInfo = armnn::TensorInfo(4, inputShape, armnn::DataType::Float32);
+    auto outputTensorInfo = armnn::TensorInfo(4, outputShape, armnn::DataType::Float32);
+
+    LayerTestResult<float,4> ret(outputTensorInfo);
+
+    auto input = MakeTensor<float, 4>(inputTensorInfo, std::vector<float>({
+        // Batch #0
+        1.0f, 2.0f,
+        3.0f, 4.0f,
+        // Batch #1
+        5.0f, 6.0f,
+        7.0f, 8.0f
+    }));
+
+    float alpha = 1.f;
+    float beta = 1.f;
+    float kappa = 1.f;
+    uint32_t normSize = 3;
+
+    std::unique_ptr<armnn::ITensorHandle> inputHandle = workloadFactory.CreateTensorHandle(inputTensorInfo);
+    std::unique_ptr<armnn::ITensorHandle> outputHandle = workloadFactory.CreateTensorHandle(outputTensorInfo);
+
+    armnn::NormalizationQueueDescriptor data;
+    armnn::WorkloadInfo info;
+    AddInputToWorkload(data, info, inputTensorInfo, inputHandle.get());
+    AddOutputToWorkload(data, info, outputTensorInfo, outputHandle.get());
+    data.m_Parameters.m_NormChannelType = normChannel;
+    data.m_Parameters.m_NormMethodType = normMethod;
+    data.m_Parameters.m_NormSize = normSize;
+    data.m_Parameters.m_Alpha = alpha;
+    data.m_Parameters.m_Beta = beta;
+    data.m_Parameters.m_K = kappa;
+    data.m_Parameters.m_DataLayout = armnn::DataLayout::NHWC;
+
+    armnn::PassthroughCpuTensorHandle refHandle(outputTensorInfo, &ret.outputExpected[0][0][0][0]);
+    armnn::NormalizationQueueDescriptor refData = data;
+    armnn::WorkloadInfo refInfo = info;
+    SetWorkloadOutput(refData, refInfo, 0, outputTensorInfo, &refHandle);
+
+    std::unique_ptr<armnn::IWorkload> workload = workloadFactory.CreateNormalization(data, info);
+
+    inputHandle->Allocate();
+    outputHandle->Allocate();
+
+    CopyDataToITensorHandle(inputHandle.get(), &input[0][0][0][0]);
+
+    ExecuteWorkload(*workload, memoryManager);
+
+    CopyDataFromITensorHandle(&ret.output[0][0][0][0], outputHandle.get());
+
+    switch (normMethod)
+    {
+        case armnn::NormalizationAlgorithmMethod::LocalBrightness:
+        {
+            switch (normChannel)
+            {
+                case armnn::NormalizationAlgorithmChannel::Across:
+                {
+                    std::vector<float> expectedOutput{ 0.5f, 0.400000006f, 0.300000012f, 0.235294119f,
+                                                       0.192307696f, 0.16216217f, 0.140000001f, 0.123076923f };
+                    ret.outputExpected = MakeTensor<float, 4>(outputTensorInfo, expectedOutput);
+                    break;
+                }
+                default:
+                {
+                    throw armnn::UnimplementedException("Unsupported normalisation channel type, "
+                                                        "Only Cross-map is supported for NHWC layout");
+                }
+            }
+            break;
+        }
+        case armnn::NormalizationAlgorithmMethod::LocalContrast: // NOTE: intentional fallthrough.
+        default:
+        {
+            throw armnn::UnimplementedException("Unsupported normalisation method type, "
+                                                "only LocalBrightness is supported");
+        }
+    }
+
+    return ret;
+}
+
+LayerTestResult<float,4> CompareNormalizationTestImpl(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    armnn::IWorkloadFactory& refWorkloadFactory,
+    armnn::NormalizationAlgorithmChannel normChannel,
+    armnn::NormalizationAlgorithmMethod normMethod)
+{
+    constexpr unsigned int inputNum = 5;
+    constexpr unsigned int inputChannels = 3;
+    constexpr unsigned int inputHeight = 32;
+    constexpr unsigned int inputWidth = 24;
+
+    constexpr unsigned int outputNum = inputNum;
+    constexpr unsigned int outputChannels = inputChannels;
+    constexpr unsigned int outputHeight = inputHeight;
+    constexpr unsigned int outputWidth = inputWidth;
+
+    armnn::TensorInfo inputTensorInfo;
+    armnn::TensorInfo outputTensorInfo;
+
+    unsigned int inputShape[] = {inputNum, inputChannels, inputHeight, inputWidth};
+    unsigned int outputShape[] = {outputNum, outputChannels, outputHeight, outputWidth};
+
+    inputTensorInfo = armnn::TensorInfo(4, inputShape, armnn::DataType::Float32);
+    outputTensorInfo = armnn::TensorInfo(4, outputShape, armnn::DataType::Float32);
+
+    LayerTestResult<float,4> ret(outputTensorInfo);
+
+    auto input = MakeRandomTensor<float, 4>(inputTensorInfo, 111234);
+
+    constexpr float alpha = 1.f;
+    constexpr float beta = 1.f;
+    constexpr float kappa = 1.f;
+    constexpr uint32_t normSize = 5;
+
+    std::unique_ptr<armnn::ITensorHandle> inputHandle = workloadFactory.CreateTensorHandle(inputTensorInfo);
+    std::unique_ptr<armnn::ITensorHandle> outputHandle = workloadFactory.CreateTensorHandle(outputTensorInfo);
+
+    armnn::NormalizationQueueDescriptor data;
+    armnn::WorkloadInfo info;
+    AddInputToWorkload(data, info, inputTensorInfo, inputHandle.get());
+    AddOutputToWorkload(data, info, outputTensorInfo, outputHandle.get());
+    data.m_Parameters.m_NormChannelType = normChannel;
+    data.m_Parameters.m_NormMethodType  = normMethod;
+    data.m_Parameters.m_NormSize        = normSize;
+    data.m_Parameters.m_Alpha           = alpha;
+    data.m_Parameters.m_Beta            = beta;
+    data.m_Parameters.m_K               = kappa;
+
+    std::unique_ptr<armnn::ITensorHandle> outputHandleRef = refWorkloadFactory.CreateTensorHandle(outputTensorInfo);
+    std::unique_ptr<armnn::ITensorHandle> inputHandleRef = refWorkloadFactory.CreateTensorHandle(inputTensorInfo);
+
+    armnn::NormalizationQueueDescriptor refData = data;
+    armnn::WorkloadInfo refInfo = info;
+    SetWorkloadInput(refData, refInfo, 0, inputTensorInfo, inputHandleRef.get());
+    SetWorkloadOutput(refData, refInfo, 0, outputTensorInfo, outputHandleRef.get());
+
+    // Don't execute if Normalization is not supported for the method and channel types, as an exception will be raised.
+    armnn::BackendId backend = workloadFactory.GetBackendId();
+    const size_t reasonIfUnsupportedMaxLen = 255;
+    char reasonIfUnsupported[reasonIfUnsupportedMaxLen+1];
+    ret.supported = armnn::IsNormalizationSupported(backend, inputTensorInfo, outputTensorInfo, data.m_Parameters,
+                                                    reasonIfUnsupported, reasonIfUnsupportedMaxLen);
+    if (!ret.supported)
+    {
+        return ret;
+    }
+
+    std::unique_ptr<armnn::IWorkload> workload = workloadFactory.CreateNormalization(data, info);
+    std::unique_ptr<armnn::IWorkload> workloadRef = refWorkloadFactory.CreateNormalization(refData, refInfo);
+
+    outputHandleRef->Allocate();
+    inputHandleRef->Allocate();
+
+    inputHandle->Allocate();
+    outputHandle->Allocate();
+
+    CopyDataToITensorHandle(inputHandle.get(), &input[0][0][0][0]);
+    CopyDataToITensorHandle(inputHandleRef.get(), &input[0][0][0][0]);
+
+    ExecuteWorkload(*workload, memoryManager);
+
+    workloadRef->Execute();
+
+    CopyDataFromITensorHandle(&ret.output[0][0][0][0], outputHandle.get());
+    CopyDataFromITensorHandle(&ret.outputExpected[0][0][0][0], outputHandleRef.get());
+
+    return ret;
+}
+
+} // anonymous namespace
+
+LayerTestResult<float,4> SimpleNormalizationAcrossTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    auto normMethod = armnn::NormalizationAlgorithmMethod::LocalBrightness;
+    auto normChannel = armnn::NormalizationAlgorithmChannel::Across;
+    return SimpleNormalizationTestImpl(workloadFactory, memoryManager, normChannel, normMethod);
+}
+
+LayerTestResult<float,4> SimpleNormalizationWithinTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    auto normMethod = armnn::NormalizationAlgorithmMethod::LocalBrightness;
+    auto normChannel = armnn::NormalizationAlgorithmChannel::Within;
+    return SimpleNormalizationTestImpl(workloadFactory, memoryManager, normChannel, normMethod);
+}
+
+LayerTestResult<float,4> SimpleNormalizationAcrossNhwcTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    auto normMethod = armnn::NormalizationAlgorithmMethod::LocalBrightness;
+    auto normChannel = armnn::NormalizationAlgorithmChannel::Across;
+    return SimpleNormalizationNhwcTestImpl(workloadFactory, memoryManager, normChannel, normMethod);
+}
+
+LayerTestResult<float,4> CompareNormalizationTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    armnn::IWorkloadFactory& refWorkloadFactory,
+    armnn::NormalizationAlgorithmChannel normChannel,
+    armnn::NormalizationAlgorithmMethod normMethod)
+{
+    return CompareNormalizationTestImpl(workloadFactory, memoryManager, refWorkloadFactory, normChannel, normMethod);
+}
diff --git a/src/backends/backendsCommon/test/layerTests/NormalizationTestImpl.hpp b/src/backends/backendsCommon/test/layerTests/NormalizationTestImpl.hpp
new file mode 100644
index 0000000..be66f6c
--- /dev/null
+++ b/src/backends/backendsCommon/test/layerTests/NormalizationTestImpl.hpp
@@ -0,0 +1,32 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include "LayerTestResult.hpp"
+
+#include <armnn/Types.hpp>
+
+#include <backendsCommon/IBackendInternal.hpp>
+#include <backendsCommon/WorkloadFactory.hpp>
+
+LayerTestResult<float, 4> SimpleNormalizationAcrossTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<float, 4> SimpleNormalizationWithinTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<float,4> SimpleNormalizationAcrossNhwcTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<float, 4> CompareNormalizationTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    armnn::IWorkloadFactory& refWorkloadFactory,
+    armnn::NormalizationAlgorithmChannel normChannel,
+    armnn::NormalizationAlgorithmMethod normMethod);
diff --git a/src/backends/backendsCommon/test/layerTests/PadTestImpl.cpp b/src/backends/backendsCommon/test/layerTests/PadTestImpl.cpp
new file mode 100644
index 0000000..82b772e
--- /dev/null
+++ b/src/backends/backendsCommon/test/layerTests/PadTestImpl.cpp
@@ -0,0 +1,497 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "PadTestImpl.hpp"
+
+#include <backendsCommon/test/TensorCopyUtils.hpp>
+#include <backendsCommon/test/WorkloadTestUtils.hpp>
+
+#include <test/TensorHelpers.hpp>
+
+//
+// Implementation templates
+//
+
+template<armnn::DataType ArmnnType, typename T>
+LayerTestResult<T, 2> Pad2dTestCommon(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    float qScale,
+    int32_t qOffset,
+    const float customPaddingValue)
+{
+    const armnn::TensorShape inputShape{ 3, 3 };
+    const armnn::TensorShape outputShape{ 7, 7 };
+
+    const armnn::TensorInfo inputTensorInfo(inputShape, ArmnnType, qScale, qOffset);
+    const armnn::TensorInfo outputTensorInfo(outputShape, ArmnnType, qScale, qOffset);
+
+    std::vector<T> inputValues(
+    QuantizedVector<T>(qScale, qOffset,
+    {
+      // Height (3) x Width (3)
+      4, 8, 6,
+      7, 4, 4,
+      3, 2, 4
+    }));
+
+    auto p = customPaddingValue;
+    std::vector<T> expectedOutputValues;
+    expectedOutputValues = (
+    QuantizedVector<T>(qScale, qOffset,
+    {
+      p, p, p, p, p, p, p,
+      p, p, p, p, p, p, p,
+      p, p, 4, 8, 6, p, p,
+      p, p, 7, 4, 4, p, p,
+      p, p, 3, 2, 4, p, p,
+      p, p, p, p, p, p, p,
+      p, p, p, p, p, p, p
+    }));
+
+    auto inputTensor = MakeTensor<T, 2>(inputTensorInfo, std::vector<T>(inputValues));
+
+    LayerTestResult<T, 2> result(outputTensorInfo);
+    result.outputExpected = MakeTensor<T, 2>(outputTensorInfo, std::vector<T>(expectedOutputValues));
+
+    std::unique_ptr<armnn::ITensorHandle> inputHandle = workloadFactory.CreateTensorHandle(inputTensorInfo);
+    std::unique_ptr<armnn::ITensorHandle> outputHandle = workloadFactory.CreateTensorHandle(outputTensorInfo);
+
+    armnn::PadQueueDescriptor descriptor;
+
+    std::vector<std::pair<unsigned int, unsigned int>> padList;
+    padList.push_back(std::pair<unsigned int, unsigned int>(2,2));
+    padList.push_back(std::pair<unsigned int, unsigned int>(2,2));
+
+    descriptor.m_Parameters.m_PadList = padList;
+    descriptor.m_Parameters.m_PadValue = customPaddingValue;
+    armnn::WorkloadInfo info;
+
+    AddInputToWorkload(descriptor, info, inputTensorInfo, inputHandle.get());
+    AddOutputToWorkload(descriptor, info, outputTensorInfo, outputHandle.get());
+
+    std::unique_ptr<armnn::IWorkload> workload = workloadFactory.CreatePad(descriptor, info);
+
+    inputHandle->Allocate();
+    outputHandle->Allocate();
+
+    CopyDataToITensorHandle(inputHandle.get(), &inputTensor[0][0]);
+
+    workload->PostAllocationConfigure();
+    workload->Execute();
+
+    CopyDataFromITensorHandle(&result.output[0][0], outputHandle.get());
+
+    return result;
+}
+
+template<armnn::DataType ArmnnType, typename T>
+LayerTestResult<T, 3> Pad3dTestCommon(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    float qScale,
+    int32_t qOffset)
+{
+    const armnn::TensorShape inputShape{ 2, 2, 2 };
+    const armnn::TensorShape outputShape{ 3, 5, 6 };
+
+    const armnn::TensorInfo inputTensorInfo(inputShape, ArmnnType, qScale, qOffset);
+    const armnn::TensorInfo outputTensorInfo(outputShape, ArmnnType, qScale, qOffset);
+
+    std::vector<T> inputValues(
+      QuantizedVector<T>(qScale,qOffset,
+    {
+        // Channel 0, Height (2) x Width (2)
+        0, 4,
+        2, 5,
+
+        // Channel 1, Height (2) x Width (2)
+        6, 1,
+        5, 2
+    }));
+
+    std::vector<T> expectedOutputValues(
+      QuantizedVector<T>(qScale,qOffset,
+    {
+
+        0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0,
+        0, 0, 0, 4, 0, 0,
+        0, 0, 2, 5, 0, 0,
+        0, 0, 0, 0, 0, 0,
+
+        0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0,
+        0, 0, 6, 1, 0, 0,
+        0, 0, 5, 2, 0, 0,
+        0, 0, 0, 0, 0, 0,
+
+        0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0,
+        0, 0, 0, 0, 0, 0
+
+    }));
+
+    auto inputTensor = MakeTensor<T, 3>(inputTensorInfo, std::vector<T>(inputValues));
+
+    LayerTestResult<T, 3> result(outputTensorInfo);
+    result.outputExpected = MakeTensor<T, 3>(outputTensorInfo, std::vector<T>(expectedOutputValues));
+
+    std::unique_ptr<armnn::ITensorHandle> inputHandle = workloadFactory.CreateTensorHandle(inputTensorInfo);
+    std::unique_ptr<armnn::ITensorHandle> outputHandle = workloadFactory.CreateTensorHandle(outputTensorInfo);
+
+    armnn::PadQueueDescriptor descriptor;
+
+    std::vector<std::pair<unsigned int, unsigned int>> PadList;
+    PadList.push_back(std::pair<unsigned int, unsigned int>(0,1));
+    PadList.push_back(std::pair<unsigned int, unsigned int>(2,1));
+    PadList.push_back(std::pair<unsigned int, unsigned int>(2,2));
+
+    descriptor.m_Parameters.m_PadList = PadList;
+    armnn::WorkloadInfo info;
+
+    AddInputToWorkload(descriptor, info, inputTensorInfo, inputHandle.get());
+    AddOutputToWorkload(descriptor, info, outputTensorInfo, outputHandle.get());
+
+    std::unique_ptr<armnn::IWorkload> workload = workloadFactory.CreatePad(descriptor, info);
+
+    inputHandle->Allocate();
+    outputHandle->Allocate();
+
+    CopyDataToITensorHandle(inputHandle.get(), &inputTensor[0][0][0]);
+
+    workload->PostAllocationConfigure();
+    workload->Execute();
+
+    CopyDataFromITensorHandle(&result.output[0][0][0], outputHandle.get());
+
+    return result;
+}
+
+template<armnn::DataType ArmnnType, typename T>
+LayerTestResult<T, 4> Pad4dTestCommon(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    float qScale,
+    int32_t qOffset)
+{
+    const armnn::TensorShape inputShape{ 2, 2, 3, 2 };
+    const armnn::TensorShape outputShape{ 4, 5, 7, 4 };
+
+    const armnn::TensorInfo inputTensorInfo(inputShape, ArmnnType, qScale, qOffset);
+    const armnn::TensorInfo outputTensorInfo(outputShape, ArmnnType, qScale, qOffset);
+
+    std::vector<T> inputValues(
+      QuantizedVector<T>(qScale,qOffset,
+    {
+        // Batch 0, Channel 0, Height (3) x Width (2)
+        0, 1,
+        2, 3,
+        4, 5,
+
+        // Batch 0, Channel 1, Height (3) x Width (2)
+        6, 7,
+        8, 9,
+        10, 11,
+
+        // Batch 1, Channel 0, Height (3) x Width (2)
+        12, 13,
+        14, 15,
+        16, 17,
+
+        // Batch 1, Channel 1, Height (3) x Width (2)
+        18, 19,
+        20, 21,
+        22, 23
+    }));
+
+    std::vector<T> expectedOutputValues(
+      QuantizedVector<T>(qScale,qOffset,
+    {
+        0, 0, 0, 0,
+        0, 0, 0, 0,
+        0, 0, 0, 0,
+        0, 0, 0, 0,
+        0, 0, 0, 0,
+        0, 0, 0, 0,
+        0, 0, 0, 0,
+
+        0, 0, 0, 0,
+        0, 0, 0, 0,
+        0, 0, 0, 0,
+        0, 0, 0, 0,
+        0, 0, 0, 0,
+        0, 0, 0, 0,
+        0, 0, 0, 0,
+
+        0, 0, 0, 0,
+        0, 0, 0, 0,
+        0, 0, 0, 0,
+        0, 0, 0, 0,
+        0, 0, 0, 0,
+        0, 0, 0, 0,
+        0, 0, 0, 0,
+
+        0, 0, 0, 0,
+        0, 0, 0, 0,
+        0, 0, 0, 0,
+        0, 0, 0, 0,
+        0, 0, 0, 0,
+        0, 0, 0, 0,
+        0, 0, 0, 0,
+
+        0, 0, 0, 0,
+        0, 0, 0, 0,
+        0, 0, 0, 0,
+        0, 0, 0, 0,
+        0, 0, 0, 0,
+        0, 0, 0, 0,
+        0, 0, 0, 0,
+
+        0, 0, 0, 0,
+        0, 0, 0, 0,
+        0, 0, 0, 0,
+        0, 0, 0, 0,
+        0, 0, 0, 0,
+        0, 0, 0, 0,
+        0, 0, 0, 0,
+
+        0, 0, 0, 0,
+        0, 0, 0, 0,
+        0, 0, 0, 0,
+        0, 0, 0, 0,
+        0, 0, 0, 0,
+        0, 0, 0, 0,
+        0, 0, 0, 0,
+
+        0, 0, 0, 0,
+        0, 0, 0, 0,
+        0, 0, 0, 0,
+        0, 0, 1, 0,
+        0, 2, 3, 0,
+        0, 4, 5, 0,
+        0, 0, 0, 0,
+
+        0, 0, 0, 0,
+        0, 0, 0, 0,
+        0, 0, 0, 0,
+        0, 6, 7, 0,
+        0, 8, 9, 0,
+        0, 10, 11, 0,
+        0, 0, 0, 0,
+
+        0, 0, 0, 0,
+        0, 0, 0, 0,
+        0, 0, 0, 0,
+        0, 0, 0, 0,
+        0, 0, 0, 0,
+        0, 0, 0, 0,
+        0, 0, 0, 0,
+
+        0, 0, 0, 0,
+        0, 0, 0, 0,
+        0, 0, 0, 0,
+        0, 0, 0, 0,
+        0, 0, 0, 0,
+        0, 0, 0, 0,
+        0, 0, 0, 0,
+
+        0, 0, 0, 0,
+        0, 0, 0, 0,
+        0, 0, 0, 0,
+        0, 0, 0, 0,
+        0, 0, 0, 0,
+        0, 0, 0, 0,
+        0, 0, 0, 0,
+
+        0, 0, 0, 0,
+        0, 0, 0, 0,
+        0, 0, 0, 0,
+        0, 12, 13, 0,
+        0, 14, 15, 0,
+        0, 16, 17, 0,
+        0, 0, 0, 0,
+
+        0, 0, 0, 0,
+        0, 0, 0, 0,
+        0, 0, 0, 0,
+        0, 18, 19, 0,
+        0, 20, 21, 0,
+        0, 22, 23, 0,
+        0, 0, 0, 0,
+
+        0, 0, 0, 0,
+        0, 0, 0, 0,
+        0, 0, 0, 0,
+        0, 0, 0, 0,
+        0, 0, 0, 0,
+        0, 0, 0, 0,
+        0, 0, 0, 0,
+
+        0, 0, 0, 0,
+        0, 0, 0, 0,
+        0, 0, 0, 0,
+        0, 0, 0, 0,
+        0, 0, 0, 0,
+        0, 0, 0, 0,
+        0, 0, 0, 0,
+
+        0, 0, 0, 0,
+        0, 0, 0, 0,
+        0, 0, 0, 0,
+        0, 0, 0, 0,
+        0, 0, 0, 0,
+        0, 0, 0, 0,
+        0, 0, 0, 0,
+
+        0, 0, 0, 0,
+        0, 0, 0, 0,
+        0, 0, 0, 0,
+        0, 0, 0, 0,
+        0, 0, 0, 0,
+        0, 0, 0, 0,
+        0, 0, 0, 0,
+
+        0, 0, 0, 0,
+        0, 0, 0, 0,
+        0, 0, 0, 0,
+        0, 0, 0, 0,
+        0, 0, 0, 0,
+        0, 0, 0, 0,
+        0, 0, 0, 0,
+
+        0, 0, 0, 0,
+        0, 0, 0, 0,
+        0, 0, 0, 0,
+        0, 0, 0, 0,
+        0, 0, 0, 0,
+        0, 0, 0, 0,
+        0, 0, 0, 0
+    }));
+
+    auto inputTensor = MakeTensor<T, 4>(inputTensorInfo, std::vector<T>(inputValues));
+
+    LayerTestResult<T, 4> result(outputTensorInfo);
+    result.outputExpected = MakeTensor<T, 4>(outputTensorInfo, std::vector<T>(expectedOutputValues));
+
+    std::unique_ptr<armnn::ITensorHandle> inputHandle = workloadFactory.CreateTensorHandle(inputTensorInfo);
+    std::unique_ptr<armnn::ITensorHandle> outputHandle = workloadFactory.CreateTensorHandle(outputTensorInfo);
+
+    armnn::PadQueueDescriptor descriptor;
+
+    std::vector<std::pair<unsigned int, unsigned int>> PadList;
+    PadList.push_back(std::pair<unsigned int, unsigned int>(1,1));
+    PadList.push_back(std::pair<unsigned int, unsigned int>(2,1));
+    PadList.push_back(std::pair<unsigned int, unsigned int>(3,1));
+    PadList.push_back(std::pair<unsigned int, unsigned int>(1,1));
+
+    descriptor.m_Parameters.m_PadList = PadList;
+    armnn::WorkloadInfo info;
+
+    AddInputToWorkload(descriptor, info, inputTensorInfo, inputHandle.get());
+    AddOutputToWorkload(descriptor, info, outputTensorInfo, outputHandle.get());
+
+    std::unique_ptr<armnn::IWorkload> workload = workloadFactory.CreatePad(descriptor, info);
+
+    inputHandle->Allocate();
+    outputHandle->Allocate();
+
+    CopyDataToITensorHandle(inputHandle.get(), &inputTensor[0][0][0][0]);
+
+    workload->PostAllocationConfigure();
+    workload->Execute();
+
+    CopyDataFromITensorHandle(&result.output[0][0][0][0], outputHandle.get());
+
+    return result;
+}
+
+//
+// Explicit template specializations
+//
+
+template LayerTestResult<armnn::ResolveType<armnn::DataType::QuantisedSymm16>, 2>
+Pad2dTestCommon<armnn::DataType::QuantisedSymm16>(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    float qScale,
+    int32_t qOffset,
+    const float customPaddingValue);
+
+template LayerTestResult<armnn::ResolveType<armnn::DataType::QuantisedSymm16>, 3>
+Pad3dTestCommon<armnn::DataType::QuantisedSymm16>(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    float qScale,
+    int32_t qOffset);
+
+template LayerTestResult<armnn::ResolveType<armnn::DataType::QuantisedSymm16>, 4>
+Pad4dTestCommon<armnn::DataType::QuantisedSymm16>(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    float qScale,
+    int32_t qOffset);
+
+//
+// Implementation functions
+//
+
+LayerTestResult<uint8_t, 2> PadUint82dTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return Pad2dTestCommon<armnn::DataType::QuantisedAsymm8>(workloadFactory, memoryManager, 1.0f, 0);
+}
+
+LayerTestResult<uint8_t, 2> PadUint82dCustomPaddingTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return Pad2dTestCommon<armnn::DataType::QuantisedAsymm8>(workloadFactory, memoryManager, 1.0f, 0, 1.0f);
+}
+
+LayerTestResult<uint8_t, 3> PadUint83dTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return Pad3dTestCommon<armnn::DataType::QuantisedAsymm8>(workloadFactory, memoryManager, 1.0f, 0);
+}
+
+LayerTestResult<uint8_t, 4> PadUint84dTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return Pad4dTestCommon<armnn::DataType::QuantisedAsymm8>(workloadFactory, memoryManager, 1.0f, 0);
+}
+
+LayerTestResult<float, 2> PadFloat322dTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return Pad2dTestCommon<armnn::DataType::Float32>(workloadFactory, memoryManager, 0.0f, 0);
+}
+
+LayerTestResult<float, 2> PadFloat322dCustomPaddingTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return Pad2dTestCommon<armnn::DataType::Float32>(workloadFactory, memoryManager, 0.0f, 0, 1.0f);
+}
+
+LayerTestResult<float, 3> PadFloat323dTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return Pad3dTestCommon<armnn::DataType::Float32>(workloadFactory, memoryManager, 0.0f, 0);
+}
+
+LayerTestResult<float, 4> PadFloat324dTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return Pad4dTestCommon<armnn::DataType::Float32>(workloadFactory, memoryManager, 0.0f, 0);
+}
diff --git a/src/backends/backendsCommon/test/layerTests/PadTestImpl.hpp b/src/backends/backendsCommon/test/layerTests/PadTestImpl.hpp
new file mode 100644
index 0000000..156b861
--- /dev/null
+++ b/src/backends/backendsCommon/test/layerTests/PadTestImpl.hpp
@@ -0,0 +1,69 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include "LayerTestResult.hpp"
+
+#include <ResolveType.hpp>
+
+#include <armnn/Types.hpp>
+
+#include <backendsCommon/IBackendInternal.hpp>
+#include <backendsCommon/WorkloadFactory.hpp>
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 2> Pad2dTestCommon(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    float qScale,
+    int32_t qOffset,
+    const float customPaddingValue = 0.0f);
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 3> Pad3dTestCommon(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    float qScale,
+    int32_t qOffset);
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> Pad4dTestCommon(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    float qScale,
+    int32_t qOffset);
+
+LayerTestResult<uint8_t, 2> PadUint82dTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<uint8_t, 2> PadUint82dCustomPaddingTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<uint8_t, 3> PadUint83dTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<uint8_t, 4> PadUint84dTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<float, 2> PadFloat322dTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<float, 2> PadFloat322dCustomPaddingTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<float, 3> PadFloat323dTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<float, 4> PadFloat324dTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
diff --git a/src/backends/backendsCommon/test/layerTests/PermuteTestImpl.hpp b/src/backends/backendsCommon/test/layerTests/PermuteTestImpl.hpp
new file mode 100644
index 0000000..ef48c97
--- /dev/null
+++ b/src/backends/backendsCommon/test/layerTests/PermuteTestImpl.hpp
@@ -0,0 +1,241 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include <ResolveType.hpp>
+
+#include <armnn/ArmNN.hpp>
+
+#include <backendsCommon/IBackendInternal.hpp>
+#include <backendsCommon/WorkloadFactory.hpp>
+
+#include <backendsCommon/test/QuantizeHelper.hpp>
+#include <backendsCommon/test/WorkloadTestUtils.hpp>
+
+#include <test/TensorHelpers.hpp>
+
+template<typename T>
+LayerTestResult<T, 4> SimplePermuteTestImpl(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        armnn::PermuteDescriptor descriptor,
+        armnn::TensorInfo inputTensorInfo,
+        armnn::TensorInfo outputTensorInfo,
+        const std::vector<T>& inputData,
+        const std::vector<T>& outputExpectedData)
+{
+    auto input = MakeTensor<T, 4>(inputTensorInfo, inputData);
+
+    LayerTestResult<T, 4> ret(outputTensorInfo);
+    ret.outputExpected = MakeTensor<T, 4>(outputTensorInfo, outputExpectedData);
+
+    std::unique_ptr<armnn::ITensorHandle> inputHandle = workloadFactory.CreateTensorHandle(inputTensorInfo);
+    std::unique_ptr<armnn::ITensorHandle> outputHandle = workloadFactory.CreateTensorHandle(outputTensorInfo);
+
+    armnn::PermuteQueueDescriptor data;
+    data.m_Parameters = descriptor;
+    armnn::WorkloadInfo info;
+    AddInputToWorkload(data, info, inputTensorInfo, inputHandle.get());
+    AddOutputToWorkload(data, info, outputTensorInfo, outputHandle.get());
+
+    std::unique_ptr<armnn::IWorkload> workload = workloadFactory.CreatePermute(data, info);
+
+    inputHandle->Allocate();
+    outputHandle->Allocate();
+
+    CopyDataToITensorHandle(inputHandle.get(), &input[0][0][0][0]);
+
+    workload->Execute();
+
+    CopyDataFromITensorHandle(&ret.output[0][0][0][0], outputHandle.get());
+
+    return ret;
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> SimplePermuteTest(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    armnn::TensorInfo inputTensorInfo;
+    armnn::TensorInfo outputTensorInfo;
+
+    unsigned int inputShape[] = { 1, 2, 2, 2 };
+    unsigned int outputShape[] = { 1, 2, 2, 2 };
+
+    armnn::PermuteDescriptor descriptor;
+    descriptor.m_DimMappings = {0U, 3U, 1U, 2U};
+
+    inputTensorInfo = armnn::TensorInfo(4, inputShape, ArmnnType);
+    outputTensorInfo = armnn::TensorInfo(4, outputShape, ArmnnType);
+
+    // Set quantization parameters if the requested type is a quantized type.
+    if(armnn::IsQuantizedType<T>())
+    {
+        inputTensorInfo.SetQuantizationScale(0.5f);
+        inputTensorInfo.SetQuantizationOffset(5);
+        outputTensorInfo.SetQuantizationScale(0.5f);
+        outputTensorInfo.SetQuantizationOffset(5);
+    }
+
+    std::vector<T> input = std::vector<T>(
+    {
+        1, 2,
+        3, 4,
+        5, 6,
+        7, 8
+    });
+
+    std::vector<T> outputExpected = std::vector<T>(
+    {
+        1, 5, 2, 6,
+        3, 7, 4, 8
+    });
+
+    return SimplePermuteTestImpl<T>(workloadFactory, memoryManager,
+                                    descriptor, inputTensorInfo,
+                                    outputTensorInfo, input, outputExpected);
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> PermuteValueSet1Test(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    armnn::TensorInfo inputTensorInfo;
+    armnn::TensorInfo outputTensorInfo;
+
+    unsigned int inputShape[]  = { 1, 2, 2, 3 };
+    unsigned int outputShape[] = { 1, 3, 2, 2 };
+
+    armnn::PermuteDescriptor descriptor;
+    descriptor.m_DimMappings = {0U, 2U, 3U, 1U};
+
+    inputTensorInfo = armnn::TensorInfo(4, inputShape, ArmnnType);
+    outputTensorInfo = armnn::TensorInfo(4, outputShape, ArmnnType);
+
+    // Set quantization parameters if the requested type is a quantized type.
+    if(armnn::IsQuantizedType<T>())
+    {
+        inputTensorInfo.SetQuantizationScale(0.5f);
+        inputTensorInfo.SetQuantizationOffset(5);
+        outputTensorInfo.SetQuantizationScale(0.5f);
+        outputTensorInfo.SetQuantizationOffset(5);
+    }
+
+    std::vector<T> input = std::vector<T>(
+    {
+         1,  2,  3,
+        11, 12, 13,
+        21, 22, 23,
+        31, 32, 33
+    });
+
+    std::vector<T> outputExpected = std::vector<T>(
+    {
+        1, 11, 21, 31,
+        2, 12, 22, 32,
+        3, 13, 23, 33
+    });
+
+    return SimplePermuteTestImpl<T>(workloadFactory, memoryManager,
+                                    descriptor, inputTensorInfo,
+                                    outputTensorInfo, input, outputExpected);
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> PermuteValueSet2Test(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    armnn::TensorInfo inputTensorInfo;
+    armnn::TensorInfo outputTensorInfo;
+
+    unsigned int inputShape[]  = { 1, 3, 2, 2 };
+    unsigned int outputShape[] = { 1, 2, 2, 3 };
+
+    armnn::PermuteDescriptor descriptor;
+    descriptor.m_DimMappings = {0U, 3U, 1U, 2U};
+
+    inputTensorInfo = armnn::TensorInfo(4, inputShape, ArmnnType);
+    outputTensorInfo = armnn::TensorInfo(4, outputShape, ArmnnType);
+
+    // Set quantization parameters if the requested type is a quantized type.
+    if(armnn::IsQuantizedType<T>())
+    {
+        inputTensorInfo.SetQuantizationScale(0.5f);
+        inputTensorInfo.SetQuantizationOffset(5);
+        outputTensorInfo.SetQuantizationScale(0.5f);
+        outputTensorInfo.SetQuantizationOffset(5);
+    }
+
+    std::vector<T> input = std::vector<T>(
+    {
+        1, 11, 21, 31,
+        2, 12, 22, 32,
+        3, 13, 23, 33
+    });
+
+    std::vector<T> outputExpected = std::vector<T>(
+    {
+         1,  2,  3,
+        11, 12, 13,
+        21, 22, 23,
+        31, 32, 33,
+    });
+
+    return SimplePermuteTestImpl<T>(workloadFactory, memoryManager,
+                                    descriptor, inputTensorInfo,
+                                    outputTensorInfo, input, outputExpected);
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> PermuteValueSet3Test(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    armnn::TensorInfo inputTensorInfo;
+    armnn::TensorInfo outputTensorInfo;
+
+    unsigned int inputShape[]  = { 1, 2, 3, 3 };
+    unsigned int outputShape[] = { 1, 3, 2, 3 };
+
+    armnn::PermuteDescriptor descriptor;
+    descriptor.m_DimMappings = {0U, 2U, 3U, 1U};
+
+    inputTensorInfo = armnn::TensorInfo(4, inputShape, ArmnnType);
+    outputTensorInfo = armnn::TensorInfo(4, outputShape, ArmnnType);
+
+    // Set quantization parameters if the requested type is a quantized type.
+    if(armnn::IsQuantizedType<T>())
+    {
+        inputTensorInfo.SetQuantizationScale(0.5f);
+        inputTensorInfo.SetQuantizationOffset(5);
+        outputTensorInfo.SetQuantizationScale(0.5f);
+        outputTensorInfo.SetQuantizationOffset(5);
+    }
+
+    std::vector<T> input = std::vector<T>(
+            {
+                 1,  2,  3,
+                11, 12, 13,
+                21, 22, 23,
+                31, 32, 33,
+                41, 42, 43,
+                51, 52, 53
+            });
+
+    std::vector<T> outputExpected = std::vector<T>(
+            {
+                1, 11, 21, 31, 41, 51,
+                2, 12, 22, 32, 42, 52,
+                3, 13, 23, 33, 43, 53
+            });
+
+    return SimplePermuteTestImpl<T>(workloadFactory, memoryManager,
+                                    descriptor, inputTensorInfo,
+                                    outputTensorInfo, input, outputExpected);
+}
diff --git a/src/backends/backendsCommon/test/layerTests/Pooling2dTestImpl.cpp b/src/backends/backendsCommon/test/layerTests/Pooling2dTestImpl.cpp
new file mode 100644
index 0000000..f250fa5
--- /dev/null
+++ b/src/backends/backendsCommon/test/layerTests/Pooling2dTestImpl.cpp
@@ -0,0 +1,1784 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "Pooling2dTestImpl.hpp"
+
+#include <armnn/LayerSupport.hpp>
+
+#include <DataLayoutIndexed.hpp>
+#include <Permute.hpp>
+#include <ResolveType.hpp>
+#include <TensorUtils.hpp>
+
+#include <backendsCommon/WorkloadInfo.hpp>
+
+#include <backendsCommon/test/QuantizeHelper.hpp>
+#include <backendsCommon/test/TensorCopyUtils.hpp>
+#include <backendsCommon/test/WorkloadTestUtils.hpp>
+
+#include <test/TensorHelpers.hpp>
+
+#include <boost/numeric/conversion/cast.hpp>
+
+namespace
+{
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> SimplePooling2dTestImpl(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    armnn::Pooling2dDescriptor descriptor,
+    float qScale,
+    int32_t qOffset,
+    const boost::multi_array<T, 4>& input,
+    const boost::multi_array<T, 4>& outputExpected)
+{
+    const armnn::DataLayout dataLayout = descriptor.m_DataLayout;
+    const armnnUtils::DataLayoutIndexed dimensionIndices = dataLayout;
+    auto heightIndex = dimensionIndices.GetHeightIndex();
+    auto widthIndex = dimensionIndices.GetWidthIndex();
+    auto channelsIndex = dimensionIndices.GetChannelsIndex();
+
+    unsigned int inputHeight     = boost::numeric_cast<unsigned int>(input.shape()[heightIndex]);
+    unsigned int inputWidth      = boost::numeric_cast<unsigned int>(input.shape()[widthIndex]);
+    unsigned int inputChannels   = boost::numeric_cast<unsigned int>(input.shape()[channelsIndex]);
+    unsigned int inputBatchSize  = boost::numeric_cast<unsigned int>(input.shape()[0]);
+
+    unsigned int outputHeight    = boost::numeric_cast<unsigned int>(outputExpected.shape()[heightIndex]);
+    unsigned int outputWidth     = boost::numeric_cast<unsigned int>(outputExpected.shape()[widthIndex]);
+    unsigned int outputChannels  = boost::numeric_cast<unsigned int>(outputExpected.shape()[channelsIndex]);
+    unsigned int outputBatchSize = boost::numeric_cast<unsigned int>(outputExpected.shape()[0]);
+
+    armnn::TensorInfo inputTensorInfo  = armnnUtils::GetTensorInfo(
+        inputBatchSize, inputChannels, inputHeight, inputWidth, dataLayout, ArmnnType);
+
+    armnn::TensorInfo outputTensorInfo = armnnUtils::GetTensorInfo(
+        outputBatchSize, outputChannels, outputHeight, outputWidth, dataLayout, ArmnnType);
+
+    // Set quantization parameters if the requested type is a quantized type.
+    if(armnn::IsQuantizedType<T>())
+    {
+        inputTensorInfo.SetQuantizationScale(qScale);
+        inputTensorInfo.SetQuantizationOffset(qOffset);
+        outputTensorInfo.SetQuantizationScale(qScale);
+        outputTensorInfo.SetQuantizationOffset(qOffset);
+    }
+
+    LayerTestResult<T, 4> result(outputTensorInfo);
+
+    std::unique_ptr<armnn::ITensorHandle> inputHandle = workloadFactory.CreateTensorHandle(inputTensorInfo);
+    std::unique_ptr<armnn::ITensorHandle> outputHandle = workloadFactory.CreateTensorHandle(outputTensorInfo);
+
+    armnn::Pooling2dQueueDescriptor queueDescriptor;
+    queueDescriptor.m_Parameters = descriptor;
+    queueDescriptor.m_Parameters.m_DataLayout = dataLayout;
+
+    armnn::WorkloadInfo workloadInfo;
+    AddInputToWorkload(queueDescriptor, workloadInfo, inputTensorInfo, inputHandle.get());
+    AddOutputToWorkload(queueDescriptor, workloadInfo, outputTensorInfo, outputHandle.get());
+
+    // Don't execute if Pooling is not supported, as an exception will be raised.
+    armnn::BackendId backend = workloadFactory.GetBackendId();
+    const size_t reasonIfUnsupportedMaxLen = 255;
+    char reasonIfUnsupported[reasonIfUnsupportedMaxLen+1];
+    result.supported = armnn::IsPooling2dSupported(backend, inputTensorInfo, outputTensorInfo,
+                                                   queueDescriptor.m_Parameters,
+                                                   reasonIfUnsupported, reasonIfUnsupportedMaxLen);
+    if (!result.supported)
+    {
+        return result;
+    }
+
+    std::unique_ptr<armnn::IWorkload> workload = workloadFactory.CreatePooling2d(queueDescriptor, workloadInfo);
+
+    inputHandle->Allocate();
+    outputHandle->Allocate();
+
+    CopyDataToITensorHandle(inputHandle.get(), &input[0][0][0][0]);
+
+    workload->Execute();
+
+    CopyDataFromITensorHandle(&result.output[0][0][0][0], outputHandle.get());
+
+    result.outputExpected = outputExpected;
+
+    return result;
+}
+
+//
+// Tests max pooling with the following parameters:
+//
+//   Pooling size: 3x3
+//   Stride:       (2,4)
+//   input size:   8x13
+//   channels:     2
+//   batch size:   2
+//
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> SimpleMaxPooling2dSize3x3Stride2x4TestCommon(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    bool forceNoPadding,
+    float qScale = 1.0f,
+    int32_t qOffset = 0)
+{
+    armnn::Pooling2dDescriptor descriptor;
+    descriptor.m_PoolType = armnn::PoolingAlgorithm::Max;
+    descriptor.m_PoolWidth = descriptor.m_PoolHeight = 3;
+    descriptor.m_StrideX = 2;
+    descriptor.m_StrideY = 4;
+    // forceNoPadding is mainly used for compatibility with ARM Compute.
+    // As of 16/05/2017, it errors if padX or padY are equal to or greater than the pool size.
+    descriptor.m_PadLeft = descriptor.m_PadRight = forceNoPadding ? 0 : 3;
+    descriptor.m_PadTop = descriptor.m_PadBottom = 0;
+    descriptor.m_OutputShapeRounding = armnn::OutputShapeRounding::Floor;
+    descriptor.m_PaddingMethod = armnn::PaddingMethod::Exclude;
+
+    unsigned int inputWidth = 8;
+    unsigned int inputHeight = 13;
+    unsigned int outputWidth =
+        (inputWidth + descriptor.m_PadLeft + descriptor.m_PadRight + descriptor.m_StrideX - descriptor.m_PoolWidth) /
+        descriptor.m_StrideX;
+    unsigned int outputHeight =
+        (inputHeight + descriptor.m_PadTop + descriptor.m_PadBottom + descriptor.m_StrideY - descriptor.m_PoolHeight) /
+        descriptor.m_StrideY;
+    unsigned int channels = 2;
+    unsigned int batchSize = 2;
+
+    armnn::TensorInfo inputTensorInfo({ batchSize, channels, inputHeight, inputWidth }, ArmnnType);
+    armnn::TensorInfo outputTensorInfo({ batchSize, channels, outputHeight, outputWidth }, ArmnnType);
+
+    // Set quantization parameters if the requested type is a quantized type.
+    if(armnn::IsQuantizedType<T>())
+    {
+        inputTensorInfo.SetQuantizationScale(qScale);
+        inputTensorInfo.SetQuantizationOffset(qOffset);
+        outputTensorInfo.SetQuantizationScale(qScale);
+        outputTensorInfo.SetQuantizationOffset(qOffset);
+    }
+
+    std::vector<float> singleChannelData({
+        0.0f, 4.0f, 8.0f, 1.0f, 6.0f, 4.0f, 5.0f, 8.0f,
+        1.0f, 1.0f, 6.0f, 0.0f, 3.0f, 7.0f, 4.0f, 7.0f,
+        8.0f, 5.0f, 0.0f, 0.0f, 8.0f, 3.0f, 4.0f, 3.0f,
+        8.0f, 2.0f, 5.0f, 4.0f, 1.0f, 9.0f, 2.0f, 0.0f,
+        5.0f, 4.0f, 5.0f, 0.0f, 0.0f, 0.0f, 7.0f, 2.0f,
+        1.0f, 2.0f, 6.0f, 2.0f, 7.0f, 9.0f, 5.0f, 2.0f,
+        9.0f, 7.0f, 3.0f, 1.0f, 3.0f, 4.0f, 8.0f, 3.0f,
+        1.0f, 0.0f, 0.0f, 5.0f, 5.0f, 4.0f, 2.0f, 0.0f,
+        6.0f, 4.0f, 3.0f, 6.0f, 9.0f, 5.0f, 5.0f, 6.0f,
+        8.0f, 7.0f, 9.0f, 6.0f, 1.0f, 4.0f, 1.0f, 9.0f,
+        7.0f, 1.0f, 9.0f, 2.0f, 9.0f, 9.0f, 8.0f, 1.0f,
+        4.0f, 4.0f, 5.0f, 9.0f, 2.0f, 6.0f, 6.0f, 4.0f,
+        3.0f, 5.0f, 4.0f, 0.0f, 1.0f, 5.0f, 9.0f, 7.0f,
+    });
+
+    // Constructs input data.
+    std::vector<float> inputData;
+    auto negator = [](float f) { return -f; };
+
+    // First image (two channels where the second channel is the negative of the first one).
+    inputData.insert(inputData.end(), singleChannelData.begin(), singleChannelData.end());
+    std::transform(singleChannelData.begin(), singleChannelData.end(), std::back_inserter(inputData), negator);
+
+    // Second image (same as first image).
+    inputData.insert(inputData.end(), singleChannelData.begin(), singleChannelData.end());
+    std::transform(singleChannelData.begin(), singleChannelData.end(), std::back_inserter(inputData), negator);
+
+    auto input = MakeTensor<T, 4>(inputTensorInfo, QuantizedVector<T>(qScale, qOffset, inputData));
+
+    // These were calculated manually.
+    auto shape(GetTensorShapeAsArray<4>(outputTensorInfo));
+    boost::multi_array<T, 4> outputExpected(shape);
+    if (forceNoPadding)
+    {
+        outputExpected = MakeTensor<T, 4>(outputTensorInfo,
+            QuantizedVector<T>(qScale, qOffset, {
+                 8.0f,  8.0f,  8.0f,
+                 9.0f,  7.0f,  9.0f,
+                 9.0f,  9.0f,  9.0f,
+
+                 0.0f,  0.0f, -3.0f,
+                -1.0f,  0.0f,  0.0f,
+                -1.0f, -1.0f, -1.0f,
+
+                 8.0f,  8.0f,  8.0f,
+                 9.0f,  7.0f,  9.0f,
+                 9.0f,  9.0f,  9.0f,
+
+                 0.0f,  0.0f, -3.0f,
+                -1.0f,  0.0f,  0.0f,
+                -1.0f, -1.0f, -1.0f
+        }));
+    }
+    else
+    {
+        outputExpected = MakeTensor<T, 4>(outputTensorInfo,
+            QuantizedVector<T>(qScale, qOffset, {
+                0.0f, 8.0f, 8.0f, 8.0f, 8.0f, 8.0f,
+                0.0f, 9.0f, 7.0f, 9.0f, 9.0f, 3.0f,
+                0.0f, 8.0f, 9.0f, 9.0f, 9.0f, 9.0f,
+
+                0.0f, 0.0f, 0.0f, 0.0f,-3.0f, 0.0f,
+                0.0f,-1.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+                0.0f,-1.0f,-1.0f,-1.0f,-1.0f, 0.0f,
+
+                0.0f, 8.0f, 8.0f, 8.0f, 8.0f, 8.0f,
+                0.0f, 9.0f, 7.0f, 9.0f, 9.0f, 3.0f,
+                0.0f, 8.0f, 9.0f, 9.0f, 9.0f, 9.0f,
+
+                0.0f, 0.0f, 0.0f, 0.0f,-3.0f, 0.0f,
+                0.0f,-1.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+                0.0f,-1.0f,-1.0f,-1.0f,-1.0f, 0.0f
+        }));
+    }
+
+    return SimplePooling2dTestImpl<ArmnnType>(
+        workloadFactory, memoryManager, descriptor, qScale, qOffset, input, outputExpected);
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> SimpleMaxPooling2dTestCommon(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::DataLayout dataLayout = armnn::DataLayout::NCHW,
+    float qScale = 1.0f,
+    int32_t qOffset = 0)
+{
+    armnn::Pooling2dDescriptor descriptor;
+    descriptor.m_PoolType = armnn::PoolingAlgorithm::Max;
+    descriptor.m_PoolWidth = descriptor.m_PoolHeight = 2;
+    descriptor.m_StrideX = descriptor.m_StrideY = 2;
+    descriptor.m_PaddingMethod = armnn::PaddingMethod::Exclude;
+    descriptor.m_DataLayout = dataLayout;
+
+    armnn::TensorInfo inputTensorInfo  = armnnUtils::GetTensorInfo(1, 2, 4, 4, dataLayout, ArmnnType);
+    armnn::TensorInfo outputTensorInfo = armnnUtils::GetTensorInfo(1, 2, 2, 2, dataLayout, ArmnnType);
+
+    // Set quantization parameters if the requested type is a quantized type.
+    if(armnn::IsQuantizedType<T>())
+    {
+        inputTensorInfo.SetQuantizationScale(qScale);
+        inputTensorInfo.SetQuantizationOffset(qOffset);
+        outputTensorInfo.SetQuantizationScale(qScale);
+        outputTensorInfo.SetQuantizationOffset(qOffset);
+    }
+
+    std::vector<T> inputData(
+        QuantizedVector<T>(qScale, qOffset, {
+             1.0f,  2.0f,  5.0f,  6.0f,
+             3.0f,  4.0f,  7.0f,  8.0f,
+             9.0f, 10.0f, 13.0f, 14.0f,
+            11.0f, 12.0f, 15.0f, 16.0f,
+
+            17.0f, 18.0f, 21.0f, 22.0f,
+            19.0f, 20.0f, 23.0f, 24.0f,
+            25.0f, 26.0f, 29.0f, 30.0f,
+            27.0f, 28.0f, 31.0f, 32.0f,
+        }));
+
+    std::vector<T> outputData(
+        QuantizedVector<T>(qScale, qOffset, {
+             4.0f,  8.0f,
+            12.0f, 16.0f,
+
+            20.0f, 24.0f,
+            28.0f, 32.0f,
+        }));
+
+    const armnn::PermutationVector NCHWToNHWC = { 0, 3, 1, 2 };
+    if (dataLayout == armnn::DataLayout::NHWC)
+    {
+        std::vector<T> tmp(inputData.size());
+        armnnUtils::Permute(inputTensorInfo.GetShape(), NCHWToNHWC, inputData.data(), tmp.data(), sizeof(T));
+        inputData = tmp;
+
+        std::vector<T> tmp1(outputData.size());
+        armnnUtils::Permute(outputTensorInfo.GetShape(), NCHWToNHWC, outputData.data(), tmp1.data(), sizeof(T));
+        outputData = tmp1;
+    }
+
+    auto input = MakeTensor<T, 4>(inputTensorInfo, inputData);
+
+    auto outputExpected = MakeTensor<T, 4>(outputTensorInfo, outputData);
+
+    return SimplePooling2dTestImpl<ArmnnType>(
+        workloadFactory, memoryManager, descriptor, qScale, qOffset, input, outputExpected);
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> SimpleAveragePooling2dTestCommon(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    armnn::DataLayout dataLayout = armnn::DataLayout::NCHW,
+    float qScale = 1.0f,
+    int32_t qOffset = 0)
+{
+    armnn::Pooling2dDescriptor descriptor;
+    descriptor.m_PoolType = armnn::PoolingAlgorithm::Average;
+    descriptor.m_PoolWidth = descriptor.m_PoolHeight = 2;
+    descriptor.m_StrideX = descriptor.m_StrideY = 2;
+    descriptor.m_PaddingMethod = armnn::PaddingMethod::Exclude;
+    descriptor.m_DataLayout = dataLayout;
+
+    armnn::TensorInfo inputTensorInfo  = armnnUtils::GetTensorInfo(1, 2, 4, 4, dataLayout, ArmnnType);
+    armnn::TensorInfo outputTensorInfo = armnnUtils::GetTensorInfo(1, 2, 2, 2, dataLayout, ArmnnType);
+
+    // Set quantization parameters if the requested type is a quantized type.
+    if(armnn::IsQuantizedType<T>())
+    {
+        inputTensorInfo.SetQuantizationScale(qScale);
+        inputTensorInfo.SetQuantizationOffset(qOffset);
+        outputTensorInfo.SetQuantizationScale(qScale);
+        outputTensorInfo.SetQuantizationOffset(qOffset);
+    }
+
+    std::vector<T> inputData(
+        QuantizedVector<T>(qScale, qOffset, {
+             2.0f,  2.0f,  6.0f,  6.0f,
+             4.0f,  4.0f,  8.0f,  8.0f,
+            10.0f, 12.0f, 14.0f, 16.0f,
+            10.0f, 12.0f, 16.0f, 14.0f,
+
+            18.0f, 20.0f, 24.0f, 22.0f,
+            20.0f, 18.0f, 22.0f, 24.0f,
+            26.0f, 28.0f,  0.0f,  0.0f,
+            26.0f, 28.0f,  0.0f,  0.0f,
+        }));
+
+    std::vector<T> outputData(
+        QuantizedVector<T>(qScale, qOffset, {
+             3.0f,  7.0f,
+            11.0f, 15.0f,
+
+            19.0f, 23.0f,
+            27.0f,  0.0f,
+        }));
+
+    const armnn::PermutationVector NCHWToNHWC = { 0, 3, 1, 2 };
+    if (dataLayout == armnn::DataLayout::NHWC)
+    {
+        std::vector<T> tmp(inputData.size());
+        armnnUtils::Permute(inputTensorInfo.GetShape(), NCHWToNHWC, inputData.data(), tmp.data(), sizeof(T));
+        inputData = tmp;
+
+        std::vector<T> tmp1(outputData.size());
+        armnnUtils::Permute(outputTensorInfo.GetShape(), NCHWToNHWC, outputData.data(), tmp1.data(), sizeof(T));
+        outputData = tmp1;
+    }
+
+    auto input = MakeTensor<T, 4>(inputTensorInfo, inputData);
+
+    auto outputExpected = MakeTensor<T, 4>(outputTensorInfo, outputData);
+
+    return SimplePooling2dTestImpl<ArmnnType>(
+        workloadFactory, memoryManager, descriptor, qScale, qOffset, input, outputExpected);
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> LargeTensorsAveragePooling2dTestCommon(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    float qScale = 1.0f,
+    int32_t qOffset = 0)
+{
+    armnn::Pooling2dDescriptor descriptor;
+    descriptor.m_PoolType = armnn::PoolingAlgorithm::Average;
+    descriptor.m_PoolWidth = descriptor.m_PoolHeight = 100;
+    descriptor.m_StrideX = descriptor.m_StrideY = 5;
+    descriptor.m_PadLeft = 50;
+    descriptor.m_PadRight = 50;
+    descriptor.m_PadTop = 50;
+    descriptor.m_PadBottom = 50;
+    descriptor.m_PaddingMethod = armnn::PaddingMethod::Exclude;
+
+    armnn::TensorInfo inputTensorInfo({ 5, 3, 52, 60 }, ArmnnType);
+    armnn::TensorInfo outputTensorInfo({ 5, 3, 11, 13 }, ArmnnType);
+
+    // Set quantization parameters if the requested type is a quantized type.
+    if(armnn::IsQuantizedType<T>())
+    {
+        inputTensorInfo.SetQuantizationScale(qScale);
+        inputTensorInfo.SetQuantizationOffset(qOffset);
+        outputTensorInfo.SetQuantizationScale(qScale);
+        outputTensorInfo.SetQuantizationOffset(qOffset);
+    }
+
+    std::vector<T> inputVec;
+
+    for (unsigned int i = 0 ; i < inputTensorInfo.GetShape().GetNumElements(); ++i)
+    {
+        inputVec.push_back(1);
+    }
+
+    auto input = MakeTensor<T, 4>(inputTensorInfo, inputVec);
+
+    std::vector<T> outputVec;
+
+    for (unsigned int i = 0 ; i < outputTensorInfo.GetShape().GetNumElements(); ++i)
+    {
+        outputVec.push_back(1);
+    }
+
+    auto outputExpected = MakeTensor<T, 4>(outputTensorInfo, outputVec);
+
+    return SimplePooling2dTestImpl<ArmnnType>(
+        workloadFactory, memoryManager, descriptor, qScale, qOffset, input, outputExpected);
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> SimpleL2Pooling2dTestCommon(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    armnn::DataLayout dataLayout = armnn::DataLayout::NCHW,
+    float qScale = 1.0f,
+    int32_t qOffset = 0)
+{
+    armnn::Pooling2dDescriptor descriptor;
+    descriptor.m_PoolType = armnn::PoolingAlgorithm::L2;
+    descriptor.m_PoolWidth = descriptor.m_PoolHeight = 2;
+    descriptor.m_StrideX = descriptor.m_StrideY = 2;
+    descriptor.m_PaddingMethod = armnn::PaddingMethod::Exclude;
+    descriptor.m_DataLayout = dataLayout;
+
+    armnn::TensorInfo inputTensorInfo  = armnnUtils::GetTensorInfo(1, 2, 4, 4, dataLayout, ArmnnType);
+    armnn::TensorInfo outputTensorInfo = armnnUtils::GetTensorInfo(1, 2, 2, 2, dataLayout, ArmnnType);
+
+    std::vector<T> inputData(
+        QuantizedVector<T>(qScale, qOffset, {
+            1.0f, 7.0f, 5.0f, 5.0f,
+            1.0f, 7.0f, 5.0f, 5.0f,
+            3.0f, 3.0f, 1.0f, 1.0f,
+            3.0f, 3.0f, 1.0f, 1.0f,
+
+            1.0f, 7.0f, 0.0f, 0.0f,
+            1.0f, 7.0f, 2.0f, 0.0f,
+            0.0f, 2.0f, 1.0f, 1.0f,
+            0.0f, 0.0f, 1.0f, 1.0f,
+        }));
+
+    std::vector<T> outputData(
+        QuantizedVector<T>(qScale, qOffset, {
+            5.0f, 5.0f,
+            3.0f, 1.0f,
+
+            5.0f, 1.0f,
+            1.0f, 1.0f,
+        }));
+
+    const armnn::PermutationVector NCHWToNHWC = { 0, 3, 1, 2 };
+    if (dataLayout == armnn::DataLayout::NHWC)
+    {
+        std::vector<T> tmp(inputData.size());
+        armnnUtils::Permute(inputTensorInfo.GetShape(), NCHWToNHWC, inputData.data(), tmp.data(), sizeof(T));
+        inputData = tmp;
+
+        std::vector<T> tmp1(outputData.size());
+        armnnUtils::Permute(outputTensorInfo.GetShape(), NCHWToNHWC, outputData.data(), tmp1.data(), sizeof(T));
+        outputData = tmp1;
+    }
+
+    auto input = MakeTensor<T, 4>(inputTensorInfo, inputData);
+
+    auto outputExpected = MakeTensor<T, 4>(outputTensorInfo, outputData);
+
+    return SimplePooling2dTestImpl<ArmnnType>(
+        workloadFactory, memoryManager, descriptor, qScale, qOffset, input, outputExpected);
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> L2Pooling2dSize3Stride1TestCommon(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    float qScale = 1.0f,
+    int32_t qOffset = 0)
+{
+    armnn::Pooling2dDescriptor descriptor;
+    descriptor.m_PoolType = armnn::PoolingAlgorithm::L2;
+    descriptor.m_PoolWidth = descriptor.m_PoolHeight = 3;
+    descriptor.m_StrideX = descriptor.m_StrideY = 1;
+    descriptor.m_PaddingMethod = armnn::PaddingMethod::Exclude;
+
+    armnn::TensorInfo inputTensorInfo({ 1, 1, 4, 4 }, ArmnnType);
+    auto input = MakeTensor<T, 4>(inputTensorInfo,
+        QuantizedVector<T>(qScale, qOffset, {
+            2.0f, 1.0f, 5.0f, 2.0f,
+            1.0f, 2.0f, 2.0f, 1.0f,
+            5.0f, 4.0f, 1.0f, 5.0f,
+            2.0f, 1.0f, 5.0f, 2.0f,
+        }));
+
+    armnn::TensorInfo outputTensorInfo({ 1, 1, 2, 2 }, ArmnnType);
+    auto outputExpected = MakeTensor<T, 4>(outputTensorInfo,
+        QuantizedVector<T>(qScale, qOffset, {
+            3.0f, 3.0f,
+            3.0f, 3.0f,
+        }));
+
+    return SimplePooling2dTestImpl<ArmnnType>(
+        workloadFactory, memoryManager, descriptor, qScale, qOffset, input, outputExpected);
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> L2Pooling2dSize3Stride3TestCommon(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    float qScale = 1.0f,
+    int32_t qOffset = 0)
+{
+    armnn::Pooling2dDescriptor descriptor;
+    descriptor.m_PoolType = armnn::PoolingAlgorithm::L2;
+    descriptor.m_PoolWidth = descriptor.m_PoolHeight = 3;
+    descriptor.m_StrideX = descriptor.m_StrideY = 3;
+    descriptor.m_PaddingMethod = armnn::PaddingMethod::Exclude;
+
+    armnn::TensorInfo inputTensorInfo({ 1, 1, 9, 9 }, ArmnnType);
+    auto input = MakeTensor<T, 4>(inputTensorInfo,
+        QuantizedVector<T>(qScale, qOffset, {
+            2.0f, 1.0f, 5.0f, 2.0f, 1.0f, 5.0f, 2.0f, 1.0f, 5.0f,
+            1.0f, 2.0f, 2.0f, 1.0f, 2.0f, 2.0f, 1.0f, 2.0f, 2.0f,
+            5.0f, 4.0f, 1.0f, 5.0f, 4.0f, 1.0f, 5.0f, 4.0f, 1.0f,
+            2.0f, 1.0f, 5.0f, 2.0f, 1.0f, 5.0f, 2.0f, 1.0f, 5.0f,
+            1.0f, 2.0f, 2.0f, 1.0f, 2.0f, 2.0f, 1.0f, 2.0f, 2.0f,
+            5.0f, 4.0f, 1.0f, 5.0f, 4.0f, 1.0f, 5.0f, 4.0f, 1.0f,
+            2.0f, 1.0f, 5.0f, 2.0f, 1.0f, 5.0f, 2.0f, 1.0f, 5.0f,
+            1.0f, 2.0f, 2.0f, 1.0f, 2.0f, 2.0f, 1.0f, 2.0f, 2.0f,
+            5.0f, 4.0f, 1.0f, 5.0f, 4.0f, 1.0f, 5.0f, 4.0f, 1.0f,
+        }));
+
+    armnn::TensorInfo outputTensorInfo({ 1, 1, 3, 3 }, ArmnnType);
+    auto outputExpected = MakeTensor<T, 4>(outputTensorInfo,
+        QuantizedVector<T>(qScale, qOffset, {
+            3.0f, 3.0f, 3.0f,
+            3.0f, 3.0f, 3.0f,
+            3.0f, 3.0f, 3.0f,
+        }));
+
+    return SimplePooling2dTestImpl<ArmnnType>(
+        workloadFactory, memoryManager, descriptor, qScale, qOffset, input, outputExpected);
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> L2Pooling2dSize3Stride4TestCommon(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    float qScale = 1.0f,
+    int32_t qOffset = 0)
+{
+    armnn::Pooling2dDescriptor descriptor;
+    descriptor.m_PoolType = armnn::PoolingAlgorithm::L2;
+    descriptor.m_PoolWidth = descriptor.m_PoolHeight = 3;
+    descriptor.m_StrideX = descriptor.m_StrideY = 4;
+    descriptor.m_PaddingMethod = armnn::PaddingMethod::Exclude;
+
+    armnn::TensorInfo inputTensorInfo({ 1, 1, 7, 7 }, ArmnnType);
+    auto input = MakeTensor<T, 4>(inputTensorInfo,
+        QuantizedVector<T>(qScale, qOffset, {
+            2.0f, 1.0f, 5.0f, 0.0f, 2.0f, 1.0f, 5.0f,
+            1.0f, 2.0f, 2.0f, 0.0f, 1.0f, 2.0f, 2.0f,
+            5.0f, 4.0f, 1.0f, 0.0f, 5.0f, 4.0f, 1.0f,
+            0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f,
+            2.0f, 1.0f, 5.0f, 0.0f, 2.0f, 1.0f, 5.0f,
+            1.0f, 2.0f, 2.0f, 0.0f, 1.0f, 2.0f, 2.0f,
+            5.0f, 4.0f, 1.0f, 0.0f, 5.0f, 4.0f, 1.0f,
+        }));
+
+    armnn::TensorInfo outputTensorInfo({ 1, 1, 2, 2 }, ArmnnType);
+    auto outputExpected = MakeTensor<T, 4>(outputTensorInfo,
+        QuantizedVector<T>(qScale, qOffset, {
+            3.0f, 3.0f,
+            3.0f, 3.0f,
+        }));
+
+    return SimplePooling2dTestImpl<ArmnnType>(
+        workloadFactory, memoryManager, descriptor, qScale, qOffset, input, outputExpected);
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> L2Pooling2dSize7TestCommon(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    float qScale = 1.0f,
+    int32_t qOffset = 0)
+{
+    armnn::Pooling2dDescriptor descriptor;
+    descriptor.m_PoolType = armnn::PoolingAlgorithm::L2;
+    descriptor.m_PoolWidth = descriptor.m_PoolHeight = 7;
+    descriptor.m_StrideX = descriptor.m_StrideY = 7;
+    descriptor.m_PaddingMethod = armnn::PaddingMethod::Exclude;
+
+    armnn::TensorInfo inputTensorInfo({ 1, 1, 7, 7 }, ArmnnType);
+    auto input = MakeTensor<T, 4>(inputTensorInfo,
+        QuantizedVector<T>(qScale, qOffset, {
+            1.0f, 0.0f, 2.0f, 0.0f,  3.0f, 0.0f, 4.0f,
+            0.0f, 0.0f, 0.0f, 0.0f,  0.0f, 0.0f, 0.0f,
+            0.0f, 5.0f, 0.0f, 6.0f,  0.0f, 7.0f, 0.0f,
+            8.0f, 0.0f, 9.0f, 0.0f, 10.0f, 0.0f, 5.0f,
+            0.0f, 5.0f, 0.0f, 2.0f,  0.0f, 1.0f, 1.0f,
+            0.0f, 0.0f, 0.0f, 0.0f,  0.0f, 0.0f, 0.0f,
+            0.0f, 0.0f, 0.0f, 0.0f,  0.0f, 0.0f, 0.0f,
+        }));
+
+    armnn::TensorInfo outputTensorInfo({ 1, 1, 1, 1 }, ArmnnType);
+    auto outputExpected = MakeTensor<T, 4>(outputTensorInfo,
+        QuantizedVector<T>(qScale, qOffset, {
+            3.0f,
+        }));
+
+    return SimplePooling2dTestImpl<ArmnnType>(
+        workloadFactory, memoryManager, descriptor, qScale, qOffset, input, outputExpected);
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> L2Pooling2dSize9TestCommon(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    float qScale = 1.0f,
+    int32_t qOffset = 0)
+{
+    armnn::Pooling2dDescriptor descriptor;
+    descriptor.m_PoolType = armnn::PoolingAlgorithm::L2;
+    descriptor.m_PoolWidth = descriptor.m_PoolHeight = 9;
+    descriptor.m_StrideX = descriptor.m_StrideY = 9;
+    descriptor.m_PaddingMethod = armnn::PaddingMethod::Exclude;
+
+    armnn::TensorInfo inputTensorInfo({ 1, 1, 9, 9 }, ArmnnType);
+    auto input = MakeTensor<T, 4>(inputTensorInfo,
+        QuantizedVector<T>(qScale, qOffset, {
+            2.0f, 1.0f, 5.0f, 2.0f, 1.0f, 5.0f, 2.0f, 1.0f, 5.0f,
+            1.0f, 2.0f, 2.0f, 1.0f, 2.0f, 2.0f, 1.0f, 2.0f, 2.0f,
+            5.0f, 4.0f, 1.0f, 5.0f, 4.0f, 1.0f, 5.0f, 4.0f, 1.0f,
+            2.0f, 1.0f, 5.0f, 2.0f, 1.0f, 5.0f, 2.0f, 1.0f, 5.0f,
+            1.0f, 2.0f, 2.0f, 1.0f, 2.0f, 2.0f, 1.0f, 2.0f, 2.0f,
+            5.0f, 4.0f, 1.0f, 5.0f, 4.0f, 1.0f, 5.0f, 4.0f, 1.0f,
+            2.0f, 1.0f, 5.0f, 2.0f, 1.0f, 5.0f, 2.0f, 1.0f, 5.0f,
+            1.0f, 2.0f, 2.0f, 1.0f, 2.0f, 2.0f, 1.0f, 2.0f, 2.0f,
+            5.0f, 4.0f, 1.0f, 5.0f, 4.0f, 1.0f, 5.0f, 4.0f, 1.0f,
+        }));
+
+    armnn::TensorInfo outputTensorInfo({ 1, 1, 1, 1 }, ArmnnType);
+    auto outputExpected = MakeTensor<T, 4>(outputTensorInfo,
+        QuantizedVector<T>(qScale, qOffset, {
+            3.0f,
+        }));
+
+    return SimplePooling2dTestImpl<ArmnnType>(
+        workloadFactory, memoryManager, descriptor, qScale, qOffset, input, outputExpected);
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> AsymmetricNonSquarePooling2dTestCommon(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    float qScale = 1.0f,
+    int32_t qOffset = 0)
+{
+    armnn::TensorInfo inputTensorInfo({ 1, 1, 1, 3 }, ArmnnType);
+    armnn::TensorInfo outputTensorInfo({ 1, 1, 2, 2 }, ArmnnType);
+
+    armnn::Pooling2dDescriptor descriptor;
+    descriptor.m_PoolType = armnn::PoolingAlgorithm::Max;
+    descriptor.m_PoolWidth = 2;
+    descriptor.m_PoolHeight = 3;
+    descriptor.m_StrideX = 2;
+    descriptor.m_StrideY = 1;
+    descriptor.m_PadLeft = 2;
+    descriptor.m_PadRight = 0;
+    descriptor.m_PadTop = 1;
+    descriptor.m_PadBottom = 2;
+    descriptor.m_OutputShapeRounding = armnn::OutputShapeRounding::Floor;
+    descriptor.m_PaddingMethod = armnn::PaddingMethod::Exclude;
+
+    // Construct input data.
+    auto input = MakeTensor<T, 4>(inputTensorInfo,
+        QuantizedVector<T>(qScale, qOffset, {
+            1.0f, 3.0f, 4.0f,
+        }));
+
+    // These were calculated manually.
+    auto outputExpected = MakeTensor<T, 4>(outputTensorInfo,
+        QuantizedVector<T>(qScale, qOffset, {
+            0.0f, 3.0f, 0.0f, 3.0f,
+        }));
+
+    return SimplePooling2dTestImpl<ArmnnType>(
+        workloadFactory, memoryManager, descriptor, qScale, qOffset, input, outputExpected);
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> ComparePooling2dTestCommon(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    armnn::IWorkloadFactory& refWorkloadFactory,
+    armnn::PoolingAlgorithm poolingType,
+    float qScale = 1.0f,
+    int32_t qOffset = 0)
+{
+    const unsigned int inputWidth = 16;
+    const unsigned int inputHeight = 32;
+    const unsigned int channelCount = 2;
+    const unsigned int batchSize = 5;
+
+    const unsigned int poolSize = 3;
+    const unsigned int strideX = 2;
+    const unsigned int strideY = 4;
+    const unsigned int padX = 0;
+    const unsigned int padY = 0;
+
+    const unsigned int outputWidth = (inputWidth + 2 * padX + strideX - poolSize) / strideX;
+    const unsigned int outputHeight = (inputHeight + 2 * padY + strideY - poolSize) / strideY;
+
+    armnn::TensorInfo inputTensorInfo;
+    armnn::TensorInfo outputTensorInfo;
+
+    unsigned int inputShape[] = { batchSize, channelCount, inputHeight, inputWidth };
+    unsigned int outputShape[] = { batchSize, channelCount, outputHeight, outputWidth };
+
+    inputTensorInfo = armnn::TensorInfo(4, inputShape, ArmnnType);
+    outputTensorInfo = armnn::TensorInfo(4, outputShape, ArmnnType);
+
+    // Set quantization parameters if the requested type is a quantized type.
+    if(armnn::IsQuantizedType<T>())
+    {
+        inputTensorInfo.SetQuantizationScale(qScale);
+        inputTensorInfo.SetQuantizationOffset(qOffset);
+        outputTensorInfo.SetQuantizationScale(qScale);
+        outputTensorInfo.SetQuantizationOffset(qOffset);
+    }
+
+    boost::multi_array<T, 4> input = MakeRandomTensor<T, 4>(inputTensorInfo, 81715);
+
+    LayerTestResult<T, 4> comparisonResult(outputTensorInfo);
+
+    std::unique_ptr<armnn::ITensorHandle> inputHandle = workloadFactory.CreateTensorHandle(inputTensorInfo);
+    std::unique_ptr<armnn::ITensorHandle> outputHandle = workloadFactory.CreateTensorHandle(outputTensorInfo);
+
+    armnn::Pooling2dQueueDescriptor data;
+    armnn::WorkloadInfo info;
+    AddInputToWorkload(data, info, inputTensorInfo, inputHandle.get());
+    AddOutputToWorkload(data, info, outputTensorInfo, outputHandle.get());
+    data.m_Parameters.m_PoolType = poolingType;
+    data.m_Parameters.m_PoolWidth = poolSize;
+    data.m_Parameters.m_PoolHeight = poolSize;
+    data.m_Parameters.m_StrideX = strideX;
+    data.m_Parameters.m_StrideY = strideY;
+    data.m_Parameters.m_PadLeft = padX;
+    data.m_Parameters.m_PadRight = padX;
+    data.m_Parameters.m_PadTop = padY;
+    data.m_Parameters.m_PadBottom = padY;
+    data.m_Parameters.m_OutputShapeRounding = armnn::OutputShapeRounding::Floor;
+
+    std::unique_ptr<armnn::ITensorHandle> outputHandleRef = refWorkloadFactory.CreateTensorHandle(outputTensorInfo);
+    std::unique_ptr<armnn::ITensorHandle> inputHandleRef = refWorkloadFactory.CreateTensorHandle(inputTensorInfo);
+
+    // Don't execute if Pooling is not supported, as an exception will be raised.
+    armnn::BackendId backend = workloadFactory.GetBackendId();
+    const size_t reasonIfUnsupportedMaxLen = 255;
+    char reasonIfUnsupported[reasonIfUnsupportedMaxLen+1];
+    comparisonResult.supported = armnn::IsPooling2dSupported(backend, inputTensorInfo, outputTensorInfo,
+                                                             data.m_Parameters,
+                                                             reasonIfUnsupported, reasonIfUnsupportedMaxLen);
+    if (!comparisonResult.supported)
+    {
+        return comparisonResult;
+    }
+
+    armnn::Pooling2dQueueDescriptor refData = data;
+    armnn::WorkloadInfo refInfo = info;
+    SetWorkloadInput(refData, refInfo, 0, inputTensorInfo, inputHandleRef.get());
+    SetWorkloadOutput(refData, refInfo, 0, outputTensorInfo, outputHandleRef.get());
+
+    std::unique_ptr<armnn::IWorkload> workload = workloadFactory.CreatePooling2d(data, info);
+    std::unique_ptr<armnn::IWorkload> workloadRef = refWorkloadFactory.CreatePooling2d(refData, refInfo);
+
+    outputHandleRef->Allocate();
+    inputHandleRef->Allocate();
+    inputHandle->Allocate();
+    outputHandle->Allocate();
+
+    CopyDataToITensorHandle(inputHandle.get(), &input[0][0][0][0]);
+    CopyDataToITensorHandle(inputHandleRef.get(), &input[0][0][0][0]);
+
+    workload->Execute();
+    workloadRef->Execute();
+
+    CopyDataFromITensorHandle(&comparisonResult.output[0][0][0][0], outputHandle.get());
+    CopyDataFromITensorHandle(&comparisonResult.outputExpected[0][0][0][0], outputHandleRef.get());
+
+    return comparisonResult;
+}
+
+//
+// Tests max pooling with the following parameters:
+//
+//   Pooling size: 2x2
+//   Stride:       (2,2)
+//   input size:   4x4
+//   channels:     1
+//   batch size:   1
+//
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> SimpleMaxPooling2dSize2x2Stride2x2TestCommon(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    bool forceNoPadding,
+    float qScale = 1.0f,
+    int32_t qOffset = 0)
+{
+    armnn::Pooling2dDescriptor descriptor;
+    descriptor.m_PoolType = armnn::PoolingAlgorithm::Max;
+    descriptor.m_PoolWidth = descriptor.m_PoolHeight = 2;
+    descriptor.m_StrideX = 2;
+    descriptor.m_StrideY = 2;
+    descriptor.m_PadLeft = descriptor.m_PadRight = forceNoPadding ? 0 : 3;
+    descriptor.m_PadTop = descriptor.m_PadBottom = 0;
+    descriptor.m_OutputShapeRounding = armnn::OutputShapeRounding::Floor;
+    descriptor.m_PaddingMethod = armnn::PaddingMethod::Exclude;
+
+
+    unsigned int inputWidth = 4;
+
+    unsigned int inputHeight = 4;
+
+    unsigned int outputWidth =
+        (inputWidth + descriptor.m_PadLeft + descriptor.m_PadRight + descriptor.m_StrideX - descriptor.m_PoolWidth) /
+        descriptor.m_StrideX;
+    unsigned int outputHeight =
+        (inputHeight + descriptor.m_PadTop + descriptor.m_PadBottom + descriptor.m_StrideY - descriptor.m_PoolHeight) /
+        descriptor.m_StrideY;
+    unsigned int channels = 1;
+    unsigned int batchSize = 1;
+
+    std::vector<float> inputData = {
+        510.0f, 222.0f, 780.0f, 654.0f,
+        141.0f, 276.0f,  15.0f, 546.0f,
+        303.0f, 618.0f, 582.0f, 339.0f,
+        438.0f, 564.0f, 573.0f, 402.0f
+    };
+
+    // Note that left and right edges will be 0.f, due to the 2x2 max pooling only accessing zeros here.
+    std::vector<float> expectedOutputDataWithPadding = {
+        0.0f, 510.0f, 780.0f, 654.0f, 0.0f,
+        0.0f, 438.0f, 618.0f, 402.0f, 0.0f
+    };
+
+    std::vector<float> expectedOutputDataNoPadding = {
+        510.0f, 780.0f,
+        618.0f, 582.0f
+    };
+
+    armnn::TensorInfo inputTensorInfo({ batchSize, channels, inputHeight, inputWidth }, ArmnnType);
+
+    // Scale and offset should match input - we're just calculating maximum values.
+    armnn::TensorInfo outputTensorInfo({ batchSize, channels, outputHeight, outputWidth }, ArmnnType);
+
+    // Set quantization parameters if the requested type is a quantized type.
+    if(armnn::IsQuantizedType<T>())
+    {
+        inputTensorInfo.SetQuantizationScale(qScale);
+        inputTensorInfo.SetQuantizationOffset(qOffset);
+        outputTensorInfo.SetQuantizationScale(qScale);
+        outputTensorInfo.SetQuantizationOffset(qOffset);
+    }
+
+    auto input = MakeTensor<T, 4>(inputTensorInfo, QuantizedVector<T>(qScale, qOffset, inputData));
+
+    auto outputExpected = MakeTensor<T, 4>(outputTensorInfo,
+        forceNoPadding ? QuantizedVector<T>(qScale, qOffset, expectedOutputDataNoPadding) :
+                         QuantizedVector<T>(qScale, qOffset, expectedOutputDataWithPadding));
+
+    return SimplePooling2dTestImpl<ArmnnType>(
+        workloadFactory, memoryManager, descriptor, qScale, qOffset, input, outputExpected);
+}
+
+//
+// Tests max pooling with the following parameters:
+//
+//   Pooling size: 3x2
+//   Stride:       (2,2)
+//   input size:   3x2
+//   channels:     1
+//   batch size:   1
+//
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> IgnorePaddingAveragePooling2dSize3x2Stride2x2TestCommon(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        bool forceNoPadding,
+        float qScale = 1.0f,
+        int32_t qOffset = 0)
+{
+    armnn::Pooling2dDescriptor descriptor;
+    descriptor.m_PoolType = armnn::PoolingAlgorithm::Average;
+    descriptor.m_PoolWidth = 3;
+    descriptor.m_PoolHeight = 2;
+    descriptor.m_StrideX = 2;
+    descriptor.m_StrideY = 2;
+    descriptor.m_PadLeft = (forceNoPadding) ? 0 : 1;
+    descriptor.m_PadRight = descriptor.m_PadLeft;
+    descriptor.m_PadTop = 0;
+    descriptor.m_PadBottom = 0;
+    descriptor.m_OutputShapeRounding = armnn::OutputShapeRounding::Floor;
+    descriptor.m_PaddingMethod = armnn::PaddingMethod::IgnoreValue;
+
+    unsigned int inputWidth = 3;
+    unsigned int inputHeight = 2;
+    unsigned int outputWidth =
+        (inputWidth + descriptor.m_PadLeft + descriptor.m_PadRight + descriptor.m_StrideX - descriptor.m_PoolWidth) /
+        descriptor.m_StrideX;
+    unsigned int outputHeight =
+        (inputHeight + descriptor.m_PadTop + descriptor.m_PadBottom + descriptor.m_StrideY - descriptor.m_PoolHeight) /
+        descriptor.m_StrideY;
+    unsigned int channels = 1;
+    unsigned int batchSize = 1;
+
+    std::vector<float> inputData = {
+        3.0f, 6.0f, 9.0f,
+        12.0f, 15.0f, 18.0f,
+    };
+
+    std::vector<float> expectedOutputDataWithPadding = {
+        6.0f, 8.0f,
+    };
+
+    std::vector<float> expectedOutputDataNoPadding = {
+        10.5f,
+    };
+
+    armnn::TensorInfo inputTensorInfo({ batchSize, channels, inputHeight, inputWidth }, ArmnnType);
+
+    // Scale and offset should match input - we're just calculating average values.
+    armnn::TensorInfo outputTensorInfo({ batchSize, channels, outputHeight, outputWidth }, ArmnnType);
+
+    // Set quantization parameters if the requested type is a quantized type.
+    if(armnn::IsQuantizedType<T>())
+    {
+        inputTensorInfo.SetQuantizationScale(qScale);
+        inputTensorInfo.SetQuantizationOffset(qOffset);
+        outputTensorInfo.SetQuantizationScale(qScale);
+        outputTensorInfo.SetQuantizationOffset(qOffset);
+    }
+
+    auto input = MakeTensor<T, 4>(inputTensorInfo, QuantizedVector<T>(qScale, qOffset, inputData));
+
+    auto outputExpected = MakeTensor<T, 4>(outputTensorInfo,
+        forceNoPadding ? QuantizedVector<T>(qScale, qOffset, expectedOutputDataNoPadding) :
+                         QuantizedVector<T>(qScale, qOffset, expectedOutputDataWithPadding));
+
+    return SimplePooling2dTestImpl<ArmnnType>(
+        workloadFactory, memoryManager, descriptor, qScale, qOffset, input, outputExpected);
+}
+
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> IgnorePaddingSimpleMaxPooling2dTestCommon(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    float qScale = 1.0f,
+    int32_t qOffset = 0)
+{
+    armnn::Pooling2dDescriptor descriptor;
+    descriptor.m_PoolType = armnn::PoolingAlgorithm::Max;
+    descriptor.m_PoolWidth = descriptor.m_PoolHeight = 2;
+    descriptor.m_StrideX = descriptor.m_StrideY = 2;
+    descriptor.m_PadLeft = 1;
+    descriptor.m_PadRight = 1;
+    descriptor.m_PadTop = 1;
+    descriptor.m_PadBottom = 1;
+    descriptor.m_PaddingMethod = armnn::PaddingMethod::IgnoreValue;
+
+    armnn::TensorInfo inputTensorInfo({ 1, 1, 4, 4 }, ArmnnType);
+    armnn::TensorInfo outputTensorInfo({ 1, 1, 3, 3 }, ArmnnType);
+
+    // Set quantization parameters if the requested type is a quantized type.
+    if(armnn::IsQuantizedType<T>())
+    {
+        inputTensorInfo.SetQuantizationScale(qScale);
+        inputTensorInfo.SetQuantizationOffset(qOffset);
+        outputTensorInfo.SetQuantizationScale(qScale);
+        outputTensorInfo.SetQuantizationOffset(qOffset);
+    }
+
+    auto input = MakeTensor<T, 4>(inputTensorInfo,
+        QuantizedVector<T>(qScale, qOffset, {
+            -1.0f, -2.0f,  3.0f,  4.0f,
+            -1.0f, -2.0f,  3.0f,  4.0f,
+             1.0f,  2.0f, -3.0f, -4.0f,
+             1.0f,  2.0f, -3.0f, -4.0f,
+        }));
+
+    auto outputExpected = MakeTensor<T, 4>(outputTensorInfo,
+        QuantizedVector<T>(qScale, qOffset, {
+            -1.0f,  3.0f,  4.0f,
+             1.0f,  3.0f,  4.0f,
+             1.0f,  2.0f, -4.0f,
+        }));
+
+    return SimplePooling2dTestImpl<ArmnnType>(
+        workloadFactory, memoryManager, descriptor, qScale, qOffset, input, outputExpected);
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> IgnorePaddingMaxPooling2dSize3TestCommon(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    float qScale = 1.0f,
+    int32_t qOffset = 0)
+{
+    armnn::Pooling2dDescriptor descriptor;
+    descriptor.m_PoolType = armnn::PoolingAlgorithm::Max;
+    descriptor.m_PoolWidth = descriptor.m_PoolHeight = 3;
+    descriptor.m_StrideX = descriptor.m_StrideY = 1;
+    descriptor.m_PadLeft = 1;
+    descriptor.m_PadRight = 1;
+    descriptor.m_PadTop = 1;
+    descriptor.m_PadBottom = 1;
+    descriptor.m_PaddingMethod = armnn::PaddingMethod::IgnoreValue;
+
+    armnn::TensorInfo inputTensorInfo({ 1, 1, 4, 4 }, ArmnnType);
+    armnn::TensorInfo outputTensorInfo({ 1, 1, 4, 4 }, ArmnnType);
+
+    // Set quantization parameters if the requested type is a quantized type.
+    if(armnn::IsQuantizedType<T>())
+    {
+        inputTensorInfo.SetQuantizationScale(qScale);
+        inputTensorInfo.SetQuantizationOffset(qOffset);
+        outputTensorInfo.SetQuantizationScale(qScale);
+        outputTensorInfo.SetQuantizationOffset(qOffset);
+    }
+
+    auto input = MakeTensor<T, 4>(inputTensorInfo,
+        QuantizedVector<T>(qScale, qOffset, {
+            -1.0f, -2.0f,  3.0f,  4.0f,
+            -1.0f, -2.0f,  3.0f,  4.0f,
+             1.0f,  2.0f, -3.0f, -4.0f,
+             1.0f,  2.0f, -3.0f, -4.0f,
+        }));
+
+    auto outputExpected = MakeTensor<T, 4>(outputTensorInfo,
+        QuantizedVector<T>(qScale, qOffset, {
+            -1.0f,  3.0f,  4.0f,  4.0f,
+             2.0f,  3.0f,  4.0f,  4.0f,
+             2.0f,  3.0f,  4.0f,  4.0f,
+             2.0f,  2.0f,  2.0f, -3.0f,
+        }));
+
+    return SimplePooling2dTestImpl<ArmnnType>(
+        workloadFactory, memoryManager, descriptor, qScale, qOffset, input, outputExpected);
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> IgnorePaddingSimpleAveragePooling2dTestCommon(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    float qScale = 1.0f,
+    int32_t qOffset = 0)
+{
+    armnn::Pooling2dDescriptor descriptor;
+    descriptor.m_PoolType = armnn::PoolingAlgorithm::Average;
+    descriptor.m_PoolWidth = descriptor.m_PoolHeight = 2;
+    descriptor.m_StrideX = descriptor.m_StrideY = 2;
+    descriptor.m_PadLeft = 1;
+    descriptor.m_PadRight = 1;
+    descriptor.m_PadTop = 1;
+    descriptor.m_PadBottom = 1;
+    descriptor.m_PaddingMethod = armnn::PaddingMethod::IgnoreValue;
+
+    armnn::TensorInfo inputTensorInfo({ 1, 1, 4, 4 }, ArmnnType);
+    armnn::TensorInfo outputTensorInfo({ 1, 1, 3, 3 }, ArmnnType);
+
+    // Set quantization parameters if the requested type is a quantized type.
+    if(armnn::IsQuantizedType<T>())
+    {
+        inputTensorInfo.SetQuantizationScale(qScale);
+        inputTensorInfo.SetQuantizationOffset(qOffset);
+        outputTensorInfo.SetQuantizationScale(qScale);
+        outputTensorInfo.SetQuantizationOffset(qOffset);
+    }
+
+    auto input = MakeTensor<T, 4>(inputTensorInfo,
+        QuantizedVector<T>(qScale, qOffset, {
+            12.0f, 20.0f, 32.0f, 40.0f,
+            12.0f, 20.0f, 32.0f, 40.0f,
+            12.0f, 20.0f, 32.0f, 40.0f,
+            12.0f, 20.0f, 32.0f, 40.0f,
+        }));
+
+    auto outputExpected = MakeTensor<T, 4>(outputTensorInfo,
+        QuantizedVector<T>(qScale, qOffset, {
+            3.0f,  13.0f,  10.0f,
+            6.0f,  26.0f,  20.0f,
+            3.0f,  13.0f,  10.0f,
+        }));
+
+    return SimplePooling2dTestImpl<ArmnnType>(
+        workloadFactory, memoryManager, descriptor, qScale, qOffset, input, outputExpected);
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> IgnorePaddingSimpleAveragePooling2dNoPaddingTestCommon(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    float qScale = 1.0f,
+    int32_t qOffset = 0)
+{
+    armnn::Pooling2dDescriptor descriptor;
+    descriptor.m_PoolType = armnn::PoolingAlgorithm::Average;
+    descriptor.m_PoolWidth = descriptor.m_PoolHeight = 3;
+    descriptor.m_StrideX = descriptor.m_StrideY = 2;
+    descriptor.m_PadLeft = 0;
+    descriptor.m_PadRight = 0;
+    descriptor.m_PadTop = 0;
+    descriptor.m_PadBottom = 0;
+    descriptor.m_PaddingMethod = armnn::PaddingMethod::IgnoreValue;
+    descriptor.m_OutputShapeRounding = armnn::OutputShapeRounding::Ceiling;
+
+    armnn::TensorInfo inputTensorInfo({ 1, 1, 4, 4}, ArmnnType);
+    armnn::TensorInfo outputTensorInfo({ 1, 1, 2, 2 }, ArmnnType);
+
+    // Set quantization parameters if the requested type is a quantized type.
+    if(armnn::IsQuantizedType<T>())
+    {
+        inputTensorInfo.SetQuantizationScale(qScale);
+        inputTensorInfo.SetQuantizationOffset(qOffset);
+        outputTensorInfo.SetQuantizationScale(qScale);
+        outputTensorInfo.SetQuantizationOffset(qOffset);
+    }
+
+    auto input = MakeTensor<T, 4>(inputTensorInfo,
+        QuantizedVector<T>(qScale, qOffset, {
+            1.0f, 2.0f, 3.0f, 4.0f,
+            1.0f, 2.0f, 3.0f, 4.0f,
+            1.0f, 2.0f, 3.0f, 4.0f,
+            1.0f, 2.0f, 3.0f, 4.0f,
+        }));
+
+    auto outputExpected = MakeTensor<T, 4>(outputTensorInfo,
+        QuantizedVector<T>(qScale, qOffset, {
+            2.0f, 3.5f,
+            2.0f, 3.5f
+        }));
+
+    return SimplePooling2dTestImpl<ArmnnType>(
+        workloadFactory, memoryManager, descriptor, qScale, qOffset, input, outputExpected);
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> IgnorePaddingAveragePooling2dSize3TestCommon(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    float qScale = 1.0f,
+    int32_t qOffset = 0)
+{
+    armnn::Pooling2dDescriptor descriptor;
+    descriptor.m_PoolType = armnn::PoolingAlgorithm::Average;
+    descriptor.m_PoolWidth = descriptor.m_PoolHeight = 3;
+    descriptor.m_StrideX = descriptor.m_StrideY = 1;
+    descriptor.m_PadLeft = 1;
+    descriptor.m_PadRight = 1;
+    descriptor.m_PadTop = 1;
+    descriptor.m_PadBottom = 1;
+    descriptor.m_PaddingMethod = armnn::PaddingMethod::IgnoreValue;
+
+    armnn::TensorInfo inputTensorInfo({ 1, 1, 4, 4 }, ArmnnType);
+    armnn::TensorInfo outputTensorInfo({ 1, 1, 4, 4 }, ArmnnType);
+
+    // Set quantization parameters if the requested type is a quantized type.
+    if(armnn::IsQuantizedType<T>())
+    {
+        inputTensorInfo.SetQuantizationScale(qScale);
+        inputTensorInfo.SetQuantizationOffset(qOffset);
+        outputTensorInfo.SetQuantizationScale(qScale);
+        outputTensorInfo.SetQuantizationOffset(qOffset);
+    }
+
+    auto input = MakeTensor<T, 4>(inputTensorInfo,
+        QuantizedVector<T>(qScale, qOffset, {
+            9.0f,   27.0f,  18.0f,  36.0f,
+            18.0f,   9.0f,  18.0f,   9.0f,
+            27.0f,  18.0f,   9.0f,  27.0f,
+            9.0f,   27.0f,   9.0f,  18.0f,
+        }));
+
+    auto outputExpected = MakeTensor<T, 4>(outputTensorInfo,
+        QuantizedVector<T>(qScale, qOffset, {
+             7.0f,  11.0f,  13.0f, 9.0f,
+            12.0f,  17.0f,  19.0f, 13.0f,
+            12.0f,  16.0f,  16.0f, 10.0f,
+             9.0f,  11.0f,  12.0f, 7.0f,
+        }));
+
+    return SimplePooling2dTestImpl<ArmnnType>(
+        workloadFactory, memoryManager, descriptor, qScale, qOffset, input, outputExpected);
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> IgnorePaddingSimpleL2Pooling2dTestCommon(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    float qScale = 1.0f,
+    int32_t qOffset = 0)
+{
+    armnn::Pooling2dDescriptor descriptor;
+    descriptor.m_PoolType = armnn::PoolingAlgorithm::L2;
+    descriptor.m_PoolWidth = descriptor.m_PoolHeight = 2;
+    descriptor.m_StrideX = descriptor.m_StrideY = 2;
+    descriptor.m_PadLeft = 1;
+    descriptor.m_PadRight = 1;
+    descriptor.m_PadTop = 1;
+    descriptor.m_PadBottom = 1;
+    descriptor.m_PaddingMethod = armnn::PaddingMethod::IgnoreValue;
+
+    armnn::TensorInfo inputTensorInfo({ 1, 1, 4, 4 }, ArmnnType);
+    armnn::TensorInfo outputTensorInfo({ 1, 1, 3, 3 }, ArmnnType);
+
+    // Set quantization parameters if the requested type is a quantized type.
+    if(armnn::IsQuantizedType<T>())
+    {
+        inputTensorInfo.SetQuantizationScale(qScale);
+        inputTensorInfo.SetQuantizationOffset(qOffset);
+        outputTensorInfo.SetQuantizationScale(qScale);
+        outputTensorInfo.SetQuantizationOffset(qOffset);
+    }
+
+    auto input = MakeTensor<T, 4>(inputTensorInfo,
+        QuantizedVector<T>(qScale, qOffset, {
+            2.0f,  4.0f, 8.0f, 16.0f,
+            4.0f,  2.0f, 2.0f, 4.0f,
+            8.0f,  2.0f, 4.0f, 2.0f,
+            16.0f, 2.0f, 2.0f, 8.0f,
+        }));
+
+    auto outputExpected = MakeTensor<T, 4>(outputTensorInfo,
+        QuantizedVector<T>(qScale, qOffset, {
+               1.0f,     4.4721f,   8.0f,
+            4.4721f,     2.6457f,   2.236f,
+               8.0f,     1.4142f,   4.0f,
+        }));
+
+    return SimplePooling2dTestImpl<ArmnnType>(
+        workloadFactory, memoryManager, descriptor, qScale, qOffset, input, outputExpected);
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> IgnorePaddingL2Pooling2dSize3TestCommon(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    float qScale = 1.0f,
+    int32_t qOffset = 0)
+{
+    armnn::Pooling2dDescriptor descriptor;
+    descriptor.m_PoolType = armnn::PoolingAlgorithm::L2;
+    descriptor.m_PoolWidth = descriptor.m_PoolHeight = 3;
+    descriptor.m_StrideX = descriptor.m_StrideY = 1;
+    descriptor.m_PadLeft = 1;
+    descriptor.m_PadRight = 1;
+    descriptor.m_PadTop = 1;
+    descriptor.m_PadBottom = 1;
+    descriptor.m_PaddingMethod = armnn::PaddingMethod::IgnoreValue;
+
+    armnn::TensorInfo inputTensorInfo({ 1, 1, 4, 4 }, ArmnnType);
+    armnn::TensorInfo outputTensorInfo({ 1, 1, 4, 4 }, ArmnnType);
+
+    // Set quantization parameters if the requested type is a quantized type.
+    if(armnn::IsQuantizedType<T>())
+    {
+        inputTensorInfo.SetQuantizationScale(qScale);
+        inputTensorInfo.SetQuantizationOffset(qOffset);
+        outputTensorInfo.SetQuantizationScale(qScale);
+        outputTensorInfo.SetQuantizationOffset(qOffset);
+    }
+
+    auto input = MakeTensor<T, 4>(inputTensorInfo,
+        QuantizedVector<T>(qScale, qOffset, {
+            1.0f, 2.0f, 3.0f, 4.0f,
+            1.0f, 2.0f, 3.0f, 4.0f,
+            1.0f, 2.0f, 3.0f, 4.0f,
+            1.0f, 2.0f, 3.0f, 4.0f,
+        }));
+
+    auto outputExpected = MakeTensor<T, 4>(outputTensorInfo,
+        QuantizedVector<T>(qScale, qOffset, {
+            1.0540f, 1.7638f, 2.5385f, 2.3570f,
+            1.2909f, 2.1602f, 3.1091f, 2.8867f,
+            1.2909f, 2.1602f, 3.1091f, 2.8867f,
+            1.0540f, 1.7638f, 2.5385f, 2.3570f,
+        }));
+
+    return SimplePooling2dTestImpl<ArmnnType>(
+        workloadFactory, memoryManager, descriptor, qScale, qOffset, input, outputExpected);
+}
+
+} // anonymous namespace
+
+LayerTestResult<float, 4> SimpleMaxPooling2dSize2x2Stride2x2Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    bool forceNoPadding)
+{
+    return SimpleMaxPooling2dSize2x2Stride2x2TestCommon<armnn::DataType::Float32>(
+        workloadFactory, memoryManager, forceNoPadding);
+}
+
+LayerTestResult<uint8_t, 4> SimpleMaxPooling2dSize2x2Stride2x2Uint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    bool forceNoPadding)
+{
+    return SimpleMaxPooling2dSize2x2Stride2x2TestCommon<armnn::DataType::QuantisedAsymm8>(
+        workloadFactory, memoryManager, forceNoPadding, 3.0f, -5);
+}
+
+LayerTestResult<int16_t, 4> SimpleMaxPooling2dSize2x2Stride2x2Int16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    bool forceNoPadding)
+{
+    return SimpleMaxPooling2dSize2x2Stride2x2TestCommon<armnn::DataType::QuantisedSymm16>(
+            workloadFactory, memoryManager, forceNoPadding);
+}
+
+LayerTestResult<float, 4> SimpleMaxPooling2dSize3x3Stride2x4Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    bool forceNoPadding)
+{
+    return SimpleMaxPooling2dSize3x3Stride2x4TestCommon<armnn::DataType::Float32>(
+        workloadFactory, memoryManager, forceNoPadding);
+}
+
+LayerTestResult<uint8_t, 4> SimpleMaxPooling2dSize3x3Stride2x4Uint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    bool forceNoPadding)
+{
+    return SimpleMaxPooling2dSize3x3Stride2x4TestCommon<armnn::DataType::QuantisedAsymm8>(
+        workloadFactory, memoryManager, forceNoPadding, 0.1f, 128);
+}
+
+LayerTestResult<int16_t, 4> SimpleMaxPooling2dSize3x3Stride2x4Int16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    bool forceNoPadding)
+{
+    return SimpleMaxPooling2dSize3x3Stride2x4TestCommon<armnn::DataType::QuantisedSymm16>(
+            workloadFactory, memoryManager, forceNoPadding);
+}
+
+LayerTestResult<float, 4> SimpleMaxPooling2dTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::DataLayout dataLayout)
+{
+    return SimpleMaxPooling2dTestCommon<armnn::DataType::Float32>(workloadFactory, memoryManager, dataLayout);
+}
+
+LayerTestResult<uint8_t, 4> SimpleMaxPooling2dUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::DataLayout dataLayout)
+{
+    return SimpleMaxPooling2dTestCommon<armnn::DataType::QuantisedAsymm8>(workloadFactory, memoryManager, dataLayout);
+}
+
+LayerTestResult<int16_t, 4> SimpleMaxPooling2dInt16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::DataLayout dataLayout)
+{
+    return SimpleMaxPooling2dTestCommon<armnn::DataType::QuantisedSymm16>(workloadFactory, memoryManager, dataLayout);
+}
+LayerTestResult<float, 4> IgnorePaddingSimpleMaxPooling2dTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return IgnorePaddingSimpleMaxPooling2dTestCommon<armnn::DataType::Float32>(workloadFactory, memoryManager);
+}
+
+LayerTestResult<uint8_t, 4> IgnorePaddingSimpleMaxPooling2dUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return IgnorePaddingSimpleMaxPooling2dTestCommon<armnn::DataType::QuantisedAsymm8>(
+            workloadFactory, memoryManager, 1.0f, -5);
+}
+
+LayerTestResult<int16_t, 4> IgnorePaddingSimpleMaxPooling2dInt16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return IgnorePaddingSimpleMaxPooling2dTestCommon<armnn::DataType::QuantisedSymm16>(
+            workloadFactory, memoryManager);
+}
+
+LayerTestResult<float, 4> IgnorePaddingMaxPooling2dSize3Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return IgnorePaddingMaxPooling2dSize3TestCommon<armnn::DataType::Float32>(workloadFactory, memoryManager);
+}
+
+LayerTestResult<uint8_t, 4> IgnorePaddingMaxPooling2dSize3Uint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return IgnorePaddingMaxPooling2dSize3TestCommon<armnn::DataType::QuantisedAsymm8>(
+            workloadFactory, memoryManager, 1.0f, -5);
+}
+
+LayerTestResult<int16_t, 4> IgnorePaddingMaxPooling2dSize3Int16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return IgnorePaddingMaxPooling2dSize3TestCommon<armnn::DataType::QuantisedSymm16>(
+            workloadFactory, memoryManager);
+}
+
+LayerTestResult<float, 4> SimpleAveragePooling2dTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::DataLayout dataLayout)
+{
+    return SimpleAveragePooling2dTestCommon<armnn::DataType::Float32>(workloadFactory, memoryManager, dataLayout);
+}
+
+LayerTestResult<uint8_t, 4> SimpleAveragePooling2dUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::DataLayout dataLayout)
+{
+    return SimpleAveragePooling2dTestCommon<armnn::DataType::QuantisedAsymm8>(
+        workloadFactory, memoryManager, dataLayout, 0.5, -1);
+}
+
+LayerTestResult<int16_t, 4> SimpleAveragePooling2dInt16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::DataLayout dataLayout)
+{
+    return SimpleAveragePooling2dTestCommon<armnn::DataType::QuantisedSymm16>(
+            workloadFactory, memoryManager, dataLayout);
+}
+
+LayerTestResult<float, 4> IgnorePaddingAveragePooling2dSize3x2Stride2x2Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    bool forceNoPadding)
+{
+    return IgnorePaddingAveragePooling2dSize3x2Stride2x2TestCommon<armnn::DataType::Float32>(
+        workloadFactory, memoryManager, forceNoPadding);
+}
+
+LayerTestResult<float, 4> LargeTensorsAveragePooling2dTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return LargeTensorsAveragePooling2dTestCommon<armnn::DataType::Float32>(workloadFactory, memoryManager);
+}
+
+LayerTestResult<uint8_t, 4> LargeTensorsAveragePooling2dUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return LargeTensorsAveragePooling2dTestCommon<armnn::DataType::QuantisedAsymm8>(
+        workloadFactory, memoryManager, 0.5, -1);
+}
+
+LayerTestResult<int16_t, 4> LargeTensorsAveragePooling2dInt16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return LargeTensorsAveragePooling2dTestCommon<armnn::DataType::QuantisedSymm16>(
+            workloadFactory, memoryManager);
+}
+LayerTestResult<float, 4> IgnorePaddingSimpleAveragePooling2dTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return IgnorePaddingSimpleAveragePooling2dTestCommon<armnn::DataType::Float32>(workloadFactory, memoryManager);
+}
+
+LayerTestResult<uint8_t, 4> IgnorePaddingSimpleAveragePooling2dUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return IgnorePaddingSimpleAveragePooling2dTestCommon<armnn::DataType::QuantisedAsymm8>(
+            workloadFactory, memoryManager);
+}
+
+LayerTestResult<int16_t, 4> IgnorePaddingSimpleAveragePooling2dInt16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return IgnorePaddingSimpleAveragePooling2dTestCommon<armnn::DataType::QuantisedSymm16>(
+            workloadFactory, memoryManager);
+}
+
+LayerTestResult<float, 4> IgnorePaddingSimpleAveragePooling2dNoPaddingTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return IgnorePaddingSimpleAveragePooling2dNoPaddingTestCommon<armnn::DataType::Float32>(
+            workloadFactory, memoryManager);
+}
+
+LayerTestResult<uint8_t, 4> IgnorePaddingSimpleAveragePooling2dNoPaddingUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return IgnorePaddingSimpleAveragePooling2dNoPaddingTestCommon<armnn::DataType::QuantisedAsymm8>(
+            workloadFactory, memoryManager);
+}
+
+LayerTestResult<int16_t, 4> IgnorePaddingSimpleAveragePooling2dNoPaddingInt16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return IgnorePaddingSimpleAveragePooling2dNoPaddingTestCommon<armnn::DataType::QuantisedSymm16>(
+            workloadFactory, memoryManager);
+}
+
+LayerTestResult<float, 4> IgnorePaddingAveragePooling2dSize3Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return IgnorePaddingAveragePooling2dSize3TestCommon<armnn::DataType::Float32>(workloadFactory, memoryManager);
+}
+
+LayerTestResult<uint8_t, 4> IgnorePaddingAveragePooling2dSize3Uint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return IgnorePaddingAveragePooling2dSize3TestCommon<armnn::DataType::QuantisedAsymm8>(
+            workloadFactory, memoryManager);
+}
+
+LayerTestResult<int16_t, 4> IgnorePaddingAveragePooling2dSize3Int16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return IgnorePaddingAveragePooling2dSize3TestCommon<armnn::DataType::QuantisedSymm16>(
+            workloadFactory, memoryManager);
+}
+
+LayerTestResult<float, 4> SimpleL2Pooling2dTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::DataLayout dataLayout)
+{
+    return SimpleL2Pooling2dTestCommon<armnn::DataType::Float32>(workloadFactory, memoryManager, dataLayout);
+}
+
+LayerTestResult<uint8_t, 4> SimpleL2Pooling2dUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::DataLayout dataLayout)
+{
+    return SimpleL2Pooling2dTestCommon<armnn::DataType::QuantisedAsymm8>(workloadFactory, memoryManager, dataLayout);
+}
+
+LayerTestResult<int16_t, 4> SimpleL2Pooling2dInt16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::DataLayout dataLayout)
+{
+    return SimpleL2Pooling2dTestCommon<armnn::DataType::QuantisedSymm16>(workloadFactory, memoryManager, dataLayout);
+}
+
+LayerTestResult<float, 4> L2Pooling2dSize3Stride1Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return L2Pooling2dSize3Stride1TestCommon<armnn::DataType::Float32>(workloadFactory, memoryManager);
+}
+
+LayerTestResult<uint8_t, 4> L2Pooling2dSize3Stride1Uint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return L2Pooling2dSize3Stride1TestCommon<armnn::DataType::QuantisedAsymm8>(workloadFactory, memoryManager);
+}
+
+LayerTestResult<int16_t, 4> L2Pooling2dSize3Stride1Int16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return L2Pooling2dSize3Stride1TestCommon<armnn::DataType::QuantisedSymm16>(workloadFactory, memoryManager);
+}
+
+LayerTestResult<float, 4> L2Pooling2dSize3Stride3Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return L2Pooling2dSize3Stride3TestCommon<armnn::DataType::Float32>(workloadFactory, memoryManager);
+}
+
+LayerTestResult<uint8_t, 4> L2Pooling2dSize3Stride3Uint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return L2Pooling2dSize3Stride3TestCommon<armnn::DataType::QuantisedAsymm8>(workloadFactory, memoryManager);
+}
+
+LayerTestResult<int16_t, 4> L2Pooling2dSize3Stride3Int16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return L2Pooling2dSize3Stride3TestCommon<armnn::DataType::QuantisedSymm16>(workloadFactory, memoryManager);
+}
+LayerTestResult<float, 4> L2Pooling2dSize3Stride4Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return L2Pooling2dSize3Stride4TestCommon<armnn::DataType::Float32>(workloadFactory, memoryManager);
+}
+
+LayerTestResult<uint8_t, 4> L2Pooling2dSize3Stride4Uint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return L2Pooling2dSize3Stride4TestCommon<armnn::DataType::QuantisedAsymm8>(workloadFactory, memoryManager);
+}
+
+LayerTestResult<int16_t, 4> L2Pooling2dSize3Stride4Int16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return L2Pooling2dSize3Stride4TestCommon<armnn::DataType::QuantisedSymm16>(workloadFactory, memoryManager);
+}
+
+LayerTestResult<float, 4> L2Pooling2dSize7Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return L2Pooling2dSize7TestCommon<armnn::DataType::Float32>(workloadFactory, memoryManager);
+}
+
+LayerTestResult<uint8_t, 4> L2Pooling2dSize7Uint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return L2Pooling2dSize7TestCommon<armnn::DataType::QuantisedAsymm8>(workloadFactory, memoryManager);
+}
+
+LayerTestResult<int16_t, 4> L2Pooling2dSize7Int16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return L2Pooling2dSize7TestCommon<armnn::DataType::QuantisedSymm16>(workloadFactory, memoryManager);
+}
+
+LayerTestResult<float, 4> L2Pooling2dSize9Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return L2Pooling2dSize9TestCommon<armnn::DataType::Float32>(workloadFactory, memoryManager);
+}
+
+LayerTestResult<uint8_t, 4> L2Pooling2dSize9Uint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return L2Pooling2dSize9TestCommon<armnn::DataType::QuantisedAsymm8>(workloadFactory, memoryManager);
+}
+
+LayerTestResult<int16_t, 4> L2Pooling2dSize9Int16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return L2Pooling2dSize9TestCommon<armnn::DataType::QuantisedSymm16>(workloadFactory, memoryManager);
+}
+LayerTestResult<float, 4> IgnorePaddingSimpleL2Pooling2dTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return IgnorePaddingSimpleL2Pooling2dTestCommon<armnn::DataType::Float32>(workloadFactory, memoryManager);
+}
+
+LayerTestResult<uint8_t, 4> IgnorePaddingSimpleL2Pooling2dUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return IgnorePaddingSimpleL2Pooling2dTestCommon<armnn::DataType::QuantisedAsymm8>(workloadFactory, memoryManager);
+}
+
+LayerTestResult<int16_t, 4> IgnorePaddingSimpleL2Pooling2dInt16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return IgnorePaddingSimpleL2Pooling2dTestCommon<armnn::DataType::QuantisedSymm16>(workloadFactory, memoryManager);
+}
+
+LayerTestResult<float, 4> IgnorePaddingL2Pooling2dSize3Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return IgnorePaddingL2Pooling2dSize3TestCommon<armnn::DataType::Float32>(workloadFactory, memoryManager);
+}
+
+LayerTestResult<uint8_t, 4> IgnorePaddingL2Pooling2dSize3Uint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return IgnorePaddingL2Pooling2dSize3TestCommon<armnn::DataType::QuantisedAsymm8>(workloadFactory, memoryManager);
+}
+
+LayerTestResult<int16_t, 4> IgnorePaddingL2Pooling2dSize3Int16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return IgnorePaddingL2Pooling2dSize3TestCommon<armnn::DataType::QuantisedSymm16>(workloadFactory, memoryManager);
+}
+
+LayerTestResult<float, 4> AsymmetricNonSquarePooling2dTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return AsymmetricNonSquarePooling2dTestCommon<armnn::DataType::Float32>(workloadFactory, memoryManager);
+}
+
+LayerTestResult<uint8_t, 4> AsymmetricNonSquarePooling2dUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return AsymmetricNonSquarePooling2dTestCommon<armnn::DataType::QuantisedAsymm8>(workloadFactory, memoryManager);
+}
+
+LayerTestResult<int16_t, 4> AsymmetricNonSquarePooling2dInt16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return AsymmetricNonSquarePooling2dTestCommon<armnn::DataType::QuantisedSymm16>(workloadFactory, memoryManager);
+}
+
+LayerTestResult<float, 4> ComparePooling2dTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    armnn::IWorkloadFactory& refWorkloadFactory,
+    armnn::PoolingAlgorithm  poolingType)
+{
+    return ComparePooling2dTestCommon<armnn::DataType::Float32>(
+        workloadFactory, memoryManager, refWorkloadFactory, poolingType);
+}
+
+LayerTestResult<uint8_t, 4> ComparePooling2dUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    armnn::IWorkloadFactory& refWorkloadFactory,
+    armnn::PoolingAlgorithm  poolingType)
+{
+    return ComparePooling2dTestCommon<armnn::DataType::QuantisedAsymm8>(
+        workloadFactory, memoryManager, refWorkloadFactory, poolingType, 0.1f, 128);
+}
+
+LayerTestResult<int16_t, 4> ComparePooling2dInt16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    armnn::IWorkloadFactory& refWorkloadFactory,
+    armnn::PoolingAlgorithm  poolingType)
+{
+    return ComparePooling2dTestCommon<armnn::DataType::QuantisedSymm16>(
+            workloadFactory, memoryManager, refWorkloadFactory, poolingType);
+}
diff --git a/src/backends/backendsCommon/test/layerTests/Pooling2dTestImpl.hpp b/src/backends/backendsCommon/test/layerTests/Pooling2dTestImpl.hpp
new file mode 100644
index 0000000..6f7a595
--- /dev/null
+++ b/src/backends/backendsCommon/test/layerTests/Pooling2dTestImpl.hpp
@@ -0,0 +1,279 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include "LayerTestResult.hpp"
+
+#include <armnn/Types.hpp>
+
+#include <backendsCommon/IBackendInternal.hpp>
+#include <backendsCommon/WorkloadFactory.hpp>
+
+LayerTestResult<float,   4> SimpleMaxPooling2dSize2x2Stride2x2Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    bool forceNoPadding);
+
+LayerTestResult<uint8_t, 4> SimpleMaxPooling2dSize2x2Stride2x2Uint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    bool forceNoPadding);
+
+LayerTestResult<int16_t, 4> SimpleMaxPooling2dSize2x2Stride2x2Int16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    bool forceNoPadding);
+
+LayerTestResult<float,   4> SimpleMaxPooling2dSize3x3Stride2x4Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    bool forceNoPadding);
+
+LayerTestResult<uint8_t, 4> SimpleMaxPooling2dSize3x3Stride2x4Uint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    bool forceNoPadding );
+
+LayerTestResult<int16_t, 4> SimpleMaxPooling2dSize3x3Stride2x4Int16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    bool forceNoPadding );
+
+LayerTestResult<float,   4> SimpleMaxPooling2dTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::DataLayout dataLayout);
+
+LayerTestResult<uint8_t, 4> SimpleMaxPooling2dUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::DataLayout dataLayout);
+
+LayerTestResult<int16_t, 4> SimpleMaxPooling2dInt16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::DataLayout dataLayout);
+
+LayerTestResult<float,   4> IgnorePaddingSimpleMaxPooling2dTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<uint8_t, 4> IgnorePaddingSimpleMaxPooling2dUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<int16_t, 4> IgnorePaddingSimpleMaxPooling2dInt16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<float,   4> IgnorePaddingMaxPooling2dSize3Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<uint8_t, 4> IgnorePaddingMaxPooling2dSize3Uint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<int16_t, 4> IgnorePaddingMaxPooling2dSize3Int16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<float,   4> SimpleAveragePooling2dTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::DataLayout dataLayout);
+
+LayerTestResult<uint8_t, 4> SimpleAveragePooling2dUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::DataLayout dataLayout);
+
+LayerTestResult<int16_t, 4> SimpleAveragePooling2dInt16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::DataLayout dataLayout);
+
+LayerTestResult<float,   4> LargeTensorsAveragePooling2dTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<uint8_t, 4> LargeTensorsAveragePooling2dUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<int16_t, 4> LargeTensorsAveragePooling2dInt16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<float,   4> IgnorePaddingAveragePooling2dSize3x2Stride2x2Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    bool forceNoPadding);
+
+LayerTestResult<float,   4> IgnorePaddingSimpleAveragePooling2dTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<uint8_t, 4> IgnorePaddingSimpleAveragePooling2dUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<int16_t, 4> IgnorePaddingSimpleAveragePooling2dInt16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<float, 4>   IgnorePaddingSimpleAveragePooling2dNoPaddingTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<uint8_t, 4> IgnorePaddingSimpleAveragePooling2dNoPaddingUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<int16_t, 4> IgnorePaddingSimpleAveragePooling2dNoPaddingInt16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<float,   4> IgnorePaddingAveragePooling2dSize3Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<uint8_t, 4> IgnorePaddingAveragePooling2dSize3Uint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<int16_t, 4> IgnorePaddingAveragePooling2dSize3Int16Test(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<float,   4> SimpleL2Pooling2dTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::DataLayout dataLayout);
+
+LayerTestResult<uint8_t, 4> SimpleL2Pooling2dUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::DataLayout dataLayout);
+
+LayerTestResult<int16_t, 4> SimpleL2Pooling2dInt16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::DataLayout dataLayout);
+
+LayerTestResult<float,   4> L2Pooling2dSize3Stride1Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<uint8_t, 4> L2Pooling2dSize3Stride1Uint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<int16_t, 4> L2Pooling2dSize3Stride1Int16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<float,   4> L2Pooling2dSize3Stride3Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<uint8_t, 4> L2Pooling2dSize3Stride3Uint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<int16_t, 4> L2Pooling2dSize3Stride3Int16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<float,   4> L2Pooling2dSize3Stride4Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<uint8_t, 4> L2Pooling2dSize3Stride4Uint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<int16_t, 4> L2Pooling2dSize3Stride4Int16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<float,   4> L2Pooling2dSize7Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<uint8_t, 4> L2Pooling2dSize7Uint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<int16_t, 4> L2Pooling2dSize7Int16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<float,   4> L2Pooling2dSize9Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<uint8_t, 4> L2Pooling2dSize9Uint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<int16_t, 4> L2Pooling2dSize9Int16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<float,   4> IgnorePaddingSimpleL2Pooling2dTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<uint8_t, 4> IgnorePaddingSimpleL2Pooling2dUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<int16_t, 4> IgnorePaddingSimpleL2Pooling2dInt16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<float,   4> IgnorePaddingL2Pooling2dSize3Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<uint8_t, 4> IgnorePaddingL2Pooling2dSize3Uint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<int16_t, 4> IgnorePaddingL2Pooling2dSize3Int16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<float,   4> AsymmetricNonSquarePooling2dTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<uint8_t, 4> AsymmetricNonSquarePooling2dUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<int16_t, 4> AsymmetricNonSquarePooling2dInt16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<float, 4> ComparePooling2dTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    armnn::IWorkloadFactory& refWorkloadFactory,
+    armnn::PoolingAlgorithm  poolingType);
+
+LayerTestResult<uint8_t, 4> ComparePooling2dUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    armnn::IWorkloadFactory& refWorkloadFactory,
+    armnn::PoolingAlgorithm  poolingType);
+
+LayerTestResult<int16_t, 4> ComparePooling2dInt16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    armnn::IWorkloadFactory& refWorkloadFactory,
+    armnn::PoolingAlgorithm  poolingType);
diff --git a/src/backends/backendsCommon/test/layerTests/PreluTestImpl.hpp b/src/backends/backendsCommon/test/layerTests/PreluTestImpl.hpp
new file mode 100644
index 0000000..18a5bd0
--- /dev/null
+++ b/src/backends/backendsCommon/test/layerTests/PreluTestImpl.hpp
@@ -0,0 +1,97 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include "LayerTestResult.hpp"
+
+#include <ResolveType.hpp>
+
+#include <armnn/ArmNN.hpp>
+
+#include <backendsCommon/IBackendInternal.hpp>
+#include <backendsCommon/WorkloadFactory.hpp>
+
+#include <backendsCommon/test/TensorCopyUtils.hpp>
+#include <backendsCommon/test/WorkloadTestUtils.hpp>
+
+#include <test/TensorHelpers.hpp>
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> PreluTest(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    armnn::TensorInfo inputTensorInfo ({ 1, 2, 2, 3 }, ArmnnType);
+    armnn::TensorInfo alphaTensorInfo ({ 1, 1, 1, 3 }, ArmnnType);
+    armnn::TensorInfo outputTensorInfo({ 1, 2, 2, 3 }, ArmnnType);
+
+    if (armnn::IsQuantizedType<T>())
+    {
+        inputTensorInfo.SetQuantizationScale(0.25f);
+        inputTensorInfo.SetQuantizationOffset(128);
+        alphaTensorInfo.SetQuantizationScale(0.25f);
+        alphaTensorInfo.SetQuantizationOffset(50);
+        outputTensorInfo.SetQuantizationScale(0.5f);
+        outputTensorInfo.SetQuantizationOffset(120);
+    }
+
+    std::vector<float> inputData
+    {
+        // Expected quantized values:
+        // 128, 128, 128, 132, 132, 132, 124, 124, 124, 120, 120, 120
+        0.0f, 0.0f, 0.0f, 1.0f, 1.0f, 1.0f, -1.0f, -1.0f, -1.0f, -2.0f, -2.0f, -2.0f
+    };
+    std::vector<float> alphaData
+    {
+        // Expected quantized values:
+        // 50, 54, 58
+        0.0f, 1.0f, 2.0f
+    };
+    std::vector<float> outputExpectedData =
+    {
+        // Expected quantized values:
+        // 20, 120, 120, 122, 122, 122, 120, 118, 116, 120, 116, 112
+       0.0f, 0.0f, 0.0f, 1.0f, 1.0f, 1.0f, 0.0f, -1.0f, -2.0f, 0.0f, -2.0f, -4.0f
+    };
+
+    auto input = MakeTensor<T, 4>(inputTensorInfo, QuantizedVector<T>(inputTensorInfo.GetQuantizationScale(),
+                                                                      inputTensorInfo.GetQuantizationOffset(),
+                                                                      inputData));
+    auto alpha = MakeTensor<T, 4>(alphaTensorInfo, QuantizedVector<T>(alphaTensorInfo.GetQuantizationScale(),
+                                                                      alphaTensorInfo.GetQuantizationOffset(),
+                                                                      alphaData));
+
+    LayerTestResult<T, 4> result(outputTensorInfo);
+    result.outputExpected = MakeTensor<T, 4>(outputTensorInfo,
+                                             QuantizedVector<T>(outputTensorInfo.GetQuantizationScale(),
+                                                                outputTensorInfo.GetQuantizationOffset(),
+                                                                outputExpectedData));
+
+    std::unique_ptr <armnn::ITensorHandle> inputHandle  = workloadFactory.CreateTensorHandle(inputTensorInfo);
+    std::unique_ptr <armnn::ITensorHandle> alphaHandle  = workloadFactory.CreateTensorHandle(alphaTensorInfo);
+    std::unique_ptr <armnn::ITensorHandle> outputHandle = workloadFactory.CreateTensorHandle(outputTensorInfo);
+
+    armnn::PreluQueueDescriptor descriptor;
+    armnn::WorkloadInfo info;
+    AddInputToWorkload (descriptor, info, inputTensorInfo,  inputHandle.get());
+    AddInputToWorkload (descriptor, info, alphaTensorInfo,  alphaHandle.get());
+    AddOutputToWorkload(descriptor, info, outputTensorInfo, outputHandle.get());
+
+    std::unique_ptr<armnn::IWorkload> workload = workloadFactory.CreatePrelu(descriptor, info);
+
+    inputHandle->Allocate();
+    alphaHandle->Allocate();
+    outputHandle->Allocate();
+
+    CopyDataToITensorHandle(inputHandle.get(), &input[0][0][0][0]);
+    CopyDataToITensorHandle(alphaHandle.get(), &alpha[0][0][0][0]);
+
+    workload->Execute();
+
+    CopyDataFromITensorHandle(&result.output[0][0][0][0], outputHandle.get());
+
+    return result;
+}
diff --git a/src/backends/backendsCommon/test/layerTests/QuantizeTestImpl.cpp b/src/backends/backendsCommon/test/layerTests/QuantizeTestImpl.cpp
new file mode 100644
index 0000000..7d5d73b
--- /dev/null
+++ b/src/backends/backendsCommon/test/layerTests/QuantizeTestImpl.cpp
@@ -0,0 +1,147 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "QuantizeTestImpl.hpp"
+
+#include <ResolveType.hpp>
+
+#include <armnn/ArmNN.hpp>
+
+#include <backendsCommon/IBackendInternal.hpp>
+#include <backendsCommon/WorkloadFactory.hpp>
+
+#include <backendsCommon/test/TensorCopyUtils.hpp>
+#include <backendsCommon/test/WorkloadTestUtils.hpp>
+
+#include <test/TensorHelpers.hpp>
+
+namespace
+{
+
+template<typename T, std::size_t Dim>
+LayerTestResult<T, Dim> QuantizeTestImpl(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::TensorInfo& inputTensorInfo,
+    const armnn::TensorInfo& outputTensorInfo,
+    const std::vector<float>& inputData,
+    const std::vector<T>& expectedOutputData,
+    armnn::QuantizeQueueDescriptor descriptor)
+{
+    boost::multi_array<float, Dim> input = MakeTensor<float, Dim>(inputTensorInfo, inputData);
+
+    LayerTestResult<T, Dim> ret(outputTensorInfo);
+    ret.outputExpected = MakeTensor<T, Dim>(outputTensorInfo, expectedOutputData);
+
+    std::unique_ptr<armnn::ITensorHandle> inputHandle = workloadFactory.CreateTensorHandle(inputTensorInfo);
+    std::unique_ptr<armnn::ITensorHandle> outputHandle = workloadFactory.CreateTensorHandle(outputTensorInfo);
+
+    armnn::WorkloadInfo info;
+    AddInputToWorkload(descriptor, info, inputTensorInfo, inputHandle.get());
+    AddOutputToWorkload(descriptor, info, outputTensorInfo, outputHandle.get());
+
+    std::unique_ptr<armnn::IWorkload> workload = workloadFactory.CreateQuantize(descriptor, info);
+
+    inputHandle->Allocate();
+    outputHandle->Allocate();
+
+    CopyDataToITensorHandle(inputHandle.get(), input.data());
+
+    ExecuteWorkload(*workload, memoryManager);
+
+    CopyDataFromITensorHandle(ret.output.data(), outputHandle.get());
+
+    return ret;
+}
+
+template <armnn::DataType ArmnnOutputType, typename T = armnn::ResolveType<ArmnnOutputType>>
+LayerTestResult<T, 4> QuantizeSimpleTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    armnn::QuantizeQueueDescriptor desc;
+
+    const armnn::TensorInfo inputTensorInfo({1, 2, 2, 3}, armnn::DataType::Float32);
+    const armnn::TensorInfo outputTensorInfo({1, 2, 2, 3}, ArmnnOutputType, 0.5f, 1);
+
+    std::vector<float> inputData = std::vector<float>(
+    {
+        1.0f,   2.0f,  3.0f,
+        4.0f,   5.0f,  6.0f,
+        7.0f,   8.0f,  9.0f,
+        10.0f, 11.0f, 12.0f,
+    });
+
+    std::vector<T> expectedOutputData = std::vector<T>(
+    {
+         3,  5,  7,
+         9, 11, 13,
+        15, 17, 19,
+        21, 23, 25,
+    });
+
+    return QuantizeTestImpl<T, 4>(workloadFactory,
+                                  memoryManager,
+                                  inputTensorInfo,
+                                  outputTensorInfo,
+                                  inputData,
+                                  expectedOutputData,
+                                  desc);
+}
+
+template <armnn::DataType ArmnnOutputType, typename T = armnn::ResolveType<ArmnnOutputType>>
+LayerTestResult<T, 4> QuantizeClampTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    armnn::QuantizeQueueDescriptor desc;
+
+    const armnn::TensorInfo inputTensorInfo({1, 1, 2, 1}, armnn::DataType::Float32);
+    const armnn::TensorInfo outputTensorInfo({1, 1, 2, 1}, ArmnnOutputType, 0.0001f, 0);
+
+    const T max = std::numeric_limits<T>::max();
+    const T min = std::numeric_limits<T>::lowest();
+
+    std::vector<float> inputData = std::vector<float>(
+    {
+        -100.0f, 100.0f
+    });
+
+    std::vector<T> expectedOutputData = std::vector<T>(
+    {
+        min, max
+    });
+
+    return QuantizeTestImpl<T, 4>(workloadFactory,
+                                  memoryManager,
+                                  inputTensorInfo,
+                                  outputTensorInfo,
+                                  inputData,
+                                  expectedOutputData,
+                                  desc);
+}
+
+} // anonymous namespace
+
+LayerTestResult<uint8_t, 4> QuantizeSimpleUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return QuantizeSimpleTest<armnn::DataType::QuantisedAsymm8>(workloadFactory, memoryManager);
+}
+
+LayerTestResult<uint8_t, 4> QuantizeClampUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return QuantizeClampTest<armnn::DataType::QuantisedAsymm8>(workloadFactory, memoryManager);
+}
+
+LayerTestResult<int16_t, 4> QuantizeClampInt16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return QuantizeClampTest<armnn::DataType::QuantisedSymm16>(workloadFactory, memoryManager);
+}
diff --git a/src/backends/backendsCommon/test/layerTests/QuantizeTestImpl.hpp b/src/backends/backendsCommon/test/layerTests/QuantizeTestImpl.hpp
new file mode 100644
index 0000000..bac438e
--- /dev/null
+++ b/src/backends/backendsCommon/test/layerTests/QuantizeTestImpl.hpp
@@ -0,0 +1,23 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include "LayerTestResult.hpp"
+
+#include <backendsCommon/IBackendInternal.hpp>
+#include <backendsCommon/WorkloadFactory.hpp>
+
+LayerTestResult<uint8_t, 4> QuantizeSimpleUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<uint8_t, 4> QuantizeClampUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<int16_t, 4> QuantizeClampInt16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
diff --git a/src/backends/backendsCommon/test/layerTests/ReshapeTestImpl.cpp b/src/backends/backendsCommon/test/layerTests/ReshapeTestImpl.cpp
new file mode 100644
index 0000000..bce24f0
--- /dev/null
+++ b/src/backends/backendsCommon/test/layerTests/ReshapeTestImpl.cpp
@@ -0,0 +1,201 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "ReshapeTestImpl.hpp"
+
+#include <backendsCommon/test/DataTypeUtils.hpp>
+#include <backendsCommon/test/TensorCopyUtils.hpp>
+#include <backendsCommon/test/WorkloadTestUtils.hpp>
+
+#include <test/TensorHelpers.hpp>
+
+namespace
+{
+
+template<typename T, size_t NumDims>
+LayerTestResult<T, NumDims> SimpleReshapeTestImpl(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    armnn::TensorInfo inputTensorInfo,
+    armnn::TensorInfo outputTensorInfo,
+    const std::vector<T>& inputData,
+    const std::vector<T>& outputExpectedData)
+{
+    auto input = MakeTensor<T, NumDims>(inputTensorInfo, inputData);
+
+    LayerTestResult<T, NumDims> ret(outputTensorInfo);
+    ret.outputExpected = MakeTensor<T, NumDims>(outputTensorInfo, outputExpectedData);
+
+    std::unique_ptr<armnn::ITensorHandle> inputHandle = workloadFactory.CreateTensorHandle(inputTensorInfo);
+    std::unique_ptr<armnn::ITensorHandle> outputHandle = workloadFactory.CreateTensorHandle(outputTensorInfo);
+
+    armnn::ReshapeQueueDescriptor data;
+    armnn::WorkloadInfo info;
+    AddInputToWorkload(data, info, inputTensorInfo, inputHandle.get());
+    AddOutputToWorkload(data, info, outputTensorInfo, outputHandle.get());
+
+    std::unique_ptr<armnn::IWorkload> workload = workloadFactory.CreateReshape(data, info);
+
+    inputHandle->Allocate();
+    outputHandle->Allocate();
+
+    CopyDataToITensorHandle(inputHandle.get(), input.origin());
+
+    workload->Execute();
+
+    CopyDataFromITensorHandle(ret.output.origin(), outputHandle.get());
+
+    return ret;
+}
+
+} // anonymous namespace
+
+template<armnn::DataType ArmnnType, typename T>
+LayerTestResult<T, 4> SimpleReshapeTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    armnn::TensorInfo inputTensorInfo;
+    armnn::TensorInfo outputTensorInfo;
+
+    unsigned int inputShape[] = { 2, 2, 3, 3 };
+    unsigned int outputShape[] = { 2, 2, 9, 1 };
+
+    inputTensorInfo = armnn::TensorInfo(4, inputShape, ArmnnType);
+    inputTensorInfo.SetQuantizationScale(1.0f);
+    outputTensorInfo = armnn::TensorInfo(4, outputShape, ArmnnType);
+    outputTensorInfo.SetQuantizationScale(1.0f);
+
+    auto input = ConvertToDataType<ArmnnType>(
+        {
+            0.0f, 1.0f, 2.0f,
+            3.0f, 4.0f, 5.0f,
+            6.0f, 7.0f, 8.0f,
+
+            9.0f, 10.0f, 11.0f,
+            12.0f, 13.0f, 14.0f,
+            15.0f, 16.0f, 17.0f,
+
+            18.0f, 19.0f, 20.0f,
+            21.0f, 22.0f, 23.0f,
+            24.0f, 25.0f, 26.0f,
+
+            27.0f, 28.0f, 29.0f,
+            30.0f, 31.0f, 32.0f,
+            33.0f, 34.0f, 35.0f,
+        },
+        inputTensorInfo);
+
+    auto outputExpected = ConvertToDataType<ArmnnType>(
+        {
+            0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f,
+
+            9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f, 17.0f,
+
+            18.0f, 19.0f, 20.0f, 21.0f, 22.0f, 23.0f, 24.0f, 25.0f, 26.0f,
+
+            27.0f, 28.0f, 29.0f, 30.0f, 31.0f, 32.0f, 33.0f, 34.0f, 35.0f,
+        },
+        outputTensorInfo);
+
+    return SimpleReshapeTestImpl<T, 4>(
+        workloadFactory, memoryManager, inputTensorInfo, outputTensorInfo, input, outputExpected);
+}
+
+template<armnn::DataType ArmnnType, typename T>
+LayerTestResult<T, 5> Reshape5dTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    armnn::TensorInfo inputTensorInfo;
+    armnn::TensorInfo outputTensorInfo;
+
+    unsigned int inputShape[] = { 2, 2, 8, 1, 1 };
+    unsigned int outputShape[] = { 2, 2, 2, 2, 2 };
+
+    inputTensorInfo = armnn::TensorInfo(5, inputShape, ArmnnType);
+    inputTensorInfo.SetQuantizationScale(1.0f);
+    outputTensorInfo = armnn::TensorInfo(5, outputShape, ArmnnType);
+    outputTensorInfo.SetQuantizationScale(1.0f);
+
+    auto input = ConvertToDataType<ArmnnType>(
+        {
+            0.0f, 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f,
+            8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f,
+
+            16.0f, 17.0f, 18.0f, 19.0f, 20.0f, 21.0f, 22.0f, 23.0f,
+            24.0f, 25.0f, 26.0f, 27.0f, 28.0f, 29.0f, 30.0f, 31.0f,
+        },
+        inputTensorInfo);
+
+    auto outputExpected = ConvertToDataType<ArmnnType>(
+        {
+            0.0f, 1.0f,
+            2.0f, 3.0f,
+
+            4.0f, 5.0f,
+            6.0f, 7.0f,
+
+
+            8.0f, 9.0f,
+            10.0f, 11.0f,
+
+            12.0f, 13.0f,
+            14.0f, 15.0f,
+
+
+
+            16.0f, 17.0f,
+            18.0f, 19.0f,
+
+            20.0f, 21.0f,
+            22.0f, 23.0f,
+
+
+            24.0f, 25.0f,
+            26.0f, 27.0f,
+
+            28.0f, 29.0f,
+            30.0f, 31.0f,
+        },
+        outputTensorInfo);
+
+    return SimpleReshapeTestImpl<T, 5>(
+        workloadFactory, memoryManager, inputTensorInfo, outputTensorInfo, input, outputExpected);
+}
+
+//
+// Explicit template specializations
+//
+
+template LayerTestResult<armnn::ResolveType<armnn::DataType::Float32>, 4>
+SimpleReshapeTest<armnn::DataType::Float32>(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+template LayerTestResult<armnn::ResolveType<armnn::DataType::QuantisedAsymm8>, 4>
+SimpleReshapeTest<armnn::DataType::QuantisedAsymm8>(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+template LayerTestResult<armnn::ResolveType<armnn::DataType::QuantisedSymm16>, 4>
+SimpleReshapeTest<armnn::DataType::QuantisedSymm16>(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+template LayerTestResult<armnn::ResolveType<armnn::DataType::Float32>, 5>
+Reshape5dTest<armnn::DataType::Float32>(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+template LayerTestResult<armnn::ResolveType<armnn::DataType::QuantisedAsymm8>, 5>
+Reshape5dTest<armnn::DataType::QuantisedAsymm8>(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+template LayerTestResult<armnn::ResolveType<armnn::DataType::QuantisedSymm16>, 5>
+Reshape5dTest<armnn::DataType::QuantisedSymm16>(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
diff --git a/src/backends/backendsCommon/test/layerTests/ReshapeTestImpl.hpp b/src/backends/backendsCommon/test/layerTests/ReshapeTestImpl.hpp
new file mode 100644
index 0000000..fb0bb33
--- /dev/null
+++ b/src/backends/backendsCommon/test/layerTests/ReshapeTestImpl.hpp
@@ -0,0 +1,23 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include "LayerTestResult.hpp"
+
+#include <ResolveType.hpp>
+
+#include <backendsCommon/IBackendInternal.hpp>
+#include <backendsCommon/WorkloadFactory.hpp>
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> SimpleReshapeTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 5> Reshape5dTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
diff --git a/src/backends/backendsCommon/test/layerTests/ResizeTestImpl.hpp b/src/backends/backendsCommon/test/layerTests/ResizeTestImpl.hpp
new file mode 100644
index 0000000..bb2392f
--- /dev/null
+++ b/src/backends/backendsCommon/test/layerTests/ResizeTestImpl.hpp
@@ -0,0 +1,1012 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include "LayerTestResult.hpp"
+
+#include <Permute.hpp>
+#include <ResolveType.hpp>
+#include <TensorUtils.hpp>
+
+#include <armnn/ArmNN.hpp>
+
+#include <backendsCommon/IBackendInternal.hpp>
+#include <backendsCommon/WorkloadFactory.hpp>
+
+#include <backendsCommon/test/TensorCopyUtils.hpp>
+#include <backendsCommon/test/WorkloadTestUtils.hpp>
+
+#include <test/TensorHelpers.hpp>
+
+//
+// ResizeBilinear
+//
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> ResizeBilinearNopTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::DataLayout dataLayout)
+{
+    armnn::TensorInfo inputTensorInfo = armnn::IsQuantizedType<T>()
+        ?  armnnUtils::GetTensorInfo(1, 1, 4, 4, dataLayout, ArmnnType)
+        :  armnnUtils::GetTensorInfo(1, 2, 4, 4, dataLayout, ArmnnType);
+
+    armnn::TensorInfo outputTensorInfo = armnn::IsQuantizedType<T>()
+        ?  armnnUtils::GetTensorInfo(1, 1, 4, 4, dataLayout, ArmnnType)
+        :  armnnUtils::GetTensorInfo(1, 2, 4, 4, dataLayout, ArmnnType);
+
+    if (armnn::IsQuantizedType<T>())
+    {
+        inputTensorInfo.SetQuantizationScale(1.5f);
+        inputTensorInfo.SetQuantizationOffset(-3);
+        outputTensorInfo.SetQuantizationScale(1.5f);
+        outputTensorInfo.SetQuantizationOffset(-3);
+    }
+
+    std::vector<float> inputData = armnn::IsQuantizedType<T>()
+        ? std::initializer_list<float>
+            {
+                1, 2, 3, 4,
+                2, 3, 4, 5,
+                3, 4, 5, 6,
+                4, 5, 6, 7
+            }
+        : std::initializer_list<float>
+            {
+                1.0f, 2.0f, 3.0f, 4.0f,
+                2.0f, 3.0f, 4.0f, 5.0f,
+                3.0f, 4.0f, 5.0f, 6.0f,
+                4.0f, 5.0f, 6.0f, 7.0f,
+
+                1.0f, 2.0f, 3.0f, 4.0f,
+                2.0f, 3.0f, 4.0f, 5.0f,
+                3.0f, 4.0f, 5.0f, 6.0f,
+                4.0f, 5.0f, 6.0f, 7.0f
+            };
+
+    const armnn::PermutationVector NCHWToNHWC = { 0, 3, 1, 2 };
+    if (dataLayout == armnn::DataLayout::NHWC)
+    {
+        std::vector<float> tmp(inputData.size());
+        armnnUtils::Permute(inputTensorInfo.GetShape(), NCHWToNHWC, inputData.data(), tmp.data(), sizeof(float));
+        inputData = tmp;
+    }
+
+    auto input = MakeTensor<T, 4>(inputTensorInfo, QuantizedVector<T>(inputTensorInfo.GetQuantizationScale(),
+                                                                      inputTensorInfo.GetQuantizationOffset(),
+                                                                      inputData));
+
+    LayerTestResult<T, 4> result(outputTensorInfo);
+    result.outputExpected = input;
+
+    std::unique_ptr<armnn::ITensorHandle> inputHandle = workloadFactory.CreateTensorHandle(inputTensorInfo);
+    std::unique_ptr<armnn::ITensorHandle> outputHandle = workloadFactory.CreateTensorHandle(outputTensorInfo);
+
+    armnn::ResizeQueueDescriptor descriptor;
+    descriptor.m_Parameters.m_Method     = armnn::ResizeMethod::Bilinear;
+    descriptor.m_Parameters.m_DataLayout = dataLayout;
+
+    armnn::WorkloadInfo info;
+    AddInputToWorkload(descriptor, info, inputTensorInfo, inputHandle.get());
+    AddOutputToWorkload(descriptor, info, outputTensorInfo, outputHandle.get());
+
+    std::unique_ptr<armnn::IWorkload> workload = workloadFactory.CreateResize(descriptor, info);
+
+    inputHandle->Allocate();
+    outputHandle->Allocate();
+    CopyDataToITensorHandle(inputHandle.get(), &input[0][0][0][0]);
+
+    workload->PostAllocationConfigure();
+    workload->Execute();
+
+    CopyDataFromITensorHandle(&result.output[0][0][0][0], outputHandle.get());
+    return result;
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> SimpleResizeBilinearTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::DataLayout dataLayout)
+{
+    armnn::TensorInfo inputTensorInfo = armnn::IsQuantizedType<T>()
+        ?  armnnUtils::GetTensorInfo(1, 1, 2, 2, dataLayout, ArmnnType)
+        :  armnnUtils::GetTensorInfo(1, 2, 2, 2, dataLayout, ArmnnType);
+
+    armnn::TensorInfo outputTensorInfo = armnn::IsQuantizedType<T>()
+        ?  armnnUtils::GetTensorInfo(1, 1, 1, 1, dataLayout, ArmnnType)
+        :  armnnUtils::GetTensorInfo(1, 2, 1, 1, dataLayout, ArmnnType);
+
+    if (armnn::IsQuantizedType<T>())
+    {
+        inputTensorInfo.SetQuantizationScale(0.1567f);
+        inputTensorInfo.SetQuantizationOffset(1);
+        outputTensorInfo.SetQuantizationScale(0.1567f);
+        outputTensorInfo.SetQuantizationOffset(1);
+    }
+
+    std::vector<float> inputData = armnn::IsQuantizedType<T>()
+        ? std::initializer_list<float>
+            {
+                  1, 255,
+                200, 250
+            }
+        : std::initializer_list<float>
+            {
+                  1.0f, 255.0f,
+                200.0f, 250.0f,
+
+                250.0f, 200.0f,
+                250.0f,   1.0f
+            };
+
+    // The 'resize bilinear' operation projects the top-left corner of output texels into the input image,
+    // then figures out the interpolants and weights. Note this is different to projecting the centre of the
+    // output texel. Thus, for a input matrix of 2x2, we'll expect the output 1x1 matrix to contain, as
+    // its single element, the value that was at position (0,0) of the input matrix (rather than an average,
+    // which we would expect if projecting the centre).
+
+    std::vector<float> outputData = armnn::IsQuantizedType<T>()
+        ? std::initializer_list<float>
+            {
+                1
+            }
+        : std::initializer_list<float>
+            {
+                  1.0f,
+
+                250.0f
+            };
+
+    const armnn::PermutationVector NCHWToNHWC = { 0, 3, 1, 2 };
+    if (dataLayout == armnn::DataLayout::NHWC)
+    {
+        std::vector<float> tmp(inputData.size());
+        armnnUtils::Permute(inputTensorInfo.GetShape(), NCHWToNHWC, inputData.data(), tmp.data(), sizeof(float));
+        inputData = tmp;
+
+        std::vector<float> tmp1(outputData.size());
+        armnnUtils::Permute(outputTensorInfo.GetShape(), NCHWToNHWC, outputData.data(), tmp1.data(), sizeof(float));
+        outputData = tmp1;
+    }
+
+    auto input = MakeTensor<T, 4>(inputTensorInfo, QuantizedVector<T>(inputTensorInfo.GetQuantizationScale(),
+                                                                      inputTensorInfo.GetQuantizationOffset(),
+                                                                      inputData));
+
+    LayerTestResult<T, 4> result(outputTensorInfo);
+    result.outputExpected = MakeTensor<T, 4>(outputTensorInfo,
+                                             QuantizedVector<T>(outputTensorInfo.GetQuantizationScale(),
+                                                                outputTensorInfo.GetQuantizationOffset(),
+                                                                outputData));
+
+    std::unique_ptr <armnn::ITensorHandle> inputHandle = workloadFactory.CreateTensorHandle(inputTensorInfo);
+    std::unique_ptr <armnn::ITensorHandle> outputHandle = workloadFactory.CreateTensorHandle(outputTensorInfo);
+
+    armnn::ResizeQueueDescriptor descriptor;
+    descriptor.m_Parameters.m_Method     = armnn::ResizeMethod::Bilinear;
+    descriptor.m_Parameters.m_DataLayout = dataLayout;
+
+    armnn::WorkloadInfo info;
+    AddInputToWorkload(descriptor, info, inputTensorInfo, inputHandle.get());
+    AddOutputToWorkload(descriptor, info, outputTensorInfo, outputHandle.get());
+
+    std::unique_ptr<armnn::IWorkload> workload = workloadFactory.CreateResize(descriptor, info);
+
+    inputHandle->Allocate();
+    outputHandle->Allocate();
+    CopyDataToITensorHandle(inputHandle.get(), &input[0][0][0][0]);
+
+    workload->PostAllocationConfigure();
+    workload->Execute();
+
+    CopyDataFromITensorHandle(&result.output[0][0][0][0], outputHandle.get());
+    return result;
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> ResizeBilinearSqMinTest(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        const armnn::DataLayout dataLayout)
+{
+    armnn::TensorInfo inputTensorInfo = armnn::IsQuantizedType<T>()
+        ?  armnnUtils::GetTensorInfo(1, 1, 4, 4, dataLayout, ArmnnType)
+        :  armnnUtils::GetTensorInfo(1, 2, 4, 4, dataLayout, ArmnnType);
+
+    armnn::TensorInfo outputTensorInfo = armnn::IsQuantizedType<T>()
+        ?  armnnUtils::GetTensorInfo(1, 1, 2, 2, dataLayout, ArmnnType)
+        :  armnnUtils::GetTensorInfo(1, 2, 2, 2, dataLayout, ArmnnType);
+
+    if (armnn::IsQuantizedType<T>())
+    {
+        inputTensorInfo.SetQuantizationScale(3.141592f);
+        inputTensorInfo.SetQuantizationOffset(3);
+        outputTensorInfo.SetQuantizationScale(3.141592f);
+        outputTensorInfo.SetQuantizationOffset(3);
+    }
+
+    std::vector<float> inputData = armnn::IsQuantizedType<T>()
+        ? std::initializer_list<float>
+            {
+                1, 2, 3, 4,
+                2, 3, 4, 5,
+                3, 4, 5, 6,
+                4, 5, 6, 7
+            }
+        : std::initializer_list<float>
+            {
+                1.0f, 2.0f, 3.0f, 4.0f,
+                2.0f, 3.0f, 4.0f, 5.0f,
+                3.0f, 4.0f, 5.0f, 6.0f,
+                4.0f, 5.0f, 6.0f, 7.0f,
+
+                7.0f, 6.0f, 5.0f, 4.0f,
+                6.0f, 5.0f, 4.0f, 3.0f,
+                5.0f, 4.0f, 3.0f, 2.0f,
+                4.0f, 3.0f, 2.0f, 1.0f
+            };
+
+    std::vector<float> outputData = armnn::IsQuantizedType<T>()
+        ? std::initializer_list<float>
+            {
+                1, 3,
+                3, 5
+            }
+        : std::initializer_list<float>
+            {
+                1.0f, 3.0f,
+                3.0f, 5.0f,
+
+                7.0f, 5.0f,
+                5.0f, 3.0f
+            };
+
+    const armnn::PermutationVector NCHWToNHWC = { 0, 3, 1, 2 };
+    if (dataLayout == armnn::DataLayout::NHWC)
+    {
+        std::vector<float> tmp(inputData.size());
+        armnnUtils::Permute(inputTensorInfo.GetShape(), NCHWToNHWC, inputData.data(), tmp.data(), sizeof(float));
+        inputData = tmp;
+
+        std::vector<float> tmp1(outputData.size());
+        armnnUtils::Permute(outputTensorInfo.GetShape(), NCHWToNHWC, outputData.data(), tmp1.data(), sizeof(float));
+        outputData = tmp1;
+    }
+
+    auto input = MakeTensor<T, 4>(inputTensorInfo, QuantizedVector<T>(inputTensorInfo.GetQuantizationScale(),
+                                                                      inputTensorInfo.GetQuantizationOffset(),
+                                                                      inputData));
+
+    LayerTestResult<T, 4> result(outputTensorInfo);
+    result.outputExpected = MakeTensor<T, 4>(outputTensorInfo,
+                                             QuantizedVector<T>(outputTensorInfo.GetQuantizationScale(),
+                                                                outputTensorInfo.GetQuantizationOffset(),
+                                                                outputData));
+
+    std::unique_ptr <armnn::ITensorHandle> inputHandle = workloadFactory.CreateTensorHandle(inputTensorInfo);
+    std::unique_ptr <armnn::ITensorHandle> outputHandle = workloadFactory.CreateTensorHandle(outputTensorInfo);
+
+    armnn::ResizeQueueDescriptor descriptor;
+    descriptor.m_Parameters.m_Method     = armnn::ResizeMethod::Bilinear;
+    descriptor.m_Parameters.m_DataLayout = dataLayout;
+
+    armnn::WorkloadInfo info;
+    AddInputToWorkload(descriptor, info, inputTensorInfo, inputHandle.get());
+    AddOutputToWorkload(descriptor, info, outputTensorInfo, outputHandle.get());
+
+    std::unique_ptr<armnn::IWorkload> workload = workloadFactory.CreateResize(descriptor, info);
+
+    inputHandle->Allocate();
+    outputHandle->Allocate();
+    CopyDataToITensorHandle(inputHandle.get(), &input[0][0][0][0]);
+
+    workload->PostAllocationConfigure();
+    workload->Execute();
+
+    CopyDataFromITensorHandle(&result.output[0][0][0][0], outputHandle.get());
+    return result;
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> ResizeBilinearMinTest(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        const armnn::DataLayout dataLayout)
+{
+    armnn::TensorInfo inputTensorInfo = armnn::IsQuantizedType<T>()
+        ?  armnnUtils::GetTensorInfo(1, 1, 2, 3, dataLayout, ArmnnType)
+        :  armnnUtils::GetTensorInfo(1, 2, 3, 5, dataLayout, ArmnnType);
+
+    armnn::TensorInfo outputTensorInfo = armnn::IsQuantizedType<T>()
+        ?  armnnUtils::GetTensorInfo(1, 1, 1, 2, dataLayout, ArmnnType)
+        :  armnnUtils::GetTensorInfo(1, 2, 2, 3, dataLayout, ArmnnType);
+
+    if (armnn::IsQuantizedType<T>())
+    {
+        inputTensorInfo.SetQuantizationScale(1.5f);
+        inputTensorInfo.SetQuantizationOffset(-1);
+        outputTensorInfo.SetQuantizationScale(1.5f);
+        outputTensorInfo.SetQuantizationOffset(-1);
+    }
+
+    std::vector<float> inputData = armnn::IsQuantizedType<T>()
+        ? std::initializer_list<float>
+            {
+                3.0f,  4.5f,  6.0f, // 1,  2,  3, : Expected quantised values
+                9.0f, 13.5f, 21.0f  // 5,  8, 13
+            }
+        : std::initializer_list<float>
+            {
+                  1.0f,   2.0f,   3.0f,   5.0f,   8.0f,
+                 13.0f,  21.0f,  34.0f,  55.0f,  89.0f,
+                144.0f, 233.0f, 377.0f, 610.0f, 987.0f,
+
+                987.0f, 610.0f, 377.0f, 233.0f, 144.0f,
+                 89.0f,  55.0f,  34.0f,  21.0f,  13.0f,
+                   8.0f,  5.0f,   3.0f,   2.0f,   1.0f
+            };
+
+    std::vector<float> outputData = armnn::IsQuantizedType<T>()
+        ? std::initializer_list<float>
+            {
+                3.0f, 5.25f // 1, 3
+            }
+        : std::initializer_list<float>
+            {
+                 1.0f,   2.6666f,   6.00f,
+                78.5f, 179.3333f, 401.00f,
+
+                987.0f, 454.6670f, 203.33f,
+                 48.5f,  22.3333f,  10.00f
+            };
+
+    const armnn::PermutationVector NCHWToNHWC = { 0, 3, 1, 2 };
+    if (dataLayout == armnn::DataLayout::NHWC)
+    {
+        std::vector<float> tmp(inputData.size());
+        armnnUtils::Permute(inputTensorInfo.GetShape(), NCHWToNHWC, inputData.data(), tmp.data(), sizeof(float));
+        inputData = tmp;
+
+        std::vector<float> tmp1(outputData.size());
+        armnnUtils::Permute(outputTensorInfo.GetShape(), NCHWToNHWC, outputData.data(), tmp1.data(), sizeof(float));
+        outputData = tmp1;
+    }
+
+    auto input = MakeTensor<T, 4>(inputTensorInfo, QuantizedVector<T>(inputTensorInfo.GetQuantizationScale(),
+                                                                      inputTensorInfo.GetQuantizationOffset(),
+                                                                      inputData));
+
+    LayerTestResult<T, 4> result(outputTensorInfo);
+    result.outputExpected = MakeTensor<T, 4>(outputTensorInfo,
+                                             QuantizedVector<T>(outputTensorInfo.GetQuantizationScale(),
+                                                                outputTensorInfo.GetQuantizationOffset(),
+                                                                outputData));
+
+    std::unique_ptr<armnn::ITensorHandle> inputHandle = workloadFactory.CreateTensorHandle(inputTensorInfo);
+    std::unique_ptr<armnn::ITensorHandle> outputHandle = workloadFactory.CreateTensorHandle(outputTensorInfo);
+
+    armnn::ResizeQueueDescriptor descriptor;
+    descriptor.m_Parameters.m_Method     = armnn::ResizeMethod::Bilinear;
+    descriptor.m_Parameters.m_DataLayout = dataLayout;
+
+    armnn::WorkloadInfo info;
+    AddInputToWorkload(descriptor, info, inputTensorInfo, inputHandle.get());
+    AddOutputToWorkload(descriptor, info, outputTensorInfo, outputHandle.get());
+
+    std::unique_ptr<armnn::IWorkload> workload = workloadFactory.CreateResize(descriptor, info);
+
+    inputHandle->Allocate();
+    outputHandle->Allocate();
+    CopyDataToITensorHandle(inputHandle.get(), &input[0][0][0][0]);
+
+    workload->PostAllocationConfigure();
+    workload->Execute();
+
+    CopyDataFromITensorHandle(&result.output[0][0][0][0], outputHandle.get());
+    return result;
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> ResizeBilinearMagTest(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        const armnn::DataLayout dataLayout)
+{
+    armnn::TensorInfo inputTensorInfo = armnn::IsQuantizedType<T>()
+        ?  armnnUtils::GetTensorInfo(1, 1, 3, 2, dataLayout, ArmnnType)
+        :  armnnUtils::GetTensorInfo(1, 2, 3, 2, dataLayout, ArmnnType);
+
+    armnn::TensorInfo outputTensorInfo = armnn::IsQuantizedType<T>()
+        ?  armnnUtils::GetTensorInfo(1, 1, 3, 5, dataLayout, ArmnnType)
+        :  armnnUtils::GetTensorInfo(1, 2, 3, 5, dataLayout, ArmnnType);
+
+    if (armnn::IsQuantizedType<T>())
+    {
+        inputTensorInfo.SetQuantizationScale(0.010765f);
+        inputTensorInfo.SetQuantizationOffset(7);
+        outputTensorInfo.SetQuantizationScale(0.010132f);
+        outputTensorInfo.SetQuantizationOffset(-18);
+    }
+
+    std::vector<float> inputData = armnn::IsQuantizedType<T>()
+        ? std::initializer_list<float>
+            {
+                0.183005f, 2.379065f, // 24, 228, : Expected quantised values
+                1.054970f, 1.302565f, // 105, 128,
+                2.400595f, 0.688960f  // 230, 71
+            }
+        : std::initializer_list<float>
+            {
+                  1.0f,   2.0f,
+                 13.0f,  21.0f,
+                144.0f, 233.0f,
+
+                233.0f, 144.0f,
+                 21.0f,  13.0f,
+                  2.0f,   1.0f
+            };
+
+    std::vector<float> outputData = armnn::IsQuantizedType<T>()
+        ? std::initializer_list<float>
+            {
+                0.18300501f, 1.06142902f, 1.93985295f, 2.37906504f, 2.37906504f,
+                1.05497003f, 1.15400803f, 1.25304604f, 1.30256498f, 1.30256498f,
+                2.40059495f, 1.71594095f, 1.03128707f, 0.68896002f, 0.68896002f
+                // 0, 87, 173, 217, 217, : Expected quantised values
+                // 86, 96, 106, 111, 111,
+                // 219, 151, 84, 50, 50
+            }
+        : std::initializer_list<float>
+            {
+                  1.0f,   1.4f,   1.8f,   2.0f,   2.0f,
+                 13.0f,  16.2f,  19.4f,  21.0f,  21.0f,
+                144.0f, 179.6f, 215.2f, 233.0f, 233.0f,
+
+                233.0f, 197.4f, 161.8f, 144.0f, 144.0f,
+                 21.0f,  17.8f,  14.6f,  13.0f,  13.0f,
+                  2.0f,   1.6f,   1.2f,   1.0f,   1.0f
+            };
+
+    const armnn::PermutationVector NCHWToNHWC = { 0, 3, 1, 2 };
+    if (dataLayout == armnn::DataLayout::NHWC)
+    {
+        std::vector<float> tmp(inputData.size());
+        armnnUtils::Permute(inputTensorInfo.GetShape(), NCHWToNHWC, inputData.data(), tmp.data(), sizeof(float));
+        inputData = tmp;
+
+        std::vector<float> tmp1(outputData.size());
+        armnnUtils::Permute(outputTensorInfo.GetShape(), NCHWToNHWC, outputData.data(), tmp1.data(), sizeof(float));
+        outputData = tmp1;
+    }
+
+    auto input = MakeTensor<T, 4>(inputTensorInfo, QuantizedVector<T>(inputTensorInfo.GetQuantizationScale(),
+                                                                      inputTensorInfo.GetQuantizationOffset(),
+                                                                      inputData));
+
+    LayerTestResult<T, 4> result(outputTensorInfo);
+    result.outputExpected = MakeTensor<T, 4>(outputTensorInfo,
+                                             QuantizedVector<T>(outputTensorInfo.GetQuantizationScale(),
+                                                                outputTensorInfo.GetQuantizationOffset(),
+                                                                outputData));
+
+    std::unique_ptr <armnn::ITensorHandle> inputHandle = workloadFactory.CreateTensorHandle(inputTensorInfo);
+    std::unique_ptr <armnn::ITensorHandle> outputHandle = workloadFactory.CreateTensorHandle(outputTensorInfo);
+
+    armnn::ResizeQueueDescriptor descriptor;
+    descriptor.m_Parameters.m_Method     = armnn::ResizeMethod::Bilinear;
+    descriptor.m_Parameters.m_DataLayout = dataLayout;
+
+    armnn::WorkloadInfo info;
+    AddInputToWorkload(descriptor, info, inputTensorInfo, inputHandle.get());
+    AddOutputToWorkload(descriptor, info, outputTensorInfo, outputHandle.get());
+
+    std::unique_ptr<armnn::IWorkload> workload = workloadFactory.CreateResize(descriptor, info);
+
+    inputHandle->Allocate();
+    outputHandle->Allocate();
+    CopyDataToITensorHandle(inputHandle.get(), &input[0][0][0][0]);
+
+    workload->PostAllocationConfigure();
+    workload->Execute();
+
+    CopyDataFromITensorHandle(&result.output[0][0][0][0], outputHandle.get());
+    return result;
+}
+
+//
+// ResizeNearestNeighbor
+//
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> ResizeNearestNeighborNopTest(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        const armnn::DataLayout dataLayout)
+{
+    armnn::TensorInfo inputTensorInfo = armnn::IsQuantizedType<T>()
+        ?  armnnUtils::GetTensorInfo(1, 1, 4, 4, dataLayout, ArmnnType)
+        :  armnnUtils::GetTensorInfo(1, 2, 4, 4, dataLayout, ArmnnType);
+
+    armnn::TensorInfo outputTensorInfo = armnn::IsQuantizedType<T>()
+        ?  armnnUtils::GetTensorInfo(1, 1, 4, 4, dataLayout, ArmnnType)
+        :  armnnUtils::GetTensorInfo(1, 2, 4, 4, dataLayout, ArmnnType);
+
+    if (armnn::IsQuantizedType<T>())
+    {
+        inputTensorInfo.SetQuantizationScale(1.5f);
+        inputTensorInfo.SetQuantizationOffset(-3);
+        outputTensorInfo.SetQuantizationScale(1.5f);
+        outputTensorInfo.SetQuantizationOffset(-3);
+    }
+
+    std::vector<float> inputData = armnn::IsQuantizedType<T>()
+        ? std::initializer_list<float>
+            {
+                1, 2, 3, 4,
+                2, 3, 4, 5,
+                3, 4, 5, 6,
+                4, 5, 6, 7
+            }
+        : std::initializer_list<float>
+            {
+                1.0f, 2.0f, 3.0f, 4.0f,
+                2.0f, 3.0f, 4.0f, 5.0f,
+                3.0f, 4.0f, 5.0f, 6.0f,
+                4.0f, 5.0f, 6.0f, 7.0f,
+
+                1.0f, 2.0f, 3.0f, 4.0f,
+                2.0f, 3.0f, 4.0f, 5.0f,
+                3.0f, 4.0f, 5.0f, 6.0f,
+                4.0f, 5.0f, 6.0f, 7.0f
+            };
+
+    const armnn::PermutationVector NCHWToNHWC = { 0, 3, 1, 2 };
+    if (dataLayout == armnn::DataLayout::NHWC)
+    {
+        std::vector<float> tmp(inputData.size());
+        armnnUtils::Permute(inputTensorInfo.GetShape(), NCHWToNHWC, inputData.data(), tmp.data(), sizeof(float));
+        inputData = tmp;
+    }
+
+    auto input = MakeTensor<T, 4>(inputTensorInfo, QuantizedVector<T>(inputTensorInfo.GetQuantizationScale(),
+                                                                      inputTensorInfo.GetQuantizationOffset(),
+                                                                      inputData));
+
+    LayerTestResult<T, 4> result(outputTensorInfo);
+    result.outputExpected = input;
+
+    std::unique_ptr<armnn::ITensorHandle> inputHandle = workloadFactory.CreateTensorHandle(inputTensorInfo);
+    std::unique_ptr<armnn::ITensorHandle> outputHandle = workloadFactory.CreateTensorHandle(outputTensorInfo);
+
+    armnn::ResizeQueueDescriptor descriptor;
+    descriptor.m_Parameters.m_Method = armnn::ResizeMethod::NearestNeighbor;
+    descriptor.m_Parameters.m_DataLayout = dataLayout;
+    armnn::WorkloadInfo info;
+    AddInputToWorkload(descriptor, info, inputTensorInfo, inputHandle.get());
+    AddOutputToWorkload(descriptor, info, outputTensorInfo, outputHandle.get());
+
+    std::unique_ptr<armnn::IWorkload> workload = workloadFactory.CreateResize(descriptor, info);
+
+    inputHandle->Allocate();
+    outputHandle->Allocate();
+    CopyDataToITensorHandle(inputHandle.get(), &input[0][0][0][0]);
+
+    workload->PostAllocationConfigure();
+    workload->Execute();
+
+    CopyDataFromITensorHandle(&result.output[0][0][0][0], outputHandle.get());
+    return result;
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> SimpleResizeNearestNeighborTest(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        const armnn::DataLayout dataLayout)
+{
+    armnn::TensorInfo inputTensorInfo = armnn::IsQuantizedType<T>()
+        ?  armnnUtils::GetTensorInfo(1, 1, 2, 2, dataLayout, ArmnnType)
+        :  armnnUtils::GetTensorInfo(1, 2, 2, 2, dataLayout, ArmnnType);
+
+    armnn::TensorInfo outputTensorInfo = armnn::IsQuantizedType<T>()
+        ?  armnnUtils::GetTensorInfo(1, 1, 1, 1, dataLayout, ArmnnType)
+        :  armnnUtils::GetTensorInfo(1, 2, 1, 1, dataLayout, ArmnnType);
+
+    if (armnn::IsQuantizedType<T>())
+    {
+        inputTensorInfo.SetQuantizationScale(0.1567f);
+        inputTensorInfo.SetQuantizationOffset(1);
+        outputTensorInfo.SetQuantizationScale(0.1567f);
+        outputTensorInfo.SetQuantizationOffset(1);
+    }
+
+    std::vector<float> inputData = armnn::IsQuantizedType<T>()
+        ? std::initializer_list<float>
+            {
+                  1, 255,
+                200, 250
+            }
+        : std::initializer_list<float>
+            {
+                  1.0f, 255.0f,
+                200.0f, 250.0f,
+
+                250.0f, 200.0f,
+                250.0f,   1.0f
+            };
+
+    // The 'resize' operation projects the top-left corner of output texels into the input image,
+    // then figures out the interpolants and weights. Note this is different to projecting the centre of the
+    // output texel. Thus, for a input matrix of 2x2, we'll expect the output 1x1 matrix to contain, as
+    // its single element, the value that was at position (0,0) of the input matrix (rather than an average,
+    // which we would expect if projecting the centre).
+
+    std::vector<float> outputData = armnn::IsQuantizedType<T>()
+        ? std::initializer_list<float>
+            {
+                1
+            }
+        : std::initializer_list<float>
+            {
+                  1.0f,
+
+                250.0f
+            };
+
+    const armnn::PermutationVector NCHWToNHWC = { 0, 3, 1, 2 };
+    if (dataLayout == armnn::DataLayout::NHWC)
+    {
+        std::vector<float> tmp(inputData.size());
+        armnnUtils::Permute(inputTensorInfo.GetShape(), NCHWToNHWC, inputData.data(), tmp.data(), sizeof(float));
+        inputData = tmp;
+
+        std::vector<float> tmp1(outputData.size());
+        armnnUtils::Permute(outputTensorInfo.GetShape(), NCHWToNHWC, outputData.data(), tmp1.data(), sizeof(float));
+        outputData = tmp1;
+    }
+
+    auto input = MakeTensor<T, 4>(inputTensorInfo, QuantizedVector<T>(inputTensorInfo.GetQuantizationScale(),
+                                                                      inputTensorInfo.GetQuantizationOffset(),
+                                                                      inputData));
+
+    LayerTestResult<T, 4> result(outputTensorInfo);
+    result.outputExpected = MakeTensor<T, 4>(outputTensorInfo,
+                                             QuantizedVector<T>(outputTensorInfo.GetQuantizationScale(),
+                                                                outputTensorInfo.GetQuantizationOffset(),
+                                                                outputData));
+
+    std::unique_ptr <armnn::ITensorHandle> inputHandle = workloadFactory.CreateTensorHandle(inputTensorInfo);
+    std::unique_ptr <armnn::ITensorHandle> outputHandle = workloadFactory.CreateTensorHandle(outputTensorInfo);
+
+    armnn::ResizeQueueDescriptor descriptor;
+    descriptor.m_Parameters.m_DataLayout = dataLayout;
+    descriptor.m_Parameters.m_Method     = armnn::ResizeMethod::NearestNeighbor;
+    armnn::WorkloadInfo info;
+    AddInputToWorkload(descriptor, info, inputTensorInfo, inputHandle.get());
+    AddOutputToWorkload(descriptor, info, outputTensorInfo, outputHandle.get());
+
+    std::unique_ptr<armnn::IWorkload> workload = workloadFactory.CreateResize(descriptor, info);
+
+    inputHandle->Allocate();
+    outputHandle->Allocate();
+    CopyDataToITensorHandle(inputHandle.get(), &input[0][0][0][0]);
+
+    workload->PostAllocationConfigure();
+    workload->Execute();
+
+    CopyDataFromITensorHandle(&result.output[0][0][0][0], outputHandle.get());
+    return result;
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> ResizeNearestNeighborSqMinTest(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        const armnn::DataLayout dataLayout)
+{
+    armnn::TensorInfo inputTensorInfo = armnn::IsQuantizedType<T>()
+        ?  armnnUtils::GetTensorInfo(1, 1, 4, 4, dataLayout, ArmnnType)
+        :  armnnUtils::GetTensorInfo(1, 2, 4, 4, dataLayout, ArmnnType);
+
+    armnn::TensorInfo outputTensorInfo = armnn::IsQuantizedType<T>()
+        ?  armnnUtils::GetTensorInfo(1, 1, 2, 2, dataLayout, ArmnnType)
+        :  armnnUtils::GetTensorInfo(1, 2, 2, 2, dataLayout, ArmnnType);
+
+    if (armnn::IsQuantizedType<T>())
+    {
+        inputTensorInfo.SetQuantizationScale(3.141592f);
+        inputTensorInfo.SetQuantizationOffset(3);
+        outputTensorInfo.SetQuantizationScale(3.141592f);
+        outputTensorInfo.SetQuantizationOffset(3);
+    }
+
+    std::vector<float> inputData = armnn::IsQuantizedType<T>()
+        ? std::initializer_list<float>
+            {
+                1, 2, 3, 4,
+                2, 3, 4, 5,
+                3, 4, 5, 6,
+                4, 5, 6, 7
+            }
+        : std::initializer_list<float>
+            {
+                1.0f, 2.0f, 3.0f, 4.0f,
+                2.0f, 3.0f, 4.0f, 5.0f,
+                3.0f, 4.0f, 5.0f, 6.0f,
+                4.0f, 5.0f, 6.0f, 7.0f,
+
+                7.0f, 6.0f, 5.0f, 4.0f,
+                6.0f, 5.0f, 4.0f, 3.0f,
+                5.0f, 4.0f, 3.0f, 2.0f,
+                4.0f, 3.0f, 2.0f, 1.0f
+            };
+
+    std::vector<float> outputData = armnn::IsQuantizedType<T>()
+        ? std::initializer_list<float>
+            {
+                1, 3,
+                3, 5
+            }
+        : std::initializer_list<float>
+            {
+                1.0f, 3.0f,
+                3.0f, 5.0f,
+
+                7.0f, 5.0f,
+                5.0f, 3.0f
+            };
+
+    const armnn::PermutationVector NCHWToNHWC = { 0, 3, 1, 2 };
+    if (dataLayout == armnn::DataLayout::NHWC)
+    {
+        std::vector<float> tmp(inputData.size());
+        armnnUtils::Permute(inputTensorInfo.GetShape(), NCHWToNHWC, inputData.data(), tmp.data(), sizeof(float));
+        inputData = tmp;
+
+        std::vector<float> tmp1(outputData.size());
+        armnnUtils::Permute(outputTensorInfo.GetShape(), NCHWToNHWC, outputData.data(), tmp1.data(), sizeof(float));
+        outputData = tmp1;
+    }
+
+    auto input = MakeTensor<T, 4>(inputTensorInfo, QuantizedVector<T>(inputTensorInfo.GetQuantizationScale(),
+                                                                      inputTensorInfo.GetQuantizationOffset(),
+                                                                      inputData));
+
+    LayerTestResult<T, 4> result(outputTensorInfo);
+    result.outputExpected = MakeTensor<T, 4>(outputTensorInfo,
+                                             QuantizedVector<T>(outputTensorInfo.GetQuantizationScale(),
+                                                                outputTensorInfo.GetQuantizationOffset(),
+                                                                outputData));
+
+    std::unique_ptr <armnn::ITensorHandle> inputHandle = workloadFactory.CreateTensorHandle(inputTensorInfo);
+    std::unique_ptr <armnn::ITensorHandle> outputHandle = workloadFactory.CreateTensorHandle(outputTensorInfo);
+
+    armnn::ResizeQueueDescriptor descriptor;
+    descriptor.m_Parameters.m_DataLayout = dataLayout;
+    descriptor.m_Parameters.m_Method     = armnn::ResizeMethod::NearestNeighbor;
+    armnn::WorkloadInfo info;
+    AddInputToWorkload(descriptor, info, inputTensorInfo, inputHandle.get());
+    AddOutputToWorkload(descriptor, info, outputTensorInfo, outputHandle.get());
+
+    std::unique_ptr<armnn::IWorkload> workload = workloadFactory.CreateResize(descriptor, info);
+
+    inputHandle->Allocate();
+    outputHandle->Allocate();
+    CopyDataToITensorHandle(inputHandle.get(), &input[0][0][0][0]);
+
+    workload->PostAllocationConfigure();
+    workload->Execute();
+
+    CopyDataFromITensorHandle(&result.output[0][0][0][0], outputHandle.get());
+    return result;
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> ResizeNearestNeighborMinTest(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        const armnn::DataLayout dataLayout)
+{
+    armnn::TensorInfo inputTensorInfo = armnn::IsQuantizedType<T>()
+        ?  armnnUtils::GetTensorInfo(1, 1, 2, 3, dataLayout, ArmnnType)
+        :  armnnUtils::GetTensorInfo(1, 2, 3, 5, dataLayout, ArmnnType);
+
+    armnn::TensorInfo outputTensorInfo = armnn::IsQuantizedType<T>()
+        ?  armnnUtils::GetTensorInfo(1, 1, 1, 2, dataLayout, ArmnnType)
+        :  armnnUtils::GetTensorInfo(1, 2, 2, 3, dataLayout, ArmnnType);
+
+    if (armnn::IsQuantizedType<T>())
+    {
+        inputTensorInfo.SetQuantizationScale(1.5f);
+        inputTensorInfo.SetQuantizationOffset(-1);
+        outputTensorInfo.SetQuantizationScale(1.5f);
+        outputTensorInfo.SetQuantizationOffset(-1);
+    }
+
+    std::vector<float> inputData = armnn::IsQuantizedType<T>()
+        ? std::initializer_list<float>
+            {
+                3.0f,  4.5f,  6.0f, // 1,  2,  3, : Expected quantised values
+                9.0f, 13.5f, 21.0f  // 5,  8, 13
+            }
+        : std::initializer_list<float>
+            {
+                  1.0f,   2.0f,   3.0f,   5.0f,   8.0f,
+                 13.0f,  21.0f,  34.0f,  55.0f,  89.0f,
+                144.0f, 233.0f, 377.0f, 610.0f, 987.0f,
+
+                987.0f, 610.0f, 377.0f, 233.0f, 144.0f,
+                 89.0f,  55.0f,  34.0f,  21.0f,  13.0f,
+                  8.0f,   5.0f,   3.0f,   2.0f,   1.0f
+            };
+
+    std::vector<float> outputData = armnn::IsQuantizedType<T>()
+        ? std::initializer_list<float>
+            {
+                3.0f, 4.5f // 1, 3
+            }
+        : std::initializer_list<float>
+            {
+                  1.f,   2.f,   5.f,
+                 13.f,  21.f,  55.f,
+
+                987.f, 610.f, 233.f,
+                 89.f,  55.f,  21.f
+            };
+
+    const armnn::PermutationVector NCHWToNHWC = { 0, 3, 1, 2 };
+    if (dataLayout == armnn::DataLayout::NHWC)
+    {
+        std::vector<float> tmp(inputData.size());
+        armnnUtils::Permute(inputTensorInfo.GetShape(), NCHWToNHWC, inputData.data(), tmp.data(), sizeof(float));
+        inputData = tmp;
+
+        std::vector<float> tmp1(outputData.size());
+        armnnUtils::Permute(outputTensorInfo.GetShape(), NCHWToNHWC, outputData.data(), tmp1.data(), sizeof(float));
+        outputData = tmp1;
+    }
+
+    auto input = MakeTensor<T, 4>(inputTensorInfo, QuantizedVector<T>(inputTensorInfo.GetQuantizationScale(),
+                                                                      inputTensorInfo.GetQuantizationOffset(),
+                                                                      inputData));
+
+    LayerTestResult<T, 4> result(outputTensorInfo);
+    result.outputExpected = MakeTensor<T, 4>(outputTensorInfo,
+                                             QuantizedVector<T>(outputTensorInfo.GetQuantizationScale(),
+                                                                outputTensorInfo.GetQuantizationOffset(),
+                                                                outputData));
+
+    std::unique_ptr<armnn::ITensorHandle> inputHandle = workloadFactory.CreateTensorHandle(inputTensorInfo);
+    std::unique_ptr<armnn::ITensorHandle> outputHandle = workloadFactory.CreateTensorHandle(outputTensorInfo);
+
+    armnn::ResizeQueueDescriptor descriptor;
+    descriptor.m_Parameters.m_DataLayout = dataLayout;
+    descriptor.m_Parameters.m_Method = armnn::ResizeMethod::NearestNeighbor;
+    armnn::WorkloadInfo info;
+    AddInputToWorkload(descriptor, info, inputTensorInfo, inputHandle.get());
+    AddOutputToWorkload(descriptor, info, outputTensorInfo, outputHandle.get());
+
+    std::unique_ptr<armnn::IWorkload> workload = workloadFactory.CreateResize(descriptor, info);
+
+    inputHandle->Allocate();
+    outputHandle->Allocate();
+    CopyDataToITensorHandle(inputHandle.get(), &input[0][0][0][0]);
+
+    workload->PostAllocationConfigure();
+    workload->Execute();
+
+    CopyDataFromITensorHandle(&result.output[0][0][0][0], outputHandle.get());
+    return result;
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> ResizeNearestNeighborMagTest(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        const armnn::DataLayout dataLayout,
+        float inQuantScale,
+        int32_t inQuantOffset,
+        float outQuantScale,
+        int32_t outQuantOffset)
+{
+    armnn::TensorInfo inputTensorInfo = armnn::IsQuantizedType<T>()
+        ?  armnnUtils::GetTensorInfo(1, 1, 3, 2, dataLayout, ArmnnType)
+        :  armnnUtils::GetTensorInfo(1, 2, 3, 2, dataLayout, ArmnnType);
+
+    armnn::TensorInfo outputTensorInfo = armnn::IsQuantizedType<T>()
+        ?  armnnUtils::GetTensorInfo(1, 1, 3, 5, dataLayout, ArmnnType)
+        :  armnnUtils::GetTensorInfo(1, 2, 3, 5, dataLayout, ArmnnType);
+
+    if (armnn::IsQuantizedType<T>())
+    {
+        inputTensorInfo.SetQuantizationScale(inQuantScale);
+        inputTensorInfo.SetQuantizationOffset(inQuantOffset);
+        outputTensorInfo.SetQuantizationScale(outQuantScale);
+        outputTensorInfo.SetQuantizationOffset(outQuantOffset);
+    }
+
+    std::vector<float> inputData = armnn::IsQuantizedType<T>()
+        ? std::initializer_list<float>
+            {
+                0.183005f, 2.379065f, //  24, 228, : expected quantised values
+                1.054970f, 1.302565f, // 105, 128,
+                2.400595f, 0.688960f  // 230, 71
+            }
+        : std::initializer_list<float>
+            {
+                  1.0f,   2.0f,
+                 13.0f,  21.0f,
+                144.0f, 233.0f,
+
+                233.0f, 144.0f,
+                 21.0f,  13.0f,
+                  2.0f,   1.0f
+            };
+
+    std::vector<float> outputData = armnn::IsQuantizedType<T>()
+        ? std::initializer_list<float>
+            {
+                0.183005f, 0.183005f, 0.183005f, 2.379065f, 2.379065f,
+                1.054970f, 1.054970f, 1.054970f, 1.302565f, 1.302565f,
+                2.400595f, 2.400595f, 2.400595f, 0.688960f, 0.688960f
+            }
+        : std::initializer_list<float>
+            {
+                  1.f,   1.f,   1.f,   2.f,   2.f,
+                 13.f,  13.f,  13.f,  21.f,  21.f,
+                144.f, 144.f, 144.f, 233.f, 233.f,
+
+                233.f, 233.f, 233.f, 144.f, 144.f,
+                 21.f,  21.f,  21.f,  13.f,  13.f,
+                  2.f,   2.f,   2.f,   1.f,   1.f
+            };
+
+    const armnn::PermutationVector NCHWToNHWC = { 0, 3, 1, 2 };
+    if (dataLayout == armnn::DataLayout::NHWC)
+    {
+        std::vector<float> tmp(inputData.size());
+        armnnUtils::Permute(inputTensorInfo.GetShape(), NCHWToNHWC, inputData.data(), tmp.data(), sizeof(float));
+        inputData = tmp;
+
+        std::vector<float> tmp1(outputData.size());
+        armnnUtils::Permute(outputTensorInfo.GetShape(), NCHWToNHWC, outputData.data(), tmp1.data(), sizeof(float));
+        outputData = tmp1;
+    }
+
+    auto input = MakeTensor<T, 4>(inputTensorInfo, QuantizedVector<T>(inputTensorInfo.GetQuantizationScale(),
+                                                                      inputTensorInfo.GetQuantizationOffset(),
+                                                                      inputData));
+
+    LayerTestResult<T, 4> result(outputTensorInfo);
+    result.outputExpected = MakeTensor<T, 4>(outputTensorInfo,
+                                             QuantizedVector<T>(outputTensorInfo.GetQuantizationScale(),
+                                                                outputTensorInfo.GetQuantizationOffset(),
+                                                                outputData));
+
+    std::unique_ptr <armnn::ITensorHandle> inputHandle  = workloadFactory.CreateTensorHandle(inputTensorInfo);
+    std::unique_ptr <armnn::ITensorHandle> outputHandle = workloadFactory.CreateTensorHandle(outputTensorInfo);
+
+    armnn::ResizeQueueDescriptor descriptor;
+    descriptor.m_Parameters.m_DataLayout = dataLayout;
+    descriptor.m_Parameters.m_Method = armnn::ResizeMethod::NearestNeighbor;
+    armnn::WorkloadInfo info;
+    AddInputToWorkload(descriptor, info, inputTensorInfo, inputHandle.get());
+    AddOutputToWorkload(descriptor, info, outputTensorInfo, outputHandle.get());
+
+    std::unique_ptr<armnn::IWorkload> workload = workloadFactory.CreateResize(descriptor, info);
+
+    inputHandle->Allocate();
+    outputHandle->Allocate();
+    CopyDataToITensorHandle(inputHandle.get(), &input[0][0][0][0]);
+
+    workload->PostAllocationConfigure();
+    workload->Execute();
+
+    CopyDataFromITensorHandle(&result.output[0][0][0][0], outputHandle.get());
+    return result;
+}
diff --git a/src/backends/backendsCommon/test/layerTests/RsqrtTestImpl.cpp b/src/backends/backendsCommon/test/layerTests/RsqrtTestImpl.cpp
new file mode 100644
index 0000000..c835ff2
--- /dev/null
+++ b/src/backends/backendsCommon/test/layerTests/RsqrtTestImpl.cpp
@@ -0,0 +1,256 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "ReshapeTestImpl.hpp"
+
+#include <armnn/ArmNN.hpp>
+
+#include <backendsCommon/test/DataTypeUtils.hpp>
+#include <backendsCommon/test/TensorCopyUtils.hpp>
+#include <backendsCommon/test/WorkloadTestUtils.hpp>
+
+#include <test/TensorHelpers.hpp>
+
+namespace
+{
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 2> Rsqrt2dTestCommon(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::TensorInfo inputTensorInfo,
+    const armnn::TensorInfo outputTensorInfo,
+    const std::vector<float>& inputValues,
+    const std::vector<float>& expectedOutputValues)
+{
+    auto inputTensor = MakeTensor<T, 2>(inputTensorInfo, ConvertToDataType<ArmnnType>(inputValues,inputTensorInfo));
+
+    LayerTestResult<T, 2> result(outputTensorInfo);
+
+    result.outputExpected = MakeTensor<T, 2>(outputTensorInfo,
+                                             ConvertToDataType<ArmnnType>(expectedOutputValues,outputTensorInfo));
+
+    std::unique_ptr<armnn::ITensorHandle> inputHandle = workloadFactory.CreateTensorHandle(inputTensorInfo);
+    std::unique_ptr<armnn::ITensorHandle> outputHandle = workloadFactory.CreateTensorHandle(outputTensorInfo);
+
+    armnn::RsqrtQueueDescriptor descriptor;
+
+    armnn::WorkloadInfo info;
+
+    AddInputToWorkload(descriptor, info, inputTensorInfo, inputHandle.get());
+    AddOutputToWorkload(descriptor, info, outputTensorInfo, outputHandle.get());
+
+    std::unique_ptr<armnn::IWorkload> workload = workloadFactory.CreateRsqrt(descriptor, info);
+
+    inputHandle->Allocate();
+    outputHandle->Allocate();
+
+    CopyDataToITensorHandle(inputHandle.get(), &inputTensor[0][0]);
+
+    workload->PostAllocationConfigure();
+    workload->Execute();
+
+    CopyDataFromITensorHandle(&result.output[0][0], outputHandle.get());
+
+    return result;
+}
+
+} // anonymous namespace
+
+template<armnn::DataType ArmnnType, typename T>
+LayerTestResult<T, 2> Rsqrt2dTest(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    const armnn::TensorShape inputShape{ 2, 2 };
+    const armnn::TensorShape outputShape{ 2, 2 };
+
+    armnn::TensorInfo inputTensorInfo(inputShape, ArmnnType);
+    inputTensorInfo.SetQuantizationScale(0.1f);
+    inputTensorInfo.SetQuantizationOffset(0);
+
+    armnn::TensorInfo outputTensorInfo(outputShape, ArmnnType);
+    outputTensorInfo.SetQuantizationScale(0.1f);
+    outputTensorInfo.SetQuantizationOffset(0);
+
+    std::vector<float> inputValues
+    {
+        1.f, 4.f,
+        16.f, 25.f
+    };
+
+    std::vector<float> expectedOutputValues
+    {
+        1.f, 0.5f,
+        0.25f, 0.2f
+    };
+
+    return Rsqrt2dTestCommon<ArmnnType>(workloadFactory, memoryManager,
+                                inputTensorInfo, outputTensorInfo,
+                                inputValues, expectedOutputValues);
+}
+
+template<armnn::DataType ArmnnType, typename T>
+LayerTestResult<T, 3> Rsqrt3dTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    const armnn::TensorShape inputShape{ 3, 1, 2 };
+    const armnn::TensorShape outputShape{ 3, 1, 2 };
+
+    armnn::TensorInfo inputTensorInfo(inputShape, ArmnnType);
+    inputTensorInfo.SetQuantizationScale(0.1f);
+    inputTensorInfo.SetQuantizationOffset(0);
+
+    armnn::TensorInfo outputTensorInfo(outputShape, ArmnnType);
+    outputTensorInfo.SetQuantizationScale(0.1f);
+    outputTensorInfo.SetQuantizationOffset(0);
+
+    std::vector<float> inputValues
+    {
+        1.f, 4.f, 16.f,
+        25.f, 64.f, 100.f
+    };
+
+    std::vector<float> expectedOutputValues
+    {
+        1.f, 0.5f, 0.25f,
+        0.2f, 0.125f, 0.1f
+    };
+
+    auto inputTensor = MakeTensor<T, 3>(inputTensorInfo, ConvertToDataType<ArmnnType>(inputValues,inputTensorInfo));
+
+    LayerTestResult<T, 3> result(outputTensorInfo);
+    result.outputExpected = MakeTensor<T, 3>(outputTensorInfo,
+                                             ConvertToDataType<ArmnnType>(expectedOutputValues,outputTensorInfo));
+
+    std::unique_ptr<armnn::ITensorHandle> inputHandle = workloadFactory.CreateTensorHandle(inputTensorInfo);
+    std::unique_ptr<armnn::ITensorHandle> outputHandle = workloadFactory.CreateTensorHandle(outputTensorInfo);
+
+    armnn::RsqrtQueueDescriptor descriptor;
+
+    armnn::WorkloadInfo info;
+
+    AddInputToWorkload(descriptor, info, inputTensorInfo, inputHandle.get());
+    AddOutputToWorkload(descriptor, info, outputTensorInfo, outputHandle.get());
+
+    std::unique_ptr<armnn::IWorkload> workload = workloadFactory.CreateRsqrt(descriptor, info);
+
+    inputHandle->Allocate();
+    outputHandle->Allocate();
+
+    CopyDataToITensorHandle(inputHandle.get(), &inputTensor[0][0][0]);
+
+    workload->PostAllocationConfigure();
+    workload->Execute();
+
+    CopyDataFromITensorHandle(&result.output[0][0][0], outputHandle.get());
+
+    return result;
+}
+
+template<armnn::DataType ArmnnType, typename T>
+LayerTestResult<T, 2> RsqrtZeroTest(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    const armnn::TensorShape inputShape{ 1, 2 };
+    const armnn::TensorShape outputShape{ 1, 2 };
+
+    armnn::TensorInfo inputTensorInfo(inputShape, ArmnnType);
+    inputTensorInfo.SetQuantizationScale(0.1f);
+
+    armnn::TensorInfo outputTensorInfo(outputShape, ArmnnType);
+    outputTensorInfo.SetQuantizationScale(0.1f);
+
+    std::vector<float> inputValues
+    {
+        0.f, -0.f
+    };
+
+    std::vector<float> expectedOutputValues
+    {
+        INFINITY, -INFINITY
+    };
+
+    return Rsqrt2dTestCommon<ArmnnType>(workloadFactory, memoryManager,
+                                inputTensorInfo, outputTensorInfo,
+                                inputValues, expectedOutputValues);
+}
+
+template<armnn::DataType ArmnnType, typename T>
+LayerTestResult<T, 2> RsqrtNegativeTest(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    const armnn::TensorShape inputShape{ 1, 2 };
+    const armnn::TensorShape outputShape{ 1, 2 };
+
+    armnn::TensorInfo inputTensorInfo(inputShape, ArmnnType);
+    inputTensorInfo.SetQuantizationScale(0.1f);
+    inputTensorInfo.SetQuantizationOffset(0);
+
+    armnn::TensorInfo outputTensorInfo(outputShape, ArmnnType);
+    outputTensorInfo.SetQuantizationScale(0.1f);
+    outputTensorInfo.SetQuantizationOffset(0);
+
+    std::vector<float> inputValues
+    {
+        -25.f, -16.f
+    };
+
+    std::vector<float> expectedOutputValues
+    {
+        -NAN, -NAN
+    };
+
+    return Rsqrt2dTestCommon<ArmnnType>(workloadFactory, memoryManager,
+                                inputTensorInfo, outputTensorInfo,
+                                inputValues, expectedOutputValues);
+}
+
+//
+// Explicit template specializations
+//
+
+template LayerTestResult<armnn::ResolveType<armnn::DataType::Float32>, 2>
+Rsqrt2dTest<armnn::DataType::Float32>(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+template LayerTestResult<armnn::ResolveType<armnn::DataType::QuantisedAsymm8>, 2>
+Rsqrt2dTest<armnn::DataType::QuantisedAsymm8>(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+template LayerTestResult<armnn::ResolveType<armnn::DataType::QuantisedSymm16>, 2>
+Rsqrt2dTest<armnn::DataType::QuantisedSymm16>(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+template LayerTestResult<armnn::ResolveType<armnn::DataType::Float32>, 3>
+Rsqrt3dTest<armnn::DataType::Float32>(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+template LayerTestResult<armnn::ResolveType<armnn::DataType::QuantisedAsymm8>, 3>
+Rsqrt3dTest<armnn::DataType::QuantisedAsymm8>(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+template LayerTestResult<armnn::ResolveType<armnn::DataType::QuantisedSymm16>, 3>
+Rsqrt3dTest<armnn::DataType::QuantisedSymm16>(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+template LayerTestResult<armnn::ResolveType<armnn::DataType::Float32>, 2>
+RsqrtZeroTest<armnn::DataType::Float32>(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+template LayerTestResult<armnn::ResolveType<armnn::DataType::Float32>, 2>
+RsqrtNegativeTest<armnn::DataType::Float32>(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
diff --git a/src/backends/backendsCommon/test/layerTests/RsqrtTestImpl.hpp b/src/backends/backendsCommon/test/layerTests/RsqrtTestImpl.hpp
new file mode 100644
index 0000000..e5a5340
--- /dev/null
+++ b/src/backends/backendsCommon/test/layerTests/RsqrtTestImpl.hpp
@@ -0,0 +1,33 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include "LayerTestResult.hpp"
+
+#include <ResolveType.hpp>
+
+#include <backendsCommon/IBackendInternal.hpp>
+#include <backendsCommon/WorkloadFactory.hpp>
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 2> Rsqrt2dTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 3> Rsqrt3dTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 2> RsqrtZeroTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 2> RsqrtNegativeTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
diff --git a/src/backends/backendsCommon/test/layerTests/SoftmaxTestImpl.cpp b/src/backends/backendsCommon/test/layerTests/SoftmaxTestImpl.cpp
new file mode 100644
index 0000000..49184ed
--- /dev/null
+++ b/src/backends/backendsCommon/test/layerTests/SoftmaxTestImpl.cpp
@@ -0,0 +1,682 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "SoftmaxTestImpl.hpp"
+
+#include <ResolveType.hpp>
+
+#include <armnn/ArmNN.hpp>
+
+#include <backendsCommon/CpuTensorHandle.hpp>
+
+#include <backendsCommon/test/QuantizeHelper.hpp>
+#include <backendsCommon/test/TensorCopyUtils.hpp>
+#include <backendsCommon/test/WorkloadTestUtils.hpp>
+
+#include <test/TensorHelpers.hpp>
+
+#include <algorithm>
+
+namespace
+{
+
+struct Simple3dSoftmaxOutputData
+{
+    const std::vector<float> outputData =
+    {
+        0.0964599f, 0.26220518f, 0.0964599f, 0.0964599f,
+        0.15903549f, 0.0964599f, 0.0964599f, 0.0964599f
+    };
+
+    const armnn::TensorShape inputShape{ 1, 8, 1 };
+
+    const std::vector<float> inputData =
+    {
+        0.0f, 1.0f, 0.0f, 0.0f,
+        0.5f, 0.0f, 0.0f, 0.0f,
+    };
+};
+
+struct Simple4dSoftmaxData
+{
+    const armnn::TensorShape inputShape{ 1, 8, 1, 1 };
+
+    const std::vector<float> outputData =
+    {
+        0.0964599f, 0.26220518f, 0.0964599f, 0.0964599f,
+        0.15903549f, 0.0964599f, 0.0964599f, 0.0964599f
+    };
+
+    const std::vector<float> inputData =
+    {
+         0.0f, 1.0f, 0.0f, 0.0f,
+         0.5f, 0.0f, 0.0f, 0.0f
+    };
+};
+
+template<armnn::DataType ArmnnType, std::size_t n, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, n> SimpleSoftmaxBaseTestImpl(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    float beta,
+    const armnn::TensorShape& inputShape,
+    const std::vector<float>& outputData,
+    const std::vector<float>& inputData,
+    int axis = 1)
+{
+    using std::exp;
+
+    const float qScale = 1.f / 256.f;
+    const int qOffset = 0;
+
+    armnn::TensorInfo inputTensorInfo;
+    armnn::TensorInfo outputTensorInfo;
+
+    inputTensorInfo = armnn::TensorInfo(inputShape, ArmnnType);
+    inputTensorInfo.SetQuantizationScale(qScale);
+    inputTensorInfo.SetQuantizationOffset(qOffset);
+
+    outputTensorInfo = armnn::TensorInfo(inputShape, ArmnnType);
+    outputTensorInfo.SetQuantizationScale(qScale);
+    outputTensorInfo.SetQuantizationOffset(qOffset);
+
+    LayerTestResult<T, n> ret(outputTensorInfo);
+
+    // Each row is independently softmax'd.
+    auto input = MakeTensor<T, n>(inputTensorInfo, std::vector<T>(
+        QuantizedVector<T>(qScale, qOffset, inputData)));
+
+    std::unique_ptr<armnn::ITensorHandle> inputHandle = workloadFactory.CreateTensorHandle(inputTensorInfo);
+    std::unique_ptr<armnn::ITensorHandle> outputHandle = workloadFactory.CreateTensorHandle(outputTensorInfo);
+
+    armnn::SoftmaxQueueDescriptor data;
+    data.m_Parameters.m_Beta = beta;
+    data.m_Parameters.m_Axis = axis;
+
+    armnn::WorkloadInfo info;
+    AddInputToWorkload(data, info, inputTensorInfo, inputHandle.get());
+    AddOutputToWorkload(data, info, outputTensorInfo, outputHandle.get());
+
+    std::unique_ptr<armnn::IWorkload> workload = workloadFactory.CreateSoftmax(data, info);
+
+    inputHandle->Allocate();
+    outputHandle->Allocate();
+    CopyDataToITensorHandle(inputHandle.get(), input.origin());
+
+    BOOST_ASSERT(workload);
+
+    ExecuteWorkload(*workload, memoryManager);
+
+    CopyDataFromITensorHandle(ret.output.origin(), outputHandle.get());
+
+    std::vector<T> expectedOutput = std::vector<T>(
+            QuantizedVector<T>(qScale, qOffset, outputData));
+    ret.outputExpected = MakeTensor<T, n>(outputTensorInfo, expectedOutput);
+
+    return ret;
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 2> SimpleSoftmaxTestImpl(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    float beta)
+{
+    using std::exp;
+    const armnn::TensorShape inputShape{ 2, 4 };
+
+    float x0[4] = { exp((0.f - 1.0f) * beta), exp((1.0f - 1.0f) * beta),
+                    exp((0.0f - 1.0f) * beta), exp((0.0f - 1.0f) * beta) };
+    float sum0 = x0[0] + x0[1] + x0[2] + x0[3];
+    float x1[4] = { exp((0.5f - 0.5f) * beta), exp((0.0f - 0.5f) * beta),
+                    exp((0.0f - 0.5f) * beta), exp((0.0f - 0.5f) * beta) };
+    float sum1 = x1[0] + x1[1] + x1[2] + x1[3];
+
+    const std::vector<float> outputData = { x0[0] / sum0, x0[1] / sum0, x0[2] / sum0, x0[3] / sum0,
+                                            x1[0] / sum1, x1[1] / sum1, x1[2] / sum1, x1[3] / sum1 };
+
+    const std::vector<float> inputData =
+            {
+                0.f, 1.f, 0.f, 0.f,
+                .5f, 0.f, 0.f, 0.f,
+            };
+
+    return SimpleSoftmaxBaseTestImpl<ArmnnType, 2>(workloadFactory, memoryManager, beta,
+                                                   inputShape, outputData, inputData);
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 2> SimpleSoftmaxTestImpl(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        float beta,
+        int axis)
+{
+    armnn::TensorShape inputShape;
+    std::vector<float> inputData;
+    std::vector<float> outputData;
+    switch (axis)
+    {
+    case -2:
+    case 0:
+        {
+        inputShape = {5, 2};
+
+        inputData =
+                {
+                        17.0f, -1.0f, 16.0f, -2.0f, 15.0f, -3.0f, 14.0f, -4.0f, 1.0f, -17.0f
+                };
+
+        outputData =
+                {
+                        0.643914213228014f, 0.643914213228014f, 0.236882800924671f, 0.236882800924671f,
+                        0.087144312427294f,
+                        0.087144312427294f, 0.032058600957022f, 0.032058600957022f, 7.246299848982885e-08f,
+                        7.246299848982885e-08f
+                };
+        break;
+        }
+    case -1:
+    case 1:
+        {
+        inputShape = {2, 5};
+
+        inputData =
+                {
+                        17.0f, 16.0f, 15.0f, 14.0f, 1.0f, -1.0f, -2.0f, -3.0f, -4.0f, -17.0f
+                };
+
+        outputData =
+                {
+                        0.643914213228014f, 0.236882800924671f, 0.087144312427294f, 0.032058600957022f,
+                        7.246299848982885e-08f,
+                        0.643914213228014f, 0.236882800924671f, 0.087144312427294f, 0.032058600957022f,
+                        7.246299848982885e-08f
+                };
+        break;
+        }
+    }
+    return SimpleSoftmaxBaseTestImpl<ArmnnType, 2>(workloadFactory, memoryManager, beta,
+                                                   inputShape, outputData, inputData, axis);
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 3> Simple3dSoftmaxTestImpl(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    float beta,
+    const armnn::TensorShape& inputShape,
+    const std::vector<float>& outputData,
+    const std::vector<float>& inputData,
+    int axis = 1)
+{
+    return SimpleSoftmaxBaseTestImpl<ArmnnType, 3>(workloadFactory, memoryManager, beta,
+                                                   inputShape, outputData, inputData, axis);
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> Simple4dSoftmaxTestImpl(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    float beta,
+    const armnn::TensorShape& inputShape,
+    const std::vector<float>& outputData,
+    const std::vector<float>& inputData,
+    int axis = 1)
+{
+
+    return SimpleSoftmaxBaseTestImpl<ArmnnType, 4>(workloadFactory, memoryManager, beta,
+                                                   inputShape, outputData, inputData, axis);
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 2> CompareSoftmaxTestImpl(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        armnn::IWorkloadFactory& refWorkloadFactory,
+        float beta)
+{
+
+    const int batchSize = 20;
+    const int channels = 30;
+
+    armnn::TensorInfo inputTensorInfo;
+    armnn::TensorInfo outputTensorInfo;
+
+    unsigned int inputShape[] = { batchSize, channels };
+
+    inputTensorInfo = armnn::TensorInfo(2, inputShape, ArmnnType);
+    outputTensorInfo = armnn::TensorInfo(2, inputShape, ArmnnType);
+    float qScale = 1.f / 256.f;
+    int qOffset = 0;
+    inputTensorInfo.SetQuantizationScale(qScale);
+    inputTensorInfo.SetQuantizationOffset(qOffset);
+    outputTensorInfo.SetQuantizationScale(qScale);
+    outputTensorInfo.SetQuantizationOffset(qOffset);
+
+
+    LayerTestResult<T, 2> ret(outputTensorInfo);
+    auto input = MakeRandomTensor<T, 2>(inputTensorInfo, 0xF00D, 0.0f, 1.0f);
+
+    std::unique_ptr<armnn::ITensorHandle> inputHandle = workloadFactory.CreateTensorHandle(inputTensorInfo);
+    std::unique_ptr<armnn::ITensorHandle> outputHandle = workloadFactory.CreateTensorHandle(outputTensorInfo);
+
+    armnn::SoftmaxQueueDescriptor data;
+    data.m_Parameters.m_Beta = beta;
+
+    armnn::WorkloadInfo info;
+    AddInputToWorkload(data, info, inputTensorInfo, inputHandle.get());
+    AddOutputToWorkload(data, info, outputTensorInfo, outputHandle.get());
+
+    std::unique_ptr<armnn::ITensorHandle> outputHandleRef = refWorkloadFactory.CreateTensorHandle(outputTensorInfo);
+    std::unique_ptr<armnn::ITensorHandle> inputHandleRef = refWorkloadFactory.CreateTensorHandle(inputTensorInfo);
+
+
+    armnn::SoftmaxQueueDescriptor refData = data;
+    armnn::WorkloadInfo refInfo = info;
+    SetWorkloadInput(refData, refInfo, 0, inputTensorInfo, inputHandleRef.get());
+    SetWorkloadOutput(refData, refInfo, 0, outputTensorInfo, outputHandleRef.get());
+
+    std::unique_ptr<armnn::IWorkload> workload = workloadFactory.CreateSoftmax(data, info);
+    std::unique_ptr<armnn::IWorkload> workloadRef = refWorkloadFactory.CreateSoftmax(refData, refInfo);
+
+    outputHandleRef->Allocate();
+    inputHandleRef->Allocate();
+
+    inputHandle->Allocate();
+    outputHandle->Allocate();
+
+    CopyDataToITensorHandle(inputHandle.get(), &input[0][0]);
+    CopyDataToITensorHandle(inputHandleRef.get(), &input[0][0]);
+
+    ExecuteWorkload(*workload, memoryManager);
+
+    workloadRef->Execute();
+
+    CopyDataFromITensorHandle(&ret.output[0][0], outputHandle.get());
+    CopyDataFromITensorHandle(&ret.outputExpected[0][0], outputHandleRef.get());
+
+    return ret;
+}
+
+} // anonymous namespace
+
+LayerTestResult<float,2> SimpleSoftmaxTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    float beta)
+{
+    return SimpleSoftmaxTestImpl<armnn::DataType::Float32>(workloadFactory, memoryManager, beta);
+}
+
+LayerTestResult<float,2> SimpleAxisSoftmaxTest(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        float beta,
+        int axis)
+{
+    return SimpleSoftmaxTestImpl<armnn::DataType::Float32>(workloadFactory, memoryManager, beta, axis);
+}
+
+LayerTestResult<float,3> Simple3dSoftmaxTest(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        float beta)
+{
+    Simple3dSoftmaxOutputData data;
+    return Simple3dSoftmaxTestImpl<armnn::DataType::Float32>(workloadFactory, memoryManager, beta,
+                                                             data.inputShape, data.outputData, data.inputData);
+}
+
+LayerTestResult<float,3> Simple3dAxisSoftmaxTest(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        float beta,
+        int axis)
+{
+    armnn::TensorShape inputShape;
+    std::vector<float> inputData;
+    std::vector<float> outputData;
+    switch (axis)
+    {
+    case -3:
+    case 0:
+        {
+            inputShape = {5, 2, 2};
+
+            inputData =
+                    {
+                            17.0f, -1.0f, 17.0f, -1.0f, 16.0f, -2.0f, 16.0f, -2.0f, 15.0f, -3.0f,
+
+                            15.0f, -3.0f, 14.0f, -4.0f, 14.0f, -4.0f, 1.0f, -17.0f, 1.0f, -17.0f
+                    };
+
+            outputData =
+                    {
+                            0.643914213228014f, 0.643914213228014f, 0.643914213228014f, 0.643914213228014f,
+                            0.236882800924671f,
+                            0.236882800924671f, 0.236882800924671f, 0.236882800924671f, 0.087144312427294f,
+                            0.087144312427294f,
+
+                            0.087144312427294f, 0.087144312427294f, 0.032058600957022f, 0.032058600957022f,
+                            0.032058600957022f,
+                            0.032058600957022f, 7.246299848982885e-08f, 7.246299848982885e-08f, 7.246299848982885e-08f,
+                            7.246299848982885e-08f
+                    };
+            break;
+        }
+    case -2:
+    case 1:
+        {
+            inputShape = {2, 5, 2};
+
+            inputData =
+                    {
+                            17.0f, -1.0f, 16.0f, -2.0f, 15.0f, -3.0f, 14.0f, -4.0f, 1.0f, -17.0f,
+
+                            17.0f, -1.0f, 16.0f, -2.0f, 15.0f, -3.0f, 14.0f, -4.0f, 1.0f, -17.0f
+                    };
+
+            outputData =
+                    {
+                            0.643914213228014f, 0.643914213228014f, 0.236882800924671f, 0.236882800924671f,
+                            0.087144312427294f,
+                            0.087144312427294f, 0.032058600957022f, 0.032058600957022f, 7.246299848982885e-08f,
+                            7.246299848982885e-08f,
+
+                            0.643914213228014f, 0.643914213228014f, 0.236882800924671f, 0.236882800924671f,
+                            0.087144312427294f,
+                            0.087144312427294f, 0.032058600957022f, 0.032058600957022f, 7.246299848982885e-08f,
+                            7.246299848982885e-08f
+                    };
+        break;
+        }
+    case -1:
+    case 2:
+        {
+            inputShape = {2, 2, 5};
+
+            inputData =
+                    {
+                            17.0f, 16.0f, 15.0f, 14.0f, 1.0f, -1.0f, -2.0f, -3.0f, -4.0f, -17.0f,
+                            17.0f, 16.0f, 15.0f, 14.0f, 1.0f, -1.0f, -2.0f, -3.0f, -4.0f, -17.0f
+                    };
+
+            outputData =
+                    {
+                            0.643914213228014f, 0.236882800924671f, 0.087144312427294f, 0.032058600957022f,
+                            7.246299848982885e-08f,
+                            0.643914213228014f, 0.236882800924671f, 0.087144312427294f, 0.032058600957022f,
+                            7.246299848982885e-08f,
+
+                            0.643914213228014f, 0.236882800924671f, 0.087144312427294f, 0.032058600957022f,
+                            7.246299848982885e-08f,
+                            0.643914213228014f, 0.236882800924671f, 0.087144312427294f, 0.032058600957022f,
+                            7.246299848982885e-08f
+                    };
+            break;
+        }
+    }
+
+    return Simple3dSoftmaxTestImpl<armnn::DataType::Float32>(workloadFactory, memoryManager, beta,
+                                                             inputShape, outputData, inputData, axis);
+}
+
+LayerTestResult<float,4> Simple4dSoftmaxTest(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        float beta)
+{
+    Simple4dSoftmaxData data;
+    return Simple4dSoftmaxTestImpl<armnn::DataType::Float32>(workloadFactory, memoryManager, beta, data.inputShape,
+                                                             data.outputData, data.inputData);
+}
+
+LayerTestResult<float,4> Simple4dAxisSoftmaxTest(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        float beta,
+        int axis)
+{
+    armnn::TensorShape inputShape;
+    std::vector<float> inputData;
+    std::vector<float> outputData;
+    switch (axis)
+    {
+    case -4:
+    case 0:
+        {
+            inputShape = {5, 2, 2, 2};
+
+            inputData =
+                    {
+                            17.0f, -1.0f, 17.0f, -1.0f, 17.0f, -1.0f, 17.0f, -1.0f, 16.0f, -2.0f,
+                            16.0f, -2.0f, 16.0f, -2.0f, 16.0f, -2.0f, 15.0f, -3.0f, 15.0f, -3.0f,
+                            15.0f, -3.0f, 15.0f, -3.0f, 14.0f, -4.0f, 14.0f, -4.0f, 14.0f, -4.0f,
+                            14.0f, -4.0f, 1.0f, -17.0f, 1.0f, -17.0f, 1.0f, -17.0f, 1.0f, -17.0f
+                    };
+
+            outputData =
+                    {
+                            0.643914213228014f, 0.643914213228014f, 0.643914213228014f, 0.643914213228014f,
+                            0.643914213228014f,
+                            0.643914213228014f, 0.643914213228014f, 0.643914213228014f, 0.236882800924671f,
+                            0.236882800924671f,
+                            0.236882800924671f, 0.236882800924671f, 0.236882800924671f, 0.236882800924671f,
+                            0.236882800924671f,
+                            0.236882800924671f, 0.087144312427294f, 0.087144312427294f, 0.087144312427294f,
+                            0.087144312427294f,
+
+                            0.087144312427294f, 0.087144312427294f, 0.087144312427294f, 0.087144312427294f,
+                            0.032058600957022f,
+                            0.032058600957022f, 0.032058600957022f, 0.032058600957022f, 0.032058600957022f,
+                            0.032058600957022f,
+                            0.032058600957022f, 0.032058600957022f, 7.246299848982885e-08f, 7.246299848982885e-08f,
+                            7.246299848982885e-08f,
+                            7.246299848982885e-08f, 7.246299848982885e-08f, 7.246299848982885e-08f,
+                            7.246299848982885e-08f, 7.246299848982885e-08f
+                    };
+            break;
+        }
+    case -3:
+    case 1:
+        {
+            inputShape = {2, 5, 2, 2};
+
+            inputData =
+                    {
+                            17.0f, -1.0f, 17.0f, -1.0f, 16.0f, -2.0f, 16.0f, -2.0f, 15.0f, -3.0f,
+                            15.0f, -3.0f, 14.0f, -4.0f, 14.0f, -4.0f, 1.0f, -17.0f, 1.0f, -17.0f,
+                            17.0f, -1.0f, 17.0f, -1.0f, 16.0f, -2.0f, 16.0f, -2.0f, 15.0f, -3.0f,
+                            15.0f, -3.0f, 14.0f, -4.0f, 14.0f, -4.0f, 1.0f, -17.0f, 1.0f, -17.0f
+                    };
+
+            outputData =
+                    {
+                            0.643914213228014f, 0.643914213228014f, 0.643914213228014f, 0.643914213228014f,
+                            0.236882800924671f,
+                            0.236882800924671f, 0.236882800924671f, 0.236882800924671f, 0.087144312427294f,
+                            0.087144312427294f,
+                            0.087144312427294f, 0.087144312427294f, 0.032058600957022f, 0.032058600957022f,
+                            0.032058600957022f,
+                            0.032058600957022f, 7.246299848982885e-08f, 7.246299848982885e-08f, 7.246299848982885e-08f,
+                            7.246299848982885e-08f,
+
+
+                            0.643914213228014f, 0.643914213228014f, 0.643914213228014f, 0.643914213228014f,
+                            0.236882800924671f,
+                            0.236882800924671f, 0.236882800924671f, 0.236882800924671f, 0.087144312427294f,
+                            0.087144312427294f,
+                            0.087144312427294f, 0.087144312427294f, 0.032058600957022f, 0.032058600957022f,
+                            0.032058600957022f,
+                            0.032058600957022f, 7.246299848982885e-08f, 7.246299848982885e-08f, 7.246299848982885e-08f,
+                            7.246299848982885e-08f
+                    };
+            break;
+        }
+    case -2:
+    case 2:
+        {
+        inputShape = {2, 2, 5, 2};
+
+        inputData =
+                {
+                        17.0f, -1.0f, 16.0f, -2.0f, 15.0f, -3.0f, 14.0f, -4.0f, 1.0f, -17.0f,
+                        17.0f, -1.0f, 16.0f, -2.0f, 15.0f, -3.0f, 14.0f, -4.0f, 1.0f, -17.0f,
+                        17.0f, -1.0f, 16.0f, -2.0f, 15.0f, -3.0f, 14.0f, -4.0f, 1.0f, -17.0f,
+                        17.0f, -1.0f, 16.0f, -2.0f, 15.0f, -3.0f, 14.0f, -4.0f, 1.0f, -17.0f
+                };
+
+        outputData =
+                {
+                        0.643914213228014f, 0.643914213228014f, 0.236882800924671f, 0.236882800924671f,
+                        0.087144312427294f,
+                        0.087144312427294f, 0.032058600957022f, 0.032058600957022f, 7.246299848982885e-08f,
+                        7.246299848982885e-08f,
+                        0.643914213228014f, 0.643914213228014f, 0.236882800924671f, 0.236882800924671f,
+                        0.087144312427294f,
+                        0.087144312427294f, 0.032058600957022f, 0.032058600957022f, 7.246299848982885e-08f,
+                        7.246299848982885e-08f,
+
+                        0.643914213228014f, 0.643914213228014f, 0.236882800924671f, 0.236882800924671f,
+                        0.087144312427294f,
+                        0.087144312427294f, 0.032058600957022f, 0.032058600957022f, 7.246299848982885e-08f,
+                        7.246299848982885e-08f,
+                        0.643914213228014f, 0.643914213228014f, 0.236882800924671f, 0.236882800924671f,
+                        0.087144312427294f,
+                        0.087144312427294f, 0.032058600957022f, 0.032058600957022f, 7.246299848982885e-08f,
+                        7.246299848982885e-08f
+                };
+        break;
+        }
+    case -1:
+    case 3:
+        {
+            inputShape = {2, 2, 2, 5};
+
+            inputData =
+                    {
+                            17.0f, 16.0f, 15.0f, 14.0f, 1.0f, -1.0f, -2.0f, -3.0f, -4.0f, -17.0f,
+                            17.0f, 16.0f, 15.0f, 14.0f, 1.0f, -1.0f, -2.0f, -3.0f, -4.0f, -17.0f,
+                            17.0f, 16.0f, 15.0f, 14.0f, 1.0f, -1.0f, -2.0f, -3.0f, -4.0f, -17.0f,
+                            17.0f, 16.0f, 15.0f, 14.0f, 1.0f, -1.0f, -2.0f, -3.0f, -4.0f, -17.0f
+                    };
+
+            outputData =
+                    {
+                            0.643914213228014f, 0.236882800924671f, 0.087144312427294f, 0.032058600957022f,
+                            7.246299848982885e-08f,
+                            0.643914213228014f, 0.236882800924671f, 0.087144312427294f, 0.032058600957022f,
+                            7.246299848982885e-08f,
+                            0.643914213228014f, 0.236882800924671f, 0.087144312427294f, 0.032058600957022f,
+                            7.246299848982885e-08f,
+                            0.643914213228014f, 0.236882800924671f, 0.087144312427294f, 0.032058600957022f,
+                            7.246299848982885e-08f,
+
+                            0.643914213228014f, 0.236882800924671f, 0.087144312427294f, 0.032058600957022f,
+                            7.246299848982885e-08f,
+                            0.643914213228014f, 0.236882800924671f, 0.087144312427294f, 0.032058600957022f,
+                            7.246299848982885e-08f,
+                            0.643914213228014f, 0.236882800924671f, 0.087144312427294f, 0.032058600957022f,
+                            7.246299848982885e-08f,
+                            0.643914213228014f, 0.236882800924671f, 0.087144312427294f, 0.032058600957022f,
+                            7.246299848982885e-08f
+                    };
+            break;
+        }
+    }
+
+    return Simple4dSoftmaxTestImpl<armnn::DataType::Float32>(
+        workloadFactory,
+        memoryManager,
+        beta,
+        inputShape,
+        outputData,
+        inputData,
+        axis);
+}
+
+LayerTestResult<uint8_t,2> SimpleSoftmaxUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    float beta)
+{
+    return SimpleSoftmaxTestImpl<armnn::DataType::QuantisedAsymm8>(workloadFactory, memoryManager, beta);
+}
+
+LayerTestResult<uint8_t,3> Simple3dSoftmaxUint8Test(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        float beta)
+{
+    Simple3dSoftmaxOutputData data;
+    return Simple3dSoftmaxTestImpl<armnn::DataType::QuantisedAsymm8>(
+        workloadFactory,
+        memoryManager,
+        beta,
+        data.inputShape,
+        data.outputData,
+        data.inputData);
+}
+
+LayerTestResult<uint8_t,4> Simple4dSoftmaxUint8Test(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        float beta)
+{
+    Simple4dSoftmaxData data;
+
+    return Simple4dSoftmaxTestImpl<armnn::DataType::QuantisedAsymm8>(workloadFactory, memoryManager, beta,
+                                                                     data.inputShape, data.outputData, data.inputData);
+}
+
+LayerTestResult<int16_t,2> SimpleSoftmaxUint16Test(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        float beta)
+{
+    return SimpleSoftmaxTestImpl<armnn::DataType::QuantisedSymm16>(workloadFactory, memoryManager, beta);
+}
+
+LayerTestResult<int16_t,3> Simple3dSoftmaxUint16Test(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        float beta)
+{
+    Simple3dSoftmaxOutputData data;
+    return Simple3dSoftmaxTestImpl<armnn::DataType::QuantisedSymm16>(workloadFactory, memoryManager, beta,
+                                                                     data.inputShape, data.outputData, data.inputData);
+}
+
+LayerTestResult<int16_t,4> Simple4dSoftmaxUint16Test(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        float beta)
+{
+    Simple4dSoftmaxData data;
+
+    return Simple4dSoftmaxTestImpl<armnn::DataType::QuantisedSymm16>(workloadFactory, memoryManager, beta,
+                                                                     data.inputShape, data.outputData, data.inputData);
+}
+
+LayerTestResult<float,2> CompareSoftmaxTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    armnn::IWorkloadFactory& refWorkloadFactory,
+    float beta)
+{
+    return CompareSoftmaxTestImpl<armnn::DataType::Float32>(
+        workloadFactory, memoryManager, refWorkloadFactory, beta);
+}
+
+LayerTestResult<uint8_t,2> CompareSoftmaxUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    armnn::IWorkloadFactory& refWorkloadFactory,
+    float beta)
+{
+    return CompareSoftmaxTestImpl<armnn::DataType::QuantisedAsymm8>(
+        workloadFactory, memoryManager, refWorkloadFactory, beta);
+}
diff --git a/src/backends/backendsCommon/test/layerTests/SoftmaxTestImpl.hpp b/src/backends/backendsCommon/test/layerTests/SoftmaxTestImpl.hpp
new file mode 100644
index 0000000..96f5fb9
--- /dev/null
+++ b/src/backends/backendsCommon/test/layerTests/SoftmaxTestImpl.hpp
@@ -0,0 +1,86 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include "LayerTestResult.hpp"
+
+#include <backendsCommon/IBackendInternal.hpp>
+#include <backendsCommon/WorkloadFactory.hpp>
+
+LayerTestResult<float, 2> SimpleSoftmaxTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    float beta);
+
+LayerTestResult<float, 2> SimpleAxisSoftmaxTest(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        float beta,
+        int axis);
+
+LayerTestResult<float, 3> Simple3dSoftmaxTest(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        float beta);
+
+LayerTestResult<float, 3> Simple3dAxisSoftmaxTest(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        float beta,
+        int axis);
+
+LayerTestResult<float, 4> Simple4dSoftmaxTest(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        float beta);
+
+LayerTestResult<float, 4> Simple4dAxisSoftmaxTest(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        float beta,
+        int axis);
+
+LayerTestResult<uint8_t, 2> SimpleSoftmaxUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    float beta);
+
+LayerTestResult<uint8_t,3> Simple3dSoftmaxUint8Test(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        float beta);
+
+LayerTestResult<uint8_t,4> Simple4dSoftmaxUint8Test(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        float beta);
+
+LayerTestResult<int16_t,2> SimpleSoftmaxUint16Test(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        float beta);
+
+LayerTestResult<int16_t,3> Simple3dSoftmaxUint16Test(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        float beta);
+
+LayerTestResult<int16_t,4> Simple4dSoftmaxUint16Test(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        float beta);
+
+LayerTestResult<float, 2> CompareSoftmaxTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    armnn::IWorkloadFactory& refWorkloadFactory,
+    float beta);
+
+LayerTestResult<uint8_t, 2> CompareSoftmaxUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    armnn::IWorkloadFactory& refWorkloadFactory,
+    float beta);
diff --git a/src/backends/backendsCommon/test/layerTests/SpaceToBatchNdTestImpl.cpp b/src/backends/backendsCommon/test/layerTests/SpaceToBatchNdTestImpl.cpp
new file mode 100644
index 0000000..152ce2c
--- /dev/null
+++ b/src/backends/backendsCommon/test/layerTests/SpaceToBatchNdTestImpl.cpp
@@ -0,0 +1,441 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "SpaceToBatchNdTestImpl.hpp"
+
+#include <Permute.hpp>
+#include <ResolveType.hpp>
+
+#include <armnn/ArmNN.hpp>
+
+#include <backendsCommon/test/TensorCopyUtils.hpp>
+#include <backendsCommon/test/WorkloadTestUtils.hpp>
+
+#include <test/TensorHelpers.hpp>
+
+namespace
+{
+
+template<typename T>
+LayerTestResult<T, 4> SpaceToBatchNdTestImpl(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    armnn::TensorInfo& inputTensorInfo,
+    armnn::TensorInfo& outputTensorInfo,
+    std::vector<float>& inputData,
+    std::vector<float>& outputExpectedData,
+    armnn::SpaceToBatchNdQueueDescriptor descriptor,
+    const float qScale = 1.0f,
+    const int32_t qOffset = 0)
+{
+    const armnn::PermutationVector NCHWToNHWC = {0, 3, 1, 2};
+    if (descriptor.m_Parameters.m_DataLayout == armnn::DataLayout::NHWC)
+    {
+        inputTensorInfo = armnnUtils::Permuted(inputTensorInfo, NCHWToNHWC);
+        outputTensorInfo = armnnUtils::Permuted(outputTensorInfo, NCHWToNHWC);
+
+        std::vector<float> inputTmp(inputData.size());
+        armnnUtils::Permute(inputTensorInfo.GetShape(), NCHWToNHWC,
+                            inputData.data(), inputTmp.data(), sizeof(float));
+        inputData = inputTmp;
+
+        std::vector<float> outputTmp(outputExpectedData.size());
+        armnnUtils::Permute(outputTensorInfo.GetShape(), NCHWToNHWC,
+                            outputExpectedData.data(), outputTmp.data(), sizeof(float));
+        outputExpectedData = outputTmp;
+    }
+
+    if(armnn::IsQuantizedType<T>())
+    {
+        inputTensorInfo.SetQuantizationScale(qScale);
+        inputTensorInfo.SetQuantizationOffset(qOffset);
+        outputTensorInfo.SetQuantizationScale(qScale);
+        outputTensorInfo.SetQuantizationOffset(qOffset);
+    }
+
+    boost::multi_array<T, 4> input = MakeTensor<T, 4>(inputTensorInfo, QuantizedVector<T>(qScale, qOffset, inputData));
+
+    LayerTestResult<T, 4> ret(outputTensorInfo);
+    ret.outputExpected = MakeTensor<T, 4>(outputTensorInfo, QuantizedVector<T>(qScale, qOffset, outputExpectedData));
+
+    std::unique_ptr<armnn::ITensorHandle> inputHandle = workloadFactory.CreateTensorHandle(inputTensorInfo);
+    std::unique_ptr<armnn::ITensorHandle> outputHandle = workloadFactory.CreateTensorHandle(outputTensorInfo);
+
+    armnn::WorkloadInfo info;
+    AddInputToWorkload(descriptor, info, inputTensorInfo, inputHandle.get());
+    AddOutputToWorkload(descriptor, info, outputTensorInfo, outputHandle.get());
+
+    std::unique_ptr<armnn::IWorkload> workload = workloadFactory.CreateSpaceToBatchNd(descriptor, info);
+
+    inputHandle->Allocate();
+    outputHandle->Allocate();
+
+    CopyDataToITensorHandle(inputHandle.get(), &input[0][0][0][0]);
+
+    workload->Execute();
+
+    CopyDataFromITensorHandle(&ret.output[0][0][0][0], outputHandle.get());
+
+    return ret;
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> SpaceToBatchNdSimpleTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    armnn::DataLayout dataLayout = armnn::DataLayout::NCHW)
+{
+    armnn::TensorInfo inputTensorInfo;
+    armnn::TensorInfo outputTensorInfo;
+
+    unsigned int inputShape[] = {1, 1, 2, 2};
+    unsigned int outputShape[] = {4, 1, 1, 1};
+
+    armnn::SpaceToBatchNdQueueDescriptor desc;
+    desc.m_Parameters.m_DataLayout = dataLayout;
+    desc.m_Parameters.m_BlockShape = {2, 2};
+    desc.m_Parameters.m_PadList = {{0, 0}, {0, 0}};
+
+    inputTensorInfo = armnn::TensorInfo(4, inputShape, ArmnnType);
+    outputTensorInfo = armnn::TensorInfo(4, outputShape, ArmnnType);
+
+    std::vector<float> input = std::vector<float>(
+    {
+        1.0f, 2.0f, 3.0f, 4.0f
+    });
+
+    std::vector<float> outputExpected = std::vector<float>(
+    {
+        1.0f, 2.0f, 3.0f, 4.0f
+    });
+
+    return SpaceToBatchNdTestImpl<T>(
+        workloadFactory, memoryManager, inputTensorInfo, outputTensorInfo, input, outputExpected, desc);
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> SpaceToBatchNdMultiChannelsTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    armnn::DataLayout dataLayout = armnn::DataLayout::NCHW)
+{
+    armnn::TensorInfo inputTensorInfo;
+    armnn::TensorInfo outputTensorInfo;
+
+    unsigned int inputShape[] = {1, 3, 2, 2};
+    unsigned int outputShape[] = {4, 3, 1, 1};
+
+    armnn::SpaceToBatchNdQueueDescriptor desc;
+    desc.m_Parameters.m_DataLayout = dataLayout;
+    desc.m_Parameters.m_BlockShape = {2, 2};
+    desc.m_Parameters.m_PadList = {{0, 0}, {0, 0}};
+
+    inputTensorInfo = armnn::TensorInfo(4, inputShape, ArmnnType);
+    outputTensorInfo = armnn::TensorInfo(4, outputShape, ArmnnType);
+
+    std::vector<float> input = std::vector<float>(
+    {
+        1.0f, 4.0f, 7.0f, 10.0f,
+        2.0f, 5.0, 8.0, 11.0f,
+        3.0f, 6.0f, 9.0f, 12.0f
+    });
+
+    std::vector<float> outputExpected = std::vector<float>(
+    {
+        1.0f, 2.0f, 3.0f,
+        4.0f, 5.0f, 6.0f,
+        7.0f, 8.0f, 9.0f,
+        10.0f, 11.0f, 12.0f
+    });
+
+    return SpaceToBatchNdTestImpl<T>(
+        workloadFactory, memoryManager, inputTensorInfo, outputTensorInfo, input, outputExpected, desc);
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> SpaceToBatchNdMultiBlockTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    armnn::DataLayout dataLayout = armnn::DataLayout::NCHW)
+{
+    armnn::TensorInfo inputTensorInfo;
+    armnn::TensorInfo outputTensorInfo;
+
+    unsigned int inputShape[] = {1, 1, 4, 4};
+    unsigned int outputShape[] = {4, 1, 2, 2};
+
+    armnn::SpaceToBatchNdQueueDescriptor desc;
+    desc.m_Parameters.m_DataLayout = dataLayout;
+    desc.m_Parameters.m_BlockShape = {2, 2};
+    desc.m_Parameters.m_PadList = {{0, 0}, {0, 0}};
+
+    inputTensorInfo = armnn::TensorInfo(4, inputShape, ArmnnType);
+    outputTensorInfo = armnn::TensorInfo(4, outputShape, ArmnnType);
+
+    std::vector<float> input = std::vector<float>(
+    {
+        1.0f, 2.0f, 3.0f, 4.0f,
+        5.0f, 6.0f, 7.0f, 8.0f,
+        9.0f, 10.0f, 11.0f, 12.0f,
+        13.0f, 14.0f, 15.0f, 16.0f
+    });
+
+    std::vector<float> outputExpected = std::vector<float>(
+    {
+        1.0f, 3.0f, 9.0f, 11.0f,
+        2.0f, 4.0f, 10.0f, 12.0f,
+        5.0f, 7.0f, 13.0f, 15.0f,
+        6.0f, 8.0f, 14.0f, 16.0f
+    });
+
+    return SpaceToBatchNdTestImpl<T>(
+        workloadFactory, memoryManager, inputTensorInfo, outputTensorInfo, input, outputExpected, desc);
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> SpaceToBatchNdPaddingTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    armnn::DataLayout dataLayout = armnn::DataLayout::NCHW)
+{
+    armnn::TensorInfo inputTensorInfo;
+    armnn::TensorInfo outputTensorInfo;
+
+    unsigned int inputShape[] = {2, 1, 2, 4};
+    unsigned int outputShape[] = {8, 1, 1, 3};
+
+    armnn::SpaceToBatchNdQueueDescriptor desc;
+    desc.m_Parameters.m_DataLayout = dataLayout;
+    desc.m_Parameters.m_BlockShape = {2, 2};
+    desc.m_Parameters.m_PadList = {{0, 0}, {2, 0}};
+
+    inputTensorInfo = armnn::TensorInfo(4, inputShape, ArmnnType);
+    outputTensorInfo = armnn::TensorInfo(4, outputShape, ArmnnType);
+
+    std::vector<float> input = std::vector<float>(
+    {
+        1.0f, 2.0f, 3.0f, 4.0f,
+        5.0f, 6.0f, 7.0f, 8.0f,
+        9.0f, 10.0f, 11.0f, 12.0f,
+        13.0f, 14.0f, 15.0f, 16.0f
+    });
+
+    std::vector<float> outputExpected = std::vector<float>(
+    {
+        0.0f, 1.0f, 3.0f,
+        0.0f, 9.0f, 11.0f,
+        0.0f, 2.0f, 4.0f,
+        0.0f, 10.0f, 12.0f,
+        0.0f, 5.0f, 7.0f,
+        0.0f, 13.0f, 15.0f,
+        0.0f, 6.0f, 8.0f,
+        0.0f, 14.0f, 16.0f
+    });
+
+    return SpaceToBatchNdTestImpl<T>(
+        workloadFactory, memoryManager, inputTensorInfo, outputTensorInfo, input, outputExpected, desc);
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> SpaceToBatchNdSimpleNhwcTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return SpaceToBatchNdSimpleTest<ArmnnType>(workloadFactory, memoryManager, armnn::DataLayout::NHWC);
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> SpaceToBatchNdMultiChannelsNhwcTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return SpaceToBatchNdMultiChannelsTest<ArmnnType>(workloadFactory, memoryManager, armnn::DataLayout::NHWC);
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> SpaceToBatchNdMultiBlockNhwcTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return SpaceToBatchNdMultiBlockTest<ArmnnType>(workloadFactory, memoryManager, armnn::DataLayout::NHWC);
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> SpaceToBatchNdPaddingNhwcTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return SpaceToBatchNdPaddingTest<ArmnnType>(workloadFactory, memoryManager, armnn::DataLayout::NHWC);
+}
+
+} // anonymous namespace
+
+LayerTestResult<float, 4> SpaceToBatchNdSimpleFloat32Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return SpaceToBatchNdSimpleTest<armnn::DataType::Float32>(workloadFactory, memoryManager);
+}
+
+LayerTestResult<float, 4> SpaceToBatchNdMultiChannelsFloat32Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return SpaceToBatchNdMultiChannelsTest<armnn::DataType::Float32>(workloadFactory, memoryManager);
+}
+
+LayerTestResult<float, 4> SpaceToBatchNdMultiBlockFloat32Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return SpaceToBatchNdMultiBlockTest<armnn::DataType::Float32>(workloadFactory, memoryManager);
+}
+
+LayerTestResult<float, 4> SpaceToBatchNdPaddingFloat32Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return SpaceToBatchNdPaddingTest<armnn::DataType::Float32>(workloadFactory, memoryManager);
+}
+
+LayerTestResult<uint8_t, 4> SpaceToBatchNdSimpleUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return SpaceToBatchNdSimpleTest<armnn::DataType::QuantisedAsymm8>(workloadFactory, memoryManager);
+}
+
+LayerTestResult<uint8_t, 4> SpaceToBatchNdMultiChannelsUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return SpaceToBatchNdMultiChannelsTest<armnn::DataType::QuantisedAsymm8>(workloadFactory, memoryManager);
+}
+
+LayerTestResult<uint8_t, 4> SpaceToBatchNdMultiBlockUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return SpaceToBatchNdMultiBlockTest<armnn::DataType::QuantisedAsymm8>(workloadFactory, memoryManager);
+}
+
+LayerTestResult<uint8_t, 4> SpaceToBatchNdPaddingUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return SpaceToBatchNdPaddingTest<armnn::DataType::QuantisedAsymm8>(workloadFactory, memoryManager);
+}
+
+LayerTestResult<float, 4> SpaceToBatchNdSimpleNhwcFloat32Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return SpaceToBatchNdSimpleNhwcTest<armnn::DataType::Float32>(workloadFactory, memoryManager);
+}
+
+LayerTestResult<float, 4> SpaceToBatchNdMultiChannelsNhwcFloat32Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return SpaceToBatchNdMultiChannelsNhwcTest<armnn::DataType::Float32>(workloadFactory, memoryManager);
+}
+
+LayerTestResult<float, 4> SpaceToBatchNdMultiBlockNhwcFloat32Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return SpaceToBatchNdMultiBlockNhwcTest<armnn::DataType::Float32>(workloadFactory, memoryManager);
+}
+
+LayerTestResult<float, 4> SpaceToBatchNdPaddingNhwcFloat32Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return SpaceToBatchNdPaddingNhwcTest<armnn::DataType::Float32>(workloadFactory, memoryManager);
+}
+
+LayerTestResult<uint8_t, 4> SpaceToBatchNdSimpleNhwcUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return SpaceToBatchNdSimpleNhwcTest<armnn::DataType::QuantisedAsymm8>(workloadFactory, memoryManager);
+}
+
+LayerTestResult<uint8_t, 4> SpaceToBatchNdMultiChannelsNhwcUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return SpaceToBatchNdMultiChannelsNhwcTest<armnn::DataType::QuantisedAsymm8>(workloadFactory, memoryManager);
+}
+
+LayerTestResult<uint8_t, 4> SpaceToBatchNdMultiBlockNhwcUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return SpaceToBatchNdMultiBlockNhwcTest<armnn::DataType::QuantisedAsymm8>(workloadFactory, memoryManager);
+}
+
+LayerTestResult<uint8_t, 4> SpaceToBatchNdPaddingNhwcUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return SpaceToBatchNdPaddingNhwcTest<armnn::DataType::QuantisedAsymm8>(workloadFactory, memoryManager);
+}
+
+LayerTestResult<int16_t, 4> SpaceToBatchNdSimpleUint16Test(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return SpaceToBatchNdSimpleTest<armnn::DataType::QuantisedSymm16>(workloadFactory, memoryManager);
+}
+
+LayerTestResult<int16_t, 4> SpaceToBatchNdMultiChannelsUint16Test(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return SpaceToBatchNdMultiChannelsTest<armnn::DataType::QuantisedSymm16>(workloadFactory, memoryManager);
+}
+
+LayerTestResult<int16_t, 4> SpaceToBatchNdMultiBlockUint16Test(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return SpaceToBatchNdMultiBlockTest<armnn::DataType::QuantisedSymm16>(workloadFactory, memoryManager);
+}
+
+LayerTestResult<int16_t, 4> SpaceToBatchNdPaddingUint16Test(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return SpaceToBatchNdPaddingTest<armnn::DataType::QuantisedSymm16>(workloadFactory, memoryManager);
+}
+
+LayerTestResult<int16_t, 4> SpaceToBatchNdSimpleNhwcUint16Test(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return SpaceToBatchNdSimpleNhwcTest<armnn::DataType::QuantisedSymm16>(workloadFactory, memoryManager);
+}
+
+LayerTestResult<int16_t, 4> SpaceToBatchNdMultiChannelsNhwcUint16Test(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return SpaceToBatchNdMultiChannelsNhwcTest<armnn::DataType::QuantisedSymm16>(workloadFactory, memoryManager);
+}
+
+LayerTestResult<int16_t, 4> SpaceToBatchNdMultiBlockNhwcUint16Test(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return SpaceToBatchNdMultiBlockNhwcTest<armnn::DataType::QuantisedSymm16>(workloadFactory, memoryManager);
+}
+
+LayerTestResult<int16_t, 4> SpaceToBatchNdPaddingNhwcUint16Test(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return SpaceToBatchNdPaddingNhwcTest<armnn::DataType::QuantisedSymm16>(workloadFactory, memoryManager);
+}
diff --git a/src/backends/backendsCommon/test/layerTests/SpaceToBatchNdTestImpl.hpp b/src/backends/backendsCommon/test/layerTests/SpaceToBatchNdTestImpl.hpp
new file mode 100644
index 0000000..0af99c5
--- /dev/null
+++ b/src/backends/backendsCommon/test/layerTests/SpaceToBatchNdTestImpl.hpp
@@ -0,0 +1,106 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+#pragma once
+
+#include "LayerTestResult.hpp"
+
+#include <backendsCommon/IBackendInternal.hpp>
+#include <backendsCommon/WorkloadFactory.hpp>
+
+LayerTestResult<float, 4> SpaceToBatchNdSimpleFloat32Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<float, 4> SpaceToBatchNdMultiChannelsFloat32Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<float, 4> SpaceToBatchNdMultiBlockFloat32Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<float, 4> SpaceToBatchNdPaddingFloat32Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<uint8_t, 4> SpaceToBatchNdSimpleUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<uint8_t, 4> SpaceToBatchNdMultiChannelsUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<uint8_t, 4> SpaceToBatchNdMultiBlockUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<uint8_t, 4> SpaceToBatchNdPaddingUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<float, 4> SpaceToBatchNdSimpleNhwcFloat32Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<float, 4> SpaceToBatchNdMultiChannelsNhwcFloat32Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<float, 4> SpaceToBatchNdMultiBlockNhwcFloat32Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<float, 4> SpaceToBatchNdPaddingNhwcFloat32Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<uint8_t, 4> SpaceToBatchNdSimpleNhwcUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<uint8_t, 4> SpaceToBatchNdMultiChannelsNhwcUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<uint8_t, 4> SpaceToBatchNdMultiBlockNhwcUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<uint8_t, 4> SpaceToBatchNdPaddingNhwcUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<int16_t, 4> SpaceToBatchNdSimpleUint16Test(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<int16_t, 4> SpaceToBatchNdMultiChannelsUint16Test(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<int16_t, 4> SpaceToBatchNdMultiBlockUint16Test(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<int16_t, 4> SpaceToBatchNdPaddingUint16Test(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<int16_t, 4> SpaceToBatchNdSimpleNhwcUint16Test(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<int16_t, 4> SpaceToBatchNdMultiChannelsNhwcUint16Test(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<int16_t, 4> SpaceToBatchNdMultiBlockNhwcUint16Test(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<int16_t, 4> SpaceToBatchNdPaddingNhwcUint16Test(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
diff --git a/src/backends/backendsCommon/test/layerTests/SpaceToDepthTestImpl.cpp b/src/backends/backendsCommon/test/layerTests/SpaceToDepthTestImpl.cpp
new file mode 100644
index 0000000..92dfd97
--- /dev/null
+++ b/src/backends/backendsCommon/test/layerTests/SpaceToDepthTestImpl.cpp
@@ -0,0 +1,227 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "SpaceToDepthTestImpl.hpp"
+
+#include <Permute.hpp>
+#include <ResolveType.hpp>
+
+#include <armnn/ArmNN.hpp>
+
+#include <backendsCommon/test/TensorCopyUtils.hpp>
+#include <backendsCommon/test/WorkloadTestUtils.hpp>
+
+#include <test/TensorHelpers.hpp>
+
+namespace
+{
+
+template<typename T>
+LayerTestResult<T, 4> SpaceToDepthTestImpl(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    armnn::TensorInfo& inputTensorInfo,
+    armnn::TensorInfo& outputTensorInfo,
+    std::vector<float>& inputData,
+    std::vector<float>& outputExpectedData,
+    armnn::SpaceToDepthQueueDescriptor descriptor,
+    const float qScale = 1.0f,
+    const int32_t qOffset = 0)
+{
+    const armnn::PermutationVector NHWCToNCHW = {0, 2, 3, 1};
+
+    if (descriptor.m_Parameters.m_DataLayout == armnn::DataLayout::NCHW)
+    {
+        inputTensorInfo = armnnUtils::Permuted(inputTensorInfo, NHWCToNCHW);
+        outputTensorInfo = armnnUtils::Permuted(outputTensorInfo, NHWCToNCHW);
+
+        std::vector<float> inputTmp(inputData.size());
+        armnnUtils::Permute(inputTensorInfo.GetShape(), NHWCToNCHW,
+            inputData.data(), inputTmp.data(), sizeof(float));
+        inputData = inputTmp;
+
+        std::vector<float> outputTmp(outputExpectedData.size());
+        armnnUtils::Permute(outputTensorInfo.GetShape(), NHWCToNCHW,
+            outputExpectedData.data(), outputTmp.data(), sizeof(float));
+        outputExpectedData = outputTmp;
+    }
+
+    if(armnn::IsQuantizedType<T>())
+    {
+        inputTensorInfo.SetQuantizationScale(qScale);
+        inputTensorInfo.SetQuantizationOffset(qOffset);
+        outputTensorInfo.SetQuantizationScale(qScale);
+        outputTensorInfo.SetQuantizationOffset(qOffset);
+    }
+
+    boost::multi_array<T, 4> input = MakeTensor<T, 4>(inputTensorInfo, QuantizedVector<T>(qScale, qOffset, inputData));
+
+    LayerTestResult<T, 4> ret(outputTensorInfo);
+    ret.outputExpected = MakeTensor<T, 4>(outputTensorInfo, QuantizedVector<T>(qScale, qOffset, outputExpectedData));
+
+    std::unique_ptr<armnn::ITensorHandle> inputHandle = workloadFactory.CreateTensorHandle(inputTensorInfo);
+    std::unique_ptr<armnn::ITensorHandle> outputHandle = workloadFactory.CreateTensorHandle(outputTensorInfo);
+
+    armnn::WorkloadInfo info;
+    AddInputToWorkload(descriptor, info, inputTensorInfo, inputHandle.get());
+    AddOutputToWorkload(descriptor, info, outputTensorInfo, outputHandle.get());
+
+    std::unique_ptr<armnn::IWorkload> workload = workloadFactory.CreateSpaceToDepth(descriptor, info);
+
+    inputHandle->Allocate();
+    outputHandle->Allocate();
+
+    CopyDataToITensorHandle(inputHandle.get(), &input[0][0][0][0]);
+
+    workload->Execute();
+
+    CopyDataFromITensorHandle(&ret.output[0][0][0][0], outputHandle.get());
+
+    return ret;
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> SpaceToDepthSimpleTest1(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    armnn::DataLayout dataLayout = armnn::DataLayout::NHWC)
+{
+    unsigned int inputShape[] = {1, 2, 2, 1};
+    unsigned int outputShape[] = {1, 1, 1, 4};
+
+    std::vector<float> input = std::vector<float>(
+    {
+        1.0f, 2.0f, 3.0f, 4.0f
+    });
+
+    std::vector<float> outputExpected = std::vector<float>(
+    {
+        1.0f, 2.0f, 3.0f, 4.0f
+    });
+
+    armnn::TensorInfo inputTensorInfo;
+    armnn::TensorInfo outputTensorInfo;
+
+    armnn::SpaceToDepthQueueDescriptor desc;
+    desc.m_Parameters.m_DataLayout = dataLayout;
+    desc.m_Parameters.m_BlockSize = 2;
+
+    inputTensorInfo = armnn::TensorInfo(4, inputShape, ArmnnType);
+    outputTensorInfo = armnn::TensorInfo(4, outputShape, ArmnnType);
+
+    return SpaceToDepthTestImpl<T>(
+        workloadFactory, memoryManager, inputTensorInfo, outputTensorInfo, input, outputExpected, desc);
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> SpaceToDepthSimpleTest2(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    armnn::DataLayout dataLayout = armnn::DataLayout::NHWC)
+{
+    unsigned int inputShape[]  = {1, 2, 2, 2};
+    unsigned int outputShape[] = {1, 1, 1, 8};
+
+    std::vector<float> input = std::vector<float>(
+    {
+        1.4f, 2.3f, 3.2f, 4.1f, 5.4f, 6.3f, 7.2f, 8.1f
+    });
+
+    std::vector<float> outputExpected = std::vector<float>(
+    {
+        1.4f, 2.3f, 3.2f, 4.1f, 5.4f, 6.3f, 7.2f, 8.1f
+    });
+
+    armnn::TensorInfo inputTensorInfo;
+    armnn::TensorInfo outputTensorInfo;
+
+    armnn::SpaceToDepthQueueDescriptor desc;
+    desc.m_Parameters.m_DataLayout = dataLayout;
+    desc.m_Parameters.m_BlockSize = 2;
+
+    inputTensorInfo = armnn::TensorInfo(4, inputShape, ArmnnType);
+    outputTensorInfo = armnn::TensorInfo(4, outputShape, ArmnnType);
+
+    return SpaceToDepthTestImpl<T>(
+        workloadFactory, memoryManager, inputTensorInfo, outputTensorInfo, input, outputExpected, desc);
+}
+
+} // anonymous namespace
+
+LayerTestResult<uint8_t, 4> SpaceToDepthNhwcAsymmQ8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return SpaceToDepthSimpleTest1<armnn::DataType::QuantisedAsymm8>(
+        workloadFactory,
+        memoryManager);
+}
+
+LayerTestResult<uint8_t, 4> SpaceToDepthNchwAsymmQ8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return SpaceToDepthSimpleTest1<armnn::DataType::QuantisedAsymm8>(
+        workloadFactory,
+        memoryManager,
+        armnn::DataLayout::NCHW);
+}
+
+LayerTestResult<float, 4> SpaceToDepthNhwcFloat32Test1(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return SpaceToDepthSimpleTest1<armnn::DataType::Float32>(
+        workloadFactory,
+        memoryManager);
+}
+
+LayerTestResult<float, 4> SpaceToDepthNchwFloat32Test1(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return SpaceToDepthSimpleTest1<armnn::DataType::Float32>(
+        workloadFactory,
+        memoryManager,
+        armnn::DataLayout::NCHW);
+}
+
+LayerTestResult<float, 4> SpaceToDepthNhwcFloat32Test2(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return SpaceToDepthSimpleTest2<armnn::DataType::Float32>(
+        workloadFactory,
+        memoryManager);
+}
+
+LayerTestResult<float, 4> SpaceToDepthNchwFloat32Test2(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return SpaceToDepthSimpleTest2<armnn::DataType::Float32>(
+        workloadFactory,
+        memoryManager,
+        armnn::DataLayout::NCHW);
+}
+
+LayerTestResult<int16_t, 4> SpaceToDepthNhwcQSymm16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return SpaceToDepthSimpleTest2<armnn::DataType::QuantisedSymm16>(
+        workloadFactory,
+        memoryManager);
+}
+
+LayerTestResult<int16_t, 4> SpaceToDepthNchwQSymm16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return SpaceToDepthSimpleTest2<armnn::DataType::QuantisedSymm16>(
+        workloadFactory,
+        memoryManager,
+        armnn::DataLayout::NCHW);
+}
diff --git a/src/backends/backendsCommon/test/layerTests/SpaceToDepthTestImpl.hpp b/src/backends/backendsCommon/test/layerTests/SpaceToDepthTestImpl.hpp
new file mode 100644
index 0000000..ef86829
--- /dev/null
+++ b/src/backends/backendsCommon/test/layerTests/SpaceToDepthTestImpl.hpp
@@ -0,0 +1,42 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+#pragma once
+
+#include "LayerTestResult.hpp"
+
+#include <backendsCommon/IBackendInternal.hpp>
+#include <backendsCommon/WorkloadFactory.hpp>
+
+LayerTestResult<uint8_t, 4> SpaceToDepthNchwAsymmQ8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<uint8_t, 4> SpaceToDepthNhwcAsymmQ8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<float, 4> SpaceToDepthNhwcFloat32Test1(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<float, 4> SpaceToDepthNchwFloat32Test1(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<float, 4> SpaceToDepthNhwcFloat32Test2(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<float, 4> SpaceToDepthNchwFloat32Test2(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<int16_t, 4> SpaceToDepthNhwcQSymm16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<int16_t, 4> SpaceToDepthNchwQSymm16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
diff --git a/src/backends/backendsCommon/test/layerTests/SplitterTestImpl.cpp b/src/backends/backendsCommon/test/layerTests/SplitterTestImpl.cpp
new file mode 100644
index 0000000..0278bbe
--- /dev/null
+++ b/src/backends/backendsCommon/test/layerTests/SplitterTestImpl.cpp
@@ -0,0 +1,357 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "SplitterTestImpl.hpp"
+
+#include <ResolveType.hpp>
+
+#include <armnn/ArmNN.hpp>
+
+#include <backendsCommon/test/QuantizeHelper.hpp>
+#include <backendsCommon/test/TensorCopyUtils.hpp>
+#include <backendsCommon/test/WorkloadTestUtils.hpp>
+
+#include <test/TensorHelpers.hpp>
+
+namespace
+{
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+std::vector<LayerTestResult<T,3>> SplitterTestCommon(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    float qScale = 0.0f,
+    int32_t qOffset = 0)
+{
+    unsigned int inputWidth = 5;
+    unsigned int inputHeight = 6;
+    unsigned int inputChannels = 3;
+
+    // NOTE: Compute Library imposes a restriction that the x and y dimension (input height and width)
+    //       cannot be split.
+    //       For the reasons for this, see first comment on https://jira.arm.com/browse/IVGCVSW-1239
+    //
+    // This test has therefore been recast to split the channels, then split the resulting subtensor.
+
+    // To take channel 0 of original output
+    // and channel 0 and channel 1 of the split subtensor.
+    unsigned int outputWidth1 = inputWidth;
+    unsigned int outputHeight1 = inputHeight;
+    unsigned int outputChannels1 = 1;
+
+    // To take channel 1 and 2 of the original output.
+    unsigned int outputWidth2 = inputWidth;
+    unsigned int outputHeight2 = inputHeight;
+    unsigned int outputChannels2 = 2;
+
+
+    // Define the tensor descriptors.
+    armnn::TensorInfo inputTensorInfo({ inputChannels, inputHeight, inputWidth }, ArmnnType, qScale, qOffset);
+
+    // Outputs of the original split.
+    armnn::TensorInfo outputTensorInfo1({ outputChannels1, outputHeight1, outputWidth1 }, ArmnnType, qScale, qOffset);
+    armnn::TensorInfo outputTensorInfo2({ outputChannels2, outputHeight2, outputWidth2 }, ArmnnType, qScale, qOffset);
+
+    // Outputs of the subsequent subtensor split.
+    armnn::TensorInfo outputTensorInfo3({ outputChannels1, outputHeight1, outputWidth1 }, ArmnnType, qScale, qOffset);
+    armnn::TensorInfo outputTensorInfo4({ outputChannels1, outputHeight1, outputWidth1 }, ArmnnType, qScale, qOffset);
+
+    // Set quantization parameters if the requested type is a quantized type.
+    // The quantization doesn't really matter as the splitter operator doesn't dequantize/quantize.
+    if(armnn::IsQuantizedType<T>())
+    {
+        inputTensorInfo.SetQuantizationScale(qScale);
+        inputTensorInfo.SetQuantizationOffset(qOffset);
+        outputTensorInfo1.SetQuantizationScale(qScale);
+        outputTensorInfo1.SetQuantizationOffset(qOffset);
+        outputTensorInfo2.SetQuantizationScale(qScale);
+        outputTensorInfo2.SetQuantizationOffset(qOffset);
+        outputTensorInfo3.SetQuantizationScale(qScale);
+        outputTensorInfo3.SetQuantizationOffset(qOffset);
+        outputTensorInfo4.SetQuantizationScale(qScale);
+        outputTensorInfo4.SetQuantizationOffset(qOffset);
+    }
+
+    LayerTestResult<T,3> ret1(outputTensorInfo1);
+    LayerTestResult<T,3> ret2(outputTensorInfo2);
+    LayerTestResult<T,3> ret3(outputTensorInfo3);
+    LayerTestResult<T,3> ret4(outputTensorInfo4);
+
+    auto input = MakeTensor<T, 3>(inputTensorInfo, std::vector<T>(
+        QuantizedVector<T>(qScale, qOffset, {
+            1.0f, 2.0f, 3.0f, 4.0f, 5.0f,
+            6.0f, 7.0f, 8.0f, 9.0f, 10.0f,
+            11.0f, 12.0f, 13.0f, 14.0f, 15.0f,
+            16.0f, 17.0f, 18.0f, 19.0f, 20.0f,
+            21.0f, 22.0f, 23.0f, 24.0f, 25.0f,
+            26.0f, 27.0f, 28.0f, 29.0f, 30.0f,
+
+            31.0f, 32.0f, 33.0f, 34.0f, 35.0f,
+            36.0f, 37.0f, 38.0f, 39.0f, 40.0f,
+            41.0f, 42.0f, 43.0f, 44.0f, 45.0f,
+            46.0f, 47.0f, 48.0f, 49.0f, 50.0f,
+            51.0f, 52.0f, 53.0f, 54.0f, 55.0f,
+            56.0f, 57.0f, 58.0f, 59.0f, 60.0f,
+
+            61.0f, 62.0f, 63.0f, 64.0f, 65.0f,
+            66.0f, 67.0f, 68.0f, 69.0f, 70.0f,
+            71.0f, 72.0f, 73.0f, 74.0f, 75.0f,
+            76.0f, 77.0f, 78.0f, 79.0f, 80.0f,
+            81.0f, 82.0f, 83.0f, 84.0f, 85.0f,
+            86.0f, 87.0f, 88.0f, 89.0f, 90.0f,
+        })
+    ));
+
+    // Channel 0 of the original input.
+    ret1.outputExpected = MakeTensor<T, 3>(outputTensorInfo1, std::vector<T>(
+        QuantizedVector<T>(qScale, qOffset, {
+            1.0f, 2.0f, 3.0f, 4.0f, 5.0f,
+            6.0f, 7.0f, 8.0f, 9.0f, 10.0f,
+            11.0f, 12.0f, 13.0f, 14.0f, 15.0f,
+            16.0f, 17.0f, 18.0f, 19.0f, 20.0f,
+            21.0f, 22.0f, 23.0f, 24.0f, 25.0f,
+            26.0f, 27.0f, 28.0f, 29.0f, 30.0f,
+        })
+    ));
+
+    // Channel 1 & 2 of the original input.
+    ret2.outputExpected = MakeTensor<T, 3>(outputTensorInfo2, std::vector<T>(
+        QuantizedVector<T>(qScale, qOffset, {
+            31.0f, 32.0f, 33.0f, 34.0f, 35.0f,
+            36.0f, 37.0f, 38.0f, 39.0f, 40.0f,
+            41.0f, 42.0f, 43.0f, 44.0f, 45.0f,
+            46.0f, 47.0f, 48.0f, 49.0f, 50.0f,
+            51.0f, 52.0f, 53.0f, 54.0f, 55.0f,
+            56.0f, 57.0f, 58.0f, 59.0f, 60.0f,
+
+            61.0f, 62.0f, 63.0f, 64.0f, 65.0f,
+            66.0f, 67.0f, 68.0f, 69.0f, 70.0f,
+            71.0f, 72.0f, 73.0f, 74.0f, 75.0f,
+            76.0f, 77.0f, 78.0f, 79.0f, 80.0f,
+            81.0f, 82.0f, 83.0f, 84.0f, 85.0f,
+            86.0f, 87.0f, 88.0f, 89.0f, 90.0f,
+        })
+    ));
+
+    // Channel 0 of return 2 (i.e. channels 1 and 2 of the original input).
+    ret3.outputExpected = MakeTensor<T, 3>(outputTensorInfo3, std::vector<T>(
+        QuantizedVector<T>(qScale, qOffset, {
+            31.0f, 32.0f, 33.0f, 34.0f, 35.0f,
+            36.0f, 37.0f, 38.0f, 39.0f, 40.0f,
+            41.0f, 42.0f, 43.0f, 44.0f, 45.0f,
+            46.0f, 47.0f, 48.0f, 49.0f, 50.0f,
+            51.0f, 52.0f, 53.0f, 54.0f, 55.0f,
+            56.0f, 57.0f, 58.0f, 59.0f, 60.0f,
+        })
+    ));
+
+    // Channel 1 of return 2.
+    ret4.outputExpected = MakeTensor<T, 3>(outputTensorInfo4, std::vector<T>(
+        QuantizedVector<T>(qScale, qOffset, {
+            61.0f, 62.0f, 63.0f, 64.0f, 65.0f,
+            66.0f, 67.0f, 68.0f, 69.0f, 70.0f,
+            71.0f, 72.0f, 73.0f, 74.0f, 75.0f,
+            76.0f, 77.0f, 78.0f, 79.0f, 80.0f,
+            81.0f, 82.0f, 83.0f, 84.0f, 85.0f,
+            86.0f, 87.0f, 88.0f, 89.0f, 90.0f,
+        })
+    ));
+
+    // NOTE: as a corollary of the splitting of x and y restriction the x and y values of the view origins
+    //       have to be zero, the co-ordinates are as per the tensor info above channels, height/y, width/x
+    //       note that under the hood the compute engine reverses these i.e. its coordinate system is x, y, channels.
+    std::vector<unsigned int> wOrigin1 = {0, 0, 0}; //Extent of the window is defined by size of output[0].
+    armnn::SplitterQueueDescriptor::ViewOrigin window1(wOrigin1);
+
+    std::vector<unsigned int> wOrigin2 = {1, 0, 0}; //Extent of the window is defined by size of output[1].
+    armnn::SplitterQueueDescriptor::ViewOrigin window2(wOrigin2);
+
+    std::vector<unsigned int> wOrigin3 = {0, 0, 0}; //Extent of the window is defined by size of output[2].
+    armnn::SplitterQueueDescriptor::ViewOrigin window3(wOrigin3);
+
+    std::vector<unsigned int> wOrigin4 = {1, 0, 0}; //Extent of the window is defined by size of output[3].
+    armnn::SplitterQueueDescriptor::ViewOrigin window4(wOrigin4);
+
+    bool subTensorsSupported = workloadFactory.SupportsSubTensors();
+
+    std::unique_ptr<armnn::ITensorHandle> inputHandle  = workloadFactory.CreateTensorHandle(inputTensorInfo);
+
+    std::unique_ptr<armnn::ITensorHandle> outputHandle1 =
+        subTensorsSupported ?
+            workloadFactory.CreateSubTensorHandle(*inputHandle, outputTensorInfo1.GetShape(), wOrigin1.data()) :
+            workloadFactory.CreateTensorHandle(outputTensorInfo1);
+
+    std::unique_ptr<armnn::ITensorHandle> outputHandle2 =
+        subTensorsSupported ?
+            workloadFactory.CreateSubTensorHandle(*inputHandle, outputTensorInfo2.GetShape(), wOrigin2.data()) :
+            workloadFactory.CreateTensorHandle(outputTensorInfo2);
+
+    std::unique_ptr<armnn::ITensorHandle> outputHandle3 =
+        subTensorsSupported ?
+            workloadFactory.CreateSubTensorHandle(*outputHandle2, outputTensorInfo3.GetShape(), wOrigin3.data()) :
+            workloadFactory.CreateTensorHandle(outputTensorInfo3);
+
+    std::unique_ptr<armnn::ITensorHandle> outputHandle4 =
+        subTensorsSupported ?
+            workloadFactory.CreateSubTensorHandle(*outputHandle2, outputTensorInfo4.GetShape(), wOrigin4.data()) :
+            workloadFactory.CreateTensorHandle(outputTensorInfo4);
+
+    // Do the first split
+    armnn::SplitterQueueDescriptor data;
+    armnn::WorkloadInfo info;
+    AddInputToWorkload(data, info, inputTensorInfo, inputHandle.get());
+    AddOutputToWorkload(data, info, outputTensorInfo1, outputHandle1.get());
+    AddOutputToWorkload(data, info, outputTensorInfo2, outputHandle2.get());
+
+    data.m_ViewOrigins.push_back(window1);
+    data.m_ViewOrigins.push_back(window2);
+
+    std::unique_ptr<armnn::IWorkload> workload = workloadFactory.CreateSplitter(data, info);
+
+    inputHandle->Allocate();
+    outputHandle1->Allocate();
+    outputHandle2->Allocate();
+
+    CopyDataToITensorHandle(inputHandle.get(), &input[0][0][0]);
+
+    workload->Execute();
+
+    CopyDataFromITensorHandle(&ret1.output[0][0][0], outputHandle1.get());
+    CopyDataFromITensorHandle(&ret2.output[0][0][0], outputHandle2.get());
+
+    // Do the second split.
+    armnn::SplitterQueueDescriptor data2;
+    armnn::WorkloadInfo info2;
+    AddInputToWorkload(data2, info2, outputTensorInfo2, outputHandle2.get());
+    AddOutputToWorkload(data2, info2, outputTensorInfo3, outputHandle3.get());
+    AddOutputToWorkload(data2, info2, outputTensorInfo4, outputHandle4.get());
+
+    data2.m_ViewOrigins.push_back(window3);
+    data2.m_ViewOrigins.push_back(window4);
+
+    std::unique_ptr<armnn::IWorkload> workload2 = workloadFactory.CreateSplitter(data2, info2);
+
+    outputHandle3->Allocate();
+    outputHandle4->Allocate();
+
+    ExecuteWorkload(*workload2, memoryManager);
+
+    CopyDataFromITensorHandle(&ret3.output[0][0][0], outputHandle3.get());
+    CopyDataFromITensorHandle(&ret4.output[0][0][0], outputHandle4.get());
+
+    std::vector<LayerTestResult<T,3>> ret = {ret1, ret2, ret3, ret4,};
+
+    return ret;
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 3> CopyViaSplitterTestImpl(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    float qScale, int32_t qOffset)
+{
+    const armnn::TensorInfo tensorInfo({ 3, 6, 5 }, ArmnnType, qScale, qOffset);
+    auto input = MakeTensor<T, 3>(tensorInfo, QuantizedVector<T>(qScale, qOffset,
+                                                                 {
+                                                                     1.0f, 2.0f, 3.0f, 4.0f, 5.0f,
+                                                                     6.0f, 7.0f, 8.0f, 9.0f, 10.0f,
+                                                                     11.0f, 12.0f, 13.0f, 14.0f, 15.0f,
+                                                                     16.0f, 17.0f, 18.0f, 19.0f, 20.0f,
+                                                                     21.0f, 22.0f, 23.0f, 24.0f, 25.0f,
+                                                                     26.0f, 27.0f, 28.0f, 29.0f, 30.0f,
+
+                                                                     31.0f, 32.0f, 33.0f, 34.0f, 35.0f,
+                                                                     36.0f, 37.0f, 38.0f, 39.0f, 40.0f,
+                                                                     41.0f, 42.0f, 43.0f, 44.0f, 45.0f,
+                                                                     46.0f, 47.0f, 48.0f, 49.0f, 50.0f,
+                                                                     51.0f, 52.0f, 53.0f, 54.0f, 55.0f,
+                                                                     56.0f, 57.0f, 58.0f, 59.0f, 60.0f,
+
+                                                                     61.0f, 62.0f, 63.0f, 64.0f, 65.0f,
+                                                                     66.0f, 67.0f, 68.0f, 69.0f, 70.0f,
+                                                                     71.0f, 72.0f, 73.0f, 74.0f, 75.0f,
+                                                                     76.0f, 77.0f, 78.0f, 79.0f, 80.0f,
+                                                                     81.0f, 82.0f, 83.0f, 84.0f, 85.0f,
+                                                                     86.0f, 87.0f, 88.0f, 89.0f, 90.0f,
+                                                                 }));
+
+    std::vector<unsigned int> origin = { 0, 0, 0 };
+    armnn::SplitterQueueDescriptor::ViewOrigin window(origin);
+
+    const bool subTensorsSupported = workloadFactory.SupportsSubTensors();
+
+    std::unique_ptr<armnn::ITensorHandle> inputHandle = workloadFactory.CreateTensorHandle(tensorInfo);
+
+    std::unique_ptr<armnn::ITensorHandle> outputHandle =
+        subTensorsSupported ?
+            workloadFactory.CreateSubTensorHandle(*inputHandle, tensorInfo.GetShape(), origin.data()) :
+            workloadFactory.CreateTensorHandle(tensorInfo);
+
+    armnn::SplitterQueueDescriptor data;
+    armnn::WorkloadInfo info;
+    AddInputToWorkload(data, info, tensorInfo, inputHandle.get());
+    AddOutputToWorkload(data, info, tensorInfo, outputHandle.get());
+
+    data.m_ViewOrigins.push_back(window);
+
+    std::unique_ptr<armnn::IWorkload> workload = workloadFactory.CreateSplitter(data, info);
+
+    inputHandle->Allocate();
+    outputHandle->Allocate();
+
+    CopyDataToITensorHandle(inputHandle.get(), &input[0][0][0]);
+
+    workload->Execute();
+
+    LayerTestResult<T, 3> ret(tensorInfo);
+    CopyDataFromITensorHandle(&ret.output[0][0][0], outputHandle.get());
+    ret.outputExpected = input;
+
+    return ret;
+}
+
+} // anonymous namespace
+
+std::vector<LayerTestResult<float,3>> SplitterFloatTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return SplitterTestCommon<armnn::DataType::Float32>(workloadFactory, memoryManager);
+}
+
+std::vector<LayerTestResult<uint8_t,3>> SplitterUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return SplitterTestCommon<armnn::DataType::QuantisedAsymm8>(workloadFactory, memoryManager, 1.0f, 0);
+}
+
+std::vector<LayerTestResult<int16_t,3>> SplitterInt16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return SplitterTestCommon<armnn::DataType::QuantisedSymm16>(workloadFactory, memoryManager, 1.0f, 0);
+}
+
+LayerTestResult<float, 3> CopyViaSplitterFloatTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return CopyViaSplitterTestImpl<armnn::DataType::Float32>(workloadFactory, memoryManager, 0.0f, 0);
+}
+
+LayerTestResult<uint8_t, 3> CopyViaSplitterUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return CopyViaSplitterTestImpl<armnn::DataType::QuantisedAsymm8>(workloadFactory, memoryManager, 1.0f, 0);
+}
+
+LayerTestResult<int16_t, 3> CopyViaSplitterInt16Test(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return CopyViaSplitterTestImpl<armnn::DataType::QuantisedSymm16>(workloadFactory, memoryManager, 1.0f, 0);
+}
diff --git a/src/backends/backendsCommon/test/layerTests/SplitterTestImpl.hpp b/src/backends/backendsCommon/test/layerTests/SplitterTestImpl.hpp
new file mode 100644
index 0000000..34c5fba
--- /dev/null
+++ b/src/backends/backendsCommon/test/layerTests/SplitterTestImpl.hpp
@@ -0,0 +1,35 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "LayerTestResult.hpp"
+
+#include <backendsCommon/IBackendInternal.hpp>
+#include <backendsCommon/WorkloadFactory.hpp>
+
+#include <vector>
+
+std::vector<LayerTestResult<float, 3>> SplitterFloatTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<float, 3> CopyViaSplitterFloatTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+std::vector<LayerTestResult<uint8_t, 3>> SplitterUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+std::vector<LayerTestResult<int16_t, 3>> SplitterInt16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<uint8_t, 3> CopyViaSplitterUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<int16_t, 3> CopyViaSplitterInt16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
diff --git a/src/backends/backendsCommon/test/layerTests/StackTestImpl.hpp b/src/backends/backendsCommon/test/layerTests/StackTestImpl.hpp
new file mode 100644
index 0000000..f063fbb
--- /dev/null
+++ b/src/backends/backendsCommon/test/layerTests/StackTestImpl.hpp
@@ -0,0 +1,495 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include "LayerTestResult.hpp"
+
+#include <ResolveType.hpp>
+
+#include <armnn/ArmNN.hpp>
+
+#include <backendsCommon/IBackendInternal.hpp>
+#include <backendsCommon/WorkloadFactory.hpp>
+
+#include <backendsCommon/test/TensorCopyUtils.hpp>
+#include <backendsCommon/test/WorkloadTestUtils.hpp>
+
+#include <test/TensorHelpers.hpp>
+
+namespace
+{
+
+template<armnn::DataType ArmnnType, typename T, std::size_t outputDimLength>
+LayerTestResult<T, outputDimLength> StackTestHelper(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        const armnn::TensorInfo& inputTensorInfo,
+        const armnn::TensorInfo& outputTensorInfo,
+        unsigned int axis,
+        const std::vector<std::vector<T>>& inputData,
+        const std::vector<T>& outputExpectedData)
+{
+    unsigned int numInputs = static_cast<unsigned int>(inputData.size());
+    std::vector<boost::multi_array<T, outputDimLength-1>> inputs;
+    for (unsigned int i = 0; i < numInputs; ++i)
+    {
+        inputs.push_back(MakeTensor<T, outputDimLength-1>(inputTensorInfo, inputData[i]));
+    }
+
+    LayerTestResult<T, outputDimLength> result(outputTensorInfo);
+    result.outputExpected = MakeTensor<T, outputDimLength>(outputTensorInfo, outputExpectedData);
+
+    std::vector<std::unique_ptr<armnn::ITensorHandle>> inputHandles;
+    for (unsigned int i = 0; i < numInputs; ++i)
+    {
+        inputHandles.push_back(workloadFactory.CreateTensorHandle(inputTensorInfo));
+    }
+    std::unique_ptr<armnn::ITensorHandle> outputHandle = workloadFactory.CreateTensorHandle(outputTensorInfo);
+
+    armnn::StackQueueDescriptor descriptor;
+    descriptor.m_Parameters.m_Axis = axis;
+    descriptor.m_Parameters.m_InputShape = inputTensorInfo.GetShape();
+    descriptor.m_Parameters.m_NumInputs = numInputs;
+
+    armnn::WorkloadInfo info;
+    for (unsigned int i = 0; i < numInputs; ++i)
+    {
+        std::unique_ptr<armnn::ITensorHandle>& inputHandle = inputHandles[i];
+        AddInputToWorkload(descriptor, info, inputTensorInfo, inputHandle.get());
+        inputHandle->Allocate();
+        CopyDataToITensorHandle(inputHandle.get(), inputs[i].origin());
+    }
+
+    AddOutputToWorkload(descriptor, info, outputTensorInfo, outputHandle.get());
+    outputHandle->Allocate();
+
+    std::unique_ptr<armnn::IWorkload> workload = workloadFactory.CreateStack(descriptor, info);
+
+    workload->Execute();
+
+    CopyDataFromITensorHandle(result.output.origin(), outputHandle.get());
+
+    return result;
+}
+
+} // anonymous namespace
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> Stack0AxisTest(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    armnn::TensorInfo inputTensorInfo ({ 3, 2, 3 }, ArmnnType);
+    armnn::TensorInfo outputTensorInfo({ 2, 3, 2, 3 }, ArmnnType);
+
+    std::vector<std::vector<T>> inputData;
+
+    inputData.push_back(
+    {
+        1, 2, 3,
+        4, 5, 6,
+
+        7, 8, 9,
+        10, 11, 12,
+
+        13, 14, 15,
+        16, 17, 18
+    });
+
+    inputData.push_back(
+    {
+        19, 20, 21,
+        22, 23, 24,
+
+        25, 26, 27,
+        28, 29, 30,
+
+        31, 32, 33,
+        34, 35, 36
+    });
+
+    std::vector<T> outputExpectedData =
+    {
+        1, 2, 3,
+        4, 5, 6,
+
+        7, 8, 9,
+        10, 11, 12,
+
+        13, 14, 15,
+        16, 17, 18,
+
+
+        19, 20, 21,
+        22, 23, 24,
+
+        25, 26, 27,
+        28, 29, 30,
+
+        31, 32, 33,
+        34, 35, 36
+    };
+
+    return StackTestHelper<ArmnnType, T, 4>(
+        workloadFactory,
+        memoryManager,
+        inputTensorInfo,
+        outputTensorInfo,
+        0U,
+        inputData,
+        outputExpectedData
+    );
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> Stack4dOutput1AxisTest(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    armnn::TensorInfo inputTensorInfo ({ 3, 2, 3 }, ArmnnType);
+    armnn::TensorInfo outputTensorInfo({ 3, 2, 2, 3 }, ArmnnType);
+
+    std::vector<std::vector<T>> inputData;
+
+    inputData.push_back(
+    {
+        1, 2, 3,
+        4, 5, 6,
+
+        7, 8, 9,
+        10, 11, 12,
+
+        13, 14, 15,
+        16, 17, 18
+    });
+
+    inputData.push_back(
+    {
+        19, 20, 21,
+        22, 23, 24,
+
+        25, 26, 27,
+        28, 29, 30,
+
+        31, 32, 33,
+        34, 35, 36
+    });
+
+    std::vector<T> outputExpectedData =
+    {
+        1, 2, 3,
+        4, 5, 6,
+
+        19, 20, 21,
+        22, 23, 24,
+
+
+        7, 8, 9,
+        10, 11, 12,
+
+        25, 26, 27,
+        28, 29, 30,
+
+
+        13, 14, 15,
+        16, 17, 18,
+
+        31, 32, 33,
+        34, 35, 36
+    };
+
+    return StackTestHelper<ArmnnType, T, 4>(
+        workloadFactory,
+        memoryManager,
+        inputTensorInfo,
+        outputTensorInfo,
+        1U,
+        inputData,
+        outputExpectedData
+    );
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> Stack4dOutput2AxisTest(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    armnn::TensorInfo inputTensorInfo ({ 3, 2, 3 }, ArmnnType);
+    armnn::TensorInfo outputTensorInfo({ 3, 2, 2, 3 }, ArmnnType);
+
+    std::vector<std::vector<T>> inputData;
+
+    inputData.push_back(
+    {
+        1, 2, 3,
+        4, 5, 6,
+
+        7, 8, 9,
+        10, 11, 12,
+
+        13, 14, 15,
+        16, 17, 18
+    });
+
+    inputData.push_back(
+    {
+        19, 20, 21,
+        22, 23, 24,
+
+        25, 26, 27,
+        28, 29, 30,
+
+        31, 32, 33,
+        34, 35, 36
+    });
+
+    std::vector<T> outputExpectedData =
+    {
+        1, 2, 3,
+        19, 20, 21,
+
+        4, 5, 6,
+        22, 23, 24,
+
+        7, 8, 9,
+        25, 26, 27,
+
+        10, 11, 12,
+        28, 29, 30,
+
+        13, 14, 15,
+        31, 32, 33,
+
+        16, 17, 18,
+        34, 35, 36
+    };
+
+    return StackTestHelper<ArmnnType, T, 4>(
+        workloadFactory,
+        memoryManager,
+        inputTensorInfo,
+        outputTensorInfo,
+        2U,
+        inputData,
+        outputExpectedData
+    );
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> Stack4dOutput3AxisTest(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    armnn::TensorInfo inputTensorInfo ({ 3, 2, 3 }, ArmnnType);
+    armnn::TensorInfo outputTensorInfo({ 3, 2, 3, 2 }, ArmnnType);
+
+    std::vector<std::vector<T>> inputData;
+
+    inputData.push_back(
+    {
+        1, 2, 3,
+        4, 5, 6,
+
+        7, 8, 9,
+        10, 11, 12,
+
+        13, 14, 15,
+        16, 17, 18
+    });
+
+    inputData.push_back(
+    {
+        19, 20, 21,
+        22, 23, 24,
+
+        25, 26, 27,
+        28, 29, 30,
+
+        31, 32, 33,
+        34, 35, 36
+    });
+
+    std::vector<T> outputExpectedData =
+    {
+        1, 19,
+        2, 20,
+        3, 21,
+
+        4, 22,
+        5, 23,
+        6, 24,
+
+
+        7, 25,
+        8, 26,
+        9, 27,
+
+        10, 28,
+        11, 29,
+        12, 30,
+
+
+        13, 31,
+        14, 32,
+        15, 33,
+
+        16, 34,
+        17, 35,
+        18, 36
+    };
+
+    return StackTestHelper<ArmnnType, T, 4>(
+        workloadFactory,
+        memoryManager,
+        inputTensorInfo,
+        outputTensorInfo,
+        3U,
+        inputData,
+        outputExpectedData
+    );
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 3> Stack3dOutput1Axis3InputTest(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    armnn::TensorInfo inputTensorInfo ({ 3, 3 }, ArmnnType);
+    armnn::TensorInfo outputTensorInfo({ 3, 3, 3 }, ArmnnType);
+
+    std::vector<std::vector<T>> inputData;
+
+    inputData.push_back(
+    {
+        1, 2, 3,
+        4, 5, 6,
+        7, 8, 9
+    });
+
+    inputData.push_back(
+    {
+        10, 11, 12,
+        13, 14, 15,
+        16, 17, 18
+    });
+
+    inputData.push_back(
+    {
+        19, 20, 21,
+        22, 23, 24,
+        25, 26, 27
+    });
+
+    std::vector<T> outputExpectedData =
+    {
+        1, 2, 3,
+        10, 11, 12,
+        19, 20, 21,
+
+        4, 5, 6,
+        13, 14, 15,
+        22, 23, 24,
+
+        7, 8, 9,
+        16, 17, 18,
+        25, 26, 27
+    };
+
+    return StackTestHelper<ArmnnType, T, 3>(
+        workloadFactory,
+        memoryManager,
+        inputTensorInfo,
+        outputTensorInfo,
+        1U,
+        inputData,
+        outputExpectedData
+    );
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 5> Stack5dOutputTest(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    armnn::TensorInfo inputTensorInfo ({ 2, 2, 2, 3 }, ArmnnType);
+    armnn::TensorInfo outputTensorInfo({ 2, 2, 2, 2, 3 }, ArmnnType);
+
+    std::vector<std::vector<T>> inputData;
+
+    inputData.push_back(
+    {
+        1, 2, 3,
+        4, 5, 6,
+
+        7, 8, 9,
+        10, 11, 12,
+
+
+        13, 14, 15,
+        16, 17, 18,
+
+        19, 20, 21,
+        22, 23, 24
+    });
+
+    inputData.push_back(
+    {
+        25, 26, 27,
+        28, 29, 30,
+
+        31, 32, 33,
+        34, 35, 36,
+
+
+        37, 38, 39,
+        40, 41, 42,
+
+        43, 44, 45,
+        46, 47, 48
+    });
+
+    std::vector<T> outputExpectedData =
+    {
+        1, 2, 3,
+        4, 5, 6,
+
+        7, 8, 9,
+        10, 11, 12,
+
+
+        25, 26, 27,
+        28, 29, 30,
+
+        31, 32, 33,
+        34, 35, 36,
+
+
+
+        13, 14, 15,
+        16, 17, 18,
+
+        19, 20, 21,
+        22, 23, 24,
+
+
+        37, 38, 39,
+        40, 41, 42,
+
+        43, 44, 45,
+        46, 47, 48
+
+    };
+
+    return StackTestHelper<ArmnnType, T, 5>(
+        workloadFactory,
+        memoryManager,
+        inputTensorInfo,
+        outputTensorInfo,
+        1U,
+        inputData,
+        outputExpectedData
+    );
+}
diff --git a/src/backends/backendsCommon/test/layerTests/StridedSliceTestImpl.cpp b/src/backends/backendsCommon/test/layerTests/StridedSliceTestImpl.cpp
new file mode 100644
index 0000000..b32e622
--- /dev/null
+++ b/src/backends/backendsCommon/test/layerTests/StridedSliceTestImpl.cpp
@@ -0,0 +1,616 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "StridedSliceTestImpl.hpp"
+
+#include <ResolveType.hpp>
+
+#include <armnn/ArmNN.hpp>
+
+#include <backendsCommon/test/TensorCopyUtils.hpp>
+#include <backendsCommon/test/WorkloadTestUtils.hpp>
+
+#include <test/TensorHelpers.hpp>
+
+namespace
+{
+
+template<typename T, std::size_t InDim, std::size_t OutDim>
+LayerTestResult<T, OutDim> StridedSliceTestImpl(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    armnn::TensorInfo& inputTensorInfo,
+    armnn::TensorInfo& outputTensorInfo,
+    std::vector<float>& inputData,
+    std::vector<float>& outputExpectedData,
+    armnn::StridedSliceQueueDescriptor descriptor,
+    const float qScale = 1.0f,
+    const int32_t qOffset = 0)
+{
+    if(armnn::IsQuantizedType<T>())
+    {
+        inputTensorInfo.SetQuantizationScale(qScale);
+        inputTensorInfo.SetQuantizationOffset(qOffset);
+
+        outputTensorInfo.SetQuantizationScale(qScale);
+        outputTensorInfo.SetQuantizationOffset(qOffset);
+    }
+
+    boost::multi_array<T, InDim> input =
+        MakeTensor<T, InDim>(inputTensorInfo, QuantizedVector<T>(qScale, qOffset, inputData));
+
+    LayerTestResult<T, OutDim> ret(outputTensorInfo);
+    ret.outputExpected =
+        MakeTensor<T, OutDim>(outputTensorInfo, QuantizedVector<T>(qScale, qOffset, outputExpectedData));
+
+    std::unique_ptr<armnn::ITensorHandle> inputHandle =
+        workloadFactory.CreateTensorHandle(inputTensorInfo);
+
+    std::unique_ptr<armnn::ITensorHandle> outputHandle =
+        workloadFactory.CreateTensorHandle(outputTensorInfo);
+
+    armnn::WorkloadInfo info;
+    AddInputToWorkload(descriptor, info, inputTensorInfo, inputHandle.get());
+    AddOutputToWorkload(descriptor, info, outputTensorInfo, outputHandle.get());
+
+    std::unique_ptr<armnn::IWorkload> workload = workloadFactory.CreateStridedSlice(descriptor, info);
+
+    inputHandle->Allocate();
+    outputHandle->Allocate();
+
+    CopyDataToITensorHandle(inputHandle.get(), input.data());
+
+    ExecuteWorkload(*workload, memoryManager);
+
+    CopyDataFromITensorHandle(ret.output.data(), outputHandle.get());
+
+    return ret;
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> StridedSlice4dTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    armnn::TensorInfo inputTensorInfo;
+    armnn::TensorInfo outputTensorInfo;
+
+    unsigned int inputShape[]  = {3, 2, 3, 1};
+    unsigned int outputShape[] = {1, 2, 3, 1};
+
+    armnn::StridedSliceQueueDescriptor desc;
+    desc.m_Parameters.m_Begin  = {1, 0, 0, 0};
+    desc.m_Parameters.m_End    = {2, 2, 3, 1};
+    desc.m_Parameters.m_Stride = {1, 1, 1, 1};
+
+    inputTensorInfo = armnn::TensorInfo(4, inputShape, ArmnnType);
+    outputTensorInfo = armnn::TensorInfo(4, outputShape, ArmnnType);
+
+    std::vector<float> input = std::vector<float>(
+    {
+        1.0f, 1.0f, 1.0f, 2.0f, 2.0f, 2.0f,
+
+        3.0f, 3.0f, 3.0f, 4.0f, 4.0f, 4.0f,
+
+        5.0f, 5.0f, 5.0f, 6.0f, 6.0f, 6.0f
+    });
+
+    std::vector<float> outputExpected = std::vector<float>(
+    {
+        3.0f, 3.0f, 3.0f, 4.0f, 4.0f, 4.0f
+    });
+
+    return StridedSliceTestImpl<T, 4, 4>(
+        workloadFactory, memoryManager, inputTensorInfo, outputTensorInfo, input, outputExpected, desc);
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> StridedSlice4dReverseTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    armnn::TensorInfo inputTensorInfo;
+    armnn::TensorInfo outputTensorInfo;
+
+    unsigned int inputShape[]  = {3, 2, 3, 1};
+    unsigned int outputShape[] = {1, 2, 3, 1};
+
+    armnn::StridedSliceQueueDescriptor desc;
+    desc.m_Parameters.m_Begin  = {1, -1, 0, 0};
+    desc.m_Parameters.m_End    = {2, -3, 3, 1};
+    desc.m_Parameters.m_Stride = {1, -1, 1, 1};
+
+    inputTensorInfo = armnn::TensorInfo(4, inputShape, ArmnnType);
+    outputTensorInfo = armnn::TensorInfo(4, outputShape, ArmnnType);
+
+    std::vector<float> input = std::vector<float>(
+    {
+        1.0f, 1.0f, 1.0f, 2.0f, 2.0f, 2.0f,
+
+        3.0f, 3.0f, 3.0f, 4.0f, 4.0f, 4.0f,
+
+        5.0f, 5.0f, 5.0f, 6.0f, 6.0f, 6.0f
+    });
+
+    std::vector<float> outputExpected = std::vector<float>(
+    {
+        4.0f, 4.0f, 4.0f, 3.0f, 3.0f, 3.0f
+    });
+
+    return StridedSliceTestImpl<T, 4, 4>(
+        workloadFactory, memoryManager, inputTensorInfo, outputTensorInfo, input, outputExpected, desc);
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> StridedSliceSimpleStrideTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    armnn::TensorInfo inputTensorInfo;
+    armnn::TensorInfo outputTensorInfo;
+
+    unsigned int inputShape[]  = {3, 2, 3, 1};
+    unsigned int outputShape[] = {2, 1, 2, 1};
+
+    armnn::StridedSliceQueueDescriptor desc;
+    desc.m_Parameters.m_Begin  = {0, 0, 0, 0};
+    desc.m_Parameters.m_End    = {3, 2, 3, 1};
+    desc.m_Parameters.m_Stride = {2, 2, 2, 1};
+
+    inputTensorInfo = armnn::TensorInfo(4, inputShape, ArmnnType);
+    outputTensorInfo = armnn::TensorInfo(4, outputShape, ArmnnType);
+
+    std::vector<float> input = std::vector<float>(
+    {
+        1.0f, 1.0f, 1.0f, 2.0f, 2.0f, 2.0f,
+
+        3.0f, 3.0f, 3.0f, 4.0f, 4.0f, 4.0f,
+
+        5.0f, 5.0f, 5.0f, 6.0f, 6.0f, 6.0f
+    });
+
+    std::vector<float> outputExpected = std::vector<float>(
+    {
+        1.0f, 1.0f,
+
+        5.0f, 5.0f
+    });
+
+    return StridedSliceTestImpl<T, 4, 4>(
+        workloadFactory, memoryManager, inputTensorInfo, outputTensorInfo, input, outputExpected, desc);
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> StridedSliceSimpleRangeMaskTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    armnn::TensorInfo inputTensorInfo;
+    armnn::TensorInfo outputTensorInfo;
+
+    unsigned int inputShape[]  = {3, 2, 3, 1};
+    unsigned int outputShape[] = {3, 2, 3, 1};
+
+    armnn::StridedSliceQueueDescriptor desc;
+    desc.m_Parameters.m_Begin     = {1, 1, 1, 1};
+    desc.m_Parameters.m_End       = {1, 1, 1, 1};
+    desc.m_Parameters.m_Stride    = {1, 1, 1, 1};
+    desc.m_Parameters.m_BeginMask = (1 << 4) - 1;
+    desc.m_Parameters.m_EndMask   = (1 << 4) - 1;
+
+    inputTensorInfo = armnn::TensorInfo(4, inputShape, ArmnnType);
+    outputTensorInfo = armnn::TensorInfo(4, outputShape, ArmnnType);
+
+    std::vector<float> input = std::vector<float>(
+    {
+        1.0f, 1.0f, 1.0f, 2.0f, 2.0f, 2.0f,
+
+        3.0f, 3.0f, 3.0f, 4.0f, 4.0f, 4.0f,
+
+        5.0f, 5.0f, 5.0f, 6.0f, 6.0f, 6.0f
+    });
+
+    std::vector<float> outputExpected = std::vector<float>(
+    {
+        1.0f, 1.0f, 1.0f, 2.0f, 2.0f, 2.0f,
+
+        3.0f, 3.0f, 3.0f, 4.0f, 4.0f, 4.0f,
+
+        5.0f, 5.0f, 5.0f, 6.0f, 6.0f, 6.0f
+    });
+
+    return StridedSliceTestImpl<T, 4, 4>(
+        workloadFactory, memoryManager, inputTensorInfo, outputTensorInfo, input, outputExpected, desc);
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 2> StridedSliceShrinkAxisMaskTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    armnn::TensorInfo inputTensorInfo;
+    armnn::TensorInfo outputTensorInfo;
+
+    unsigned int inputShape[]  = {3, 2, 3, 1};
+    unsigned int outputShape[] = {3, 1};
+
+    armnn::StridedSliceQueueDescriptor desc;
+    desc.m_Parameters.m_Begin          = {0, 0, 1, 0};
+    desc.m_Parameters.m_End            = {1, 1, 1, 1};
+    desc.m_Parameters.m_Stride         = {1, 1, 1, 1};
+    desc.m_Parameters.m_EndMask        = (1 << 4) - 1;
+    desc.m_Parameters.m_ShrinkAxisMask = (1 << 1) | (1 << 2);
+
+    inputTensorInfo = armnn::TensorInfo(4, inputShape, ArmnnType);
+    outputTensorInfo = armnn::TensorInfo(2, outputShape, ArmnnType);
+
+    std::vector<float> input = std::vector<float>(
+    {
+        1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f,
+
+        7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f,
+
+        13.0f, 14.0f, 15.0f, 16.0f, 17.0f, 18.0f
+    });
+
+    std::vector<float> outputExpected = std::vector<float>(
+    {
+        2.0f, 8.0f, 14.0f
+    });
+
+    return StridedSliceTestImpl<T, 4, 2>(
+        workloadFactory, memoryManager, inputTensorInfo, outputTensorInfo, input, outputExpected, desc);
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 3> StridedSlice3dTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    armnn::TensorInfo inputTensorInfo;
+    armnn::TensorInfo outputTensorInfo;
+
+    unsigned int inputShape[]  = {3, 3, 3};
+    unsigned int outputShape[] = {2, 2, 2};
+
+    armnn::StridedSliceQueueDescriptor desc;
+    desc.m_Parameters.m_Begin   = {0, 0, 0};
+    desc.m_Parameters.m_End     = {1, 1, 1};
+    desc.m_Parameters.m_Stride  = {2, 2, 2};
+    desc.m_Parameters.m_EndMask = (1 << 3) - 1;
+
+    inputTensorInfo = armnn::TensorInfo(3, inputShape, ArmnnType);
+    outputTensorInfo = armnn::TensorInfo(3, outputShape, ArmnnType);
+
+    std::vector<float> input = std::vector<float>(
+    {
+        1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f,
+
+        10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f, 17.0f, 18.0f,
+
+        19.0f, 20.0f, 21.0f, 22.0f, 23.0f, 24.0f, 25.0f, 26.0f, 27.0f
+    });
+
+    std::vector<float> outputExpected = std::vector<float>(
+    {
+        1.0f, 3.0f, 7.0f, 9.0f,
+
+        19.0f, 21.0f, 25.0f, 27.0f
+    });
+
+    return StridedSliceTestImpl<T, 3, 3>(
+        workloadFactory, memoryManager, inputTensorInfo, outputTensorInfo, input, outputExpected, desc);
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 3> StridedSlice3dReverseTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    armnn::TensorInfo inputTensorInfo;
+    armnn::TensorInfo outputTensorInfo;
+
+    unsigned int inputShape[]  = {3, 3, 3};
+    unsigned int outputShape[] = {2, 2, 2};
+
+    armnn::StridedSliceQueueDescriptor desc;
+    desc.m_Parameters.m_Begin  = {-1, -1, -1};
+    desc.m_Parameters.m_End    = {-4, -4, -4};
+    desc.m_Parameters.m_Stride = {-2, -2, -2};
+
+    inputTensorInfo = armnn::TensorInfo(3, inputShape, ArmnnType);
+    outputTensorInfo = armnn::TensorInfo(3, outputShape, ArmnnType);
+
+    std::vector<float> input = std::vector<float>(
+    {
+        1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f,
+
+        10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f, 17.0f, 18.0f,
+
+        19.0f, 20.0f, 21.0f, 22.0f, 23.0f, 24.0f, 25.0f, 26.0f, 27.0f
+    });
+
+    std::vector<float> outputExpected = std::vector<float>(
+    {
+        27.0f, 25.0f, 21.0f, 19.0f,
+
+        9.0f, 7.0f, 3.0f, 1.0f
+    });
+
+    return StridedSliceTestImpl<T, 3, 3>(
+        workloadFactory, memoryManager, inputTensorInfo, outputTensorInfo, input, outputExpected, desc);
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 2> StridedSlice2dTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    armnn::TensorInfo inputTensorInfo;
+    armnn::TensorInfo outputTensorInfo;
+
+    unsigned int inputShape[]  = {3, 3};
+    unsigned int outputShape[] = {2, 2};
+
+    armnn::StridedSliceQueueDescriptor desc;
+    desc.m_Parameters.m_Begin   = {0, 0};
+    desc.m_Parameters.m_End     = {1, 1};
+    desc.m_Parameters.m_Stride  = {2, 2};
+    desc.m_Parameters.m_EndMask = (1 << 2) - 1;
+
+    inputTensorInfo = armnn::TensorInfo(2, inputShape, ArmnnType);
+    outputTensorInfo = armnn::TensorInfo(2, outputShape, ArmnnType);
+
+    std::vector<float> input = std::vector<float>(
+    {
+        1.0f, 2.0f, 3.0f,
+
+        4.0f, 5.0f, 6.0f,
+
+        7.0f, 8.0f, 9.0f
+    });
+
+    std::vector<float> outputExpected = std::vector<float>(
+    {
+        1.0f, 3.0f,
+
+        7.0f, 9.0f
+    });
+
+    return StridedSliceTestImpl<T, 2, 2>(
+        workloadFactory, memoryManager, inputTensorInfo, outputTensorInfo, input, outputExpected, desc);
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 2> StridedSlice2dReverseTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    armnn::TensorInfo inputTensorInfo;
+    armnn::TensorInfo outputTensorInfo;
+
+    unsigned int inputShape[]  = {3, 3};
+    unsigned int outputShape[] = {2, 2};
+
+    armnn::StridedSliceQueueDescriptor desc;
+    desc.m_Parameters.m_Begin     = {0, 0};
+    desc.m_Parameters.m_End       = {1, 1};
+    desc.m_Parameters.m_Stride    = {-2, -2};
+    desc.m_Parameters.m_BeginMask = (1 << 2) - 1;
+    desc.m_Parameters.m_EndMask   = (1 << 2) - 1;
+
+    inputTensorInfo = armnn::TensorInfo(2, inputShape, ArmnnType);
+    outputTensorInfo = armnn::TensorInfo(2, outputShape, ArmnnType);
+
+    std::vector<float> input = std::vector<float>(
+    {
+        1.0f, 2.0f, 3.0f,
+
+        4.0f, 5.0f, 6.0f,
+
+        7.0f, 8.0f, 9.0f
+    });
+
+    std::vector<float> outputExpected = std::vector<float>(
+    {
+        9.0f, 7.0f,
+
+        3.0f, 1.0f
+    });
+
+    return StridedSliceTestImpl<T, 2, 2>(
+        workloadFactory, memoryManager, inputTensorInfo, outputTensorInfo, input, outputExpected, desc);
+}
+
+} // anonymous namespace
+
+LayerTestResult<float, 4> StridedSlice4dFloat32Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return StridedSlice4dTest<armnn::DataType::Float32>(workloadFactory, memoryManager);
+}
+
+LayerTestResult<float, 4> StridedSlice4dReverseFloat32Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return StridedSlice4dReverseTest<armnn::DataType::Float32>(workloadFactory, memoryManager);
+}
+
+LayerTestResult<float, 4> StridedSliceSimpleStrideFloat32Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return StridedSliceSimpleStrideTest<armnn::DataType::Float32>(workloadFactory, memoryManager);
+}
+
+LayerTestResult<float, 4> StridedSliceSimpleRangeMaskFloat32Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return StridedSliceSimpleRangeMaskTest<armnn::DataType::Float32>(workloadFactory, memoryManager);
+}
+
+LayerTestResult<float, 2> StridedSliceShrinkAxisMaskFloat32Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return StridedSliceShrinkAxisMaskTest<armnn::DataType::Float32>(workloadFactory, memoryManager);
+}
+
+LayerTestResult<float, 3> StridedSlice3dFloat32Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return StridedSlice3dTest<armnn::DataType::Float32>(workloadFactory, memoryManager);
+}
+
+LayerTestResult<float, 3> StridedSlice3dReverseFloat32Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return StridedSlice3dReverseTest<armnn::DataType::Float32>(workloadFactory, memoryManager);
+}
+
+LayerTestResult<float, 2> StridedSlice2dFloat32Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return StridedSlice2dTest<armnn::DataType::Float32>(workloadFactory, memoryManager);
+}
+
+LayerTestResult<float, 2> StridedSlice2dReverseFloat32Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return StridedSlice2dReverseTest<armnn::DataType::Float32>(workloadFactory, memoryManager);
+}
+
+LayerTestResult<uint8_t, 4> StridedSlice4dUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return StridedSlice4dTest<armnn::DataType::QuantisedAsymm8>(workloadFactory, memoryManager);
+}
+
+LayerTestResult<uint8_t, 4> StridedSlice4dReverseUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return StridedSlice4dReverseTest<armnn::DataType::QuantisedAsymm8>(workloadFactory, memoryManager);
+}
+
+LayerTestResult<uint8_t, 4> StridedSliceSimpleStrideUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return StridedSliceSimpleStrideTest<armnn::DataType::QuantisedAsymm8>(workloadFactory, memoryManager);
+}
+
+LayerTestResult<uint8_t, 4> StridedSliceSimpleRangeMaskUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return StridedSliceSimpleRangeMaskTest<armnn::DataType::QuantisedAsymm8>(workloadFactory, memoryManager);
+}
+
+LayerTestResult<uint8_t, 2> StridedSliceShrinkAxisMaskUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return StridedSliceShrinkAxisMaskTest<armnn::DataType::QuantisedAsymm8>(workloadFactory, memoryManager);
+}
+
+LayerTestResult<uint8_t, 3> StridedSlice3dUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return StridedSlice3dTest<armnn::DataType::QuantisedAsymm8>(workloadFactory, memoryManager);
+}
+
+LayerTestResult<uint8_t, 3> StridedSlice3dReverseUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return StridedSlice3dReverseTest<armnn::DataType::QuantisedAsymm8>(workloadFactory, memoryManager);
+}
+
+LayerTestResult<uint8_t, 2> StridedSlice2dUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return StridedSlice2dTest<armnn::DataType::QuantisedAsymm8>(workloadFactory, memoryManager);
+}
+
+LayerTestResult<uint8_t, 2> StridedSlice2dReverseUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return StridedSlice2dReverseTest<armnn::DataType::QuantisedAsymm8>(workloadFactory, memoryManager);
+}
+
+LayerTestResult<int16_t, 4> StridedSlice4dInt16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return StridedSlice4dTest<armnn::DataType::QuantisedSymm16>(workloadFactory, memoryManager);
+}
+
+LayerTestResult<int16_t, 4> StridedSlice4dReverseInt16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return StridedSlice4dReverseTest<armnn::DataType::QuantisedSymm16>(workloadFactory, memoryManager);
+}
+
+LayerTestResult<int16_t, 4> StridedSliceSimpleStrideInt16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return StridedSliceSimpleStrideTest<armnn::DataType::QuantisedSymm16>(workloadFactory, memoryManager);
+}
+
+LayerTestResult<int16_t, 4> StridedSliceSimpleRangeMaskInt16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return StridedSliceSimpleRangeMaskTest<armnn::DataType::QuantisedSymm16>(workloadFactory, memoryManager);
+}
+
+LayerTestResult<int16_t, 2> StridedSliceShrinkAxisMaskInt16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return StridedSliceShrinkAxisMaskTest<armnn::DataType::QuantisedSymm16>(workloadFactory, memoryManager);
+}
+
+LayerTestResult<int16_t, 3> StridedSlice3dInt16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return StridedSlice3dTest<armnn::DataType::QuantisedSymm16>(workloadFactory, memoryManager);
+}
+
+LayerTestResult<int16_t, 3> StridedSlice3dReverseInt16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return StridedSlice3dReverseTest<armnn::DataType::QuantisedSymm16>(workloadFactory, memoryManager);
+}
+
+LayerTestResult<int16_t, 2> StridedSlice2dInt16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return StridedSlice2dTest<armnn::DataType::QuantisedSymm16>(workloadFactory, memoryManager);
+}
+
+LayerTestResult<int16_t, 2> StridedSlice2dReverseInt16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return StridedSlice2dReverseTest<armnn::DataType::QuantisedSymm16>(workloadFactory, memoryManager);
+}
diff --git a/src/backends/backendsCommon/test/layerTests/StridedSliceTestImpl.hpp b/src/backends/backendsCommon/test/layerTests/StridedSliceTestImpl.hpp
new file mode 100644
index 0000000..1c83e3e
--- /dev/null
+++ b/src/backends/backendsCommon/test/layerTests/StridedSliceTestImpl.hpp
@@ -0,0 +1,119 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include "LayerTestResult.hpp"
+
+#include <backendsCommon/IBackendInternal.hpp>
+#include <backendsCommon/WorkloadFactory.hpp>
+
+LayerTestResult<float, 4> StridedSlice4dFloat32Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<float, 4> StridedSlice4dReverseFloat32Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<float, 4> StridedSliceSimpleStrideFloat32Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<float, 4> StridedSliceSimpleRangeMaskFloat32Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<float, 2> StridedSliceShrinkAxisMaskFloat32Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<float, 3> StridedSlice3dFloat32Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<float, 3> StridedSlice3dReverseFloat32Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<float, 2> StridedSlice2dFloat32Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<float, 2> StridedSlice2dReverseFloat32Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<uint8_t, 4> StridedSlice4dUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<uint8_t, 4> StridedSlice4dReverseUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<uint8_t, 4> StridedSliceSimpleStrideUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<uint8_t, 4> StridedSliceSimpleRangeMaskUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<uint8_t, 2> StridedSliceShrinkAxisMaskUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<uint8_t, 3> StridedSlice3dUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<uint8_t, 3> StridedSlice3dReverseUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<uint8_t, 2> StridedSlice2dUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<uint8_t, 2> StridedSlice2dReverseUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<int16_t, 4> StridedSlice4dInt16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<int16_t, 4> StridedSlice4dReverseInt16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<int16_t, 4> StridedSliceSimpleStrideInt16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<int16_t, 4> StridedSliceSimpleRangeMaskInt16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<int16_t, 2> StridedSliceShrinkAxisMaskInt16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<int16_t, 3> StridedSlice3dInt16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<int16_t, 3> StridedSlice3dReverseInt16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<int16_t, 2> StridedSlice2dInt16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<int16_t, 2> StridedSlice2dReverseInt16Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
diff --git a/src/backends/backendsCommon/test/layerTests/TransposeConvolution2dTestImpl.hpp b/src/backends/backendsCommon/test/layerTests/TransposeConvolution2dTestImpl.hpp
new file mode 100644
index 0000000..6191adf
--- /dev/null
+++ b/src/backends/backendsCommon/test/layerTests/TransposeConvolution2dTestImpl.hpp
@@ -0,0 +1,564 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include <armnn/ArmNN.hpp>
+
+#include <Permute.hpp>
+#include <ResolveType.hpp>
+
+#include <backendsCommon/CpuTensorHandle.hpp>
+
+#include <backendsCommon/test/CommonTestUtils.hpp>
+#include <backendsCommon/test/QuantizeHelper.hpp>
+#include <backendsCommon/test/TensorCopyUtils.hpp>
+#include <backendsCommon/test/WorkloadTestUtils.hpp>
+
+#include <reference/RefWorkloadFactory.hpp>
+
+#include <boost/test/unit_test.hpp>
+
+#include <string>
+#include <utility>
+#include <vector>
+
+namespace
+{
+
+template<typename T>
+using TensorData = std::pair<armnn::TensorInfo, std::vector<T>>;
+
+template<typename T>
+void VerifyInputTensorData(const TensorData<T>& data, const std::string& tensorName)
+{
+    if (data.first.GetNumElements() > data.second.size())
+    {
+        throw armnn::InvalidArgumentException("Size of data too small for " + tensorName + ": expected " +
+            std::to_string(data.first.GetNumElements()) + "but got " + std::to_string(data.second.size()));
+    }
+}
+
+template<typename T, typename BT>
+void TransposeConvolution2dTestImpl(armnn::IWorkloadFactory& workloadFactory,
+                                    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+                                    const armnn::TransposeConvolution2dDescriptor& descriptor,
+                                    const TensorData<T>& input,
+                                    TensorData<T>& output,
+                                    const TensorData<T>& weights,
+                                    const armnn::Optional<TensorData<BT>>& biases)
+{
+    using namespace armnn;
+
+    VerifyInputTensorData(input, "input");
+    VerifyInputTensorData(weights, "biases");
+
+    if (descriptor.m_BiasEnabled)
+    {
+        if (!biases.has_value())
+        {
+            throw InvalidArgumentException("Bias enabled but no bias data provided");
+        }
+        VerifyInputTensorData(biases.value(), "biases");
+    }
+
+    // set up weights
+    ScopedCpuTensorHandle weightsTensor(weights.first);
+
+    TransposeConvolution2dQueueDescriptor queueDescriptor;
+    queueDescriptor.m_Parameters = descriptor;
+    queueDescriptor.m_Weight     = &weightsTensor;
+
+    AllocateAndCopyDataToITensorHandle(&weightsTensor, weights.second.data());
+
+    std::unique_ptr<ScopedCpuTensorHandle> biasesTensor;
+    if (descriptor.m_BiasEnabled)
+    {
+        // set up biases
+        biasesTensor = std::make_unique<ScopedCpuTensorHandle>(biases.value().first);
+        queueDescriptor.m_Bias = biasesTensor.get();
+
+        AllocateAndCopyDataToITensorHandle(biasesTensor.get(), biases.value().second.data());
+    }
+
+    // set up input and output handles
+    std::unique_ptr<ITensorHandle> inputHandle  = workloadFactory.CreateTensorHandle(input.first);
+    std::unique_ptr<ITensorHandle> outputHandle = workloadFactory.CreateTensorHandle(output.first);
+
+    // set up workload
+    armnn::WorkloadInfo workloadInfo;
+    AddInputToWorkload(queueDescriptor, workloadInfo, input.first, inputHandle.get());
+    AddOutputToWorkload(queueDescriptor, workloadInfo, output.first, outputHandle.get());
+
+    std::unique_ptr<armnn::IWorkload> workload =
+            workloadFactory.CreateTransposeConvolution2d(queueDescriptor, workloadInfo);
+
+    inputHandle->Allocate();
+    outputHandle->Allocate();
+
+    CopyDataToITensorHandle(inputHandle.get(), input.second.data());
+
+    ExecuteWorkload(*workload, memoryManager);
+
+    // copy output
+    output.second = std::vector<T>(output.first.GetNumElements(), 0.0f);
+    CopyDataFromITensorHandle(output.second.data(), outputHandle.get());
+}
+
+template<armnn::DataType ArmnnType, armnn::DataType ArmnnBType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> TransposeConvolution2dTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::TransposeConvolution2dDescriptor& descriptor,
+    armnn::TensorInfo& inputInfo,
+    const std::vector<float>& inputData,
+    armnn::TensorInfo& outputInfo,
+    const std::vector<float>& expectedOutputData,
+    armnn::TensorInfo& weightsInfo,
+    const std::vector<float>& weightsData,
+    armnn::TensorInfo& biasesInfo,
+    const std::vector<float>& biasesData)
+{
+    using namespace armnn;
+
+    // set up quantization parameters
+    if (armnn::IsQuantizedType<T>())
+    {
+        constexpr float   qScale  = 0.50f;
+        constexpr int32_t qOffset = 10;
+
+        inputInfo.SetQuantizationScale(qScale);
+        inputInfo.SetQuantizationOffset(qOffset);
+
+        outputInfo.SetQuantizationScale(qScale);
+        outputInfo.SetQuantizationOffset(qOffset);
+
+        weightsInfo.SetQuantizationScale(qScale);
+        weightsInfo.SetQuantizationOffset(qOffset);
+
+        biasesInfo.SetQuantizationScale(qScale * qScale);
+        biasesInfo.SetQuantizationOffset(0);
+    }
+
+    // set up input
+    TensorData<T> input =
+    {
+        inputInfo,
+        QuantizedVector<T>(inputInfo.GetQuantizationScale(), inputInfo.GetQuantizationOffset(), inputData)
+    };
+
+    // set up weights
+    TensorData<T> weights =
+    {
+        weightsInfo,
+        QuantizedVector<T>(weightsInfo.GetQuantizationScale(), weightsInfo.GetQuantizationOffset(), weightsData)
+    };
+
+    // set up biases
+    using BT = armnn::ResolveType<ArmnnBType>;
+    Optional<TensorData<BT>> optionalBiases;
+    if (descriptor.m_BiasEnabled)
+    {
+        TensorData<BT> biases =
+        {
+            biasesInfo,
+            QuantizedVector<BT>(biasesInfo.GetQuantizationScale(), biasesInfo.GetQuantizationOffset(), biasesData)
+        };
+
+        optionalBiases = Optional<TensorData<BT>>(biases);
+    }
+
+    // set up output
+    TensorData<T> output = { outputInfo, {} };
+
+    // execute test
+    TransposeConvolution2dTestImpl(workloadFactory,
+                                   memoryManager,
+                                   descriptor,
+                                   input,
+                                   output,
+                                   weights,
+                                   optionalBiases);
+
+    // construct result object
+    LayerTestResult<T, 4> testResult(outputInfo);
+    testResult.output         = MakeTensor<T, 4>(outputInfo, output.second);
+    testResult.outputExpected = MakeTensor<T, 4>(outputInfo,
+                                                 QuantizedVector<T>(outputInfo.GetQuantizationScale(),
+                                                                    outputInfo.GetQuantizationOffset(),
+                                                                    expectedOutputData));
+
+    return testResult;
+}
+
+template<typename T>
+void SwizzleData(const armnn::TensorInfo& inputInfo,
+                 std::vector<T>& inputData,
+                 const armnn::TensorInfo& outputInfo,
+                 std::vector<T>& outputData,
+                 const armnn::TensorInfo& weightsInfo,
+                 std::vector<T>& weightsData)
+{
+    constexpr size_t dataTypeSize = sizeof(float);
+    const armnn::PermutationVector nchwToNhwc = { 0, 3, 1, 2 };
+
+    std::vector<T> tmp(inputData.size());
+    armnnUtils::Permute(inputInfo.GetShape(), nchwToNhwc, inputData.data(), tmp.data(), dataTypeSize);
+    inputData = tmp;
+
+    tmp.resize(weightsData.size());
+    armnnUtils::Permute(weightsInfo.GetShape(), nchwToNhwc, weightsData.data(), tmp.data(), dataTypeSize);
+    weightsData = tmp;
+
+    tmp.resize(outputData.size());
+    armnnUtils::Permute(outputInfo.GetShape(), nchwToNhwc, outputData.data(), tmp.data(), dataTypeSize);
+    outputData = tmp;
+}
+
+} // anonymous namespace
+
+template<armnn::DataType ArmnnType, armnn::DataType ArmnnBType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> SimpleTransposeConvolution2dTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    bool biasEnabled,
+    const armnn::DataLayout layout)
+{
+    using namespace armnn;
+
+    constexpr unsigned int batches  = 1u;
+    constexpr unsigned int channels = 1u;
+
+    constexpr unsigned int wInput = 3u;
+    constexpr unsigned int hInput = wInput;
+
+    constexpr unsigned int wOutput = 5u;
+    constexpr unsigned int hOutput = wOutput;
+
+    constexpr unsigned int wWeights = 3u;
+    constexpr unsigned int hWeights = wWeights;
+
+    TensorShape inputShape   = MakeTensorShape(batches, channels, hInput, wInput, layout);
+    TensorShape outputShape  = MakeTensorShape(batches, channels, hOutput, wOutput, layout);
+    TensorShape weightsShape = MakeTensorShape(batches, channels, hWeights, wWeights, layout);
+
+    TensorInfo inputInfo(inputShape, ArmnnType);
+    TensorInfo outputInfo(outputShape, ArmnnType);
+    TensorInfo weightsInfo(weightsShape, ArmnnType);
+    TensorInfo biasesInfo({ channels }, ArmnnBType);
+
+    std::vector<float> inputData =
+    {
+       1.f, 1.f, 1.f,
+       1.f, 1.f, 1.f,
+       1.f, 1.f, 1.f
+    };
+
+    std::vector<float> weightsData =
+    {
+        1.f, 2.f, 3.f,
+        4.f, 5.f, 6.f,
+        7.f, 8.f, 9.f
+    };
+
+    std::vector<float> biasesData = { 1.f };
+
+    std::vector<float> expectedOutputData =
+    {
+         1.f,  3.f,  6.f,  5.f,  3.f,
+         5.f, 12.f, 21.f, 16.f,  9.f,
+        12.f, 27.f, 45.f, 33.f, 18.f,
+        11.f, 24.f, 39.f, 28.f, 15.f,
+         7.f, 15.f, 24.f, 17.f,  9.f
+    };
+
+    if (biasEnabled)
+    {
+        // apply bias to expected output data
+        std::transform(expectedOutputData.begin(), expectedOutputData.end(), expectedOutputData.begin(),
+                       [&](float f) -> float { return f + biasesData[0]; });
+    }
+
+    TransposeConvolution2dDescriptor descriptor;
+    descriptor.m_StrideX     = 1;
+    descriptor.m_StrideY     = 1;
+    descriptor.m_BiasEnabled = biasEnabled;
+    descriptor.m_DataLayout  = layout;
+
+    // swizzle data if needed
+    if (layout == armnn::DataLayout::NHWC)
+    {
+       SwizzleData(inputInfo, inputData, outputInfo, expectedOutputData, weightsInfo, weightsData);
+    }
+
+    return TransposeConvolution2dTest<ArmnnType, ArmnnBType>(workloadFactory,
+                                                             memoryManager,
+                                                             descriptor,
+                                                             inputInfo,
+                                                             inputData,
+                                                             outputInfo,
+                                                             expectedOutputData,
+                                                             weightsInfo,
+                                                             weightsData,
+                                                             biasesInfo,
+                                                             biasesData);
+}
+
+template<armnn::DataType ArmnnType, armnn::DataType ArmnnBType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> PaddedTransposeConvolution2dTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    bool biasEnabled,
+    const armnn::DataLayout layout)
+{
+    using namespace armnn;
+
+    constexpr unsigned int batches  = 1u;
+    constexpr unsigned int channels = 1u;
+
+    constexpr unsigned int wInput = 4u;
+    constexpr unsigned int hInput = wInput;
+
+    constexpr unsigned int wOutput = 2u;
+    constexpr unsigned int hOutput = wOutput;
+
+    constexpr unsigned int wWeights = 3u;
+    constexpr unsigned int hWeights = wWeights;
+
+    TensorShape inputShape   = MakeTensorShape(batches, channels, hInput, wInput, layout);
+    TensorShape outputShape  = MakeTensorShape(batches, channels, hOutput, wOutput, layout);
+    TensorShape weightsShape = MakeTensorShape(batches, channels, hWeights, wWeights, layout);
+
+    TensorInfo inputInfo(inputShape, ArmnnType);
+    TensorInfo outputInfo(outputShape, ArmnnType);
+    TensorInfo weightsInfo(weightsShape, ArmnnType);
+    TensorInfo biasesInfo({ channels }, ArmnnBType);
+
+    std::vector<float> inputData =
+    {
+       1.f, 3.f, 2.f, 1.f,
+       1.f, 3.f, 3.f, 1.f,
+       2.f, 1.f, 1.f, 3.f,
+       3.f, 2.f, 3.f, 3.f
+    };
+
+    std::vector<float> weightsData =
+    {
+        1.f, 2.f, 3.f,
+        0.f, 1.f, 0.f,
+        2.f, 1.f, 2.f
+    };
+
+    std::vector<float> biasesData = { 1.f };
+
+    std::vector<float> expectedOutputData =
+    {
+         21.f, 21.f,
+         28.f, 27.f
+    };
+
+    if (biasEnabled)
+    {
+        // apply bias to expected output data
+        std::transform(expectedOutputData.begin(), expectedOutputData.end(), expectedOutputData.begin(),
+                       [&](float f) -> float { return f + biasesData[0]; });
+    }
+
+    TransposeConvolution2dDescriptor descriptor;
+    descriptor.m_PadLeft     = 2;
+    descriptor.m_PadRight    = 2;
+    descriptor.m_PadTop      = 2;
+    descriptor.m_PadBottom   = 2;
+    descriptor.m_StrideX     = 1;
+    descriptor.m_StrideY     = 1;
+    descriptor.m_BiasEnabled = biasEnabled;
+    descriptor.m_DataLayout  = layout;
+
+    // swizzle data if needed
+    if (layout == armnn::DataLayout::NHWC)
+    {
+        SwizzleData(inputInfo, inputData, outputInfo, expectedOutputData, weightsInfo, weightsData);
+    }
+
+    return TransposeConvolution2dTest<ArmnnType, ArmnnBType>(workloadFactory,
+                                                             memoryManager,
+                                                             descriptor,
+                                                             inputInfo,
+                                                             inputData,
+                                                             outputInfo,
+                                                             expectedOutputData,
+                                                             weightsInfo,
+                                                             weightsData,
+                                                             biasesInfo,
+                                                             biasesData);
+}
+
+template<armnn::DataType ArmnnType, armnn::DataType ArmnnBType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> StridedTransposeConvolution2dTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    bool biasEnabled,
+    const armnn::DataLayout layout)
+{
+    using namespace armnn;
+
+    constexpr unsigned int batches  = 1u;
+    constexpr unsigned int channels = 1u;
+
+    constexpr unsigned int wInput = 3u;
+    constexpr unsigned int hInput = wInput;
+
+    constexpr unsigned int wOutput = 7u;
+    constexpr unsigned int hOutput = wOutput;
+
+    constexpr unsigned int wWeights = 3u;
+    constexpr unsigned int hWeights = wWeights;
+
+    TensorShape inputShape   = MakeTensorShape(batches, channels, hInput, wInput, layout);
+    TensorShape outputShape  = MakeTensorShape(batches, channels, hOutput, wOutput, layout);
+    TensorShape weightsShape = MakeTensorShape(batches, channels, hWeights, wWeights, layout);
+
+    TensorInfo inputInfo(inputShape, ArmnnType);
+    TensorInfo outputInfo(outputShape, ArmnnType);
+    TensorInfo weightsInfo(weightsShape, ArmnnType);
+    TensorInfo biasesInfo({ channels }, ArmnnBType);
+
+    std::vector<float> inputData =
+    {
+        1.f, 1.f, 1.f,
+        1.f, 1.f, 1.f,
+        1.f, 1.f, 1.f
+    };
+
+    std::vector<float> weightsData =
+    {
+        1.f, 2.f, 3.f,
+        4.f, 5.f, 6.f,
+        7.f, 8.f, 9.f
+    };
+
+    std::vector<float> biasesData = { 1.f };
+
+    std::vector<float> expectedOutputData =
+    {
+        1.f,  2.f,  4.f,  2.f,  4.f,  2.f,  3.f,
+        4.f,  5.f, 10.f,  5.f, 10.f,  5.f,  6.f,
+        8.f, 10.f, 20.f, 10.f, 20.f, 10.f, 12.f,
+        4.f,  5.f, 10.f,  5.f, 10.f,  5.f,  6.f,
+        8.f, 10.f, 20.f, 10.f, 20.f, 10.f, 12.f,
+        4.f,  5.f, 10.f,  5.f, 10.f,  5.f,  6.f,
+        7.f,  8.f, 16.f,  8.f, 16.f,  8.f,  9.f
+    };
+
+    if (biasEnabled)
+    {
+        // apply bias to expected output data
+        std::transform(expectedOutputData.begin(), expectedOutputData.end(), expectedOutputData.begin(),
+                    [&](float f) -> float { return f + biasesData[0]; });
+    }
+
+    TransposeConvolution2dDescriptor descriptor;
+    descriptor.m_StrideX     = 2;
+    descriptor.m_StrideY     = 2;
+    descriptor.m_BiasEnabled = biasEnabled;
+    descriptor.m_DataLayout  = layout;
+
+    // swizzle data if needed
+    if (layout == armnn::DataLayout::NHWC)
+    {
+        SwizzleData(inputInfo, inputData, outputInfo, expectedOutputData, weightsInfo, weightsData);
+    }
+
+    return TransposeConvolution2dTest<ArmnnType, ArmnnBType>(workloadFactory,
+                                                             memoryManager,
+                                                             descriptor,
+                                                             inputInfo,
+                                                             inputData,
+                                                             outputInfo,
+                                                             expectedOutputData,
+                                                             weightsInfo,
+                                                             weightsData,
+                                                             biasesInfo,
+                                                             biasesData);
+}
+
+template<armnn::DataType ArmnnType, armnn::DataType ArmnnBType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> MultiChannelTransposeConvolution2dTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    const armnn::DataLayout layout)
+{
+    using namespace armnn;
+
+    TensorShape inputShape   = MakeTensorShape(1, 1, 2, 2, layout);
+    TensorShape outputShape  = MakeTensorShape(1, 2, 5, 5, layout);
+
+    // OIHW for NCHW; OHWI for NHWC
+    TensorShape weightsShape = MakeTensorShape(2, 1, 3, 3, layout);
+    TensorShape biasesShape  = { 2 };
+
+    TensorInfo inputInfo(inputShape, ArmnnType);
+    TensorInfo outputInfo(outputShape, ArmnnType);
+    TensorInfo weightsInfo(weightsShape, ArmnnType);
+    TensorInfo biasesInfo(biasesShape, ArmnnBType);
+
+    std::vector<float> inputData =
+    {
+        1.f, 2.f,
+        3.f, 4.f,
+    };
+
+    std::vector<float> weightsData =
+    {
+         1.f,  3.f,  5.f,
+         7.f,  9.f, 11.f,
+        13.f, 15.f, 17.f,
+
+         2.f,  4.f,  6.f,
+         8.f, 10.f, 12.f,
+        14.f, 16.f, 18.f
+    };
+
+    std::vector<float> biasesData = { -1.5f, -2.0f };
+
+    std::vector<float> expectedOutputData =
+    {
+        -0.5f,  1.5f,   5.5f,  4.5f,  8.5f,
+         5.5f,  7.5f,  23.5f, 16.5f, 20.5f,
+        14.5f, 22.5f,  60.5f, 40.5f, 52.5f,
+        19.5f, 25.5f,  59.5f, 34.5f, 42.5f,
+        37.5f, 43.5f, 101.5f, 58.5f, 66.5f,
+
+         0.0f,  2.0f,   8.0f,  6.0f, 10.0f,
+         6.0f,  8.0f,  26.0f, 18.0f, 22.0f,
+        18.0f, 26.0f,  70.0f, 46.0f, 58.0f,
+        22.0f, 28.0f,  66.0f, 38.0f, 46.0f,
+        40.0f, 46.0f, 108.0f, 62.0f, 70.0f,
+    };
+
+    TransposeConvolution2dDescriptor descriptor;
+    descriptor.m_StrideX     = 2;
+    descriptor.m_StrideY     = 2;
+    descriptor.m_BiasEnabled = true;
+    descriptor.m_DataLayout  = layout;
+
+    // swizzle data if needed
+    if (layout == armnn::DataLayout::NHWC)
+    {
+        SwizzleData(inputInfo, inputData, outputInfo, expectedOutputData, weightsInfo, weightsData);
+    }
+
+    return TransposeConvolution2dTest<ArmnnType, ArmnnBType>(workloadFactory,
+                                                             memoryManager,
+                                                             descriptor,
+                                                             inputInfo,
+                                                             inputData,
+                                                             outputInfo,
+                                                             expectedOutputData,
+                                                             weightsInfo,
+                                                             weightsData,
+                                                             biasesInfo,
+                                                             biasesData);
+}