IVGCVSW-1922 Unit test for DepthwiseConvolution with NHWC

Change-Id: I3e6e5b9a62f30d03c05bd7178adea8f4c8275da8
diff --git a/src/armnn/layers/DepthwiseConvolution2dLayer.cpp b/src/armnn/layers/DepthwiseConvolution2dLayer.cpp
index e1d433c..393c4bf 100644
--- a/src/armnn/layers/DepthwiseConvolution2dLayer.cpp
+++ b/src/armnn/layers/DepthwiseConvolution2dLayer.cpp
@@ -29,8 +29,6 @@
 
     descriptor.m_Weight = m_Weight.get();
 
-    descriptor.m_DataLayout = GetParameters().m_DataLayout;
-
     if (m_Param.m_BiasEnabled)
     {
         BOOST_ASSERT_MSG(m_Bias != nullptr, "DepthwiseConvolution2dLayer: Bias data should not be null.");
diff --git a/src/armnn/test/CreateWorkload.hpp b/src/armnn/test/CreateWorkload.hpp
index a33189e..f3cf544 100644
--- a/src/armnn/test/CreateWorkload.hpp
+++ b/src/armnn/test/CreateWorkload.hpp
@@ -397,52 +397,56 @@
     return workload;
 }
 
-template <typename DepthwiseConvolution2dFloat32Workload>
+template <typename DepthwiseConvolution2dFloat32Workload, armnn::DataType DataType>
 std::unique_ptr<DepthwiseConvolution2dFloat32Workload> CreateDepthwiseConvolution2dWorkloadTest(
-    armnn::IWorkloadFactory& factory, armnn::Graph& graph)
+    armnn::IWorkloadFactory& factory, armnn::Graph& graph, DataLayout dataLayout = DataLayout::NCHW)
 {
     // Creates the layer we're testing.
     DepthwiseConvolution2dDescriptor layerDesc;
-    layerDesc.m_PadLeft         = 3;
-    layerDesc.m_PadRight        = 3;
+    layerDesc.m_PadLeft         = 1;
+    layerDesc.m_PadRight        = 2;
     layerDesc.m_PadTop          = 1;
-    layerDesc.m_PadBottom       = 1;
-    layerDesc.m_StrideX         = 2;
-    layerDesc.m_StrideY         = 4;
-    layerDesc.m_BiasEnabled     = true;
+    layerDesc.m_PadBottom       = 2;
+    layerDesc.m_StrideX         = 1;
+    layerDesc.m_StrideY         = 1;
+    layerDesc.m_BiasEnabled     = false;
+    layerDesc.m_DataLayout = dataLayout;
 
     DepthwiseConvolution2dLayer* const layer = graph.AddLayer<DepthwiseConvolution2dLayer>(layerDesc, "layer");
 
-    layer->m_Weight = std::make_unique<ScopedCpuTensorHandle>(TensorInfo({3, 3, 5, 3}, DataType::Float32));
-    layer->m_Bias   = std::make_unique<ScopedCpuTensorHandle>(TensorInfo({9}, DataType::Float32));
+    layer->m_Weight = std::make_unique<ScopedCpuTensorHandle>(TensorInfo({1, 4, 4, 2}, DataType));
     layer->m_Weight->Allocate();
-    layer->m_Bias->Allocate();
 
     // Creates extra layers.
     Layer* const input = graph.AddLayer<InputLayer>(0, "input");
     Layer* const output = graph.AddLayer<OutputLayer>(0, "output");
 
+    TensorShape inputShape = (dataLayout == DataLayout::NCHW) ?
+                TensorShape{ 2, 2, 5, 5 } : TensorShape{ 2, 5, 5, 2 };
+    TensorShape outputShape = (dataLayout == DataLayout::NCHW) ?
+                TensorShape{ 2, 2, 5, 5 } : TensorShape{ 2, 5, 5, 2 };
+
     // Connects up.
-    Connect(input, layer, TensorInfo({2, 3, 8, 16}, armnn::DataType::Float32));
-    Connect(layer, output, TensorInfo({2, 9, 2, 10}, armnn::DataType::Float32));
+    Connect(input, layer, TensorInfo(inputShape, DataType));
+    Connect(layer, output, TensorInfo(outputShape, DataType));
     CreateTensorHandles(graph, factory);
 
     // Makes the workload and checks it.
     auto workload = MakeAndCheckWorkload<DepthwiseConvolution2dFloat32Workload>(*layer, graph, factory);
 
     DepthwiseConvolution2dQueueDescriptor queueDescriptor = workload->GetData();
-    BOOST_TEST(queueDescriptor.m_Parameters.m_StrideX == 2);
-    BOOST_TEST(queueDescriptor.m_Parameters.m_StrideY == 4);
-    BOOST_TEST(queueDescriptor.m_Parameters.m_PadLeft == 3);
-    BOOST_TEST(queueDescriptor.m_Parameters.m_PadRight == 3);
+    BOOST_TEST(queueDescriptor.m_Parameters.m_StrideX == 1);
+    BOOST_TEST(queueDescriptor.m_Parameters.m_StrideY == 1);
+    BOOST_TEST(queueDescriptor.m_Parameters.m_PadLeft == 1);
+    BOOST_TEST(queueDescriptor.m_Parameters.m_PadRight == 2);
     BOOST_TEST(queueDescriptor.m_Parameters.m_PadTop == 1);
-    BOOST_TEST(queueDescriptor.m_Parameters.m_PadBottom == 1);
-    BOOST_TEST(queueDescriptor.m_Parameters.m_BiasEnabled == true);
+    BOOST_TEST(queueDescriptor.m_Parameters.m_PadBottom == 2);
+    BOOST_TEST(queueDescriptor.m_Parameters.m_BiasEnabled == false);
+    BOOST_TEST((queueDescriptor.m_Parameters.m_DataLayout == dataLayout));
 
     BOOST_TEST(queueDescriptor.m_Inputs.size() == 1);
     BOOST_TEST(queueDescriptor.m_Outputs.size() == 1);
-    BOOST_TEST((queueDescriptor.m_Weight->GetTensorInfo() == TensorInfo({3, 3, 5, 3}, DataType::Float32)));
-    BOOST_TEST((queueDescriptor.m_Bias->GetTensorInfo() == TensorInfo({9}, DataType::Float32)));
+    BOOST_TEST((queueDescriptor.m_Weight->GetTensorInfo() == TensorInfo({1, 4, 4, 2}, DataType)));
 
     // Returns so we can do extra, backend-specific tests.
     return workload;
diff --git a/src/backends/WorkloadData.cpp b/src/backends/WorkloadData.cpp
index d562b73..ef31fbd 100644
--- a/src/backends/WorkloadData.cpp
+++ b/src/backends/WorkloadData.cpp
@@ -579,10 +579,12 @@
     ValidatePointer(m_Weight, "DepthwiseConvolution2dQueueDescriptor", "weight");
     ValidateTensorNumDimensions(m_Weight->GetTensorInfo(), "DepthwiseConvolution2dQueueDescriptor", 4, "weight");
 
+    const unsigned int channelIndex = (m_Parameters.m_DataLayout == DataLayout::NCHW) ? 1 : 3;
+
     //inputChannels * channelMultiplier should be equal to outputChannels.
     const unsigned int numWeightChannelMultiplier = m_Weight->GetTensorInfo().GetShape()[0];
-    const unsigned int numWeightInputChannels = m_Weight->GetTensorInfo().GetShape()[1];
-    const unsigned int numWeightOutputChannels = workloadInfo.m_OutputTensorInfos[0].GetShape()[1];
+    const unsigned int numWeightInputChannels = m_Weight->GetTensorInfo().GetShape()[channelIndex];
+    const unsigned int numWeightOutputChannels = workloadInfo.m_OutputTensorInfos[0].GetShape()[channelIndex];
     if (numWeightChannelMultiplier * numWeightInputChannels != numWeightOutputChannels)
     {
         throw InvalidArgumentException(
diff --git a/src/backends/WorkloadData.hpp b/src/backends/WorkloadData.hpp
index c7777b0..40e89f7 100644
--- a/src/backends/WorkloadData.hpp
+++ b/src/backends/WorkloadData.hpp
@@ -160,13 +160,11 @@
     DepthwiseConvolution2dQueueDescriptor()
         : m_Weight(nullptr)
         , m_Bias(nullptr)
-        , m_DataLayout(DataLayout::NCHW)
     {
     }
 
     const ConstCpuTensorHandle* m_Weight;
     const ConstCpuTensorHandle* m_Bias;
-    DataLayout m_DataLayout;
 
     void Validate(const WorkloadInfo& workloadInfo) const;
 };
diff --git a/src/backends/cl/test/ClCreateWorkloadTests.cpp b/src/backends/cl/test/ClCreateWorkloadTests.cpp
index 67f3e3c..66c2c2a 100644
--- a/src/backends/cl/test/ClCreateWorkloadTests.cpp
+++ b/src/backends/cl/test/ClCreateWorkloadTests.cpp
@@ -243,6 +243,36 @@
     ClConvolution2dWorkloadTest<ClConvolution2dWorkload, armnn::DataType::Float16>(DataLayout::NHWC);
 }
 
+template <typename DepthwiseConvolutionWorkloadType, typename armnn::DataType DataType>
+static void ClDepthwiseConvolutionWorkloadTest(DataLayout dataLayout)
+{
+    Graph graph;
+    ClWorkloadFactory factory;
+
+    auto workload = CreateDepthwiseConvolution2dWorkloadTest<DepthwiseConvolutionWorkloadType, DataType>
+                    (factory, graph, dataLayout);
+
+    // Checks that inputs/outputs are as we expect them (see definition of CreateDepthwiseConvolution2dWorkloadTest).
+    DepthwiseConvolution2dQueueDescriptor queueDescriptor = workload->GetData();
+    auto inputHandle  = boost::polymorphic_downcast<IClTensorHandle*>(queueDescriptor.m_Inputs[0]);
+    auto outputHandle = boost::polymorphic_downcast<IClTensorHandle*>(queueDescriptor.m_Outputs[0]);
+
+    std::initializer_list<unsigned int> inputShape  = (dataLayout == DataLayout::NCHW)
+            ? std::initializer_list<unsigned int>({ 2, 2, 5, 5 })
+            : std::initializer_list<unsigned int>({ 2, 5, 5, 2 });
+    std::initializer_list<unsigned int> outputShape = (dataLayout == DataLayout::NCHW)
+            ? std::initializer_list<unsigned int>({ 2, 2, 5, 5 })
+            : std::initializer_list<unsigned int>({ 2, 5, 5, 2 });
+
+    BOOST_TEST(CompareIClTensorHandleShape(inputHandle, inputShape));
+    BOOST_TEST(CompareIClTensorHandleShape(outputHandle, outputShape));
+}
+
+BOOST_AUTO_TEST_CASE(CreateDepthwiseConvolutionFloat32NhwcWorkload)
+{
+    ClDepthwiseConvolutionWorkloadTest<ClDepthwiseConvolutionWorkload, DataType::Float32>(DataLayout::NHWC);
+}
+
 template <typename Convolution2dWorkloadType, typename armnn::DataType DataType>
 static void ClDirectConvolution2dWorkloadTest()
 {
diff --git a/src/backends/cl/test/ClLayerTests.cpp b/src/backends/cl/test/ClLayerTests.cpp
old mode 100644
new mode 100755
index cea7470..9934c92
--- a/src/backends/cl/test/ClLayerTests.cpp
+++ b/src/backends/cl/test/ClLayerTests.cpp
@@ -65,6 +65,7 @@
 
 // Depthwise Convolution
 ARMNN_AUTO_TEST_CASE(DepthwiseConvolution2dDepthMul1, DepthwiseConvolution2dDepthMul1Test, true)
+ARMNN_AUTO_TEST_CASE(DepthwiseConvolution2dDepthNhwc, DepthwiseConvolution2dDepthNhwcTest, false)
 ARMNN_AUTO_TEST_CASE(UnbiasedDepthwiseConvolution2dDepthMul1, DepthwiseConvolution2dDepthMul1Test, false)
 ARMNN_AUTO_TEST_CASE(DepthwiseConvolution2dDepthMul1Uint8, DepthwiseConvolution2dDepthMul1Uint8Test, true)
 ARMNN_AUTO_TEST_CASE(UnbiasedDepthwiseConvolution2dDepthMul1Uint8, DepthwiseConvolution2dDepthMul1Uint8Test, false)
diff --git a/src/backends/cl/workloads/ClDepthwiseConvolutionWorkload.cpp b/src/backends/cl/workloads/ClDepthwiseConvolutionWorkload.cpp
index 142cbc2..6fa9ddc 100644
--- a/src/backends/cl/workloads/ClDepthwiseConvolutionWorkload.cpp
+++ b/src/backends/cl/workloads/ClDepthwiseConvolutionWorkload.cpp
@@ -60,12 +60,12 @@
     auto& weightInfo = m_Data.m_Weight->GetTensorInfo();
 
     m_KernelTensor = std::make_unique<arm_compute::CLTensor>();
-    BuildArmComputeTensor(*m_KernelTensor, weightInfo);
+    BuildArmComputeTensor(*m_KernelTensor, weightInfo, m_Data.m_Parameters.m_DataLayout);
 
     if (m_Data.m_Parameters.m_BiasEnabled)
     {
         m_BiasTensor = std::make_unique<arm_compute::CLTensor>();
-        BuildArmComputeTensor(*m_BiasTensor, m_Data.m_Bias->GetTensorInfo());
+        BuildArmComputeTensor(*m_BiasTensor, m_Data.m_Bias->GetTensorInfo(), m_Data.m_Parameters.m_DataLayout);
     }
 
     arm_compute::PadStrideInfo padStrideInfo(m_Data.m_Parameters.m_StrideX,
@@ -82,6 +82,10 @@
     arm_compute::ICLTensor& input  = static_cast<IClTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
     arm_compute::ICLTensor& output = static_cast<IClTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
 
+    arm_compute::DataLayout aclDataLayout = ConvertDataLayout(m_Data.m_Parameters.m_DataLayout);
+    input.info()->set_data_layout(aclDataLayout);
+    output.info()->set_data_layout(aclDataLayout);
+
     const unsigned int depthMultiplier = weightInfo.GetShape()[0];
 
     //Check for optimisation opportunities.
diff --git a/src/backends/neon/test/NeonCreateWorkloadTests.cpp b/src/backends/neon/test/NeonCreateWorkloadTests.cpp
index 244002f..ac0451f 100644
--- a/src/backends/neon/test/NeonCreateWorkloadTests.cpp
+++ b/src/backends/neon/test/NeonCreateWorkloadTests.cpp
@@ -219,6 +219,45 @@
     NeonCreateConvolution2dWorkloadTest<NeonConvolution2dFloatWorkload, DataType::Float32>(DataLayout::NHWC);
 }
 
+template <typename DepthwiseConvolution2dFloat32WorkloadType, typename armnn::DataType DataType>
+static void NeonCreateDepthWiseConvolutionWorkloadTest(DataLayout dataLayout)
+{
+    Graph graph;
+    NeonWorkloadFactory factory;
+
+    auto workload = CreateDepthwiseConvolution2dWorkloadTest<DepthwiseConvolution2dFloat32WorkloadType,
+                                                             DataType>(factory, graph, dataLayout);
+
+    // Checks that inputs/outputs are as we expect them (see definition of CreateNormalizationWorkloadTest).
+    DepthwiseConvolution2dQueueDescriptor queueDescriptor = workload->GetData();
+    auto inputHandle  = boost::polymorphic_downcast<INeonTensorHandle*>(queueDescriptor.m_Inputs[0]);
+    auto outputHandle = boost::polymorphic_downcast<INeonTensorHandle*>(queueDescriptor.m_Outputs[0]);
+
+    std::initializer_list<unsigned int> inputShape  = (dataLayout == DataLayout::NCHW)
+            ? std::initializer_list<unsigned int>({ 2, 2, 5, 5 })
+            : std::initializer_list<unsigned int>({ 2, 5, 5, 2 });
+    std::initializer_list<unsigned int> outputShape = (dataLayout == DataLayout::NCHW)
+            ? std::initializer_list<unsigned int>({ 2, 2, 5, 5 })
+            : std::initializer_list<unsigned int>({ 2, 5, 5, 2 });
+
+    BOOST_TEST(TestNeonTensorHandleInfo(inputHandle, TensorInfo(inputShape, DataType)));
+    BOOST_TEST(TestNeonTensorHandleInfo(outputHandle, TensorInfo(outputShape, DataType)));
+}
+
+BOOST_AUTO_TEST_CASE(CreateDepthWiseConvolution2dFloat32NhwcWorkload)
+{
+    NeonCreateDepthWiseConvolutionWorkloadTest<NeonDepthwiseConvolutionFloatWorkload,
+                                               DataType::Float32>(DataLayout::NHWC);
+}
+
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+BOOST_AUTO_TEST_CASE(CreateDepthWiseConvolution2dFloat16NhwcWorkload)
+{
+    NeonCreateDepthWiseConvolutionWorkloadTest<NeonDepthwiseConvolutionFloatWorkload,
+                                               DataType::Float16>(DataLayout::NHWC);
+}
+#endif
+
 template <typename FullyConnectedWorkloadType, typename armnn::DataType DataType>
 static void NeonCreateFullyConnectedWorkloadTest()
 {
diff --git a/src/backends/neon/test/NeonLayerTests.cpp b/src/backends/neon/test/NeonLayerTests.cpp
index 2d4ee99..36138b3 100644
--- a/src/backends/neon/test/NeonLayerTests.cpp
+++ b/src/backends/neon/test/NeonLayerTests.cpp
@@ -84,6 +84,7 @@
 
 // Depthwise Convolution
 ARMNN_AUTO_TEST_CASE(DepthwiseConvolution2dDepthMul1, DepthwiseConvolution2dDepthMul1Test, true)
+ARMNN_AUTO_TEST_CASE(DepthwiseConvolution2dDepthNhwc, DepthwiseConvolution2dDepthNhwcTest, false)
 ARMNN_AUTO_TEST_CASE(UnbiasedDepthwiseConvolution2dDepthMul1, DepthwiseConvolution2dDepthMul1Test, false)
 ARMNN_AUTO_TEST_CASE(DepthwiseConvolution2dDepthMul1Uint8, DepthwiseConvolution2dDepthMul1Uint8Test, true)
 ARMNN_AUTO_TEST_CASE(UnbiasedDepthwiseConvolution2dDepthMul1Uint8, DepthwiseConvolution2dDepthMul1Uint8Test, false)
diff --git a/src/backends/neon/workloads/NeonDepthwiseConvolutionFloatWorkload.cpp b/src/backends/neon/workloads/NeonDepthwiseConvolutionFloatWorkload.cpp
index 742a768..4b266f3 100644
--- a/src/backends/neon/workloads/NeonDepthwiseConvolutionFloatWorkload.cpp
+++ b/src/backends/neon/workloads/NeonDepthwiseConvolutionFloatWorkload.cpp
@@ -20,12 +20,12 @@
     const TensorInfo& weightInfo = m_Data.m_Weight->GetTensorInfo();
 
     m_KernelTensor = std::make_unique<arm_compute::Tensor>();
-    BuildArmComputeTensor(*m_KernelTensor, weightInfo, descriptor.m_DataLayout);
+    BuildArmComputeTensor(*m_KernelTensor, weightInfo, m_Data.m_Parameters.m_DataLayout);
 
     if (m_Data.m_Parameters.m_BiasEnabled)
     {
         m_BiasTensor = std::make_unique<arm_compute::Tensor>();
-        BuildArmComputeTensor(*m_BiasTensor, m_Data.m_Bias->GetTensorInfo(), descriptor.m_DataLayout);
+        BuildArmComputeTensor(*m_BiasTensor, m_Data.m_Bias->GetTensorInfo(), m_Data.m_Parameters.m_DataLayout);
     }
 
     arm_compute::PadStrideInfo padStrideInfo(m_Data.m_Parameters.m_StrideX,
@@ -41,6 +41,10 @@
     arm_compute::ITensor& input  = static_cast<INeonTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
     arm_compute::ITensor& output = static_cast<INeonTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
 
+    arm_compute::DataLayout aclDataLayout = ConvertDataLayout(m_Data.m_Parameters.m_DataLayout);
+    input.info()->set_data_layout(aclDataLayout);
+    output.info()->set_data_layout(aclDataLayout);
+
     bool use3x3Optimisation = weightInfo.GetShape()[3] == 3 && weightInfo.GetShape()[2] == 3;
     if (use3x3Optimisation)
     {
diff --git a/src/backends/neon/workloads/NeonDepthwiseConvolutionUint8Workload.cpp b/src/backends/neon/workloads/NeonDepthwiseConvolutionUint8Workload.cpp
index 722b778..6c6c2df 100644
--- a/src/backends/neon/workloads/NeonDepthwiseConvolutionUint8Workload.cpp
+++ b/src/backends/neon/workloads/NeonDepthwiseConvolutionUint8Workload.cpp
@@ -20,12 +20,12 @@
     const TensorInfo& weightInfo = m_Data.m_Weight->GetTensorInfo();
 
     m_KernelTensor = std::make_unique<arm_compute::Tensor>();
-    BuildArmComputeTensor(*m_KernelTensor, weightInfo, descriptor.m_DataLayout);
+    BuildArmComputeTensor(*m_KernelTensor, weightInfo, m_Data.m_Parameters.m_DataLayout);
 
     if (m_Data.m_Parameters.m_BiasEnabled)
     {
         m_BiasTensor = std::make_unique<arm_compute::Tensor>();
-        BuildArmComputeTensor(*m_BiasTensor, m_Data.m_Bias->GetTensorInfo(), descriptor.m_DataLayout);
+        BuildArmComputeTensor(*m_BiasTensor, m_Data.m_Bias->GetTensorInfo(), m_Data.m_Parameters.m_DataLayout);
     }
 
     arm_compute::PadStrideInfo padStrideInfo(m_Data.m_Parameters.m_StrideX,
@@ -41,6 +41,10 @@
     arm_compute::ITensor& input  = static_cast<INeonTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
     arm_compute::ITensor& output = static_cast<INeonTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
 
+    arm_compute::DataLayout aclDataLayout = ConvertDataLayout(m_Data.m_Parameters.m_DataLayout);
+    input.info()->set_data_layout(aclDataLayout);
+    output.info()->set_data_layout(aclDataLayout);
+
     bool use3x3Optimisation = weightInfo.GetShape()[3] == 3 && weightInfo.GetShape()[2] == 3;
     if (use3x3Optimisation)
     {
diff --git a/src/backends/reference/test/RefCreateWorkloadTests.cpp b/src/backends/reference/test/RefCreateWorkloadTests.cpp
index e8d536f..a8901d2 100644
--- a/src/backends/reference/test/RefCreateWorkloadTests.cpp
+++ b/src/backends/reference/test/RefCreateWorkloadTests.cpp
@@ -190,19 +190,6 @@
                      TensorInfo({2, 2, 2, 10}, DataType::Float32));
 }
 
-BOOST_AUTO_TEST_CASE(CreateDepthwiseConvolution2dWorkload)
-{
-    Graph                graph;
-    RefWorkloadFactory factory;
-    auto                 workload =
-        CreateDepthwiseConvolution2dWorkloadTest<RefDepthwiseConvolution2dFloat32Workload>(factory, graph);
-
-    // Checks that outputs and inputs are as we expect them (see definition of CreateConvolution2dWorkloadTest).
-    CheckInputOutput(std::move(workload),
-                     TensorInfo({2, 3, 8, 16}, DataType::Float32),
-                     TensorInfo({2, 9, 2, 10}, DataType::Float32));
-}
-
 template <typename FullyConnectedWorkloadType, armnn::DataType DataType>
 static void RefCreateFullyConnectedWorkloadTest()
 {
diff --git a/src/backends/test/Conv2dTestImpl.hpp b/src/backends/test/Conv2dTestImpl.hpp
old mode 100644
new mode 100755
index 8e29615..d8c1040
--- a/src/backends/test/Conv2dTestImpl.hpp
+++ b/src/backends/test/Conv2dTestImpl.hpp
@@ -691,6 +691,106 @@
     return ret;
 }
 
+template<typename T, typename B>
+LayerTestResult<T, 4> DepthwiseConvolution2dNhwcTestImpl(armnn::IWorkloadFactory& workloadFactory,
+                                                         const boost::multi_array<T, 4>& input,
+                                                         const boost::multi_array<T, 4>& kernel,
+                                                         const boost::multi_array<B, 1>& bias,
+                                                         const boost::multi_array<T, 4>& outputExpected,
+                                                         float qScale,
+                                                         int32_t qOffset,
+                                                         uint32_t padLeft = 0,
+                                                         uint32_t padTop = 0,
+                                                         uint32_t padRight = 0,
+                                                         uint32_t padBottom = 0,
+                                                         uint32_t strideX = 1,
+                                                         uint32_t strideY = 1)
+{
+    unsigned int inputNum       = boost::numeric_cast<unsigned int>(input.shape()[0]);
+    unsigned int inputChannels  = boost::numeric_cast<unsigned int>(input.shape()[3]);
+    unsigned int inputHeight    = boost::numeric_cast<unsigned int>(input.shape()[1]);
+    unsigned int inputWidth     = boost::numeric_cast<unsigned int>(input.shape()[2]);
+
+    unsigned int kernelChanMul  = boost::numeric_cast<unsigned int>(kernel.shape()[0]);
+    unsigned int kernelChannels = boost::numeric_cast<unsigned int>(kernel.shape()[3]);
+    unsigned int kernelHeight   = boost::numeric_cast<unsigned int>(kernel.shape()[1]);
+    unsigned int kernelWidth    = boost::numeric_cast<unsigned int>(kernel.shape()[2]);
+
+    unsigned int outputNum      = boost::numeric_cast<unsigned int>(outputExpected.shape()[0]);
+    unsigned int outputChannels = boost::numeric_cast<unsigned int>(outputExpected.shape()[3]);
+    unsigned int outputHeight   = boost::numeric_cast<unsigned int>(outputExpected.shape()[1]);
+    unsigned int outputWidth    = boost::numeric_cast<unsigned int>(outputExpected.shape()[2]);
+
+    // Creates the tensors.
+    armnn::TensorInfo inputTensorInfo({inputNum, inputHeight, inputWidth, inputChannels}, armnn::GetDataType<T>());
+    armnn::TensorInfo outputTensorInfo({outputNum, outputHeight, outputWidth, outputChannels},
+                                       armnn::GetDataType<T>());
+    armnn::TensorInfo kernelDesc({kernelChanMul, kernelHeight, kernelWidth, kernelChannels}, armnn::GetDataType<T>());
+    armnn::TensorInfo biasDesc({static_cast<unsigned int>(bias.size())}, armnn::GetDataType<B>());
+
+    // Set quantization parameters if the requested type is a quantized type.
+    if (armnn::IsQuantizedType<T>())
+    {
+        inputTensorInfo.SetQuantizationScale(qScale);
+        inputTensorInfo.SetQuantizationOffset(qOffset);
+        outputTensorInfo.SetQuantizationScale(qScale);
+        outputTensorInfo.SetQuantizationOffset(qOffset);
+        kernelDesc.SetQuantizationScale(qScale);
+        kernelDesc.SetQuantizationOffset(qOffset);
+        biasDesc.SetQuantizationScale(qScale*qScale);
+        biasDesc.SetQuantizationOffset(0);
+    }
+
+    // Construct the input data.
+    std::vector<T> inputData;
+    inputData.assign(input.data(), input.data() + inputHeight*inputWidth*inputChannels);
+    auto batchedInput = MakeTensor<T, 4>(inputTensorInfo, inputData);
+
+    // Construct the output data, with bias applied, as appropriate.
+    std::vector<T> outputData;
+    outputData.assign(outputExpected.data(), outputExpected.data() + outputHeight*outputWidth*outputChannels);
+
+    LayerTestResult<T, 4> ret(outputTensorInfo);
+    ret.outputExpected = MakeTensor<T, 4>(outputTensorInfo, outputData);
+
+    std::unique_ptr<armnn::ITensorHandle> inputHandle = workloadFactory.CreateTensorHandle(inputTensorInfo);
+    std::unique_ptr<armnn::ITensorHandle> outputHandle = workloadFactory.CreateTensorHandle(outputTensorInfo);
+
+    armnn::ScopedCpuTensorHandle weightsTensor(kernelDesc);
+    AllocateAndCopyDataToITensorHandle(&weightsTensor, &kernel[0][0][0][0]);
+
+    armnn::ScopedCpuTensorHandle biasTensor(biasDesc);
+
+    armnn::DepthwiseConvolution2dQueueDescriptor data;
+    data.m_Weight = &weightsTensor;
+    data.m_Bias = &biasTensor; // Still set this whether or not bias is enabled - it can be a source of bugs.
+    data.m_Parameters.m_StrideX = strideX;
+    data.m_Parameters.m_StrideY = strideY;
+    data.m_Parameters.m_PadLeft = padLeft;
+    data.m_Parameters.m_PadRight = padRight;
+    data.m_Parameters.m_PadTop = padTop;
+    data.m_Parameters.m_PadBottom = padBottom;
+    data.m_Parameters.m_DataLayout = armnn::DataLayout::NHWC;
+
+    armnn::WorkloadInfo info;
+    AddInputToWorkload(data, info, inputTensorInfo, inputHandle.get());
+    AddOutputToWorkload(data, info, outputTensorInfo, outputHandle.get());
+
+    std::unique_ptr<armnn::IWorkload> workload = workloadFactory.CreateDepthwiseConvolution2d(data, info);
+
+    inputHandle->Allocate();
+    outputHandle->Allocate();
+
+    CopyDataToITensorHandle(inputHandle.get(), &batchedInput[0][0][0][0]);
+
+    workloadFactory.Finalize();
+    workload->Execute();
+
+    CopyDataFromITensorHandle(&ret.output[0][0][0][0], outputHandle.get());
+
+    return ret;
+}
+
 template<typename T>
 LayerTestResult<T,4> Convolution1dTestImpl(armnn::IWorkloadFactory& workloadFactory,
                                            float qScale,
diff --git a/src/backends/test/LayerTests.cpp b/src/backends/test/LayerTests.cpp
old mode 100644
new mode 100755
index d5f84f0..f2bc019
--- a/src/backends/test/LayerTests.cpp
+++ b/src/backends/test/LayerTests.cpp
@@ -493,6 +493,119 @@
         1); // strideY
 }
 
+template<typename T>
+LayerTestResult<T, 4> DepthwiseConvolution2dNhwcTestCommon(armnn::IWorkloadFactory& workloadFactory,
+                                                           float qScale,
+                                                           int32_t qOffset,
+                                                           bool biasEnabled)
+{
+    armnn::TensorInfo inputTensorInfo({ 1, 5, 5, 2}, armnn::GetDataType<T>());
+    auto input = MakeTensor<T, 4>(inputTensorInfo, std::vector<T>(
+        QuantizedVector<T>(inputTensorInfo.GetQuantizationScale(), inputTensorInfo.GetQuantizationOffset(), {
+            0, 25,
+            1, 26,
+            2, 27,
+            3, 28,
+            4, 29,
+
+            5, 30,
+            6, 31,
+            7, 32,
+            8, 33,
+            9, 34,
+
+            10, 35,
+            11, 36,
+            12, 37,
+            13, 38,
+            14, 39,
+
+            15, 40,
+            16, 41,
+            17, 42,
+            18, 43,
+            19, 44,
+
+            20, 45,
+            21, 46,
+            22, 47,
+            23, 48,
+            24, 49
+        })));
+
+    armnn::TensorInfo kernelTensorInfo({ 1, 4, 4, 2}, armnn::GetDataType<T>());
+    auto kernel = MakeTensor<T, 4>(kernelTensorInfo, std::vector<T>(
+        QuantizedVector<T>(kernelTensorInfo.GetQuantizationScale(), kernelTensorInfo.GetQuantizationOffset(), {
+             32, 16,
+             31, 15,
+             30, 14,
+             29, 13,
+
+             28, 12,
+             27, 11,
+             26, 10,
+             25,  9,
+
+             24,  8,
+             23,  7,
+             22,  6,
+             21,  5,
+
+             20,  4,
+             19,  3,
+             18,  2,
+             17,  1
+        })));
+
+    armnn::TensorInfo outputTensorInfo({ 1, 5, 5, 2}, armnn::GetDataType<T>());
+    boost::multi_array<T, 4> expectedOutput = MakeTensor<T, 4>(outputTensorInfo, std::vector<T>(
+        QuantizedVector<T>(outputTensorInfo.GetQuantizationScale(), outputTensorInfo.GetQuantizationOffset(), {
+        1062, 1550,
+        1580, 2284,
+        1850, 2362,
+        1530, 1955,
+        1117, 1428,
+
+        2140, 2910,
+        3108, 4206,
+        3500, 4342,
+        2842, 3528,
+        2042, 2536,
+
+        3580, 3390,
+        5068, 4886,
+        5460, 5022,
+        4342, 4068,
+        3062, 2916,
+
+        3618, 3566,
+        5072, 5056,
+        5390, 5182,
+        4248, 4133,
+        2971, 2922,
+
+        3074, 3100,
+        4282, 4352,
+        4510, 4452,
+        3533, 3517,
+        2457, 2465
+        })));
+
+    return DepthwiseConvolution2dNhwcTestImpl<T>(workloadFactory,
+        input,
+        kernel,
+        GetBias2<typename FullyConnectedBiasTypeForInputType<T>::Type>(biasEnabled, qScale, qOffset),
+        expectedOutput,
+        qScale,
+        qOffset,
+        1,  // Padding left.
+        1,  // Padding top.
+        2,  // Padding right.
+        2,  // Padding bottom.
+        1,  // strideX
+        1);  // strideY
+}
+
 LayerTestResult<float, 4>
 Convolution2dAsymmetricPaddingLargerThanHalfKernelSizeTest(armnn::IWorkloadFactory& workloadFactory)
 {
@@ -510,6 +623,12 @@
     return DepthwiseConvolution2dTestImpl<float, float>(workloadFactory, 0.0f, 0, biasEnabled);
 }
 
+LayerTestResult<float, 4> DepthwiseConvolution2dDepthNhwcTest(armnn::IWorkloadFactory& workloadFactory,
+                                                              bool biasEnabled)
+{
+    return DepthwiseConvolution2dNhwcTestCommon<float>(workloadFactory, 0.0f, 0, biasEnabled);
+}
+
 LayerTestResult<float, 4> DepthwiseConvolution2dDepthMul1Test(armnn::IWorkloadFactory& workloadFactory,
                                                               bool biasEnabled)
 {
diff --git a/src/backends/test/LayerTests.hpp b/src/backends/test/LayerTests.hpp
index f5abd98..9f8cd3f 100644
--- a/src/backends/test/LayerTests.hpp
+++ b/src/backends/test/LayerTests.hpp
@@ -68,6 +68,9 @@
 
 LayerTestResult<float, 4> DepthwiseConvolution2dTest(armnn::IWorkloadFactory& workloadFactory, bool biasEnabled);
 
+LayerTestResult<float, 4> DepthwiseConvolution2dDepthNhwcTest(armnn::IWorkloadFactory& workloadFactory,
+                                                              bool biasEnabled);
+
 LayerTestResult<float, 4> DepthwiseConvolution2dDepthMul1Test(armnn::IWorkloadFactory& workloadFactory,
                                                               bool biasEnabled);