IVGCVSW-3575 Fix DepthwiseConvolution VTS Test Failures

Failing VTS tests were "NeuralnetworksHidlTest.depthwise_conv2d_*"

In depthwise convolution there was a difference in weight tensor channel
order between the reference and ACL implementations. This specifically related
to NCHW. This commit:

* Adds ReorderWeightChannelsForAcl to WorkloadUtils which will correct the weight tensor channel order.
* Add unit tests to detect this problem.

Signed-off-by: Colm Donelan <Colm.Donelan@arm.com>
Change-Id: Icaeac08e14b3d5da9e222ad2f118db55ebb15d09
diff --git a/src/backends/backendsCommon/WorkloadUtils.cpp b/src/backends/backendsCommon/WorkloadUtils.cpp
index fa387a7..3185ba0 100644
--- a/src/backends/backendsCommon/WorkloadUtils.cpp
+++ b/src/backends/backendsCommon/WorkloadUtils.cpp
@@ -9,8 +9,7 @@
 {
 
 armnn::ConstTensor PermuteTensor(const ConstCpuTensorHandle* tensor,
-                                 const PermutationVector& permutationVector,
-                                 void* permuteBuffer)
+                                 const PermutationVector& permutationVector, void* permuteBuffer)
 {
     BOOST_ASSERT_MSG(tensor, "Invalid input tensor");
     BOOST_ASSERT_MSG(permuteBuffer, "Invalid permute buffer");
@@ -44,18 +43,72 @@
                                   weightShape[0],
                                   weightShape[1],
                                   weightShape[2] * weightShape[3] });
-            break;
-        case DataLayout::NCHW:
-        default:
-            // The data layout is NCHW, reshape from [ M, I, H, W ] to [ 1, I * M, H, W, ]
             weightInfo.SetShape({ 1,
                                   weightShape[0] * weightShape[1],
                                   weightShape[2],
                                   weightShape[3] });
             break;
+        case DataLayout::NCHW:
+        default:
+            // The data layout is NCHW, reshape from [ M, I, H, W ] to [ 1, I * M, H, W, ]
+            weightInfo.SetShape({ 1, weightShape[0] * weightShape[1], weightShape[2], weightShape[3] });
+            break;
     }
 }
 
+template <typename DataType>
+ConstTensor ReorderWeightChannelsForAcl(const ConstTensor& weightHandle, DataLayout dataLayout, void* permuteBuffer)
+{
+    DataType* weight = static_cast<DataType*>(permuteBuffer);
+    const TensorShape& weightShape = weightHandle.GetShape();
+    unsigned int multiplier;
+    unsigned int height;
+    unsigned int width;
+    unsigned int inputChannels;
+    switch (dataLayout)
+    {
+        case DataLayout::NHWC:    //It actually is [ H, W, I, M ]
+            height        = weightShape[0];
+            width         = weightShape[1];
+            inputChannels = weightShape[2];
+            multiplier    = weightShape[3];
+            break;
+        case DataLayout::NCHW:    //It actually is [ M, I, H, W ]
+        default:
+            height        = weightShape[2];
+            width         = weightShape[3];
+            inputChannels = weightShape[1];
+            multiplier    = weightShape[0];
+            break;
+    }
+
+    DataType weightAclOrder[height*width*inputChannels*multiplier];
+    unsigned int destinationWeightsChannel;
+    unsigned int totalChannels = inputChannels * multiplier;
+    unsigned int channelSize   = height * width;
+
+    for (unsigned int originWeightsChannel = 0; originWeightsChannel < totalChannels; originWeightsChannel++)
+    {
+        if (originWeightsChannel % inputChannels == 0)
+        {
+            destinationWeightsChannel = originWeightsChannel / inputChannels;
+        }
+        else
+        {
+            destinationWeightsChannel = (originWeightsChannel - 1) / inputChannels + multiplier;
+        }
+
+        for (unsigned int i = 0; i < channelSize; i++)
+        {
+            weightAclOrder[i + destinationWeightsChannel * channelSize] =
+                    weight[i + originWeightsChannel * channelSize];
+        }
+    }
+
+    ::memcpy(permuteBuffer, weightAclOrder, weightHandle.GetInfo().GetNumBytes());
+    return ConstTensor(weightHandle.GetInfo(), permuteBuffer);
+}
+
 TensorInfo ConvertWeightTensorInfoFromArmnnToAcl(const TensorInfo& weightInfo, DataLayout dataLayout)
 {
     // Convert the weight format from ArmNN's [ M, I, H, W ] (does NOT depend on the data layout) to either
@@ -86,6 +139,9 @@
     BOOST_ASSERT_MSG(weightTensor, "Invalid input tensor");
     BOOST_ASSERT_MSG(permuteBuffer, "Invalid permute buffer");
 
+    auto multiplier    = weightTensor->GetTensorInfo().GetShape()[0];
+    auto inputChannels = weightTensor->GetTensorInfo().GetShape()[1];
+
     // Convert the weight format from ArmNN's [ M, I, H, W ] (does NOT depend on the data layout) to either
     // [ 1, H, W, I * M ] (if NHWC) or [ 1, I * M, H, W ] (if NCHW), as required by the compute library
 
@@ -101,6 +157,26 @@
     }
     ConstTensor weightPermuted = PermuteTensor(weightTensor, permutationVector, permuteBuffer);
 
+    // Shuffle the weights data to obtain the channel order needed used by Acl
+    if (multiplier > 1 and inputChannels > 1 and dataLayout == DataLayout::NCHW)
+    {
+        switch (weightPermuted.GetDataType())
+        {
+            case DataType::Float32:
+                weightPermuted = ReorderWeightChannelsForAcl<float>(weightPermuted, dataLayout, permuteBuffer);
+                break;
+            case DataType::Float16:
+                weightPermuted =
+                    ReorderWeightChannelsForAcl<half_float::half>(weightPermuted, dataLayout, permuteBuffer);
+                break;
+            case DataType::QuantisedAsymm8:
+                weightPermuted = ReorderWeightChannelsForAcl<uint8_t>(weightPermuted, dataLayout, permuteBuffer);
+                break;
+            default:
+                break;
+        }
+    }
+
     // 2. Reshape the weights
     ReshapeWeightsForAcl(weightPermuted.GetInfo(), dataLayout);
 
diff --git a/src/backends/backendsCommon/WorkloadUtils.hpp b/src/backends/backendsCommon/WorkloadUtils.hpp
index 7e3ac39..ba69255 100644
--- a/src/backends/backendsCommon/WorkloadUtils.hpp
+++ b/src/backends/backendsCommon/WorkloadUtils.hpp
@@ -5,14 +5,14 @@
 
 #pragma once
 
-#include "ITensorHandle.hpp"
 #include "CpuTensorHandle.hpp"
+#include "ITensorHandle.hpp"
 
 #include <armnn/Tensor.hpp>
 
+#include <Half.hpp>
 #include <Permute.hpp>
 #include <Profiling.hpp>
-#include <Half.hpp>
 
 #include <boost/cast.hpp>
 
@@ -21,7 +21,7 @@
 namespace
 {
 
-template<typename ArrayType, typename Arg>
+template <typename ArrayType, typename Arg>
 void AssignValues(unsigned int num, unsigned int& idx, const ArrayType& array, Arg& arg)
 {
     if (idx >= num)
@@ -33,68 +33,76 @@
     idx++;
 }
 
-template<typename T, typename ArrayType, typename ...Args>
-void AssignValues(unsigned int num, unsigned int idx, const ArrayType& array, T& assignee, Args& ... args)
+template <typename T, typename ArrayType, typename... Args>
+void AssignValues(unsigned int num, unsigned int idx, const ArrayType& array, T& assignee, Args&... args)
 {
     AssignValues(num, idx, array, assignee);
 
     AssignValues(num, idx, array, args...);
 }
 
-} // anonymous namespace
+}    // anonymous namespace
 
-template<typename CopyFunc>
+template <typename CopyFunc>
 void CopyTensorContentsGeneric(const ITensorHandle* srcTensor, ITensorHandle* dstTensor, CopyFunc copy)
 {
     static_assert(MaxNumOfTensorDimensions == 5, "Please update CopyTensorContents");
 
-    TensorShape srcStrides = srcTensor->GetStrides();
+    TensorShape srcStrides      = srcTensor->GetStrides();
     const TensorShape& srcShape = srcTensor->GetShape();
-    TensorShape dstStrides = dstTensor->GetStrides();
+    TensorShape dstStrides      = dstTensor->GetStrides();
     const TensorShape& dstShape = dstTensor->GetShape();
 
-    size_t srcDepth = 1;
-    size_t srcBatches = 1;
+    size_t srcDepth    = 1;
+    size_t srcBatches  = 1;
     size_t srcChannels = 1;
-    size_t srcHeight = 1;
-    size_t srcWidth = 1;
-    AssignValues(srcShape.GetNumDimensions(),0, srcShape,
+    size_t srcHeight   = 1;
+    size_t srcWidth    = 1;
+    AssignValues(srcShape.GetNumDimensions(),
+                 0,
+                 srcShape,
                  srcWidth,
                  srcHeight,
                  srcChannels,
                  srcBatches,
                  srcDepth);
 
-    size_t srcDepthStride = 0;
-    size_t srcBatchStride = 0;
+    size_t srcDepthStride   = 0;
+    size_t srcBatchStride   = 0;
     size_t srcChannelStride = 0;
-    size_t srcHeightStride = 0;
-    size_t srcWidthStride = 0;
-    AssignValues(srcStrides.GetNumDimensions(),0, srcStrides,
+    size_t srcHeightStride  = 0;
+    size_t srcWidthStride   = 0;
+    AssignValues(srcStrides.GetNumDimensions(),
+                 0,
+                 srcStrides,
                  srcWidthStride,
                  srcHeightStride,
                  srcChannelStride,
                  srcBatchStride,
                  srcDepthStride);
 
-    size_t dstDepth = 1;
-    size_t dstBatches = 1;
+    size_t dstDepth    = 1;
+    size_t dstBatches  = 1;
     size_t dstChannels = 1;
-    size_t dstHeight = 1;
-    size_t dstWidth = 1;
-    AssignValues(dstShape.GetNumDimensions(),0, dstShape,
+    size_t dstHeight   = 1;
+    size_t dstWidth    = 1;
+    AssignValues(dstShape.GetNumDimensions(),
+                 0,
+                 dstShape,
                  dstWidth,
                  dstHeight,
                  dstChannels,
                  dstBatches,
                  dstDepth);
 
-    size_t dstDepthStride = 0;
-    size_t dstBatchStride = 0;
+    size_t dstDepthStride   = 0;
+    size_t dstBatchStride   = 0;
     size_t dstChannelStride = 0;
-    size_t dstHeightStride = 0;
-    size_t dstWidthStride = 0;
-    AssignValues(dstStrides.GetNumDimensions(),0, dstStrides,
+    size_t dstHeightStride  = 0;
+    size_t dstWidthStride   = 0;
+    AssignValues(dstStrides.GetNumDimensions(),
+                 0,
+                 dstStrides,
                  dstWidthStride,
                  dstHeightStride,
                  dstChannelStride,
@@ -105,29 +113,29 @@
     unsigned char* dstData;
     {
         ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "Synchronize buffers");
-        srcData = static_cast<const uint8_t *>(srcTensor->Map());
-        dstData = static_cast<uint8_t *>(dstTensor->Map());
+        srcData = static_cast<const uint8_t*>(srcTensor->Map());
+        dstData = static_cast<uint8_t*>(dstTensor->Map());
     }
 
-    size_t copyLength = std::min(srcWidth*srcWidthStride, dstWidth*dstWidthStride);
-    size_t copyHeight = std::min(srcHeight, dstHeight);
+    size_t copyLength   = std::min(srcWidth * srcWidthStride, dstWidth * dstWidthStride);
+    size_t copyHeight   = std::min(srcHeight, dstHeight);
     size_t copyChannels = std::min(srcChannels, dstChannels);
-    size_t copyBatches = std::min(srcBatches, dstBatches);
-    size_t copyDepth = std::min(srcDepth, dstDepth);
+    size_t copyBatches  = std::min(srcBatches, dstBatches);
+    size_t copyDepth    = std::min(srcDepth, dstDepth);
 
-    for (unsigned int d=0; d < copyDepth; ++d)
+    for (unsigned int d = 0; d < copyDepth; ++d)
     {
         auto srcPtrDepth = srcData;
         auto dstPtrDepth = dstData;
-        for (unsigned int b=0; b < copyBatches; ++b)
+        for (unsigned int b = 0; b < copyBatches; ++b)
         {
             auto srcPtrBatch = srcData;
             auto dstPtrBatch = dstData;
-            for (unsigned int c=0; c< copyChannels; ++c)
+            for (unsigned int c = 0; c < copyChannels; ++c)
             {
                 auto srcPtrChannel = srcData;
                 auto dstPtrChannel = dstData;
-                for (unsigned int h=0; h < copyHeight; ++h)
+                for (unsigned int h = 0; h < copyHeight; ++h)
                 {
                     copy(dstData, srcData, copyLength);
                     dstData += dstHeightStride;
@@ -136,11 +144,11 @@
                 dstData += (static_cast<long>(dstChannelStride) - (dstData - dstPtrChannel));
                 srcData += (static_cast<long>(srcChannelStride) - (srcData - srcPtrChannel));
             }
-            dstData += (static_cast<long>(dstBatchStride)-(dstData - dstPtrBatch));
-            srcData += (static_cast<long>(srcBatchStride)-(srcData - srcPtrBatch));
+            dstData += (static_cast<long>(dstBatchStride) - (dstData - dstPtrBatch));
+            srcData += (static_cast<long>(srcBatchStride) - (srcData - srcPtrBatch));
         }
-        dstData += (static_cast<long>(dstDepthStride)-(dstData - dstPtrDepth));
-        srcData += (static_cast<long>(srcDepthStride)-(srcData - srcPtrDepth));
+        dstData += (static_cast<long>(dstDepthStride) - (dstData - dstPtrDepth));
+        srcData += (static_cast<long>(srcDepthStride) - (srcData - srcPtrDepth));
     }
 
     srcTensor->Unmap();
@@ -156,10 +164,10 @@
 
     for (unsigned int i = 0; i < numInputs; ++i)
     {
-        SrcTensorHandleType* const srcTensorHandle = boost::polymorphic_downcast<SrcTensorHandleType*>(
-            descriptor.m_Inputs[i]);
-        DstTensorHandleType* const dstTensorHandle = boost::polymorphic_downcast<DstTensorHandleType*>(
-            descriptor.m_Outputs[i]);
+        SrcTensorHandleType* const srcTensorHandle =
+            boost::polymorphic_downcast<SrcTensorHandleType*>(descriptor.m_Inputs[i]);
+        DstTensorHandleType* const dstTensorHandle =
+            boost::polymorphic_downcast<DstTensorHandleType*>(descriptor.m_Outputs[i]);
 
         tensorHandlePairs.emplace_back(srcTensorHandle, dstTensorHandle);
     }
@@ -177,4 +185,4 @@
                                                      DataLayout dataLayout,
                                                      void* permuteBuffer);
 
-} //namespace armnn
+}  //namespace armnn
diff --git a/src/backends/backendsCommon/test/LayerTests.cpp b/src/backends/backendsCommon/test/LayerTests.cpp
index 561e526..1504806 100644
--- a/src/backends/backendsCommon/test/LayerTests.cpp
+++ b/src/backends/backendsCommon/test/LayerTests.cpp
@@ -75,6 +75,10 @@
 // 2-channel bias used by a number of Conv2d tests.
 static std::vector<float> Bias2({0, 2});
 
+static std::vector<float> Bias4({1, 2, 3, 4});
+
+static std::vector<float> Bias8({1, 2, 3, 4, 1, 2, 3, 4});
+
 struct Simple3dSoftmaxOutputData
 {
     const std::vector<float> outputData =
@@ -121,6 +125,65 @@
     }
 }
 
+// Helper function that returns either Bias4 or an empty vector depending on whether bias is enabled.
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+boost::multi_array<T, 1> GetBias4(bool biasEnabled, float qScale)
+{
+    if(biasEnabled)
+    {
+        armnn::TensorInfo biasDesc({static_cast<unsigned int>(Bias4.size())}, ArmnnType);
+        boost::multi_array<T, 1> bias = MakeTensor<T, 1>(biasDesc, QuantizedVector<T>(qScale, 0.0f, Bias4));
+        return bias;
+    }
+    else
+    {
+        return boost::multi_array<T, 1>();
+    }
+}
+
+// Helper function that returns either Bias8 or an empty vector depending on whether bias is enabled.
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+boost::multi_array<T, 1> GetBias8(bool biasEnabled, float qScale)
+{
+    if(biasEnabled)
+    {
+        armnn::TensorInfo biasDesc({static_cast<unsigned int>(Bias4.size())}, ArmnnType);
+        boost::multi_array<T, 1> bias = MakeTensor<T, 1>(biasDesc, QuantizedVector<T>(qScale, 0.0f, Bias8));
+        return bias;
+    }
+    else
+    {
+        return boost::multi_array<T, 1>();
+    }
+}
+
+// Helper function that returns either Bias4 or an empty vector depending on whether bias is enabled.
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+boost::multi_array<T, 1> GetBias(bool biasEnabled, float qScale, armnn::TensorInfo outputInfo, armnn::DataLayout layout)
+{
+    const armnnUtils::DataLayoutIndexed dataLayoutIndexed(layout);
+    const unsigned int channelsIndex = dataLayoutIndexed.GetChannelsIndex();
+    const unsigned int outputChannels = outputInfo.GetShape()[channelsIndex];
+
+    switch (outputChannels)
+    {
+        case 2:
+        default:
+        {
+            return GetBias2<ArmnnType>(biasEnabled, qScale);
+        }
+        case 4:
+        {
+            return GetBias4<ArmnnType>(biasEnabled, qScale);
+        }
+        case 8:
+        {
+            return GetBias8<ArmnnType>(biasEnabled, qScale);
+        }
+    }
+}
+
+
 template<armnn::DataType ArmnnType, armnn::DataType ArmnnBType, typename T = armnn::ResolveType<ArmnnType>>
 LayerTestResult<T, 4> SimpleConvolution2d3x5TestCommon(
     armnn::IWorkloadFactory& workloadFactory,
@@ -1307,7 +1370,7 @@
             memoryManager,
             input,
             kernel,
-            GetBias2<ArmnnBType>(biasEnabled, qScale * qScale),
+            GetBias<ArmnnBType>(biasEnabled, qScale * qScale, outputTensorInfo, layout),
             expectedOutput,
             qScale,
             qOffset,
@@ -1454,6 +1517,166 @@
             biasEnabled);
 }
 
+template<armnn::DataType ArmnnType, armnn::DataType ArmnnBType, typename T>
+LayerTestResult<T, 4> DepthwiseConvolution2dMult4Test(
+            armnn::IWorkloadFactory& workloadFactory,
+            const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+            bool biasEnabled,
+            const armnn::DataLayout layout)
+{
+    armnn::TensorInfo inputTensorInfo({1, 2, 3, 3}, ArmnnType);
+    std::vector<float> inputNoQuantizedValues =
+            {
+                    10.0, 10.0, 10.0,
+                    10.0, 10.0, 10.0,
+                    10.0, 10.0, 10.0,
+
+                    21.0, 22.0, 23.0,
+                    24.0, 25.0, 26.0,
+                    27.0, 28.0, 29.0
+            };
+
+    armnn::TensorInfo kernelTensorInfo({ 4, 2, 2, 2}, ArmnnType);
+
+    std::vector<float> kernelNoQuantizedValues =
+            {
+                    0.25f, 0.25f,
+                    0.25f, 0.25f,
+
+                    0.25f, 0.25f,
+                    0.25f, 0.25f,
+
+                    0.0f , 0.0f,
+                    0.0f , 0.1f,
+
+                    0.0f , 0.0f,
+                    0.0f , 0.1f,
+
+                    0.2f , 0.0f,
+                    0.0f , 0.0f,
+
+                    0.2f , 0.0f,
+                    0.0f , 0.0f,
+
+                    0.0f , 0.3f,
+                    0.0f , 0.0f,
+
+                    0.0f , 0.3f,
+                    0.0f , 0.0f
+            };
+
+    armnn::TensorInfo outputTensorInfo({ 1, 8, 2, 2}, ArmnnType);
+    std::vector<float> outputExpectedNoQuantizedValues =
+            {
+                    10.f, 10.f,
+                    10.f, 10.f,
+
+                    1.f, 1.f,
+                    1.f, 1.f,
+
+                    2.f, 2.f,
+                    2.f, 2.f,
+
+                    3.f, 3.f,
+                    3.f, 3.f,
+
+                    23.f, 24.f,
+                    26.f, 27.f,
+
+                    2.5f, 2.6000001f,
+                    2.8f, 2.9f,
+
+                    4.2000003f, 4.4f,
+                    4.8f, 5.f,
+
+                    6.6000004f, 6.9f,
+                    7.5000005f, 7.8f
+            };
+
+
+    return DepthwiseConvolution2d3x3DilationTestCommon<ArmnnType, ArmnnBType>(
+            workloadFactory,
+            memoryManager,
+            inputNoQuantizedValues,
+            inputTensorInfo,
+            kernelNoQuantizedValues,
+            kernelTensorInfo,
+            outputExpectedNoQuantizedValues,
+            outputTensorInfo,
+            1,
+            1,
+            layout,
+            biasEnabled);
+}
+
+template<armnn::DataType ArmnnType, armnn::DataType ArmnnBType, typename T>
+LayerTestResult<T, 4> DepthwiseConvolution2dMult2Test(
+            armnn::IWorkloadFactory& workloadFactory,
+            const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+            bool biasEnabled,
+            const armnn::DataLayout layout)
+{
+    armnn::TensorInfo inputTensorInfo({1, 2, 3, 3}, ArmnnType);
+    std::vector<float> inputNoQuantizedValues =
+            {
+                    10.0, 10.0, 10.0,
+                    10.0, 10.0, 10.0,
+                    10.0, 10.0, 10.0,
+
+                    21.0, 22.0, 23.0,
+                    24.0, 25.0, 26.0,
+                    27.0, 28.0, 29.0
+            };
+
+    armnn::TensorInfo kernelTensorInfo({ 2, 2, 2, 2}, ArmnnType);
+
+    std::vector<float> kernelNoQuantizedValues =
+            {
+                    0.25f, 0.25f,
+                    0.25f, 0.25f,
+
+                    0.2f , 0.0f,
+                    0.0f , 0.0f,
+
+                    0.0f , 0.0f,
+                    0.0f , 0.1f,
+
+                    0.0f , 0.3f,
+                    0.0f , 0.0f
+
+            };
+
+    armnn::TensorInfo outputTensorInfo({ 1, 4, 2, 2}, ArmnnType);
+    std::vector<float> outputExpectedNoQuantizedValues =
+            {
+                    10.f, 10.f,
+                    10.f, 10.f,
+
+                    1.f, 1.f,
+                    1.f, 1.f,
+
+                    4.2000003f, 4.4f,
+                    4.8f, 5.f,
+
+                    6.6000004f, 6.9f,
+                    7.5000005f, 7.8f
+            };
+
+
+    return DepthwiseConvolution2d3x3DilationTestCommon<ArmnnType, ArmnnBType>(
+            workloadFactory,
+            memoryManager,
+            inputNoQuantizedValues,
+            inputTensorInfo,
+            kernelNoQuantizedValues,
+            kernelTensorInfo,
+            outputExpectedNoQuantizedValues,
+            outputTensorInfo,
+            1,
+            1,
+            layout,
+            biasEnabled);
+}
 
 template LayerTestResult<armnn::ResolveType<armnn::DataType::Float32>, 4>
 DepthwiseConvolution2d3x3Dilation3x3Test<armnn::DataType::Float32, armnn::DataType::Float32>(
@@ -1497,6 +1720,20 @@
         bool,
         armnn::DataLayout);
 
+template LayerTestResult<armnn::ResolveType<armnn::DataType::Float32>, 4>
+DepthwiseConvolution2dMult4Test<armnn::DataType::Float32, armnn::DataType::Float32>(
+        armnn::IWorkloadFactory &workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr &memoryManager,
+        bool biasEnabled,
+        const armnn::DataLayout layout);
+
+template LayerTestResult<armnn::ResolveType<armnn::DataType::Float32>, 4>
+DepthwiseConvolution2dMult2Test<armnn::DataType::Float32, armnn::DataType::Float32>(
+        armnn::IWorkloadFactory &workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr &memoryManager,
+        bool biasEnabled,
+        const armnn::DataLayout layout);
+
 LayerTestResult<float, 4> DepthwiseConvolution2dTest(
     armnn::IWorkloadFactory& workloadFactory,
     const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
diff --git a/src/backends/backendsCommon/test/LayerTests.hpp b/src/backends/backendsCommon/test/LayerTests.hpp
index df33aa1..235c5dc 100644
--- a/src/backends/backendsCommon/test/LayerTests.hpp
+++ b/src/backends/backendsCommon/test/LayerTests.hpp
@@ -148,6 +148,20 @@
     bool biasEnabled,
     const armnn::DataLayout layout);
 
+template<armnn::DataType ArmnnType, armnn::DataType ArmnnBType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> DepthwiseConvolution2dMult4Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    bool biasEnabled,
+    const armnn::DataLayout layout);
+
+template<armnn::DataType ArmnnType, armnn::DataType ArmnnBType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> DepthwiseConvolution2dMult2Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    bool biasEnabled,
+    const armnn::DataLayout layout);
+
 LayerTestResult<float, 4> DepthwiseConvolution2dDepthNhwcTest(
     armnn::IWorkloadFactory& workloadFactory,
     const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,