MLCE-77 Depthwise Convolution with depth multiplier > 1 doesn't work

 * Unified ArmNN's weight format to [ M, I, H, W ] for the depthwise convolution
 * Added conversion utilities to permute/reshape the weights as appropriate
   when using CL and Neon backends
 * Updated the reference implementation of the convolution
 * Updated the relevant unit tests accordingly

!android-nn-driver:459

Change-Id: I07d0818efa9d1ca1e5dad82983aac1fe78eadb18
diff --git a/src/backends/backendsCommon/CMakeLists.txt b/src/backends/backendsCommon/CMakeLists.txt
index f295630..b120f51 100644
--- a/src/backends/backendsCommon/CMakeLists.txt
+++ b/src/backends/backendsCommon/CMakeLists.txt
@@ -27,6 +27,7 @@
     WorkloadFactory.hpp
     Workload.hpp
     WorkloadInfo.hpp
+    WorkloadUtils.cpp
     WorkloadUtils.hpp
 )
 
diff --git a/src/backends/backendsCommon/CpuTensorHandle.cpp b/src/backends/backendsCommon/CpuTensorHandle.cpp
index fe0c634..9dcd3f3 100644
--- a/src/backends/backendsCommon/CpuTensorHandle.cpp
+++ b/src/backends/backendsCommon/CpuTensorHandle.cpp
@@ -18,7 +18,7 @@
 }
 
 template <>
-const void* ConstCpuTensorHandle::GetConstTensor() const
+const void* ConstCpuTensorHandle::GetConstTensor<void>() const
 {
     return m_Memory;
 }
@@ -30,7 +30,7 @@
 }
 
 template <>
-void* CpuTensorHandle::GetTensor() const
+void* CpuTensorHandle::GetTensor<void>() const
 {
     return m_MutableMemory;
 }
diff --git a/src/backends/backendsCommon/CpuTensorHandle.hpp b/src/backends/backendsCommon/CpuTensorHandle.hpp
index ae13d6c..b88a0d3 100644
--- a/src/backends/backendsCommon/CpuTensorHandle.hpp
+++ b/src/backends/backendsCommon/CpuTensorHandle.hpp
@@ -72,6 +72,9 @@
     const void* m_Memory;
 };
 
+template<>
+const void* ConstCpuTensorHandle::GetConstTensor<void>() const;
+
 // Abstract specialization of ConstCpuTensorHandle that allows write access to the same data.
 class CpuTensorHandle : public ConstCpuTensorHandle
 {
@@ -99,6 +102,9 @@
     void* m_MutableMemory;
 };
 
+template <>
+void* CpuTensorHandle::GetTensor<void>() const;
+
 // A CpuTensorHandle that owns the wrapped memory region.
 class ScopedCpuTensorHandle : public CpuTensorHandle
 {
diff --git a/src/backends/backendsCommon/WorkloadData.cpp b/src/backends/backendsCommon/WorkloadData.cpp
index 8847b4e..1dac498 100644
--- a/src/backends/backendsCommon/WorkloadData.cpp
+++ b/src/backends/backendsCommon/WorkloadData.cpp
@@ -593,9 +593,10 @@
 
     const unsigned int channelIndex = (m_Parameters.m_DataLayout == DataLayout::NCHW) ? 1 : 3;
 
-    //inputChannels * channelMultiplier should be equal to outputChannels.
+    // Expected weight shape: [ M, I, H, W ] - This shape does NOT depend on the data layout
+    // inputChannels * channelMultiplier should be equal to outputChannels.
     const unsigned int numWeightChannelMultiplier = m_Weight->GetTensorInfo().GetShape()[0];
-    const unsigned int numWeightInputChannels = m_Weight->GetTensorInfo().GetShape()[channelIndex];
+    const unsigned int numWeightInputChannels = m_Weight->GetTensorInfo().GetShape()[1];
     const unsigned int numWeightOutputChannels = workloadInfo.m_OutputTensorInfos[0].GetShape()[channelIndex];
     if (numWeightChannelMultiplier * numWeightInputChannels != numWeightOutputChannels)
     {
diff --git a/src/backends/backendsCommon/WorkloadUtils.cpp b/src/backends/backendsCommon/WorkloadUtils.cpp
new file mode 100644
index 0000000..fa387a7
--- /dev/null
+++ b/src/backends/backendsCommon/WorkloadUtils.cpp
@@ -0,0 +1,111 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "WorkloadUtils.hpp"
+
+namespace armnn
+{
+
+armnn::ConstTensor PermuteTensor(const ConstCpuTensorHandle* tensor,
+                                 const PermutationVector& permutationVector,
+                                 void* permuteBuffer)
+{
+    BOOST_ASSERT_MSG(tensor, "Invalid input tensor");
+    BOOST_ASSERT_MSG(permuteBuffer, "Invalid permute buffer");
+
+    TensorInfo tensorInfo = tensor->GetTensorInfo();
+
+    if (permutationVector.GetSize() > 0)
+    {
+        tensorInfo = armnnUtils::Permuted(tensorInfo, permutationVector);
+        armnnUtils::Permute(tensorInfo.GetShape(), permutationVector,
+                            tensor->GetConstTensor<void>(), permuteBuffer,
+                            GetDataTypeSize(tensorInfo.GetDataType()));
+    }
+    else
+    {
+        ::memcpy(permuteBuffer, tensor->GetConstTensor<void>(), tensorInfo.GetNumBytes());
+    }
+
+    return ConstTensor(tensorInfo, permuteBuffer);
+}
+
+void ReshapeWeightsForAcl(TensorInfo& weightInfo, DataLayout dataLayout)
+{
+    // Reshape the weights in-place
+    const TensorShape& weightShape = weightInfo.GetShape();
+    switch (dataLayout)
+    {
+        case DataLayout::NHWC:
+            // The data layout is NHWC, reshape from [ H, W, I, M ] to [ 1, H, W, I * M ]
+            weightInfo.SetShape({ 1,
+                                  weightShape[0],
+                                  weightShape[1],
+                                  weightShape[2] * weightShape[3] });
+            break;
+        case DataLayout::NCHW:
+        default:
+            // The data layout is NCHW, reshape from [ M, I, H, W ] to [ 1, I * M, H, W, ]
+            weightInfo.SetShape({ 1,
+                                  weightShape[0] * weightShape[1],
+                                  weightShape[2],
+                                  weightShape[3] });
+            break;
+    }
+}
+
+TensorInfo ConvertWeightTensorInfoFromArmnnToAcl(const TensorInfo& weightInfo, DataLayout dataLayout)
+{
+    // Convert the weight format from ArmNN's [ M, I, H, W ] (does NOT depend on the data layout) to either
+    // [ 1, H, W, I * M ] (if NHWC) or [ 1, I * M, H, W ] (if NCHW), as required by the compute library
+
+    // 1. Permute the weights if necessary
+    // If the data layout is NCHW no permutation is necessary, as a reshape to [ 1, I * M, H, W ] can be better done
+    // starting from the current shape of [ M, I, H, W ]
+    TensorInfo weightPermutedInfo(weightInfo);
+    if (dataLayout == DataLayout::NHWC)
+    {
+        // The data layout is NHWC, then permute the weights from [ M, I, H, W ] to [ H, W, I, M ]
+        PermutationVector permutationVector{ 3, 2, 0, 1 };
+        weightPermutedInfo = armnnUtils::Permuted(weightInfo, permutationVector);
+    }
+
+    // 2. Reshape the weights
+    ReshapeWeightsForAcl(weightPermutedInfo, dataLayout);
+
+    // 3. Return the permuted weight info
+    return weightPermutedInfo;
+}
+
+armnn::ConstTensor ConvertWeightTensorFromArmnnToAcl(const ConstCpuTensorHandle* weightTensor,
+                                                     DataLayout dataLayout,
+                                                     void* permuteBuffer)
+{
+    BOOST_ASSERT_MSG(weightTensor, "Invalid input tensor");
+    BOOST_ASSERT_MSG(permuteBuffer, "Invalid permute buffer");
+
+    // Convert the weight format from ArmNN's [ M, I, H, W ] (does NOT depend on the data layout) to either
+    // [ 1, H, W, I * M ] (if NHWC) or [ 1, I * M, H, W ] (if NCHW), as required by the compute library
+
+    // 1. Permute the weights if necessary
+    // If the data layout is NCHW no permutation is necessary, as a reshape to [ 1, I * M, H, W ] can be better done
+    // starting from the current shape of [ M, I, H, W ]
+    // If no permutation is necessary, leave the permutation vector empty
+    PermutationVector permutationVector{};
+    if (dataLayout == DataLayout::NHWC)
+    {
+        // The data layout is NHWC, then permute the weights from [ M, I, H, W ] to [ H, W, I, M ]
+        permutationVector = { 3, 2, 0, 1 };
+    }
+    ConstTensor weightPermuted = PermuteTensor(weightTensor, permutationVector, permuteBuffer);
+
+    // 2. Reshape the weights
+    ReshapeWeightsForAcl(weightPermuted.GetInfo(), dataLayout);
+
+    // 3. Return both the tensor and the allocated storage to ensure that the data stays alive
+    return weightPermuted;
+}
+
+} // namespace armnn
diff --git a/src/backends/backendsCommon/WorkloadUtils.hpp b/src/backends/backendsCommon/WorkloadUtils.hpp
index 2b07b2b..a1a8d2a 100644
--- a/src/backends/backendsCommon/WorkloadUtils.hpp
+++ b/src/backends/backendsCommon/WorkloadUtils.hpp
@@ -6,35 +6,42 @@
 #pragma once
 
 #include "ITensorHandle.hpp"
+#include "CpuTensorHandle.hpp"
 
 #include <armnn/Tensor.hpp>
 
+#include <Permute.hpp>
+#include <Profiling.hpp>
+#include <Half.hpp>
+
 #include <boost/cast.hpp>
 
 namespace armnn
 {
 namespace
 {
+
 template<typename ArrayType, typename Arg>
 void AssignValues(unsigned int num, unsigned int& idx, const ArrayType& array, Arg& arg)
 {
- if (idx >= num)
- {
-     return;
- }
+    if (idx >= num)
+    {
+        return;
+    }
 
- arg = array[(num - 1) - idx];
- idx++;
-};
+    arg = array[(num - 1) - idx];
+    idx++;
+}
 
 template<typename T, typename ArrayType, typename ...Args>
 void AssignValues(unsigned int num, unsigned int idx, const ArrayType& array, T& assignee, Args& ... args)
 {
- AssignValues(num, idx, array, assignee);
+    AssignValues(num, idx, array, assignee);
 
- AssignValues(num, idx, array, args...);
+    AssignValues(num, idx, array, args...);
 }
-} // namespace
+
+} // anonymous namespace
 
 template<typename CopyFunc>
 void CopyTensorContentsGeneric(const ITensorHandle* srcTensor, ITensorHandle* dstTensor, CopyFunc copy)
@@ -142,4 +149,16 @@
     }
 }
 
-} //namespace armnn
\ No newline at end of file
+armnn::ConstTensor PermuteTensor(const ConstCpuTensorHandle* tensor,
+                                 const PermutationVector& permutationVector,
+                                 void* permuteBuffer);
+
+void ReshapeWeightsForAcl(TensorInfo& weightInfo, DataLayout dataLayout);
+
+TensorInfo ConvertWeightTensorInfoFromArmnnToAcl(const TensorInfo& weightInfo, DataLayout dataLayout);
+
+armnn::ConstTensor ConvertWeightTensorFromArmnnToAcl(const ConstCpuTensorHandle* weightTensor,
+                                                     DataLayout dataLayout,
+                                                     void* permuteBuffer);
+
+} //namespace armnn
diff --git a/src/backends/backendsCommon/common.mk b/src/backends/backendsCommon/common.mk
index a66b5c4..4e79bfc 100644
--- a/src/backends/backendsCommon/common.mk
+++ b/src/backends/backendsCommon/common.mk
@@ -14,7 +14,8 @@
     MemCopyWorkload.cpp \
     OutputHandler.cpp \
     WorkloadData.cpp \
-    WorkloadFactory.cpp
+    WorkloadFactory.cpp \
+    WorkloadUtils.cpp
 
 # COMMON_TEST_SOURCES contains the list of files to be included
 # in the Android unit test build (armnn-tests) and it is picked
diff --git a/src/backends/backendsCommon/test/Conv2dTestImpl.hpp b/src/backends/backendsCommon/test/Conv2dTestImpl.hpp
index 37fa0f6..2ff66b0 100755
--- a/src/backends/backendsCommon/test/Conv2dTestImpl.hpp
+++ b/src/backends/backendsCommon/test/Conv2dTestImpl.hpp
@@ -327,7 +327,7 @@
     armnn::IWorkloadFactory& workloadFactory,
     const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
     const boost::multi_array<T, 4>& input,
-    const boost::multi_array<T, 4>& originalKernel,
+    const boost::multi_array<T, 4>& kernel,
     const boost::multi_array<B, 1>& bias,
     const boost::multi_array<T, 4>& outputExpected,
     float qScale,
@@ -344,10 +344,10 @@
     unsigned int inputChannels  = boost::numeric_cast<unsigned int>(input.shape()[1]);
     unsigned int inputHeight    = boost::numeric_cast<unsigned int>(input.shape()[2]);
     unsigned int inputWidth     = boost::numeric_cast<unsigned int>(input.shape()[3]);
-    unsigned int kernelChanMul  = boost::numeric_cast<unsigned int>(originalKernel.shape()[0]);
-    unsigned int kernelChannels = boost::numeric_cast<unsigned int>(originalKernel.shape()[1]);
-    unsigned int kernelHeight   = boost::numeric_cast<unsigned int>(originalKernel.shape()[2]);
-    unsigned int kernelWidth    = boost::numeric_cast<unsigned int>(originalKernel.shape()[3]);
+    unsigned int kernelChanMul  = boost::numeric_cast<unsigned int>(kernel.shape()[0]);
+    unsigned int kernelChannels = boost::numeric_cast<unsigned int>(kernel.shape()[1]);
+    unsigned int kernelHeight   = boost::numeric_cast<unsigned int>(kernel.shape()[2]);
+    unsigned int kernelWidth    = boost::numeric_cast<unsigned int>(kernel.shape()[3]);
     unsigned int outputNum      = boost::numeric_cast<unsigned int>(outputExpected.shape()[0]);
     unsigned int outputChannels = boost::numeric_cast<unsigned int>(outputExpected.shape()[1]);
     unsigned int outputHeight   = boost::numeric_cast<unsigned int>(outputExpected.shape()[2]);
@@ -362,8 +362,7 @@
             armnnUtils::GetTensorInfo<T>(inputNum, inputChannels, inputHeight, inputWidth, layout);
     armnn::TensorInfo outputTensorInfo =
             armnnUtils::GetTensorInfo<T>(outputNum, outputChannels, outputHeight, outputWidth, layout);
-    armnn::TensorInfo kernelDesc =
-            armnnUtils::GetTensorInfo<T>(kernelChanMul, kernelChannels, kernelHeight, kernelWidth, layout);
+    armnn::TensorInfo kernelDesc({kernelChanMul, kernelChannels, kernelHeight, kernelWidth}, armnn::GetDataType<T>());
     armnn::TensorInfo biasDesc({static_cast<unsigned int>(bias.size())}, armnn::GetDataType<B>());
 
     // Set quantization parameters if the requested type is a quantized type.
@@ -423,13 +422,6 @@
 
     armnn::ScopedCpuTensorHandle weightsTensor(kernelDesc);
 
-    // Permute the kernel if necessary
-    boost::multi_array<T, 4> kernel = boost::multi_array<T, 4>(originalKernel);
-    if (layout == armnn::DataLayout::NHWC)
-    {
-        armnnUtils::Permute(kernelDesc.GetShape(), NCHWToNHWC, originalKernel.data(), kernel.data());
-    }
-
     AllocateAndCopyDataToITensorHandle(&weightsTensor, &kernel[0][0][0][0]);
 
     armnn::ScopedCpuTensorHandle biasTensor(biasDesc);
@@ -484,6 +476,7 @@
     unsigned int kernelHeight = 3;
     unsigned int kernelWidth = 3;
     unsigned int kernelChannels = inputChannels;
+    unsigned int kernelDepthMultiplier = 1;
 
     unsigned int outputHeight = 1;
     unsigned int outputWidth = 1;
@@ -494,7 +487,8 @@
             armnnUtils::GetTensorInfo<T>(inputNum, inputChannels, inputHeight, inputWidth, layout);
     armnn::TensorInfo outputTensorInfo =
             armnnUtils::GetTensorInfo<T>(outputNum, outputChannels, outputHeight, outputWidth, layout);
-    armnn::TensorInfo kernelDesc = armnnUtils::GetTensorInfo<T>(1, outputChannels, kernelHeight, kernelWidth, layout);
+    armnn::TensorInfo kernelDesc({kernelDepthMultiplier, kernelChannels, kernelHeight, kernelWidth},
+                                 armnn::GetDataType<T>());
     armnn::TensorInfo biasDesc({ outputChannels }, armnn::GetDataType<B>());
 
     // Set quantization parameters if the requested type is a quantized type.
@@ -543,12 +537,6 @@
                     0.f, 0.f,  0.f,
                     -1.f, 0.f, -1.f,
             }));
-    if (layout == armnn::DataLayout::NHWC)
-    {
-        std::vector<T> tmp(kernelData.size());
-        armnnUtils::Permute(kernelDesc.GetShape(), NCHWToNHWC, kernelData.data(), tmp.data());
-        kernelData = tmp;
-    }
     auto kernel = MakeTensor<T, 4>(kernelDesc, kernelData);
 
     // Manually calculated.
@@ -642,8 +630,8 @@
             inputBatchSize, inputChannels, inputHeight, inputWidth, layout);
     armnn::TensorInfo outputTensorInfo = armnnUtils::GetTensorInfo<T>(
             outputBatchSize, outputChannels, outputHeight, outputWidth, layout);
-    armnn::TensorInfo kernelDesc = armnnUtils::GetTensorInfo<T>(
-            depthMultiplier, inputChannels, kernelHeight, kernelWidth, layout);
+    armnn::TensorInfo kernelDesc({depthMultiplier, inputChannels, kernelHeight, kernelWidth},
+                                 armnn::GetDataType<T>());
     armnn::TensorInfo biasDesc({outputChannels}, armnn::GetDataType<B>());
 
     // Set quantization parameters if the requested type is a quantized type.
@@ -692,7 +680,7 @@
         {0, 2, 1, -1}));
     auto bias = MakeTensor<B, 1>(biasDesc, biasV);
 
-    std::vector<T> originalKernelData = std::vector<T>(
+    std::vector<T> kernelData = std::vector<T>(
             QuantizedVector<T>(kernelDesc.GetQuantizationScale(), kernelDesc.GetQuantizationOffset(), {
                     1, 1, 1,
                     1, -1, 1,
@@ -717,12 +705,8 @@
                     0, 1, 0,
                     0, 0, 0,
                     0, 0, 0
+
             }));
-    std::vector<T> kernelData = originalKernelData;
-    if (layout == armnn::DataLayout::NHWC)
-    {
-        armnnUtils::Permute(kernelDesc.GetShape(), NCHWToNHWC, originalKernelData.data(), kernelData.data());
-    }
     auto kernel = MakeTensor<T, 4>(kernelDesc, kernelData);
 
     // Manually calculated.
@@ -840,9 +824,9 @@
     unsigned int inputWidth     = boost::numeric_cast<unsigned int>(input.shape()[2]);
 
     unsigned int kernelChanMul  = boost::numeric_cast<unsigned int>(kernel.shape()[0]);
-    unsigned int kernelChannels = boost::numeric_cast<unsigned int>(kernel.shape()[3]);
-    unsigned int kernelHeight   = boost::numeric_cast<unsigned int>(kernel.shape()[1]);
-    unsigned int kernelWidth    = boost::numeric_cast<unsigned int>(kernel.shape()[2]);
+    unsigned int kernelChannels = boost::numeric_cast<unsigned int>(kernel.shape()[1]);
+    unsigned int kernelHeight   = boost::numeric_cast<unsigned int>(kernel.shape()[2]);
+    unsigned int kernelWidth    = boost::numeric_cast<unsigned int>(kernel.shape()[3]);
 
     unsigned int outputNum      = boost::numeric_cast<unsigned int>(outputExpected.shape()[0]);
     unsigned int outputChannels = boost::numeric_cast<unsigned int>(outputExpected.shape()[3]);
@@ -853,7 +837,7 @@
     armnn::TensorInfo inputTensorInfo({inputNum, inputHeight, inputWidth, inputChannels}, armnn::GetDataType<T>());
     armnn::TensorInfo outputTensorInfo({outputNum, outputHeight, outputWidth, outputChannels},
                                        armnn::GetDataType<T>());
-    armnn::TensorInfo kernelDesc({kernelChanMul, kernelHeight, kernelWidth, kernelChannels}, armnn::GetDataType<T>());
+    armnn::TensorInfo kernelDesc({kernelChanMul, kernelChannels, kernelHeight, kernelWidth}, armnn::GetDataType<T>());
     armnn::TensorInfo biasDesc({static_cast<unsigned int>(bias.size())}, armnn::GetDataType<B>());
 
     // Set quantization parameters if the requested type is a quantized type.
@@ -1068,10 +1052,10 @@
     armnn::TensorInfo kernelDesc;
     armnn::TensorInfo biasDesc;
 
-    unsigned int inputShape[]    = {inputNum, inputChannels, inputHeight, inputWidth};
-    unsigned int outputShape[]   = {outputNum, outputChannels, outputHeight, outputWidth};
-    unsigned int kernelShape[]   = {outputChannels, inputChannels, kernelHeight, kernelWidth};
-    unsigned int biasShape[]     = {outputChannels};
+    unsigned int inputShape[]  = {inputNum, inputChannels, inputHeight, inputWidth};
+    unsigned int outputShape[] = {outputNum, outputChannels, outputHeight, outputWidth};
+    unsigned int kernelShape[] = {outputChannels, inputChannels, kernelHeight, kernelWidth};
+    unsigned int biasShape[]   = {outputChannels};
 
     inputTensorInfo = armnn::TensorInfo(4, inputShape, armnn::GetDataType<T>());
     outputTensorInfo = armnn::TensorInfo(4, outputShape, armnn::GetDataType<T>());
@@ -1171,19 +1155,17 @@
 
     std::vector<unsigned int> inputShape;
     std::vector<unsigned int> outputShape;
-    std::vector<unsigned int> kernelShape;
-    std::vector<unsigned int> biasShape= { outputChannels };
+    std::vector<unsigned int> kernelShape{ channelMultiplier, inputChannels, kernelHeight, kernelWidth };
+    std::vector<unsigned int> biasShape{ outputChannels };
     switch (layout.GetDataLayout())
     {
         case armnn::DataLayout::NCHW:
             inputShape =  { inputNum, inputChannels, inputHeight, inputWidth };
             outputShape = { outputNum, outputChannels, outputHeight, outputWidth };
-            kernelShape = { channelMultiplier, inputChannels, kernelHeight, kernelWidth };
             break;
         case armnn::DataLayout ::NHWC:
             inputShape =  { inputNum, inputHeight, inputWidth, inputChannels };
             outputShape = { outputNum, outputHeight, outputWidth, outputChannels };
-            kernelShape = { channelMultiplier, kernelHeight, kernelWidth, inputChannels };
             break;
         default:
             throw armnn::InvalidArgumentException("unknown data layout ["
diff --git a/src/backends/backendsCommon/test/LayerTests.cpp b/src/backends/backendsCommon/test/LayerTests.cpp
index ddf0d0b..819b9d6 100755
--- a/src/backends/backendsCommon/test/LayerTests.cpp
+++ b/src/backends/backendsCommon/test/LayerTests.cpp
@@ -661,28 +661,18 @@
             24, 49
         })));
 
-    armnn::TensorInfo kernelTensorInfo({ 1, 4, 4, 2}, armnn::GetDataType<T>());
+    armnn::TensorInfo kernelTensorInfo({ 1, 2, 4, 4 }, armnn::GetDataType<T>());
     auto kernel = MakeTensor<T, 4>(kernelTensorInfo, std::vector<T>(
         QuantizedVector<T>(kernelTensorInfo.GetQuantizationScale(), kernelTensorInfo.GetQuantizationOffset(), {
-             32, 16,
-             31, 15,
-             30, 14,
-             29, 13,
+             32, 31, 30, 29,
+             28, 27, 26, 25,
+             24, 23, 22, 21,
+             20, 19, 18, 17,
 
-             28, 12,
-             27, 11,
-             26, 10,
-             25,  9,
-
-             24,  8,
-             23,  7,
-             22,  6,
-             21,  5,
-
-             20,  4,
-             19,  3,
-             18,  2,
-             17,  1
+             16, 15, 14, 13,
+             12, 11, 10,  9,
+              8,  7,  6,  5,
+              4,  3,  2,  1
         })));
 
     armnn::TensorInfo outputTensorInfo({ 1, 5, 5, 2}, armnn::GetDataType<T>());