IVGCVSW-2069 - Implement NHWC Convolution2D for CpuRef
 * Calculate index for NHWC to compute Convolution
 * add more unit test for NHWC

Change-Id: I800d649b9b42be2758c445e3b3e76142888c1377
diff --git a/src/backends/cl/test/ClLayerTests.cpp b/src/backends/cl/test/ClLayerTests.cpp
index 4946515..057b5d8 100755
--- a/src/backends/cl/test/ClLayerTests.cpp
+++ b/src/backends/cl/test/ClLayerTests.cpp
@@ -56,12 +56,15 @@
 
 ARMNN_AUTO_TEST_CASE(SimpleConvolution2d, SimpleConvolution2d3x5Test, true, armnn::DataLayout::NCHW)
 ARMNN_AUTO_TEST_CASE(SimpleConvolution2dNhwc, SimpleConvolution2d3x5Test, true, armnn::DataLayout::NHWC)
-ARMNN_AUTO_TEST_CASE(SimpleConvolution2d3x3Uint8, SimpleConvolution2d3x3Uint8Test, true)
+ARMNN_AUTO_TEST_CASE(SimpleConvolution2d3x3Uint8, SimpleConvolution2d3x3Uint8Test, true, armnn::DataLayout::NCHW)
+ARMNN_AUTO_TEST_CASE(SimpleConvolution2d3x3Uint8Nhwc, SimpleConvolution2d3x3Uint8Test, true, armnn::DataLayout::NHWC)
 ARMNN_AUTO_TEST_CASE(UnbiasedConvolution2d, SimpleConvolution2d3x5Test, false, armnn::DataLayout::NCHW)
 ARMNN_AUTO_TEST_CASE(UnbiasedConvolution2dNhwc, SimpleConvolution2d3x5Test, false, armnn::DataLayout::NHWC)
 
-ARMNN_AUTO_TEST_CASE(UnbiasedConvolution2dSquare, SimpleConvolution2d3x3Test, false)
-ARMNN_AUTO_TEST_CASE(SimpleConvolution2dAsymmetricPadding, Convolution2dAsymmetricPaddingTest)
+ARMNN_AUTO_TEST_CASE(UnbiasedConvolution2dSquare, SimpleConvolution2d3x3Test, false, armnn::DataLayout::NCHW)
+ARMNN_AUTO_TEST_CASE(SimpleConvolution2dAsymmetricPadding, Convolution2dAsymmetricPaddingTest, armnn::DataLayout::NCHW)
+
+ARMNN_AUTO_TEST_CASE(UnbiasedConvolution2dSquareNhwc, SimpleConvolution2d3x3Test, false, armnn::DataLayout::NHWC)
 
 ARMNN_AUTO_TEST_CASE(SimpleConvolution2dSquareNhwc, SimpleConvolution2d3x3NhwcTest, false)
 
diff --git a/src/backends/neon/test/NeonLayerTests.cpp b/src/backends/neon/test/NeonLayerTests.cpp
index ffd0d29..349dbf8 100644
--- a/src/backends/neon/test/NeonLayerTests.cpp
+++ b/src/backends/neon/test/NeonLayerTests.cpp
@@ -28,12 +28,18 @@
 
 ARMNN_AUTO_TEST_CASE(SimpleConvolution2d, SimpleConvolution2d3x5Test, true, armnn::DataLayout::NCHW)
 ARMNN_AUTO_TEST_CASE(SimpleConvolution2dNhwc, SimpleConvolution2d3x5Test, true, armnn::DataLayout::NHWC)
-ARMNN_AUTO_TEST_CASE(SimpleConvolution2dSquare, SimpleConvolution2d3x3Test, true)
+ARMNN_AUTO_TEST_CASE(SimpleConvolution2d3x3Uint8, SimpleConvolution2d3x3Uint8Test, true, armnn::DataLayout::NCHW)
+ARMNN_AUTO_TEST_CASE(SimpleConvolution2d3x3Uint8Nhwc, SimpleConvolution2d3x3Uint8Test, true, armnn::DataLayout::NHWC)
 ARMNN_AUTO_TEST_CASE(UnbiasedConvolution2d, SimpleConvolution2d3x5Test, false, armnn::DataLayout::NCHW)
 ARMNN_AUTO_TEST_CASE(UnbiasedConvolution2dNhwc, SimpleConvolution2d3x5Test, false, armnn::DataLayout::NHWC)
 
-ARMNN_AUTO_TEST_CASE(UnbiasedConvolution2dSquare, SimpleConvolution2d3x3Test, false)
-ARMNN_AUTO_TEST_CASE(SimpleConvolution2dAsymmetricPadding, Convolution2dAsymmetricPaddingTest)
+ARMNN_AUTO_TEST_CASE(UnbiasedConvolution2dSquare, SimpleConvolution2d3x3Test, false, armnn::DataLayout::NCHW)
+ARMNN_AUTO_TEST_CASE(SimpleConvolution2dAsymmetricPadding, Convolution2dAsymmetricPaddingTest, armnn::DataLayout::NCHW)
+
+ARMNN_AUTO_TEST_CASE(UnbiasedConvolution2dSquareNhwc, SimpleConvolution2d3x3Test, false, armnn::DataLayout::NHWC)
+ARMNN_AUTO_TEST_CASE(SimpleConvolution2dAsymmetricPaddingNhwc,
+                     Convolution2dAsymmetricPaddingTest,
+                     armnn::DataLayout::NHWC)
 
 ARMNN_AUTO_TEST_CASE(SimpleConvolution2dSquareNhwc, SimpleConvolution2d3x3NhwcTest, false)
 namespace
diff --git a/src/backends/reference/test/RefLayerTests.cpp b/src/backends/reference/test/RefLayerTests.cpp
index 45239be..cb5a1c4 100644
--- a/src/backends/reference/test/RefLayerTests.cpp
+++ b/src/backends/reference/test/RefLayerTests.cpp
@@ -20,21 +20,37 @@
 // Convolution
 ARMNN_AUTO_TEST_CASE(SimpleConvolution2d3x5, SimpleConvolution2d3x5Test, true, armnn::DataLayout::NCHW)
 ARMNN_AUTO_TEST_CASE(SimpleConvolution2d3x5Uint8, SimpleConvolution2d3x5Uint8Test, true, armnn::DataLayout::NCHW)
+ARMNN_AUTO_TEST_CASE(SimpleConvolution2d3x5Nhwc, SimpleConvolution2d3x5Test, true, armnn::DataLayout::NHWC)
+ARMNN_AUTO_TEST_CASE(SimpleConvolution2d3x5Uint8Nhwc, SimpleConvolution2d3x5Uint8Test, true, armnn::DataLayout::NHWC)
 
 ARMNN_AUTO_TEST_CASE(UnbiasedConvolution2d, SimpleConvolution2d3x5Test, false, armnn::DataLayout::NCHW)
 ARMNN_AUTO_TEST_CASE(UnbiasedConvolutionUint8, SimpleConvolution2d3x5Uint8Test, false, armnn::DataLayout::NCHW)
+ARMNN_AUTO_TEST_CASE(UnbiasedConvolution2dNhwc, SimpleConvolution2d3x5Test, false, armnn::DataLayout::NHWC)
+ARMNN_AUTO_TEST_CASE(UnbiasedConvolutionUint8Nhwc, SimpleConvolution2d3x5Uint8Test, false, armnn::DataLayout::NHWC)
 
 ARMNN_AUTO_TEST_CASE(SimpleConvolution1d, Convolution1dTest, true)
 ARMNN_AUTO_TEST_CASE(SimpleConvolution1dUint8, Convolution1dUint8Test, true)
 
-ARMNN_AUTO_TEST_CASE(SimpleConvolution2d3x3, SimpleConvolution2d3x3Test, true)
-ARMNN_AUTO_TEST_CASE(SimpleConvolution2d3x3Uint8, SimpleConvolution2d3x3Uint8Test, true)
+ARMNN_AUTO_TEST_CASE(SimpleConvolution2d3x3, SimpleConvolution2d3x3Test, true, armnn::DataLayout::NCHW)
+ARMNN_AUTO_TEST_CASE(SimpleConvolution2d3x3Uint8, SimpleConvolution2d3x3Uint8Test, true, armnn::DataLayout::NCHW)
 
-ARMNN_AUTO_TEST_CASE(UnbiasedConvolution2dSquare, SimpleConvolution2d3x3Test, false)
+ARMNN_AUTO_TEST_CASE(SimpleConvolution2d3x3Nhwc, SimpleConvolution2d3x3Test, true, armnn::DataLayout::NHWC)
+ARMNN_AUTO_TEST_CASE(SimpleConvolution2d3x3Uint8Nhwc, SimpleConvolution2d3x3Uint8Test, true, armnn::DataLayout::NHWC)
+
+ARMNN_AUTO_TEST_CASE(UnbiasedConvolution2dSquare, SimpleConvolution2d3x3Test, false, armnn::DataLayout::NCHW)
+ARMNN_AUTO_TEST_CASE(UnbiasedConvolution2dSquareNhwc, SimpleConvolution2d3x3Test, false, armnn::DataLayout::NHWC)
 
 ARMNN_AUTO_TEST_CASE(SimpleConvolution2dAsymmetricPaddingLargerThanHalfKernelSize,
-    Convolution2dAsymmetricPaddingLargerThanHalfKernelSizeTest)
-ARMNN_AUTO_TEST_CASE(SimpleConvolution2dAsymmetricPadding, Convolution2dAsymmetricPaddingTest)
+                     Convolution2dAsymmetricPaddingLargerThanHalfKernelSizeTest,
+                     armnn::DataLayout::NCHW)
+ARMNN_AUTO_TEST_CASE(SimpleConvolution2dAsymmetricPadding, Convolution2dAsymmetricPaddingTest, armnn::DataLayout::NCHW)
+
+ARMNN_AUTO_TEST_CASE(SimpleConvolution2dAsymmetricPaddingLargerThanHalfKernelSizeNhwc,
+                     Convolution2dAsymmetricPaddingLargerThanHalfKernelSizeTest,
+                     armnn::DataLayout::NHWC)
+ARMNN_AUTO_TEST_CASE(SimpleConvolution2dAsymmetricPaddingNhwc,
+                     Convolution2dAsymmetricPaddingTest,
+                     armnn::DataLayout::NHWC)
 
 ARMNN_AUTO_TEST_CASE(SimpleConvolution2dSquareNhwc, SimpleConvolution2d3x3NhwcTest, false)
 
diff --git a/src/backends/reference/workloads/ConvImpl.hpp b/src/backends/reference/workloads/ConvImpl.hpp
index 60a3622..4b15c1d 100644
--- a/src/backends/reference/workloads/ConvImpl.hpp
+++ b/src/backends/reference/workloads/ConvImpl.hpp
@@ -6,6 +6,7 @@
 #pragma once
 
 #include "RefWorkloadUtils.hpp"
+#include "TensorBufferArrayView.hpp"
 
 #include <armnn/Tensor.hpp>
 
@@ -66,6 +67,10 @@
     const TensorInfo& inputInfo0  = GetTensorInfo(data.m_Inputs[0]);
     const TensorInfo& outputInfo0 = GetTensorInfo(data.m_Outputs[0]);
 
+    TensorBufferArrayView<InputType> output(outputInfo0.GetShape(),
+                                            GetOutputTensorData<InputType>(0, data),
+                                            data.m_Parameters.m_DataLayout);
+
     const DataLayoutIndexed dataLayoutIndexed(data.m_Parameters.m_DataLayout);
     const unsigned int channelsIndex = dataLayoutIndexed.GetChannelsIndex();
     const unsigned int heightIndex   = dataLayoutIndexed.GetHeightIndex();
@@ -123,18 +128,41 @@
                                 // Since dimensionality of kernel depends on depthwiseness, so does index.
                                 if (depthwise)
                                 {
-                                    filterIndex = depthwiseMultiplierIdx * widthFilter * heightFilter * channelsInput +
-                                                  cInput * widthFilter * heightFilter +
-                                                  yFilter * widthFilter +
-                                                  xFilter;
+                                    if (data.m_Parameters.m_DataLayout == DataLayout::NHWC)
+                                    {
+                                        filterIndex = depthwiseMultiplierIdx * heightFilter * widthFilter
+                                                        * channelsInput +
+                                                      yFilter * widthFilter * channelsInput +
+                                                      xFilter * channelsInput +
+                                                      cInput;
+                                    }
+                                    else
+                                    {
+                                        filterIndex = depthwiseMultiplierIdx * widthFilter * heightFilter
+                                                        * channelsInput +
+                                                      cInput * widthFilter * heightFilter +
+                                                      yFilter * widthFilter +
+                                                      xFilter;
+                                    }
                                 }
                                 else
                                 {
-                                    filterIndex = cOutput * widthFilter * heightFilter * channelsInput +
-                                                  cInput  * widthFilter * heightFilter +
-                                                  yFilter * widthFilter +
-                                                  xFilter;
+                                    if (data.m_Parameters.m_DataLayout == DataLayout::NHWC)
+                                    {
+                                        filterIndex = cOutput * heightFilter * widthFilter * channelsInput +
+                                                      yFilter * widthFilter * channelsInput +
+                                                      xFilter * channelsInput +
+                                                      cInput;
+                                    }
+                                    else
+                                    {
+                                        filterIndex = cOutput * widthFilter * heightFilter * channelsInput +
+                                                      cInput  * widthFilter * heightFilter +
+                                                      yFilter * widthFilter +
+                                                      xFilter;
+                                    }
                                 }
+
                                 AccumulatorType filterValue = filterData[filterIndex] -
                                     boost::numeric_cast<AccumulatorType>(filterOffset);
 
@@ -151,11 +179,27 @@
                                 }
                                 else
                                 {
-                                    inputValue = inputData[batchIdx * widthInput * heightInput * channelsInput +
-                                                                      widthInput * heightInput * cInput +
-                                                                      widthInput * (yInput - paddingTop) +
-                                                                      xInput - paddingLeft] -
-                                        boost::numeric_cast<AccumulatorType>(inputOffset);
+                                    unsigned int inputIndex;
+
+                                    if (data.m_Parameters.m_DataLayout == DataLayout::NHWC)
+                                    {
+                                        inputIndex = batchIdx * heightInput * widthInput  * channelsInput +
+                                                     (yInput - paddingTop) * widthInput * channelsInput +
+                                                     (xInput - paddingLeft) * channelsInput +
+                                                     cInput;
+
+                                    }
+                                    else
+                                    {
+                                        inputIndex = batchIdx * widthInput * heightInput * channelsInput +
+                                                     widthInput * heightInput * cInput +
+                                                     widthInput * (yInput - paddingTop) +
+                                                     xInput - paddingLeft;
+                                    }
+
+                                    inputValue = inputData[inputIndex] -
+                                                    boost::numeric_cast<AccumulatorType>(inputOffset);
+
                                 }
                                 sum += filterValue * inputValue;
                             }
@@ -179,10 +223,7 @@
                         sum = std::min<AccumulatorType>(std::max<AccumulatorType>(sum, 0), 255);
                     }
 
-                    outputData[batchIdx * widthOutput * heightOutput * channelsOutput +
-                                          widthOutput * heightOutput * cOutput +
-                                          widthOutput * yOutput +
-                                          xOutput] = boost::numeric_cast<InputType>(sum);
+                    output.Get(batchIdx, cOutput, yOutput, xOutput) = boost::numeric_cast<InputType>(sum);
                 }
             }
         }
diff --git a/src/backends/test/LayerTests.cpp b/src/backends/test/LayerTests.cpp
index e536cc9..bc9e116 100755
--- a/src/backends/test/LayerTests.cpp
+++ b/src/backends/test/LayerTests.cpp
@@ -165,7 +165,8 @@
 LayerTestResult<T, 4> SimpleConvolution2d3x3TestCommon(armnn::IWorkloadFactory& workloadFactory,
                                                        float                    qScale,
                                                        int32_t                  qOffset,
-                                                       bool                     biasEnabled)
+                                                       bool                     biasEnabled,
+                                                       const armnn::DataLayoutIndexed& layout)
 {
     // Use a 3x3 kernel, which exercises ArmCompute's direct convolution path.
 
@@ -228,7 +229,8 @@
       GetBias2<typename FullyConnectedBiasTypeForInputType<T>::Type>(biasEnabled, qScale, qOffset),
       expectedOutput,
       qScale,
-      qOffset);
+      qOffset,
+      layout);
 }
 
 template<typename T>
@@ -294,9 +296,10 @@
 }
 
 LayerTestResult<float, 4> SimpleConvolution2d3x3Test(armnn::IWorkloadFactory& workloadFactory,
-                                                     bool                     biasEnabled)
+                                                     bool                     biasEnabled,
+                                                     const armnn::DataLayoutIndexed& layout)
 {
-    return SimpleConvolution2d3x3TestCommon<float>(workloadFactory, 0.f, 0, biasEnabled);
+    return SimpleConvolution2d3x3TestCommon<float>(workloadFactory, 0.f, 0, biasEnabled, layout);
 }
 
 LayerTestResult<float, 4> SimpleConvolution2d3x3NhwcTest(armnn::IWorkloadFactory& workloadFactory,
@@ -306,14 +309,16 @@
 }
 
 LayerTestResult<uint8_t, 4> SimpleConvolution2d3x3Uint8Test(armnn::IWorkloadFactory& workloadFactory,
-                                                            bool                     biasEnabled)
+                                                            bool                     biasEnabled,
+                                                            const armnn::DataLayoutIndexed& layout)
 {
-    return SimpleConvolution2d3x3TestCommon<uint8_t>(workloadFactory, 0.5f, 50, biasEnabled);
+    return SimpleConvolution2d3x3TestCommon<uint8_t>(workloadFactory, 0.5f, 50, biasEnabled, layout);
 }
 
 template<typename T>
 LayerTestResult<T, 4> Convolution2dAsymmetricPaddingLargerThanHalfKernelSizeTestCommon(
     armnn::IWorkloadFactory& workloadFactory,
+    const armnn::DataLayoutIndexed& layout,
     float                    qScale,
     int32_t                  qOffset)
 {
@@ -363,7 +368,7 @@
       expectedOutput,
       qScale,
       qOffset,
-      armnn::DataLayout::NCHW,
+      layout,
       1,  // Padding left.
       2,  // Padding top.
       3,  // Padding right.
@@ -372,8 +377,9 @@
 
 template<typename T>
 LayerTestResult<T, 4> SimpleConvolution2dAsymmetricPaddingTestCommon(armnn::IWorkloadFactory& workloadFactory,
-    float                    qScale,
-    int32_t                  qOffset)
+                                                                     const armnn::DataLayoutIndexed& layout,
+                                                                     float qScale,
+                                                                     int32_t qOffset)
 {
     // Use a single-batch 1-channel 5x5 image as input.
     armnn::TensorInfo inputDesc({ 1, 1, 5, 5 }, armnn::GetDataType<T>());
@@ -415,7 +421,7 @@
         expectedOutput,
         qScale,
         qOffset,
-        armnn::DataLayout::NCHW,
+        layout,
         1,  // Padding left.
         1,  // Padding top.
         2,  // Padding right.
@@ -606,14 +612,16 @@
 }
 
 LayerTestResult<float, 4>
-Convolution2dAsymmetricPaddingLargerThanHalfKernelSizeTest(armnn::IWorkloadFactory& workloadFactory)
+Convolution2dAsymmetricPaddingLargerThanHalfKernelSizeTest(armnn::IWorkloadFactory& workloadFactory,
+                                                           const armnn::DataLayoutIndexed& layout)
 {
-    return Convolution2dAsymmetricPaddingLargerThanHalfKernelSizeTestCommon<float>(workloadFactory, 0.0f, 0);
+    return Convolution2dAsymmetricPaddingLargerThanHalfKernelSizeTestCommon<float>(workloadFactory, layout, 0.0f, 0);
 }
 
-LayerTestResult<float, 4> Convolution2dAsymmetricPaddingTest(armnn::IWorkloadFactory& workloadFactory)
+LayerTestResult<float, 4> Convolution2dAsymmetricPaddingTest(armnn::IWorkloadFactory& workloadFactory,
+                                                             const armnn::DataLayoutIndexed& layout)
 {
-    return SimpleConvolution2dAsymmetricPaddingTestCommon<float>(workloadFactory, 0.0f, 0);
+    return SimpleConvolution2dAsymmetricPaddingTestCommon<float>(workloadFactory, layout, 0.0f, 0);
 }
 
 LayerTestResult<float, 4> DepthwiseConvolution2dTest(armnn::IWorkloadFactory& workloadFactory,
diff --git a/src/backends/test/LayerTests.hpp b/src/backends/test/LayerTests.hpp
index 6c3b9e1..8846297 100644
--- a/src/backends/test/LayerTests.hpp
+++ b/src/backends/test/LayerTests.hpp
@@ -54,18 +54,23 @@
                                                      const armnn::DataLayoutIndexed& layout);
 
 LayerTestResult<float, 4> SimpleConvolution2d3x3Test(armnn::IWorkloadFactory& workloadFactory,
-                                                     bool biasEnabled);
+                                                     bool biasEnabled,
+                                                     const armnn::DataLayoutIndexed& layout);
 
 LayerTestResult<float, 4> SimpleConvolution2d3x3NhwcTest(armnn::IWorkloadFactory& workloadFactory,
                                                          bool biasEnabled);
 
 LayerTestResult<float, 4>
-Convolution2dAsymmetricPaddingLargerThanHalfKernelSizeTest(armnn::IWorkloadFactory& workloadFactory);
-LayerTestResult<float, 4> Convolution2dAsymmetricPaddingTest(armnn::IWorkloadFactory& workloadFactory);
+Convolution2dAsymmetricPaddingLargerThanHalfKernelSizeTest(armnn::IWorkloadFactory& workloadFactory,
+                                                           const armnn::DataLayoutIndexed& layout);
+LayerTestResult<float, 4> Convolution2dAsymmetricPaddingTest(armnn::IWorkloadFactory& workloadFactory,
+                                                             const armnn::DataLayoutIndexed& layout);
 
 
-LayerTestResult<float,   4> Convolution1dTest(armnn::IWorkloadFactory& workloadFactory, bool biasEnabled);
-LayerTestResult<uint8_t, 4> Convolution1dUint8Test(armnn::IWorkloadFactory& workloadFactory, bool biasEnabled);
+LayerTestResult<float,   4> Convolution1dTest(armnn::IWorkloadFactory& workloadFactory,
+                                              bool biasEnabled);
+LayerTestResult<uint8_t, 4> Convolution1dUint8Test(armnn::IWorkloadFactory& workloadFactory,
+                                                   bool biasEnabled);
 
 LayerTestResult<float, 4> DepthwiseConvolution2dTest(armnn::IWorkloadFactory& workloadFactory, bool biasEnabled);
 
@@ -317,7 +322,8 @@
                                                             const armnn::DataLayoutIndexed& layout);
 
 LayerTestResult<uint8_t, 4> SimpleConvolution2d3x3Uint8Test(armnn::IWorkloadFactory& workloadFactory,
-                                                            bool                     biasEnabled);
+                                                            bool                     biasEnabled,
+                                                            const armnn::DataLayoutIndexed& layout);
 
 LayerTestResult<uint8_t, 4> DepthwiseConvolution2dUint8Test(armnn::IWorkloadFactory& workloadFactory,
                                                             bool                     biasEnabled);