IVGCVSW-2264 Remove input swizzling from ParseConv2D in the TF parser

 * Removed the input swizzling when the data layout is NHWC
 * Permuting weights depending on the data layout used
 * Added getter methods to ParsedConstTfOperation to get the tensor
   info and the storage memory area, needed for swizzling the weights
 * Added unit tests for both NHWC and NCHW data layouts

Change-Id: I6543900c594417df630b2663d8551158b93b7836
diff --git a/src/armnnTfParser/TfParser.cpp b/src/armnnTfParser/TfParser.cpp
index 53cdfa3..b40b054 100644
--- a/src/armnnTfParser/TfParser.cpp
+++ b/src/armnnTfParser/TfParser.cpp
@@ -14,6 +14,7 @@
 #include <ParserHelper.hpp>
 #include <Permute.hpp>
 #include <VerificationHelpers.hpp>
+#include <DataLayoutIndexed.hpp>
 
 #include <google/protobuf/io/zero_copy_stream_impl.h>
 #include <google/protobuf/text_format.h>
@@ -36,6 +37,7 @@
 #include <numeric>
 #include <functional>
 
+using namespace armnnUtils;
 using namespace armnn;
 
 namespace armnnTfParser
@@ -752,6 +754,16 @@
         return constTensor;
     }
 
+    const T* GetStorage() const
+    {
+        return m_Storage.data();
+    }
+
+    const TensorInfo& GetTensorInfo() const
+    {
+        return m_TensorInfo;
+    }
+
 private:
     ///< Manages the lifetime of the tensor data.
     std::vector<T> m_Storage;
@@ -1059,59 +1071,85 @@
 
     CHECK_DATA_FORMAT(nodeDef, dataFormat, "Conv2D");
 
-    if (dataFormat == "NHWC")
-    {
-        desc.m_StrideX = strides[2];
-        desc.m_StrideY = strides[1];
-        // Swizzles input to supported memory layout.
-        inputTensorInfo = armnnUtils::Permuted(inputSlot.GetTensorInfo(), NHWCToArmNN);
-    }
-    else if (dataFormat == "NCHW")
-    {
-        desc.m_StrideX = strides[3];
-        desc.m_StrideY = strides[2];
-    }
+    DataLayout dataLayout = dataFormat == "NHWC" ? DataLayout::NHWC : DataLayout::NCHW;
 
-    uint32_t inputHeight = inputTensorInfo.GetShape()[2];
-    uint32_t inputWidth = inputTensorInfo.GetShape()[3];
+    desc.m_DataLayout = dataLayout;
 
-    std::vector<float> outputTensorData;
+    DataLayoutIndexed dataLayoutIndexed(dataLayout);
 
-    ConstTensor weightTensor = weightNode->GetConstTensor(true, outputTensorData);
+    desc.m_StrideX = strides[dataLayoutIndexed.GetWidthIndex()];
+    desc.m_StrideY = strides[dataLayoutIndexed.GetHeightIndex()];
 
-    uint32_t weightHeight = weightTensor.GetShape()[2];
-    uint32_t weightWidth = weightTensor.GetShape()[3];
+    uint32_t inputHeight = inputTensorInfo.GetShape()[dataLayoutIndexed.GetHeightIndex()];
+    uint32_t inputWidth  = inputTensorInfo.GetShape()[dataLayoutIndexed.GetWidthIndex()];
+
+    // Mappings from TensorFlow filter tensors to the ArmNN filter tensors.
+    // Tensorflow weights are [H, W, In, Out].
+    // ArmNN weights have to be [Out, H, W, In] when the data layout is NHWC,
+    // and [Out, In, H, W] when the data layout is NCHW.
+    PermutationVector permutationVector =
+            dataLayout == DataLayout::NHWC ?
+                std::initializer_list<unsigned int>{ 1, 2, 3, 0 } : // NHWC: [H, W, In, Out] -> [Out, H, W, In]
+                std::initializer_list<unsigned int>{ 2, 3, 1, 0 };  // NCHW: [H, W, In, Out] -> [Out, In, H, W]
+
+    // Swizzle the tensor using the given permutation vector.
+    const TensorInfo& weightTensorInfo = weightNode->GetTensorInfo();
+    const TensorInfo weightTensorSwizzledInfo = armnnUtils::Permuted(weightTensorInfo, permutationVector);
+
+    // Swizzles the content of the tensor's permanent storage into a local storage.
+    std::vector<float> weightTensorSwizzledData(weightTensorInfo.GetNumElements());
+    armnnUtils::Permute(weightTensorSwizzledInfo.GetShape(), permutationVector,
+                        weightNode->GetStorage(), weightTensorSwizzledData.data());
+
+    // Create a weight tensor with the newly swizzled data.
+    ConstTensor weightTensor(weightTensorSwizzledInfo, weightTensorSwizzledData);
+
+    uint32_t weightHeight = weightTensor.GetShape()[dataLayoutIndexed.GetHeightIndex()];
+    uint32_t weightWidth  = weightTensor.GetShape()[dataLayoutIndexed.GetWidthIndex()];
 
     bool padding = false;
     TensorInfo outputInfo;
+    unsigned int outputHeight = 0;
+    unsigned int outputWidth = 0;
 
     CHECK_PADDING_TYPE(nodeDef, paddingString);
 
     if (paddingString == "SAME")
     {
         padding = true;
-        outputInfo = TensorInfo({ inputTensorInfo.GetShape()[0],
-                                  weightTensor.GetShape()[0],
-                                  static_cast<uint32_t>(ceil(
-                                      static_cast<float>(inputHeight) /
-                                      static_cast<float>(desc.m_StrideY))),
-                                  static_cast<uint32_t>(ceil(
-                                      static_cast<float>(inputWidth) /
-                                      static_cast<float>(desc.m_StrideX)))
-                                }, DataType::Float32);
+
+        outputHeight = static_cast<uint32_t>(ceil(static_cast<float>(inputHeight) /
+                                                  static_cast<float>(desc.m_StrideY)));
+        outputWidth  = static_cast<uint32_t>(ceil(static_cast<float>(inputWidth) /
+                                                  static_cast<float>(desc.m_StrideX)));
     }
     else if (paddingString == "VALID")
     {
         padding = false;
+
+        outputHeight = static_cast<uint32_t>(ceil(static_cast<float>(inputHeight - weightHeight + 1) /
+                                                  static_cast<float>(desc.m_StrideY)));
+        outputWidth  = static_cast<uint32_t>(ceil(static_cast<float>(inputWidth - weightWidth + 1) /
+                                                  static_cast<float>(desc.m_StrideX)));
+    }
+
+    switch (dataLayout)
+    {
+    case DataLayout::NHWC:
+        outputInfo = TensorInfo({ inputTensorInfo.GetShape()[0],
+                                  outputHeight,
+                                  outputWidth,
+                                  weightTensor.GetShape()[0] },
+                                DataType::Float32);
+        break;
+    case DataLayout::NCHW:
+    default:
         outputInfo = TensorInfo({ inputTensorInfo.GetShape()[0],
                                   weightTensor.GetShape()[0],
-                                  static_cast<uint32_t>(ceil(
-                                      static_cast<float>(inputHeight - weightHeight + 1) /
-                                      static_cast<float>(desc.m_StrideY))),
-                                  static_cast<uint32_t>(ceil(
-                                      static_cast<float>(inputWidth - weightWidth + 1) /
-                                      static_cast<float>(desc.m_StrideX)))
-                                }, DataType::Float32);
+                                  outputHeight,
+                                  outputWidth },
+                                DataType::Float32);
+        break;
     }
 
     CalcPadding(inputHeight, weightHeight, desc.m_StrideY, desc.m_PadTop, desc.m_PadBottom, padding);
@@ -1119,15 +1157,7 @@
 
     IConnectableLayer* layer = m_Network->AddConvolution2dLayer(desc, weightTensor, nodeDef.name().c_str());
     layer->GetOutputSlot(0).SetTensorInfo(outputInfo);
-
-    if (dataFormat == "NHWC")
-    {
-        layer = SwizzleInDeswizzleOut(*m_Network, inputSlot, *layer, nodeDef.name());
-    }
-    else
-    {
-        inputSlot.Connect(layer->GetInputSlot(0));
-    }
+    inputSlot.Connect(layer->GetInputSlot(0));
 
     return std::make_unique<SingleLayerParsedTfOperation>(this, nodeDef, layer);
 }
diff --git a/src/armnnTfParser/test/Convolution2d.cpp b/src/armnnTfParser/test/Convolution2d.cpp
index cc534df..aead1fe 100644
--- a/src/armnnTfParser/test/Convolution2d.cpp
+++ b/src/armnnTfParser/test/Convolution2d.cpp
@@ -6,6 +6,8 @@
 #include <boost/test/unit_test.hpp>
 #include "armnnTfParser/ITfParser.hpp"
 #include "ParserPrototxtFixture.hpp"
+
+#include <array>
 #include <string>
 #include <iostream>
 
@@ -13,15 +15,28 @@
 
 struct Convolution2dFixture : public armnnUtils::ParserPrototxtFixture<armnnTfParser::ITfParser>
 {
-    explicit Convolution2dFixture(const char* paddingType)
-    : Convolution2dFixture(paddingType, 1)
+    explicit Convolution2dFixture(const std::string& dataLayout, const std::string& paddingType)
+    : Convolution2dFixture(dataLayout, paddingType, 1)
     {}
 
     // Dilation: 0 - dilations attribute is not included;
     // Dilation: >0 - dilations attribute set to [1,v,v,1], where v is the value of the dilation arg
-    explicit Convolution2dFixture(const char* paddingType, int stride, int dilation = 0)
+    explicit Convolution2dFixture(const std::string& dataLayout, const std::string& paddingType,
+                                  int stride, int dilation = 0)
     {
-        std::string strideString = std::to_string(stride);
+        std::string strideString ("        i: 1 \n"
+                                  "        i: 1 \n");
+        if (dataLayout == "NHWC")
+        {
+            strideString.append("        i: " + std::to_string(stride) + " \n"
+                                "        i: 1 \n");
+        }
+        else // dataLayout == "NCHW"
+        {
+            strideString.append("        i: 1 \n"
+                                "        i: " + std::to_string(stride) + " \n");
+        }
+
         std::string dilationString = std::to_string(dilation);
         m_Prototext = "node { \n"
             "    name: \"graphInput\" \n"
@@ -87,13 +102,15 @@
             "  attr { \n"
             "    key: \"data_format\" \n"
             "    value { \n"
-            "      s: \"NHWC\" \n"
-            "    } \n"
-            "  } \n"
-            "  attr { \n"
-            "    key: \"padding\" \n"
-            "    value { \n"
             "      s: \"";
+        m_Prototext.append(dataLayout);
+        m_Prototext.append("\"\n"
+                           "    } \n"
+                           "  } \n"
+                           "  attr { \n"
+                           "    key: \"padding\" \n"
+                           "    value { \n"
+                           "      s: \"");
         m_Prototext.append(paddingType);
         m_Prototext.append("\"\n"
                            "    } \n"
@@ -101,14 +118,10 @@
                            "  attr { \n"
                            "    key: \"strides\" \n"
                            "    value { \n"
-                           "      list { \n"
-                           "        i: 1 \n"
-                           "        i: 1 \n"
-                           "        i: ");
+                           "      list { \n");
         m_Prototext.append(strideString);
-        m_Prototext.append(" \n"
-                           "        i: 1 \n"
-                           "      } \n"
+
+        m_Prototext.append("      } \n"
                            "    } \n"
                            "  } \n");
 
@@ -139,67 +152,118 @@
                            "} \n");
 
         // Manual height computation based on stride parameter.
-        BOOST_ASSERT_MSG(stride == 1 || stride==2, "Add support for strides other than 1 or 2.");
-        unsigned int dims[] = {1,2,3,1};
-        if (stride == 2)
+        BOOST_ASSERT_MSG(stride == 1 || stride == 2, "Add support for strides other than 1 or 2.");
+        std::array<unsigned int, 4> dims;
+        if (dataLayout == "NHWC")
         {
-            dims[1]=3;
+            dims = { 1u, (stride == 2 ? 3u : 2u), 3u, 1u };
+        }
+        else // dataLayout == "NCHW"
+        {
+            dims = { 1u, 1u, (stride == 2 ? 3u : 2u), 3u };
         }
 
-        SetupSingleInputSingleOutput(armnn::TensorShape(4, dims), "graphInput", "potato");
+        SetupSingleInputSingleOutput(armnn::TensorShape(4, dims.data()), "graphInput", "potato");
     }
 };
 
 
-struct Convolution2dSameFixture : Convolution2dFixture
+struct Convolution2dNhwcSameFixture : Convolution2dFixture
 {
-    Convolution2dSameFixture() : Convolution2dFixture("SAME", 1){}
+    Convolution2dNhwcSameFixture() : Convolution2dFixture("NHWC", "SAME", 1){}
 };
-BOOST_FIXTURE_TEST_CASE(ParseConv2DSame, Convolution2dSameFixture)
+BOOST_FIXTURE_TEST_CASE(ParseConv2dNhwcSame, Convolution2dNhwcSameFixture)
 {
     RunTest<4>({1, 2, 3, 4, 5, 6}, {2, 4, 4, 6.5f, 10 , 8.5f});
 }
 
-struct Convolution2dValidFixture : Convolution2dFixture
+struct Convolution2dNchwSameFixture : Convolution2dFixture
 {
-    Convolution2dValidFixture() : Convolution2dFixture("VALID", 1){}
+    Convolution2dNchwSameFixture() : Convolution2dFixture("NCHW", "SAME", 1){}
 };
-BOOST_FIXTURE_TEST_CASE(ParseConv2DValid, Convolution2dValidFixture)
+BOOST_FIXTURE_TEST_CASE(ParseConv2dNchwSame, Convolution2dNchwSameFixture)
+{
+    RunTest<4>({1, 2, 3, 4, 5, 6}, {2, 4, 4, 6.5f, 10 , 8.5f});
+}
+
+
+struct Convolution2dNhwcValidFixture : Convolution2dFixture
+{
+    Convolution2dNhwcValidFixture() : Convolution2dFixture("NHWC", "VALID", 1){}
+};
+BOOST_FIXTURE_TEST_CASE(ParseConv2dNhwcValid, Convolution2dNhwcValidFixture)
+{
+    RunTest<4>({1, 2, 3, 4, 5, 6}, {4, 10});
+}
+
+struct Convolution2dNchwValidFixture : Convolution2dFixture
+{
+    Convolution2dNchwValidFixture() : Convolution2dFixture("NCHW", "VALID", 1){}
+};
+BOOST_FIXTURE_TEST_CASE(ParseConv2dNchwValid, Convolution2dNchwValidFixture)
 {
     RunTest<4>({1, 2, 3, 4, 5, 6}, {4, 10});
 }
 
 
-struct Convolution2dStride2SameFixture : Convolution2dFixture
+struct Convolution2dStride2NhwcSameFixture : Convolution2dFixture
 {
-    Convolution2dStride2SameFixture() : Convolution2dFixture("SAME", 2){}
+    Convolution2dStride2NhwcSameFixture() : Convolution2dFixture("NHWC", "SAME", 2){}
 };
-BOOST_FIXTURE_TEST_CASE(ParseConv2DStride2Same, Convolution2dStride2SameFixture)
+BOOST_FIXTURE_TEST_CASE(ParseConv2dStride2NhwcSame, Convolution2dStride2NhwcSameFixture)
+{
+    RunTest<4>({1, 2, 3, 4, 5, 6, 7, 8, 9}, {2, 4, 6.5, 8.5, 11, 13});
+}
+
+struct Convolution2dStride2NchwSameFixture : Convolution2dFixture
+{
+    Convolution2dStride2NchwSameFixture() : Convolution2dFixture("NCHW", "SAME", 2){}
+};
+BOOST_FIXTURE_TEST_CASE(ParseConv2dStride2NchwSame, Convolution2dStride2NchwSameFixture)
 {
     RunTest<4>({1, 2, 3, 4, 5, 6, 7, 8, 9}, {2, 4, 6.5, 8.5, 11, 13});
 }
 
 
-struct Convolution2dStride2ValidFixture : Convolution2dFixture
+struct Convolution2dStride2NhwcValidFixture : Convolution2dFixture
 {
-    Convolution2dStride2ValidFixture() : Convolution2dFixture("VALID", 2){}
+    Convolution2dStride2NhwcValidFixture() : Convolution2dFixture("NHWC", "VALID", 2){}
 };
-BOOST_FIXTURE_TEST_CASE(ParseConv2DStride2Valid, Convolution2dStride2ValidFixture)
+BOOST_FIXTURE_TEST_CASE(ParseConv2dStride2NhwcValid, Convolution2dStride2NhwcValidFixture)
+{
+    RunTest<4>({1, 2, 3, 4, 5, 6, 7, 8, 9}, {4, 10, 16});
+}
+
+struct Convolution2dStride2NchwValidFixture : Convolution2dFixture
+{
+    Convolution2dStride2NchwValidFixture() : Convolution2dFixture("NCHW", "VALID", 2){}
+};
+BOOST_FIXTURE_TEST_CASE(ParseConv2dStride2NchwValid, Convolution2dStride2NchwValidFixture)
 {
     RunTest<4>({1, 2, 3, 4, 5, 6, 7, 8, 9}, {4, 10, 16});
 }
 
 
-struct Convolution2dDilation1Fixture : Convolution2dFixture
+struct Convolution2dDilation1NhwcFixture : Convolution2dFixture
 {
-    Convolution2dDilation1Fixture() : Convolution2dFixture("SAME", 1, 1){}
+    Convolution2dDilation1NhwcFixture() : Convolution2dFixture("NHWC", "SAME", 1, 1){}
 };
-BOOST_FIXTURE_TEST_CASE(ParseConv2DDilation1, Convolution2dDilation1Fixture)
+BOOST_FIXTURE_TEST_CASE(ParseConv2dDilation1Nhwc, Convolution2dDilation1NhwcFixture)
 {
     RunTest<4>({1, 2, 3, 4, 5, 6}, {2, 4, 4, 6.5f, 10 , 8.5f});
 }
 
-BOOST_AUTO_TEST_CASE(ParseConv2DDilation2)
+struct Convolution2dDilation1NchwFixture : Convolution2dFixture
+{
+    Convolution2dDilation1NchwFixture() : Convolution2dFixture("NCHW", "SAME", 1, 1){}
+};
+BOOST_FIXTURE_TEST_CASE(ParseConv2dDilation1Nchw, Convolution2dDilation1NchwFixture)
+{
+    RunTest<4>({1, 2, 3, 4, 5, 6}, {2, 4, 4, 6.5f, 10 , 8.5f});
+}
+
+
+BOOST_AUTO_TEST_CASE(ParseConv2dDilation2)
 {
     const char* prototext = ""
         "node {\n"
@@ -309,8 +373,7 @@
     armnn::TensorShape tensorShape = { 1, 3, 3, 1 };
     inputShapes["graphInput"] = tensorShape;
     armnnTfParser::ITfParserPtr parser = armnnTfParser::ITfParser::Create();
-    BOOST_CHECK_THROW(parser->CreateNetworkFromString(prototext, inputShapes, { "potato" }),
-                          armnn::ParseException);
+    BOOST_CHECK_THROW(parser->CreateNetworkFromString(prototext, inputShapes, { "potato" }), armnn::ParseException);
 }
 
 
diff --git a/src/backends/reference/workloads/ConvImpl.hpp b/src/backends/reference/workloads/ConvImpl.hpp
index b8e2dea..704bc36 100644
--- a/src/backends/reference/workloads/ConvImpl.hpp
+++ b/src/backends/reference/workloads/ConvImpl.hpp
@@ -15,6 +15,8 @@
 #include <boost/assert.hpp>
 #include <boost/numeric/conversion/cast.hpp>
 
+#include <DataLayoutIndexed.hpp>
+
 #include <cmath>
 #include <limits>
 
@@ -74,6 +76,7 @@
                                             data.m_Parameters.m_DataLayout);
 
     const armnnUtils::DataLayoutIndexed dataLayoutIndexed(data.m_Parameters.m_DataLayout);
+
     const unsigned int channelsIndex = dataLayoutIndexed.GetChannelsIndex();
     const unsigned int heightIndex   = dataLayoutIndexed.GetHeightIndex();
     const unsigned int widthIndex    = dataLayoutIndexed.GetWidthIndex();
@@ -91,10 +94,10 @@
     unsigned int heightFilter = filterInfo.GetShape()[heightIndex];
     unsigned int widthFilter  = filterInfo.GetShape()[widthIndex];
 
-    unsigned int paddingTop = data.m_Parameters.m_PadTop;
+    unsigned int paddingTop  = data.m_Parameters.m_PadTop;
     unsigned int paddingLeft = data.m_Parameters.m_PadLeft;
-    unsigned int hStride  = data.m_Parameters.m_StrideY;
-    unsigned int xStride  = data.m_Parameters.m_StrideX;
+    unsigned int xStride     = data.m_Parameters.m_StrideX;
+    unsigned int yStride     = data.m_Parameters.m_StrideY;
 
     // The world's least efficient convolution.
     for (unsigned int batchIdx = 0; batchIdx < batchSize; batchIdx++)
@@ -168,7 +171,7 @@
                                 AccumulatorType filterValue = filterData[filterIndex] -
                                     boost::numeric_cast<AccumulatorType>(filterOffset);
 
-                                unsigned int yInput = yOutput * hStride + yFilter;
+                                unsigned int yInput = yOutput * yStride + yFilter;
                                 unsigned int xInput = xOutput * xStride + xFilter;
 
                                 AccumulatorType inputValue;