IVGCVSW-5826 Change weights layout for depthwise to [1,H,W,I*M]

 * This change is necessary because tflite uses a [1,H,W,I*M] format
   and uses the I*M dimension for per axis quantization. Our previous
   layout [M,I,H,W] can't handle the correlating quantization scales.
 * Updates Onnx-, TfLiteParser and TfliteDelegate
 * Updates the CpuRef, CpuAcc and GpuAcc backends
 * Adjusts unit tests
 * Adds test to ensure models with old layout can still be read and
   executed
 * Adds conversion function to previous layout [1,H,W,I*M] --> [M,I,H,W]
   which can be used by backend developers

!android-nn-driver:5553

Signed-off-by: Jan Eilers <jan.eilers@arm.com>
Change-Id: Ifef23368b8c3702cf315a5838d214f7dc13c0152
diff --git a/src/backends/reference/test/CMakeLists.txt b/src/backends/reference/test/CMakeLists.txt
index 76541cf..d7c5da8 100644
--- a/src/backends/reference/test/CMakeLists.txt
+++ b/src/backends/reference/test/CMakeLists.txt
@@ -13,6 +13,8 @@
     RefLayerTests.cpp
     RefMemoryManagerTests.cpp
     RefOptimizedNetworkTests.cpp
+    RefPerAxisIteratorTests.cpp
+    RefPerChannelDecoderTests.cpp
     RefRuntimeTests.cpp
     RefTensorHandleTests.cpp
     RefWorkloadFactoryHelper.hpp
diff --git a/src/backends/reference/test/RefPerAxisIteratorTests.cpp b/src/backends/reference/test/RefPerAxisIteratorTests.cpp
new file mode 100644
index 0000000..7da4c0f
--- /dev/null
+++ b/src/backends/reference/test/RefPerAxisIteratorTests.cpp
@@ -0,0 +1,252 @@
+//
+// Copyright © 2021 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include <reference/workloads/Decoders.hpp>
+#include <armnn/utility/NumericCast.hpp>
+
+#include <fmt/format.h>
+
+#include <boost/test/unit_test.hpp>
+#include <chrono>
+
+
+template<typename T>
+void CompareVector(std::vector<T> vec1, std::vector<T> vec2)
+{
+    BOOST_TEST(vec1.size() == vec2.size());
+
+    bool mismatch = false;
+    for (uint i = 0; i < vec1.size(); ++i)
+    {
+        if (vec1[i] != vec2[i])
+        {
+            /*std::stringstream ss;
+            ss << "Vector value mismatch: index=" << i << "  " <<  vec1[i] << "!=" << vec2[i];*/
+            BOOST_TEST_MESSAGE(fmt::format("Vector value mismatch: index={}  {} != {}",
+                                           i,
+                                           vec1[i],
+                                           vec2[i]));
+            mismatch = true;
+        }
+    }
+
+    if (mismatch)
+    {
+        BOOST_FAIL("Error in CompareVector. Vectors don't match.");
+    }
+}
+
+using namespace armnn;
+
+// Basically a per axis decoder but without any decoding/quantization
+class MockPerAxisIterator : public PerAxisIterator<const int8_t, Decoder<int8_t>>
+{
+public:
+    MockPerAxisIterator(const int8_t* data, const armnn::TensorShape& tensorShape, const unsigned int axis)
+            : PerAxisIterator(data, tensorShape, axis), m_NumElements(tensorShape.GetNumElements())
+    {}
+
+    int8_t Get() const override
+    {
+        return *m_Iterator;
+    }
+
+    virtual std::vector<float> DecodeTensor(const TensorShape &tensorShape,
+                                            bool isDepthwise = false) override
+    {
+        IgnoreUnused(tensorShape, isDepthwise);
+        return std::vector<float>{};
+    };
+
+    // Iterates over data using operator[] and returns vector
+    std::vector<int8_t> Loop()
+    {
+        std::vector<int8_t> vec;
+        for (uint32_t i = 0; i < m_NumElements; ++i)
+        {
+            this->operator[](i);
+            vec.emplace_back(Get());
+        }
+        return vec;
+    }
+
+    unsigned int GetAxisIndex()
+    {
+        return m_AxisIndex;
+    }
+    unsigned int m_NumElements;
+};
+
+BOOST_AUTO_TEST_SUITE(RefPerAxisIterator)
+
+// Test Loop (Equivalent to DecodeTensor) and Axis = 0
+BOOST_AUTO_TEST_CASE(PerAxisIteratorTest1)
+{
+    std::vector<int8_t> input = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
+    TensorInfo tensorInfo ({3,1,2,2},DataType::QSymmS8);
+
+    // test axis=0
+    std::vector<int8_t> expOutput = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
+    auto iterator = MockPerAxisIterator(input.data(), tensorInfo.GetShape(), 0);
+    std::vector<int8_t> output = iterator.Loop();
+    CompareVector(output, expOutput);
+
+    // Set iterator to index and check if the axis index is correct
+    iterator[5];
+    BOOST_TEST(iterator.GetAxisIndex() == 1u);
+
+    iterator[1];
+    BOOST_TEST(iterator.GetAxisIndex() == 0u);
+
+    iterator[10];
+    BOOST_TEST(iterator.GetAxisIndex() == 2u);
+}
+
+// Test Axis = 1
+BOOST_AUTO_TEST_CASE(PerAxisIteratorTest2)
+{
+    std::vector<int8_t> input = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
+    TensorInfo tensorInfo ({3,1,2,2},DataType::QSymmS8);
+
+    // test axis=1
+    std::vector<int8_t> expOutput = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
+    auto iterator = MockPerAxisIterator(input.data(), tensorInfo.GetShape(), 1);
+    std::vector<int8_t> output = iterator.Loop();
+    CompareVector(output, expOutput);
+
+    // Set iterator to index and check if the axis index is correct
+    iterator[5];
+    BOOST_TEST(iterator.GetAxisIndex() == 0u);
+
+    iterator[1];
+    BOOST_TEST(iterator.GetAxisIndex() == 0u);
+
+    iterator[10];
+    BOOST_TEST(iterator.GetAxisIndex() == 0u);
+}
+
+// Test Axis = 2
+BOOST_AUTO_TEST_CASE(PerAxisIteratorTest3)
+{
+    std::vector<int8_t> input = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
+    TensorInfo tensorInfo ({3,1,2,2},DataType::QSymmS8);
+
+    // test axis=2
+    std::vector<int8_t> expOutput = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
+    auto iterator = MockPerAxisIterator(input.data(), tensorInfo.GetShape(), 2);
+    std::vector<int8_t> output = iterator.Loop();
+    CompareVector(output, expOutput);
+
+    // Set iterator to index and check if the axis index is correct
+    iterator[5];
+    BOOST_TEST(iterator.GetAxisIndex() == 0u);
+
+    iterator[1];
+    BOOST_TEST(iterator.GetAxisIndex() == 0u);
+
+    iterator[10];
+    BOOST_TEST(iterator.GetAxisIndex() == 1u);
+}
+
+// Test Axis = 3
+BOOST_AUTO_TEST_CASE(PerAxisIteratorTest4)
+{
+    std::vector<int8_t> input = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
+    TensorInfo tensorInfo ({3,1,2,2},DataType::QSymmS8);
+
+    // test axis=3
+    std::vector<int8_t> expOutput = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
+    auto iterator = MockPerAxisIterator(input.data(), tensorInfo.GetShape(), 3);
+    std::vector<int8_t> output = iterator.Loop();
+    CompareVector(output, expOutput);
+
+    // Set iterator to index and check if the axis index is correct
+    iterator[5];
+    BOOST_TEST(iterator.GetAxisIndex() == 1u);
+
+    iterator[1];
+    BOOST_TEST(iterator.GetAxisIndex() == 1u);
+
+    iterator[10];
+    BOOST_TEST(iterator.GetAxisIndex() == 0u);
+}
+
+
+// Test Axis = 1. Different tensor shape
+BOOST_AUTO_TEST_CASE(PerAxisIteratorTest5)
+{
+    using namespace armnn;
+    std::vector<int8_t> input =
+    {
+         0,  1,  2,  3,
+         4,  5,  6,  7,
+         8,  9, 10, 11,
+        12, 13, 14, 15
+    };
+
+    std::vector<int8_t> expOutput =
+    {
+         0,  1,  2,  3,
+         4,  5,  6,  7,
+         8,  9, 10, 11,
+        12, 13, 14, 15
+    };
+
+    TensorInfo tensorInfo ({2,2,2,2},DataType::QSymmS8);
+    auto iterator = MockPerAxisIterator(input.data(), tensorInfo.GetShape(), 1);
+    std::vector<int8_t> output = iterator.Loop();
+    CompareVector(output, expOutput);
+
+    // Set iterator to index and check if the axis index is correct
+    iterator[5];
+    BOOST_TEST(iterator.GetAxisIndex() == 1u);
+
+    iterator[1];
+    BOOST_TEST(iterator.GetAxisIndex() == 0u);
+
+    iterator[10];
+    BOOST_TEST(iterator.GetAxisIndex() == 0u);
+}
+
+// Test the increment and decrement operator
+BOOST_AUTO_TEST_CASE(PerAxisIteratorTest7)
+{
+    using namespace armnn;
+    std::vector<int8_t> input =
+    {
+        0, 1,  2,  3,
+        4, 5,  6,  7,
+        8, 9, 10, 11
+    };
+
+    std::vector<int8_t> expOutput =
+    {
+        0, 1,  2,  3,
+        4, 5,  6,  7,
+        8, 9, 10, 11
+    };
+
+    TensorInfo tensorInfo ({3,1,2,2},DataType::QSymmS8);
+    auto iterator = MockPerAxisIterator(input.data(), tensorInfo.GetShape(), 2);
+
+    iterator += 3;
+    BOOST_TEST(iterator.Get(), expOutput[3]);
+    BOOST_TEST(iterator.GetAxisIndex() == 1u);
+
+    iterator += 3;
+    BOOST_TEST(iterator.Get(), expOutput[6]);
+    BOOST_TEST(iterator.GetAxisIndex() == 1u);
+
+    iterator -= 2;
+    BOOST_TEST(iterator.Get(), expOutput[4]);
+    BOOST_TEST(iterator.GetAxisIndex() == 0u);
+
+    iterator -= 1;
+    BOOST_TEST(iterator.Get(), expOutput[3]);
+    BOOST_TEST(iterator.GetAxisIndex() == 1u);
+}
+
+
+BOOST_AUTO_TEST_SUITE_END()
\ No newline at end of file
diff --git a/src/backends/reference/test/RefPerChannelDecoderTests.cpp b/src/backends/reference/test/RefPerChannelDecoderTests.cpp
new file mode 100644
index 0000000..c2e3cee
--- /dev/null
+++ b/src/backends/reference/test/RefPerChannelDecoderTests.cpp
@@ -0,0 +1,156 @@
+//
+// Copyright © 2021 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include <reference/workloads/Decoders.hpp>
+#include <armnn/utility/NumericCast.hpp>
+
+#include <fmt/format.h>
+
+#include <boost/test/unit_test.hpp>
+
+BOOST_AUTO_TEST_SUITE(RefPerChannelDecoder)
+
+template<typename T>
+void CompareVector(std::vector<T> vec1, std::vector<T> vec2)
+{
+    BOOST_TEST(vec1.size() == vec2.size());
+
+    bool mismatch = false;
+    for (uint i = 0; i < vec1.size(); ++i)
+    {
+        if (vec1[i] != vec2[i])
+        {
+            /*std::stringstream ss;
+            ss << "Vector value mismatch: index=" << i << "  " <<  vec1[i] << "!=" << vec2[i];*/
+            BOOST_TEST_MESSAGE(fmt::format("Vector value mismatch: index={}  {} != {}",
+                                           i,
+                                           vec1[i],
+                                           vec2[i]));
+            mismatch = true;
+        }
+    }
+
+    if (mismatch)
+    {
+        BOOST_FAIL("Error in CompareVector. Vectors don't match.");
+    }
+}
+
+// Ensure quantization works for none depthwise convolutions
+BOOST_AUTO_TEST_CASE(RefPerChannelDecoderTest1)
+{
+    using namespace armnn;
+    std::vector<int8_t> input =
+    {
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23
+    };
+
+    std::vector<float> expOutput =
+    {
+        0.0f,   1.0f,  2.0f,  3.0f,  4.0f,  5.0f,  6.0f,  7.0f,  8.0f,  9.0f, 10.0f, 11.0f,
+        24.0f, 26.0f, 28.0f, 30.0f, 32.0f, 34.0f, 36.0f, 38.0f, 40.0f, 42.0f, 44.0f, 46.0f
+    };
+
+    TensorInfo tensorInfo ({2,2,2,3},DataType::QSymmS8,{1.0f, 2.0f},0);
+    auto decoder = MakeDecoder<float>(tensorInfo, input.data());
+
+    std::vector<float> output = decoder->DecodeTensor(tensorInfo.GetShape());
+
+    CompareVector(output, expOutput);
+}
+
+// Ensure quantization works for depthwise convolutions M=1
+BOOST_AUTO_TEST_CASE(RefPerChannelDecoderTest2)
+{
+    using namespace armnn;
+    std::vector<int8_t> input =
+    {
+        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+    };
+
+    std::vector<float> expOutput =
+    {
+         0.0f,  1.0f,  2.0f,  3.0f,
+         8.0f, 10.0f, 12.0f, 14.0f,
+        24.0f, 27.0f, 30.0f, 33.0f,
+        48.0f, 52.0f, 56.0f, 60.0f
+    };
+
+    // [O,1,H,W] = [I*M,1,H,W] = [4*1,1,2,2]
+    TensorInfo tensorInfo ({4,1,2,2},DataType::QSymmS8,{1.0f, 2.0f, 3.0f, 4.0f},0);
+    auto decoder = MakeDecoder<float>(tensorInfo, input.data());
+
+    std::vector<float> output = decoder->DecodeTensor(tensorInfo.GetShape(), true);
+
+    CompareVector(output, expOutput);
+}
+
+// Ensure quantization works for depthwise convolutions M=2
+BOOST_AUTO_TEST_CASE(RefPerChannelDecoderTest3)
+{
+    using namespace armnn;
+    std::vector<int8_t> input =
+    {
+        0, 1, 2, 3,
+        4, 5, 6, 7,
+        8, 9, 10, 11,
+        12, 13, 14, 15,
+        16, 17, 18, 19,
+        20, 21, 22, 23
+    };
+
+    std::vector<float> expOutput =
+    {
+         0.0f,  1.0f,  2.0f,  3.0f,
+         8.0f, 10.0f, 12.0f, 14.0f,
+        24.0f, 27.0f, 30.0f, 33.0f,
+        48.0f, 52.0f, 56.0f, 60.0f,
+        80.0f, 85.0f, 90.0f, 95.0f,
+        120.0f, 126.0f, 132.0f, 138.0f
+    };
+
+    // [O,1,H,W] = [I*M,1,H,W] = [3*2,1,2,2]
+    TensorInfo tensorInfo ({6,1,2,2},DataType::QSymmS8,{1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},0);
+    auto decoder = MakeDecoder<float>(tensorInfo, input.data());
+
+    std::vector<float> output = decoder->DecodeTensor(tensorInfo.GetShape(), true);
+
+    CompareVector(output, expOutput);
+}
+
+// Ensure quantization works for depthwise convolutions M=2 for int32
+BOOST_AUTO_TEST_CASE(RefPerChannelDecoderTest4)
+{
+    using namespace armnn;
+    std::vector<int32_t> input =
+    {
+        0, 1, 2, 3,
+        4, 5, 6, 7,
+        8, 9, 10, 11,
+        12, 13, 14, 15,
+        16, 17, 18, 19,
+        20, 21, 22, 23
+    };
+
+    std::vector<float> expOutput =
+    {
+         0.0f,  1.0f,  2.0f,  3.0f,
+         8.0f, 10.0f, 12.0f, 14.0f,
+        24.0f, 27.0f, 30.0f, 33.0f,
+        48.0f, 52.0f, 56.0f, 60.0f,
+        80.0f, 85.0f, 90.0f, 95.0f,
+        120.0f, 126.0f, 132.0f, 138.0f
+    };
+
+    // [O,1,H,W] = [I*M,1,H,W] = [3*2,1,2,2]
+    TensorInfo tensorInfo ({6,1,2,2},DataType::Signed32,{1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f},0);
+    auto decoder = MakeDecoder<float>(tensorInfo, input.data());
+
+    std::vector<float> output = decoder->DecodeTensor(tensorInfo.GetShape(), true);
+
+    CompareVector(output, expOutput);
+}
+
+BOOST_AUTO_TEST_SUITE_END()
diff --git a/src/backends/reference/workloads/BaseIterator.hpp b/src/backends/reference/workloads/BaseIterator.hpp
index 73e2469..483ef72 100644
--- a/src/backends/reference/workloads/BaseIterator.hpp
+++ b/src/backends/reference/workloads/BaseIterator.hpp
@@ -8,7 +8,9 @@
 #include <armnn/TypesUtils.hpp>
 #include <armnn/utility/Assert.hpp>
 #include <armnn/utility/IgnoreUnused.hpp>
+#include <armnn/utility/NumericCast.hpp>
 #include <armnnUtils/FloatingPointConverter.hpp>
+#include <armnnUtils/TensorUtils.hpp>
 
 #include <ResolveType.hpp>
 
@@ -22,8 +24,6 @@
 
     virtual ~BaseIterator() {}
 
-    virtual BaseIterator& SetIndex(unsigned int index, unsigned int axisIndex = 0) = 0;
-
     virtual BaseIterator& operator++() = 0;
 
     virtual BaseIterator& operator+=(const unsigned int increment) = 0;
@@ -47,7 +47,6 @@
 
     virtual std::vector<float>
     DecodeTensor(const TensorShape &tensorShape,
-                 const unsigned int channelMultiplier = 1,
                  bool isDepthwise = false) = 0;
 };
 
@@ -108,14 +107,6 @@
         return *this;
     }
 
-    TypedIterator& SetIndex(unsigned int index, unsigned int axisIndex = 0) override
-    {
-        IgnoreUnused(axisIndex);
-        ARMNN_ASSERT(m_Iterator);
-        m_Iterator = m_Start + index;
-        return *this;
-    }
-
 protected:
     T* m_Iterator;
     T* m_Start;
@@ -135,10 +126,9 @@
         return armnn::Dequantize(*m_Iterator, m_Scale, m_Offset);
     }
     std::vector<float> DecodeTensor (const TensorShape& tensorShape,
-                                     const unsigned int channelMultiplier,
                                      const bool isDepthwise) override
     {
-        IgnoreUnused(channelMultiplier, isDepthwise);
+        IgnoreUnused(isDepthwise);
 
         const unsigned int size = tensorShape.GetNumElements();
         std::vector<float> decodedTensor;
@@ -173,10 +163,9 @@
         return armnn::Dequantize(*m_Iterator, m_Scale, m_Offset);
     }
     std::vector<float> DecodeTensor (const TensorShape& tensorShape,
-                                     const unsigned int channelMultiplier,
                                      const bool isDepthwise) override
     {
-        IgnoreUnused(channelMultiplier, isDepthwise);
+        IgnoreUnused(isDepthwise);
 
         const unsigned int size = tensorShape.GetNumElements();
         std::vector<float> decodedTensor;
@@ -211,10 +200,9 @@
         return armnn::Dequantize(*m_Iterator, m_Scale, m_Offset);
     }
     std::vector<float> DecodeTensor (const TensorShape& tensorShape,
-                                     const unsigned int channelMultiplier,
                                      const bool isDepthwise) override
     {
-        IgnoreUnused(channelMultiplier, isDepthwise);
+        IgnoreUnused(isDepthwise);
 
         const unsigned int size = tensorShape.GetNumElements();
         std::vector<float> decodedTensor;
@@ -249,10 +237,9 @@
         return armnn::Dequantize(*m_Iterator, m_Scale, m_Offset);
     }
     std::vector<float> DecodeTensor (const TensorShape& tensorShape,
-                                     const unsigned int channelMultiplier,
                                      const bool isDepthwise) override
     {
-        IgnoreUnused(channelMultiplier, isDepthwise);
+        IgnoreUnused(isDepthwise);
 
         const unsigned int size = tensorShape.GetNumElements();
         std::vector<float> decodedTensor;
@@ -289,10 +276,9 @@
         return val;
     }
     std::vector<float> DecodeTensor (const TensorShape& tensorShape,
-                                     const unsigned int channelMultiplier,
                                      const bool isDepthwise) override
     {
-        IgnoreUnused(channelMultiplier, isDepthwise);
+        IgnoreUnused(isDepthwise);
 
         const unsigned int size = tensorShape.GetNumElements();
         std::vector<float> decodedTensor;
@@ -328,10 +314,9 @@
         return val;
     }
     std::vector<float> DecodeTensor (const TensorShape& tensorShape,
-                                     const unsigned int channelMultiplier,
                                      const bool isDepthwise) override
     {
-        IgnoreUnused(channelMultiplier, isDepthwise);
+        IgnoreUnused(isDepthwise);
 
         const unsigned int size = tensorShape.GetNumElements();
         std::vector<float> decodedTensor;
@@ -365,10 +350,9 @@
         return *m_Iterator;
     }
     std::vector<float> DecodeTensor (const TensorShape& tensorShape,
-                                     const unsigned int channelMultiplier,
                                      const bool isDepthwise) override
     {
-        IgnoreUnused(channelMultiplier, isDepthwise);
+        IgnoreUnused(isDepthwise);
         const unsigned int size = tensorShape.GetNumElements();
         std::vector<float> decodedTensor;
 
@@ -393,10 +377,9 @@
         return static_cast<float>(*m_Iterator) * m_Scale;
     }
     std::vector<float> DecodeTensor (const TensorShape& tensorShape,
-                                     const unsigned int channelMultiplier,
                                      const bool isDepthwise) override
     {
-        IgnoreUnused(channelMultiplier, isDepthwise);
+        IgnoreUnused(isDepthwise);
 
         const unsigned int size = tensorShape.GetNumElements();
         std::vector<float> decodedTensor;
@@ -430,10 +413,9 @@
         return static_cast<float>(*m_Iterator);
     }
     std::vector<float> DecodeTensor (const TensorShape& tensorShape,
-                                     const unsigned int channelMultiplier,
                                      const bool isDepthwise) override
     {
-        IgnoreUnused(channelMultiplier, isDepthwise);
+        IgnoreUnused(isDepthwise);
 
         const unsigned int size = tensorShape.GetNumElements();
         std::vector<float> decodedTensor;
@@ -463,10 +445,9 @@
         return *m_Iterator;
     }
     std::vector<float> DecodeTensor (const TensorShape& tensorShape,
-                                     const unsigned int channelMultiplier,
                                      const bool isDepthwise) override
     {
-        IgnoreUnused(channelMultiplier, isDepthwise);
+        IgnoreUnused(isDepthwise);
 
         const unsigned int size = tensorShape.GetNumElements();
         std::vector<float> decodedTensor;
@@ -496,10 +477,9 @@
         return *m_Iterator;
     }
     std::vector<float> DecodeTensor (const TensorShape& tensorShape,
-                                     const unsigned int channelMultiplier,
                                      const bool isDepthwise) override
     {
-        IgnoreUnused(channelMultiplier, isDepthwise);
+        IgnoreUnused(isDepthwise);
 
         const unsigned int size = tensorShape.GetNumElements();
         std::vector<float> decodedTensor;
@@ -530,10 +510,9 @@
     }
 
     std::vector<float> DecodeTensor(const TensorShape& tensorShape,
-                                    const unsigned int channelMultiplier,
                                     const bool isDepthwise) override
     {
-        IgnoreUnused(channelMultiplier, isDepthwise);
+        IgnoreUnused(isDepthwise);
 
         const unsigned int size = tensorShape.GetNumElements();
         std::vector<float> decodedTensor;
@@ -769,23 +748,33 @@
     }
 };
 
-// PerAxisIterator for per-axis quantization
+/// PerAxisIterator for per-axis quantization. Iterates over a tensor as layed out in memory and keeps track
+/// of the axis index.
 template<typename T, typename Base>
 class PerAxisIterator : public Base
 {
 public:
-    // axisFactor is used to calculate channelStep
-    PerAxisIterator(T* data = nullptr, unsigned int axisFactor = 0)
-        : m_Iterator(data), m_Start(data), m_AxisIndex(0), m_AxisFactor(axisFactor)
+    PerAxisIterator(T* data = nullptr,
+                    unsigned int axisFactor = 0,
+                    unsigned int axisDimensionality=0)
+        : m_Iterator(data),
+          m_Start(data),
+          m_AxisIndex(0), // iterates over the dimension of axis
+          m_AxisDimensionality(axisDimensionality), // tensorShape[quantization_dim]
+          m_AxisFactor(axisFactor),
+          m_Index(0)
     {}
 
-    // This should be called to set index for per-axis Encoder/Decoder
-    PerAxisIterator& SetIndex(unsigned int index, unsigned int axisIndex) override
+    PerAxisIterator(T* data = nullptr,
+                    const armnn::TensorShape& tensorShape = TensorShape(),
+                    const unsigned int axis = 0)
+        : m_Iterator(data),
+          m_Start(data),
+          m_AxisIndex(0),
+          m_Index(0)
     {
-         ARMNN_ASSERT(m_Iterator);
-         m_Iterator = m_Start + index;
-         m_AxisIndex = axisIndex;
-         return *this;
+        m_AxisDimensionality = tensorShape[axis];
+        m_AxisFactor = armnnUtils::GetNumElementsAfter(tensorShape, axis);
     }
 
     void Reset(void* data) override
@@ -793,37 +782,50 @@
         m_Iterator = reinterpret_cast<T*>(data);
         m_Start = m_Iterator;
         m_AxisIndex = 0;
+        m_Index = 0;
     }
 
     PerAxisIterator& operator++() override
     {
-        ARMNN_ASSERT(m_Iterator);
-        ++m_Iterator;
-        m_AxisIndex = static_cast<unsigned int>(*m_Iterator) % m_AxisFactor;
+        ++m_Index;
+        this -> operator[](m_Index);
         return *this;
     }
 
     PerAxisIterator& operator+=(const unsigned int increment) override
     {
-        ARMNN_ASSERT(m_Iterator);
-        m_Iterator += increment;
-        m_AxisIndex = static_cast<unsigned int>(*m_Iterator) % m_AxisFactor;
+        m_Index += increment;
+        this -> operator[](m_Index);
         return *this;
     }
 
     PerAxisIterator& operator-=(const unsigned int decrement) override
     {
+        m_Index -= decrement;
+        this -> operator[](m_Index);
+        return *this;
+    }
+
+
+    inline PerAxisIterator& SetIndexOnMem(const unsigned int index)
+    {
         ARMNN_ASSERT(m_Iterator);
-        m_Iterator -= decrement;
-        m_AxisIndex = static_cast<unsigned int>(*m_Iterator) % m_AxisFactor;
+        m_Iterator = m_Start + index;
+        if (index < m_AxisFactor)
+        {
+            m_AxisIndex = 0;
+        }
+        else
+        {
+            m_AxisIndex = (index / m_AxisFactor) % m_AxisDimensionality;
+        }
+        m_Index = index;
         return *this;
     }
 
     PerAxisIterator& operator[](const unsigned int index) override
     {
-        ARMNN_ASSERT(m_Iterator);
-        m_Iterator = m_Start + index;
-        m_AxisIndex = static_cast<unsigned int>(*m_Iterator) % m_AxisFactor;
+        SetIndexOnMem(index);
         return *this;
     }
 
@@ -831,18 +833,22 @@
         T* m_Iterator;
         T* m_Start;
         unsigned int m_AxisIndex;
+        unsigned int m_AxisDimensionality; // tensorShape[quantization_dim]
         unsigned int m_AxisFactor;
+        unsigned int m_Index;
 };
 
 class QSymm8PerAxisDecoder : public PerAxisIterator<const int8_t, Decoder<float>>
 {
 public:
-    QSymm8PerAxisDecoder(const int8_t* data, const std::vector<float>& scale, unsigned int axisFactor)
-        : PerAxisIterator(data, axisFactor), m_Scales(scale) {}
+    QSymm8PerAxisDecoder(const int8_t* data, const armnn::TensorInfo& tensorInfo)
+            : PerAxisIterator(data, tensorInfo.GetShape(), tensorInfo.GetQuantizationDim().value()),
+              m_Scales(tensorInfo.GetQuantizationScales())
+    {}
 
     float Get() const override
     {
-        return armnn::Dequantize(*m_Iterator, m_Scales[m_AxisIndex], 0);
+        return armnn::Dequantize(*m_Iterator, GetScale(), 0);
     }
 
     // Get scale of the current value
@@ -852,37 +858,18 @@
     }
 
     std::vector<float> DecodeTensor(const TensorShape &tensorShape,
-                                    const unsigned int channelMultiplier,
                                     bool isDepthwise) override
     {
-        const uint32_t size = tensorShape.GetNumElements();
-        const uint32_t scaleSize = static_cast<uint32_t>(m_Scales.size());
+        IgnoreUnused(isDepthwise);
 
-        const uint32_t stepSize = isDepthwise ?
-                                  tensorShape[2] * tensorShape[3] : tensorShape.GetNumElements() / tensorShape[0];
-
-        const uint32_t stepNum = size / (stepSize * channelMultiplier);
-        uint32_t scale;
-
+        const unsigned int size = tensorShape.GetNumElements();
         std::vector<float> decodedTensor;
         decodedTensor.reserve(size);
 
-        // channelMultiplier is only used in depthwise convolutions and in other cases will have no effect
-        // stepSize is the length of a contiguous area sharing a quantization scale within a tensor
-        // stepNum is the number of those steps/blocks in the tensor
-        for (uint32_t mult = 0; mult < channelMultiplier; ++mult)
+        for (uint32_t i = 0; i < size; ++i)
         {
-            for (uint32_t step = 0; step < stepNum; ++step)
-            {
-                scale = (channelMultiplier * step + mult) % scaleSize;
-                for (uint32_t i = 0; i < stepSize; ++i)
-                {
-                    unsigned int index = mult * stepSize * channelMultiplier +
-                                         step * stepSize + i;
-                    this->operator[](index);
-                    decodedTensor.emplace_back(armnn::Dequantize(*m_Iterator, m_Scales[scale], 0));
-                }
-            }
+            SetIndexOnMem(i);
+            decodedTensor.emplace_back(armnn::Dequantize(*m_Iterator, GetScale(), 0));
         }
         return decodedTensor;
     }
@@ -920,8 +907,10 @@
 class ScaledInt32PerAxisDecoder : public PerAxisIterator<const int32_t, Decoder<float>>
 {
 public:
-    ScaledInt32PerAxisDecoder(const int32_t* data, const std::vector<float>& scales, unsigned int axisFactor)
-        : PerAxisIterator(data, axisFactor), m_Scales(scales) {}
+    ScaledInt32PerAxisDecoder(const int32_t* data, const armnn::TensorInfo tensorInfo)
+    : PerAxisIterator(data, tensorInfo.GetShape(), tensorInfo.GetQuantizationDim().value()),
+      m_Scales(tensorInfo.GetQuantizationScales())
+    {}
 
     float Get() const override
     {
@@ -935,17 +924,14 @@
     }
 
     std::vector<float> DecodeTensor(const TensorShape &tensorShape,
-                                    const unsigned int channelMultiplier,
                                     bool isDepthwise) override
     {
         const uint32_t size = tensorShape.GetNumElements();
-        const uint32_t scaleSize = static_cast<uint32_t>(m_Scales.size());
 
         const uint32_t stepSize = isDepthwise ?
                                   tensorShape[2] * tensorShape[3] : tensorShape.GetNumElements() / tensorShape[0];
 
-        const uint32_t stepNum = size / (stepSize * channelMultiplier);
-        uint32_t scale;
+        const uint32_t stepNum = size / stepSize;
 
         std::vector<float> decodedTensor;
         decodedTensor.reserve(size);
@@ -953,18 +939,14 @@
         // channelMultiplier is only used in depthwise convolutions and in other cases will have no effect
         // stepSize is the length of a contiguous area sharing a quantization scale within a tensor
         // stepNum is the number of those steps/blocks in the tensor
-        for (uint32_t mult = 0; mult < channelMultiplier; ++mult)
+        for (uint32_t step = 0; step < stepNum; ++step)
         {
-            for (uint32_t step = 0; step < stepNum; ++step)
+            //scale = (channelMultiplier * step + mult) % scaleSize;
+            for (uint32_t i = 0; i < stepSize; ++i)
             {
-                scale = (channelMultiplier * step + mult) % scaleSize;
-                for (uint32_t i = 0; i < stepSize; ++i)
-                {
-                    unsigned int index = mult * stepSize * channelMultiplier +
-                                         step * stepSize + i;
-                    this->operator[](index);
-                    decodedTensor.emplace_back(armnn::Dequantize(*m_Iterator, m_Scales[scale], 0));
-                }
+                unsigned int index = step * stepSize + i;
+                this->operator[](index);
+                decodedTensor.emplace_back(armnn::Dequantize(*m_Iterator, m_Scales[step], 0));
             }
         }
         return decodedTensor;
diff --git a/src/backends/reference/workloads/ConvImpl.cpp b/src/backends/reference/workloads/ConvImpl.cpp
index d784553..e1bbc6b 100644
--- a/src/backends/reference/workloads/ConvImpl.cpp
+++ b/src/backends/reference/workloads/ConvImpl.cpp
@@ -95,9 +95,12 @@
     const unsigned int heightIndex   = dataLayoutIndexed.GetHeightIndex();
     const unsigned int widthIndex    = dataLayoutIndexed.GetWidthIndex();
 
-    const unsigned int depthMultiplier = depthwise ? rFilterShape[0] : 1;
-    const unsigned int inputChannels   = depthwise ? rFilterShape[1] : rFilterShape[channelsIndex];
-    const unsigned int outputChannels  = depthwise ? inputChannels * depthMultiplier : rFilterShape[0];
+    // Weights layout:
+    // Conv2d:    [O,H,W,I]
+    // Depthwise: [1,H,W,O]
+    const unsigned int inputChannels   = rInputShape[channelsIndex];
+    const unsigned int outputChannels  = rOutputShape[channelsIndex];
+    const unsigned int depthMultiplier = depthwise ? outputChannels/inputChannels : 1;
 
     const unsigned int batchSize    = rOutputShape[0];
     const unsigned int outputHeight = rOutputShape[heightIndex];
@@ -105,16 +108,15 @@
     const unsigned int inputHeight  = rInputShape[heightIndex];
     const unsigned int inputWidth   = rInputShape[widthIndex];
 
-    const unsigned int filterHeight = depthwise ? rFilterShape[2] : rFilterShape[heightIndex];
-    const unsigned int filterWidth  = depthwise ? rFilterShape[3] : rFilterShape[widthIndex];
+    const unsigned int filterHeight = depthwise ? rFilterShape[1] : rFilterShape[heightIndex];
+    const unsigned int filterWidth  = depthwise ? rFilterShape[2] : rFilterShape[widthIndex];
 
     const std::vector<float> inputVec = rInputDecoder.DecodeTensor(rInputShape);
-    const std::vector<float> filterVec = rFilterDecoder.DecodeTensor(rFilterShape, depthMultiplier, depthwise);
+    const std::vector<float> filterVec = rFilterDecoder.DecodeTensor(rFilterShape, depthwise);
 
     const TensorShape biasShape{outputChannels};
     const std::vector<float> biasVec = biasEnabled ? pBiasDecoder->DecodeTensor(biasShape) : std::vector<float>();
 
-    unsigned int depthwiseMultiplierIdx = 0;
     for (unsigned int batchIdx = 0; batchIdx < batchSize; batchIdx++)
     {
         for (unsigned int cOutput = 0; cOutput < outputChannels; cOutput++)
@@ -130,13 +132,6 @@
                     // For normal, must loop over each input channel.
                     for (unsigned int cInput = 0; cInput < (depthwise ? 1 : inputChannels); cInput++)
                     {
-                        if (depthwise)
-                        {
-                            depthwiseMultiplierIdx = 0;
-                            cInput = cOutput / depthMultiplier;
-                            depthwiseMultiplierIdx = cOutput % depthMultiplier;
-                        }
-
                         for (unsigned int yFilter = 0; yFilter < filterHeight; yFilter++)
                         {
                             for (unsigned int xFilter = 0; xFilter < filterWidth; xFilter++)
@@ -147,10 +142,10 @@
                                 // Since dimensionality of kernel depends on depthwiseness, so does index.
                                 if (depthwise)
                                 {
-                                    filterIndex = depthwiseMultiplierIdx * filterWidth * filterHeight * inputChannels +
-                                                  cInput * filterWidth * filterHeight +
-                                                  yFilter * filterWidth +
-                                                  xFilter;
+                                    cInput = cOutput / depthMultiplier;
+                                    // filterDepth = outputChannels;
+                                    filterIndex = xFilter * outputChannels + cOutput +
+                                                  yFilter * filterWidth * outputChannels;
                                 }
                                 else
                                 {
diff --git a/src/backends/reference/workloads/Decoders.hpp b/src/backends/reference/workloads/Decoders.hpp
index 0b3f360..cd0dc5d 100644
--- a/src/backends/reference/workloads/Decoders.hpp
+++ b/src/backends/reference/workloads/Decoders.hpp
@@ -20,11 +20,7 @@
 
 inline std::unique_ptr<Decoder<float>> MakeSigned32PerAxisDecoder(const TensorInfo& info, const void* data)
 {
-    auto params = armnnUtils::GetPerAxisParams(info);
-    return std::make_unique<ScaledInt32PerAxisDecoder>(
-        static_cast<const int32_t*>(data),
-        params.second,
-        params.first);
+    return std::make_unique<ScaledInt32PerAxisDecoder>(static_cast<const int32_t*>(data), info);
 }
 
 inline std::unique_ptr<Decoder<float>> MakeSigned32Decoder(const TensorInfo& info, const void* data)
@@ -75,10 +71,7 @@
         case armnn::DataType::QuantizedSymm8PerAxis:
         {
             std::pair<unsigned int, std::vector<float>> params = armnnUtils::GetPerAxisParams(info);
-            return std::make_unique<QSymm8PerAxisDecoder>(
-                static_cast<const int8_t*>(data),
-                params.second,
-                params.first);
+            return std::make_unique<QSymm8PerAxisDecoder>(static_cast<const int8_t*>(data), info);
         }
         ARMNN_NO_DEPRECATE_WARN_END
         case DataType::QAsymmS8:
@@ -123,10 +116,7 @@
             if (info.HasPerAxisQuantization())
             {
                 std::pair<unsigned int, std::vector<float>> params = armnnUtils::GetPerAxisParams(info);
-                return std::make_unique<QSymm8PerAxisDecoder>(
-                    static_cast<const int8_t*>(data),
-                    params.second,
-                    params.first);
+                return std::make_unique<QSymm8PerAxisDecoder>(static_cast<const int8_t*>(data), info);
             }
             else
             {
diff --git a/src/backends/reference/workloads/TransposeConvolution2d.cpp b/src/backends/reference/workloads/TransposeConvolution2d.cpp
index 7408e92..a1a6cba 100644
--- a/src/backends/reference/workloads/TransposeConvolution2d.cpp
+++ b/src/backends/reference/workloads/TransposeConvolution2d.cpp
@@ -137,7 +137,7 @@
         {
             for (unsigned int dOutput = 0u; dOutput < outputDepth; ++dOutput)
             {
-                rBiasesDecoder.SetIndex(dOutput, dOutput);
+                rBiasesDecoder[dOutput];
                 for (unsigned int yOutput = 0u; yOutput < outputHeight; ++yOutput)
                 {
                     for (unsigned int xOutput = 0u; xOutput < outputWidth; ++xOutput)