GitHub #667: Neon fold padding into average pool 2D quantization bug fix.

  * Originated from a GitHub issue: https://github.com/ARM-software/armnn/issues/667
  * Initially, Arm NN supports the pool 2D operation because there is no padding
    on the pool2d. Neon failure occurs when padding is followed by average pool 2D
    due to folding optimization.
  * Here we prevent the folding optimization from happening for the above special case
    and add it in as a backend specific optimization.

Signed-off-by: Cathal Corbett <cathal.corbett@arm.com>
Change-Id: Ia0fd90c3a6b4b9d29c81106f154617d2e893e26b
diff --git a/src/backends/aclCommon/ArmComputeSubgraphUtils.hpp b/src/backends/aclCommon/ArmComputeSubgraphUtils.hpp
index a26442c..766bf2d 100644
--- a/src/backends/aclCommon/ArmComputeSubgraphUtils.hpp
+++ b/src/backends/aclCommon/ArmComputeSubgraphUtils.hpp
@@ -1,5 +1,5 @@
 //
-// Copyright © 2020 Arm Ltd and Contributors. All rights reserved.
+// Copyright © 2022 Arm Ltd and Contributors. All rights reserved.
 // SPDX-License-Identifier: MIT
 //
 
@@ -9,6 +9,7 @@
 #include <armnn/utility/Assert.hpp>
 
 #include <aclCommon/ArmComputeUtils.hpp>
+#include <backendsCommon/SubgraphUtils.hpp>
 
 namespace armnn
 {
@@ -20,36 +21,6 @@
 // this helper only works if all layers where the inputs connect to are not selected
 //
 
-SubgraphView::IInputSlots CreateIInputsFrom(const std::vector<armnn::IConnectableLayer*>& layers)
-{
-    SubgraphView::IInputSlots result;
-    for (auto&& layer : layers)
-    {
-        for (unsigned int i = 0 ; i < layer->GetNumInputSlots(); ++i)
-        {
-            result.push_back(&(layer->GetInputSlot(i)));
-        }
-    }
-    return result;
-}
-
-//
-// this helper only works if all layers where the outputs connect to are not selected
-//
-
-SubgraphView::IOutputSlots CreateIOutputsFrom(const std::vector<armnn::IConnectableLayer*>& layers)
-{
-    SubgraphView::IOutputSlots result;
-    for (auto &&layer: layers)
-    {
-        for (unsigned int i = 0; i < layer->GetNumOutputSlots(); ++i)
-        {
-            result.push_back(&(layer->GetOutputSlot(i)));
-        }
-    }
-    return result;
-}
-
 bool checkDataTypeInputandOutput(const Layer& layer)
 {
     auto inputInfo = layer.GetInputSlot(0).GetConnection()->GetTensorInfo();
@@ -79,19 +50,6 @@
 
 } // namespace
 
-inline void ReportUntouchedLayers(OptimizationViews& optimizationViews, std::map<LayerGuid, Layer*> untouched)
-{
-    std::vector<Layer*> untouchedVector;
-    for (const auto& pair : untouched)
-    {
-        Layer* layer = pair.second;
-        SubgraphView subgraphView({layer},
-                                  CreateIInputsFrom({layer}),
-                                  CreateIOutputsFrom({layer}));
-        optimizationViews.AddUntouchedSubgraph(std::move(subgraphView));
-    }
-}
-
 template<typename LayerType>
 LayerType* FuseLayer(OptimizationViews& optimizationViews,
                      LayerType* baseLayer,
diff --git a/src/backends/backendsCommon/SubgraphUtils.hpp b/src/backends/backendsCommon/SubgraphUtils.hpp
new file mode 100644
index 0000000..bd3d698
--- /dev/null
+++ b/src/backends/backendsCommon/SubgraphUtils.hpp
@@ -0,0 +1,99 @@
+//
+// Copyright © 2022 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include <optimizations/FoldPadIntoLayer2d.hpp>
+
+namespace armnn
+{
+
+namespace
+{
+
+//
+// this helper only works if all layers where the inputs connect to are not selected
+//
+
+SubgraphView::IInputSlots CreateIInputsFrom(const std::vector<armnn::IConnectableLayer*>& layers)
+{
+    SubgraphView::IInputSlots result;
+    for (auto&& layer : layers)
+    {
+        for (unsigned int i = 0 ; i < layer->GetNumInputSlots(); ++i)
+        {
+            result.push_back(&(layer->GetInputSlot(i)));
+        }
+    }
+    return result;
+}
+
+//
+// this helper only works if all layers where the outputs connect to are not selected
+//
+
+SubgraphView::IOutputSlots CreateIOutputsFrom(const std::vector<armnn::IConnectableLayer*>& layers)
+{
+    SubgraphView::IOutputSlots result;
+    for (auto &&layer: layers)
+    {
+        for (unsigned int i = 0; i < layer->GetNumOutputSlots(); ++i)
+        {
+            result.push_back(&(layer->GetOutputSlot(i)));
+        }
+    }
+    return result;
+}
+
+}
+
+inline void ReportUntouchedLayers(OptimizationViews& optimizationViews, std::map<LayerGuid, Layer*> untouched)
+{
+    std::vector<Layer*> untouchedVector;
+    for (const auto& pair : untouched)
+    {
+        Layer* layer = pair.second;
+        SubgraphView subgraphView({layer},
+                                  CreateIInputsFrom({layer}),
+                                  CreateIOutputsFrom({layer}));
+        optimizationViews.AddUntouchedSubgraph(std::move(subgraphView));
+    }
+}
+
+template<typename LayerType>
+LayerType* FoldPadLayer(OptimizationViews& optimizationViews,
+                        LayerType* baseLayer,
+                        LayerType* replacementLayer,
+                        PadLayer* padLayer)
+{
+    SubgraphView substitutionSubgraph({padLayer, baseLayer},
+                                      CreateIInputsFrom({padLayer}),
+                                      CreateIOutputsFrom({baseLayer}));
+    SubgraphView replacementSubgraph(replacementLayer);
+
+    optimizationViews.AddSubstitution({substitutionSubgraph, replacementSubgraph});
+
+    return replacementLayer;
+}
+
+template<typename LayerType>
+LayerType* FoldPadIntoAveragePool2d(OptimizationViews& optimizationViews,
+                                    Pooling2dLayer* baseLayer,
+                                    Pooling2dDescriptor& poolDescriptor,
+                                    PadLayer* padLayer)
+{
+        IConnectableLayer* replacement =
+            optimizationViews.GetINetwork()->AddPooling2dLayer(poolDescriptor, "folded-pad-into-pool2d");
+        LayerType* replacementLayer = PolymorphicDowncast<LayerType*>(replacement);
+
+        FoldPadLayer(optimizationViews,
+                     baseLayer,
+                     replacementLayer,
+                     padLayer);
+
+        return replacementLayer;
+}
+
+} // namespace armnn
diff --git a/src/backends/cl/ClBackend.cpp b/src/backends/cl/ClBackend.cpp
index 1fe53de..d2e8fbf 100644
--- a/src/backends/cl/ClBackend.cpp
+++ b/src/backends/cl/ClBackend.cpp
@@ -1,5 +1,5 @@
 //
-// Copyright © 2017 Arm Ltd. All rights reserved.
+// Copyright © 2022 Arm Ltd. All rights reserved.
 // SPDX-License-Identifier: MIT
 //
 
@@ -566,6 +566,31 @@
                 untouched.erase(baseLayer->GetGuid());
             }
         }
+
+        // Special case to fuse padding into average pooling 2d for quantized datatype.
+        // Required to be done as a backend specific optimization as Neon does not support this special case.
+        if (base.GetType() == LayerType::Pooling2d)
+        {
+            Pooling2dLayer* baseLayer = PolymorphicDowncast<Pooling2dLayer*>(&base);
+            Pooling2dDescriptor poolingDescriptor = baseLayer->GetParameters();
+
+            if (baseLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetOwningLayer().GetType() == LayerType::Pad)
+            {
+                PadLayer* padLayer = PolymorphicDowncast<PadLayer*>(
+                    &baseLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetOwningLayer());
+                if (padLayer->GetOutputSlot(0).GetNumConnections() == 1 &&
+                    optimizations::pad_fold::TryFoldPadIntoLayer2d(padLayer->GetParameters(),
+                                                                   poolingDescriptor,
+                                                                   padLayer->GetOutputSlot().GetTensorInfo(),
+                                                                   true))
+                {
+                    FoldPadIntoAveragePool2d<Pooling2dLayer>(optimizationViews, baseLayer,
+                                                             poolingDescriptor, padLayer);
+                    untouched.erase(baseLayer->GetGuid());
+                    untouched.erase(padLayer->GetGuid());
+                }
+            }
+        }
     }
 
     if (optimizationViews.GetSubstitutions().empty())
diff --git a/src/backends/reference/RefBackend.cpp b/src/backends/reference/RefBackend.cpp
index a33a775..8c8879c 100644
--- a/src/backends/reference/RefBackend.cpp
+++ b/src/backends/reference/RefBackend.cpp
@@ -14,6 +14,7 @@
 #include <armnn/backends/IMemoryManager.hpp>
 #include <armnn/utility/PolymorphicDowncast.hpp>
 #include <backendsCommon/DefaultAllocator.hpp>
+#include <backendsCommon/SubgraphUtils.hpp>
 
 #include <Optimizer.hpp>
 
@@ -70,11 +71,61 @@
     return layerSupport;
 }
 
-OptimizationViews RefBackend::OptimizeSubgraphView(const SubgraphView& subgraph) const
+OptimizationViews RefBackend::OptimizeSubgraphView(const SubgraphView& subgraph,
+                                                   const ModelOptions& modelOptions) const
 {
-    OptimizationViews optimizationViews;
+    OptimizationViews optimizationViews(modelOptions);
 
-    optimizationViews.AddUntouchedSubgraph(SubgraphView(subgraph));
+    auto it = subgraph.endIConnectable();
+    std::map<LayerGuid, Layer*> untouched;
+
+    while (it != subgraph.beginIConnectable())
+    {
+        --it;
+        Layer& base = *(PolymorphicDowncast<Layer*>(*it));
+        untouched.insert({base.GetGuid(), &base});
+    }
+
+    it = subgraph.endIConnectable();
+    while (it != subgraph.beginIConnectable())
+    {
+        --it;
+        Layer& base = *(PolymorphicDowncast<Layer*>(*it));
+
+        // Special case to fuse padding into average pooling 2d for quantized datatype.
+        // Required to be done as a backend specific optimization as Neon does not support this special case.
+        if (base.GetType() == LayerType::Pooling2d)
+        {
+            Pooling2dLayer* baseLayer = PolymorphicDowncast<Pooling2dLayer*>(&base);
+            Pooling2dDescriptor poolingDescriptor = baseLayer->GetParameters();
+
+            if (baseLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetOwningLayer().GetType() == LayerType::Pad)
+            {
+                PadLayer* padLayer = PolymorphicDowncast<PadLayer*>(
+                    &baseLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetOwningLayer());
+                if (padLayer->GetOutputSlot(0).GetNumConnections() == 1 &&
+                    optimizations::pad_fold::TryFoldPadIntoLayer2d(padLayer->GetParameters(),
+                                                                   poolingDescriptor,
+                                                                   padLayer->GetOutputSlot().GetTensorInfo(),
+                                                                   true))
+                {
+                    FoldPadIntoAveragePool2d<Pooling2dLayer>(optimizationViews, baseLayer,
+                                                             poolingDescriptor, padLayer);
+                    untouched.erase(baseLayer->GetGuid());
+                    untouched.erase(padLayer->GetGuid());
+                }
+            }
+        }
+    }
+
+    if (optimizationViews.GetSubstitutions().empty())
+    {
+        optimizationViews.AddUntouchedSubgraph(SubgraphView(subgraph));
+    }
+    else
+    {
+        ReportUntouchedLayers(optimizationViews, untouched);
+    }
 
     return optimizationViews;
 }
diff --git a/src/backends/reference/RefBackend.hpp b/src/backends/reference/RefBackend.hpp
index 9828d09..ecbe4d5 100644
--- a/src/backends/reference/RefBackend.hpp
+++ b/src/backends/reference/RefBackend.hpp
@@ -50,7 +50,8 @@
 
     IBackendInternal::ILayerSupportSharedPtr GetLayerSupport() const override;
 
-    OptimizationViews OptimizeSubgraphView(const SubgraphView& subgraph) const override;
+    OptimizationViews OptimizeSubgraphView(const SubgraphView& subgraph,
+                                           const ModelOptions& modelOptions) const override;
 
     std::vector<ITensorHandleFactory::FactoryId> GetHandleFactoryPreferences() const override;