MLCE-418 Reduce layer does not support multiple axes

 * Added backend specific optimization to chain new reduces layers
   for each axis to simulate behaviour of a layer with multiple axes.
 * Added function to calculate reduced output shape.
 * Added unit tests.

Signed-off-by: Matthew Sloyan <matthew.sloyan@arm.com>
Change-Id: I180b0b111b7bcf3d0c283f1db0b82d5f17757682
diff --git a/src/backends/aclCommon/ArmComputeSubgraphUtils.hpp b/src/backends/aclCommon/ArmComputeSubgraphUtils.hpp
index a0fca46..9439ddb 100644
--- a/src/backends/aclCommon/ArmComputeSubgraphUtils.hpp
+++ b/src/backends/aclCommon/ArmComputeSubgraphUtils.hpp
@@ -6,6 +6,9 @@
 #pragma once
 
 #include <armnn/backends/OptimizationViews.hpp>
+#include <armnn/utility/Assert.hpp>
+
+#include <aclCommon/ArmComputeUtils.hpp>
 
 namespace armnn
 {
@@ -147,4 +150,86 @@
     return replacementLayer;
 }
 
+//
+// If reduce layer has multiple axes, add new layer for each axis to simulate the same behaviour
+// as currently only one axis is supported.
+//
+template<typename LayerType>
+void ChainReduceLayers(OptimizationViews& optimizationViews,
+                       LayerType* baseLayer,
+                       ReduceDescriptor& reduceDescriptor)
+{
+    // If layer has single axis don't chain layers.
+    if (!reduceDescriptor.m_vAxis.empty() && reduceDescriptor.m_vAxis.size() > 1)
+    {
+        // Save base layer output shape to compare against the output of the final layer added.
+        const TensorInfo baseLayerInfo = baseLayer->GetOutputSlot(0).GetTensorInfo();
+
+        // Vector of new chained layers, used for substitution.
+        std::vector<Layer*> layers;
+
+        // Vector of axes so each layer is reshaped correctly.
+        std::vector<uint32_t> reduceAxis;
+        unsigned int recalulateAxis = 0;
+
+        for (unsigned int i = 0; i != reduceDescriptor.m_vAxis.size(); ++i)
+        {
+            // Get TensorInfo to populate subsequent layers with.
+            TensorInfo layerInfoToModify = baseLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo();
+
+            reduceAxis.emplace_back(reduceDescriptor.m_vAxis[i]);
+
+            // Calculate new shape based on the axes.
+            const TensorShape& reducedShape = ComputeReductionTensorShape(layerInfoToModify,
+                                                                          reduceAxis,
+                                                                          reduceDescriptor.m_KeepDims);
+            layerInfoToModify.SetShape(reducedShape);
+
+            // Create a vector for the single axis to be assigned to the descriptor.
+            // Update axis if keepDims is set reduce layers correctly.
+            std::vector<uint32_t> singleAxis(1, reduceDescriptor.m_vAxis[i] - recalulateAxis);
+
+            // Create a descriptor and assign single axis.
+            ReduceDescriptor newReduceDescriptor = baseLayer->GetParameters();
+            newReduceDescriptor.m_vAxis.assign(singleAxis.begin(), singleAxis.end());
+
+            // Add new layer to graph.
+            std::string layerName = "reduce_layer_" + std::to_string(i);
+            Layer* replacementLayer = optimizationViews.GetGraph().AddLayer<LayerType>(newReduceDescriptor,
+                                                                                       layerName.c_str());
+
+            // Connect previous layer with new layer.
+            // The first and last layer will be connected when the subgraph is replaced.
+            if (!layers.empty())
+            {
+                layers[i - 1]->GetOutputSlot(0).Connect(replacementLayer->GetInputSlot(0));
+            }
+
+            // Set updated tensorInfo for new layer.
+            replacementLayer->GetOutputSlot(0).SetTensorInfo(layerInfoToModify);
+
+            if (!reduceDescriptor.m_KeepDims)
+            {
+                recalulateAxis++;
+            }
+
+            layers.emplace_back(replacementLayer);
+        }
+
+        // Check if the TensorInfo from the last layer equals the inferred output from the original layer.
+        ARMNN_ASSERT(baseLayerInfo == layers.back()->GetOutputSlot().GetTensorInfo());
+
+        std::list<Layer*> replacementLayers(layers.begin(), layers.end());
+
+        // Substitute new chained subgraph for original reduce layer.
+        SubgraphView substitutionSubgraph(baseLayer);
+        SubgraphView replacementSubgraph(CreateInputsFrom({replacementLayers.front()}),
+                                         CreateOutputsFrom({replacementLayers.back()}),
+                                         std::move(replacementLayers));
+
+        optimizationViews.AddSubstitution({substitutionSubgraph, replacementSubgraph});
+
+    }
+}
+
 } // namespace armnn
diff --git a/src/backends/aclCommon/ArmComputeUtils.hpp b/src/backends/aclCommon/ArmComputeUtils.hpp
index d9efab2..5bc5abc 100644
--- a/src/backends/aclCommon/ArmComputeUtils.hpp
+++ b/src/backends/aclCommon/ArmComputeUtils.hpp
@@ -7,6 +7,7 @@
 #include <armnn/Descriptors.hpp>
 #include <armnn/Tensor.hpp>
 #include <armnn/utility/Assert.hpp>
+#include <armnn/utility/NumericCast.hpp>
 #include <backendsCommon/WorkloadData.hpp>
 
 #include <arm_compute/core/Types.h>
@@ -267,4 +268,58 @@
     }
 }
 
+/// Function to compute the output tensor shape based on the axes and if keepDims is set.
+inline const TensorShape ComputeReductionTensorShape(const armnn::TensorInfo& input,
+                                                     const std::vector<uint32_t>& vAxis,
+                                                     const bool keepDims)
+{
+    unsigned int rank = input.GetNumDimensions();
+    unsigned int outputRank = 0;
+
+    // Calculate output dimension
+    if (keepDims)
+    {
+        outputRank = rank;
+    }
+    else if (vAxis.empty())
+    {
+        outputRank = 1;
+    }
+    else if (vAxis.size() > input.GetNumDimensions())
+    {
+        throw LayerValidationException("ReduceLayer: Dimensions to reduce can not be bigger than input dimensions");
+    }
+    else
+    {
+        outputRank = input.GetNumDimensions() - armnn::numeric_cast<unsigned int>(vAxis.size());
+        if (outputRank == 0)
+        {
+            outputRank = 1;
+        }
+    }
+
+    std::vector<unsigned int> dimSizes(outputRank, 1);
+    if (!vAxis.empty())
+    {
+        // Skip the dimension that has been reduced unless keepDims is true.
+        unsigned int outputIndex = 0;
+        for (unsigned int i = 0; i < input.GetNumDimensions(); ++i)
+        {
+            if (std::find(vAxis.begin(), vAxis.end(), i) == vAxis.end())
+            {
+                dimSizes[outputIndex] = armnn::numeric_cast<unsigned int>(input.GetShape()[i]);
+                ++outputIndex;
+            }
+            else if (keepDims)
+            {
+                dimSizes[outputIndex] = 1;
+                ++outputIndex;
+            }
+        }
+    }
+
+    const TensorShape inferredShape = TensorShape(outputRank, dimSizes.data());
+    return inferredShape;
+}
+
 } // namespace armnn
diff --git a/src/backends/cl/ClBackend.cpp b/src/backends/cl/ClBackend.cpp
index f97cb4b..92a06aa 100644
--- a/src/backends/cl/ClBackend.cpp
+++ b/src/backends/cl/ClBackend.cpp
@@ -29,6 +29,7 @@
 #include "workloads/ClDivisionWorkload.hpp"
 #include "workloads/ClFullyConnectedWorkload.hpp"
 #include "workloads/ClMultiplicationWorkload.hpp"
+#include "workloads/ClReduceWorkload.hpp"
 #include "workloads/ClSubtractionWorkload.hpp"
 
 #include <Optimizer.hpp>
@@ -188,7 +189,8 @@
         if ((base.GetType() == LayerType::DepthwiseConvolution2d || base.GetType() == LayerType::Convolution2d
             || base.GetType() == LayerType::BatchNormalization || base.GetType() == LayerType::FullyConnected
             || base.GetType() == LayerType::Addition || base.GetType() == LayerType::Multiplication
-            || base.GetType() == LayerType::Subtraction || base.GetType() == LayerType::Division)
+            || base.GetType() == LayerType::Subtraction || base.GetType() == LayerType::Division
+            || base.GetType() == LayerType::Reduce)
             && (base.GetAdditionalInformation<ActivationDescriptor>() == nullptr))
         {
             for (auto output = base.BeginOutputSlots(); output != base.EndOutputSlots(); ++output)
@@ -412,6 +414,26 @@
                                 }
                             }
                         }
+
+                        // Separate check for Reduce as we aren't fusing with activation layer
+                        if (base.GetType() == LayerType::Reduce)
+                        {
+                            ReduceLayer* baseLayer = PolymorphicDowncast<ReduceLayer*>(&base);
+
+                            // Get params from base layer
+                            ReduceDescriptor reduceDescriptor = baseLayer->GetParameters();
+
+                            arm_compute::Status status = ClReduceWorkloadValidate(
+                                    baseLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(),
+                                    baseLayer->GetOutputSlot(0).GetTensorInfo(),
+                                    reduceDescriptor);
+
+                            if (status)
+                            {
+                                ChainReduceLayers<ReduceLayer>(optimizationViews, baseLayer, reduceDescriptor);
+                                untouched.erase(baseLayer->GetGuid());
+                            }
+                        }
                     }
                 }
             }
diff --git a/src/backends/cl/workloads/ClReduceWorkload.cpp b/src/backends/cl/workloads/ClReduceWorkload.cpp
index 6f594ff..0ad6259 100644
--- a/src/backends/cl/workloads/ClReduceWorkload.cpp
+++ b/src/backends/cl/workloads/ClReduceWorkload.cpp
@@ -20,23 +20,52 @@
                                              const ReduceDescriptor& desc)
 {
     const arm_compute::TensorInfo aclInputInfo  = armcomputetensorutils::BuildArmComputeTensorInfo(input);
-    const arm_compute::TensorInfo aclOutputInfo = armcomputetensorutils::BuildArmComputeTensorInfo(output);
-    if (!desc.m_vAxis.empty() && desc.m_vAxis.size() > 1)
-    {
-        return arm_compute::Status(arm_compute::ErrorCode::RUNTIME_ERROR,
-                                   "ClReduceWorkload: Reduction is supported only on 1 axis.");
-    }
 
     arm_compute::Coordinates coords = BuildArmComputeReductionCoordinates(aclInputInfo.num_dimensions(),
                                                                           input.GetNumDimensions(),
                                                                           desc.m_vAxis);
 
+    // As ACL only support one axis, validate the layer for each axis if more than one is present.
+    if (!desc.m_vAxis.empty() && desc.m_vAxis.size() > 1)
+    {
+        arm_compute::Status status;
 
-    return arm_compute::CLReductionOperation::validate(&aclInputInfo,
-                                                       &aclOutputInfo,
-                                                       static_cast<unsigned int>(coords[0]),
-                                                       ConvertReductionOperationToAcl(desc),
-                                                       desc.m_KeepDims);
+        for (unsigned int i = 0; i != desc.m_vAxis.size(); ++i)
+        {
+            TensorInfo inputToModify = input;
+            std::vector<uint32_t> singleAxis(1, desc.m_vAxis[i]);
+
+            // Calculate the output shape using the input shape for a single axis.
+            // Currently the output TensorInfo inferred will be reduced upon multiple axis
+            // which will fail validation as only one axis is supported.
+            const TensorShape& reducedShape = ComputeReductionTensorShape(inputToModify, singleAxis, desc.m_KeepDims);
+            inputToModify.SetShape(reducedShape);
+
+            const arm_compute::TensorInfo aclOutputInfoModified =
+                    armcomputetensorutils::BuildArmComputeTensorInfo(inputToModify);
+
+            status = arm_compute::CLReductionOperation::validate(&aclInputInfo,
+                                                                 &aclOutputInfoModified,
+                                                                 static_cast<unsigned int>(coords[i]),
+                                                                 ConvertReductionOperationToAcl(desc),
+                                                                 desc.m_KeepDims);
+            if (!status)
+            {
+                break;
+            }
+        }
+        return status;
+    }
+    else
+    {
+        const arm_compute::TensorInfo aclOutputInfo = armcomputetensorutils::BuildArmComputeTensorInfo(output);
+
+        return arm_compute::CLReductionOperation::validate(&aclInputInfo,
+                                                           &aclOutputInfo,
+                                                           static_cast<unsigned int>(coords[0]),
+                                                           ConvertReductionOperationToAcl(desc),
+                                                           desc.m_KeepDims);
+    }
 }
 
 ClReduceWorkload::ClReduceWorkload(const ReduceQueueDescriptor& descriptor, const WorkloadInfo& info)
diff --git a/src/backends/neon/NeonBackend.cpp b/src/backends/neon/NeonBackend.cpp
index a1299fb..6d5eab0 100644
--- a/src/backends/neon/NeonBackend.cpp
+++ b/src/backends/neon/NeonBackend.cpp
@@ -29,6 +29,7 @@
 #include "workloads/NeonDivisionWorkload.hpp"
 #include "workloads/NeonFullyConnectedWorkload.hpp"
 #include "workloads/NeonMultiplicationWorkload.hpp"
+#include "workloads/NeonReduceWorkload.hpp"
 #include "workloads/NeonSubtractionWorkload.hpp"
 
 #include <Optimizer.hpp>
@@ -164,7 +165,8 @@
         if ((base.GetType() == LayerType::DepthwiseConvolution2d || base.GetType() == LayerType::Convolution2d
              || base.GetType() == LayerType::BatchNormalization || base.GetType() == LayerType::FullyConnected
              || base.GetType() == LayerType::Addition || base.GetType() == LayerType::Multiplication
-             || base.GetType() == LayerType::Subtraction || base.GetType() == LayerType::Division)
+             || base.GetType() == LayerType::Subtraction || base.GetType() == LayerType::Division
+             || base.GetType() == LayerType::Reduce)
             && (base.GetAdditionalInformation<ActivationDescriptor>() == nullptr))
         {
             for (auto output = base.BeginOutputSlots(); output != base.EndOutputSlots(); ++output)
@@ -389,6 +391,26 @@
                                 }
                             }
                         }
+
+                        // Separate check for Reduce as we aren't fusing with activation layer
+                        if (base.GetType() == LayerType::Reduce)
+                        {
+                            ReduceLayer* baseLayer = PolymorphicDowncast<ReduceLayer*>(&base);
+
+                            // Get params from base layer
+                            ReduceDescriptor reduceDescriptor = baseLayer->GetParameters();
+
+                            arm_compute::Status status = NeonReduceWorkloadValidate(
+                                    baseLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(),
+                                    baseLayer->GetOutputSlot(0).GetTensorInfo(),
+                                    reduceDescriptor);
+
+                            if (status)
+                            {
+                                ChainReduceLayers<ReduceLayer>(optimizationViews, baseLayer, reduceDescriptor);
+                                untouched.erase(baseLayer->GetGuid());
+                            }
+                        }
                     }
                 }
             }
diff --git a/src/backends/neon/workloads/NeonReduceWorkload.cpp b/src/backends/neon/workloads/NeonReduceWorkload.cpp
index 0e1b46a..6125f36 100644
--- a/src/backends/neon/workloads/NeonReduceWorkload.cpp
+++ b/src/backends/neon/workloads/NeonReduceWorkload.cpp
@@ -21,22 +21,52 @@
                                                const ReduceDescriptor& desc)
 {
     const arm_compute::TensorInfo aclInputInfo  = armcomputetensorutils::BuildArmComputeTensorInfo(input);
-    const arm_compute::TensorInfo aclOutputInfo = armcomputetensorutils::BuildArmComputeTensorInfo(output);
-    if (!desc.m_vAxis.empty() && desc.m_vAxis.size() > 1)
-    {
-        return arm_compute::Status(arm_compute::ErrorCode::RUNTIME_ERROR,
-                                   "NeonReduceWorkload: Reduction is supported only on 1 axis.");
-    }
 
     arm_compute::Coordinates coords = BuildArmComputeReductionCoordinates(aclInputInfo.num_dimensions(),
                                                                           input.GetNumDimensions(),
                                                                           desc.m_vAxis);
 
-    return arm_compute::NEReductionOperation::validate(&aclInputInfo,
-                                                       &aclOutputInfo,
-                                                       static_cast<unsigned int>(coords[0]),
-                                                       ConvertReductionOperationToAcl(desc),
-                                                       desc.m_KeepDims);
+    // As ACL only support one axis, validate the layer for each axis if more than one is present.
+    if (!desc.m_vAxis.empty() && desc.m_vAxis.size() > 1)
+    {
+        arm_compute::Status status;
+
+        for (unsigned int i = 0; i != desc.m_vAxis.size(); ++i)
+        {
+            TensorInfo inputToModify = input;
+            std::vector<uint32_t> singleAxis(1, desc.m_vAxis[i]);
+
+            // Calculate the output shape using the input shape for a single axis.
+            // Currently the output TensorInfo inferred will be reduced upon multiple axis
+            // which will fail validation as only one axis is supported.
+            const TensorShape& reducedShape = ComputeReductionTensorShape(inputToModify, singleAxis, desc.m_KeepDims);
+            inputToModify.SetShape(reducedShape);
+
+            const arm_compute::TensorInfo aclOutputInfoModified =
+                    armcomputetensorutils::BuildArmComputeTensorInfo(inputToModify);
+
+            status = arm_compute::NEReductionOperation::validate(&aclInputInfo,
+                                                                 &aclOutputInfoModified,
+                                                                 static_cast<unsigned int>(coords[i]),
+                                                                 ConvertReductionOperationToAcl(desc),
+                                                                 desc.m_KeepDims);
+            if (!status)
+            {
+                break;
+            }
+        }
+        return status;
+    }
+    else
+    {
+        const arm_compute::TensorInfo aclOutputInfo = armcomputetensorutils::BuildArmComputeTensorInfo(output);
+
+        return arm_compute::NEReductionOperation::validate(&aclInputInfo,
+                                                           &aclOutputInfo,
+                                                           static_cast<unsigned int>(coords[0]),
+                                                           ConvertReductionOperationToAcl(desc),
+                                                           desc.m_KeepDims);
+    }
 }
 
 NeonReduceWorkload::NeonReduceWorkload(const ReduceQueueDescriptor& descriptor, const WorkloadInfo& info)
@@ -50,6 +80,7 @@
     arm_compute::Coordinates coords = BuildArmComputeReductionCoordinates(input.info()->num_dimensions(),
                                                                           info.m_InputTensorInfos[0].GetNumDimensions(),
                                                                           m_Data.m_Parameters.m_vAxis);
+
     m_Layer.configure(&input,
                       &output,
                       static_cast<unsigned int>(coords[0]),