COMPMID-2579: Fuse batch normalization with convolution and depthwise convolution at graph level on NEON

Change-Id: Ib263a680bbd2dc1a4947102ee8d6da76b95f02bf
Signed-off-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
Reviewed-on: https://review.mlplatform.org/c/2252
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
Reviewed-by: Giorgio Arena <giorgio.arena@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
diff --git a/arm_compute/graph/backends/FunctionHelpers.h b/arm_compute/graph/backends/FunctionHelpers.h
index ee257e3..02bfe9d 100644
--- a/arm_compute/graph/backends/FunctionHelpers.h
+++ b/arm_compute/graph/backends/FunctionHelpers.h
@@ -174,11 +174,12 @@
  * @tparam TargetInfo                      Target-specific information
  *
  * @param[in] node Node to create the backend function for
+ * @param[in] ctx  Graph context
  *
  * @return Backend batch normalization layer function
  */
 template <typename FusedLayerTypes, typename TargetInfo>
-std::unique_ptr<IFunction> create_fused_convolution_batch_normalization_layer(FusedConvolutionBatchNormalizationNode &node)
+std::unique_ptr<IFunction> create_fused_convolution_batch_normalization_layer(FusedConvolutionBatchNormalizationNode &node, GraphContext &ctx)
 {
     validate_node<TargetInfo>(node, 7 /* expected inputs */, 1 /* expected outputs */);
 
@@ -199,9 +200,16 @@
     const ActivationLayerInfo fused_act  = node.fused_activation();
     const float               epsilon    = node.epsilon();
 
+    // Create and configure function (we assume that functions have been validated before creation)
+    std::shared_ptr<IMemoryManager> mm = get_memory_manager(ctx, TargetInfo::TargetType);
+    std::unique_ptr<IFunction>      func;
+    std::string                     func_name;
+
+    using FType = FusedConvolutionBatchNormalizationFunction<TargetInfo, FusedLayerTypes>;
+
     // Create and configure function
-    auto func = support::cpp14::make_unique<FusedConvolutionBatchNormalizationFunction<TargetInfo, FusedLayerTypes>>();
-    func->configure(input, weights, biases, output, mean, var, beta, gamma, epsilon, conv_info, num_groups, fast_math, fused_act);
+    std::tie(func, func_name) = create_named_memory_managed_function<FType>(
+                                    std::string("FusedConvolutionBatchNormalizationLayer"), mm, input, weights, biases, output, mean, var, beta, gamma, epsilon, conv_info, num_groups, fast_math, fused_act);
 
     // Log info
     ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
@@ -214,7 +222,7 @@
                                << " Output shape: " << output->info()->tensor_shape()
                                << (fused_act.enabled() ? " " + to_string(fused_act.activation()) : "")
                                << std::endl);
-    return std::move(func);
+    return func;
 }
 
 /** Create a backend fused depthwise convolution batch normalization layer function
@@ -223,11 +231,12 @@
  * @tparam TargetInfo                  Target-specific information
  *
  * @param[in] node Node to create the backend function for
+ * @param[in] ctx  Graph context
  *
  * @return Backend fused depthwise convolution batch normalization layer function
  */
 template <typename FusedLayerTypes, typename TargetInfo>
-std::unique_ptr<IFunction> create_fused_depthwise_convolution_batch_normalization_layer(FusedDepthwiseConvolutionBatchNormalizationNode &node)
+std::unique_ptr<IFunction> create_fused_depthwise_convolution_batch_normalization_layer(FusedDepthwiseConvolutionBatchNormalizationNode &node, GraphContext &ctx)
 {
     validate_node<TargetInfo>(node, 7 /* expected inputs */, 1 /* expected outputs */);
 
@@ -247,9 +256,16 @@
     const ActivationLayerInfo fused_act        = node.fused_activation();
     const float               epsilon          = node.epsilon();
 
+    // Create and configure function (we assume that functions have been validated before creation)
+    std::shared_ptr<IMemoryManager> mm = get_memory_manager(ctx, TargetInfo::TargetType);
+    std::unique_ptr<IFunction>      func;
+    std::string                     func_name;
+
+    using FType = FusedDepthwiseConvolutionBatchNormalizationFunction<TargetInfo, FusedLayerTypes>;
+
     // Create and configure function
-    auto func = support::cpp14::make_unique<FusedDepthwiseConvolutionBatchNormalizationFunction<TargetInfo, FusedLayerTypes>>();
-    func->configure(input, weights, biases, output, mean, var, beta, gamma, epsilon, conv_info, depth_multiplier, fused_act);
+    std::tie(func, func_name) = create_named_memory_managed_function<FType>(
+                                    std::string("FusedDepthwiseConvolutionBatchNormalizationLayer"), mm, input, weights, biases, output, mean, var, beta, gamma, epsilon, conv_info, depth_multiplier, fused_act);
 
     // Log info
     ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
@@ -262,7 +278,7 @@
                                << " Output shape: " << output->info()->tensor_shape()
                                << (fused_act.enabled() ? " " + to_string(fused_act.activation()) : "")
                                << std::endl);
-    return std::move(func);
+    return func;
 }
 
 /** Create a backend bounding box transform layer function
diff --git a/arm_compute/graph/backends/FusedConvolutionBatchNormalizationFunction.h b/arm_compute/graph/backends/FusedConvolutionBatchNormalizationFunction.h
index a6da76b..0af3abc 100644
--- a/arm_compute/graph/backends/FusedConvolutionBatchNormalizationFunction.h
+++ b/arm_compute/graph/backends/FusedConvolutionBatchNormalizationFunction.h
@@ -42,8 +42,8 @@
     using TensorType         = typename TargetInfo::TensorType;
     using TensorConcreteType = typename TargetInfo::TensorConcreteType;
 
-    FusedConvolutionBatchNormalizationFunction()
-        : _conv_layer(), _fused_batch_norm_layer(), _fused_bias(), _is_prepared(false)
+    FusedConvolutionBatchNormalizationFunction(std::shared_ptr<IMemoryManager> memory_manager = nullptr)
+        : _conv_layer(memory_manager), _fused_batch_norm_layer(), _fused_bias(), _is_prepared(false)
     {
     }
 
diff --git a/arm_compute/graph/backends/FusedDepthwiseConvolutionBatchNormalizationFunction.h b/arm_compute/graph/backends/FusedDepthwiseConvolutionBatchNormalizationFunction.h
index 6f70d3c..14474f4 100644
--- a/arm_compute/graph/backends/FusedDepthwiseConvolutionBatchNormalizationFunction.h
+++ b/arm_compute/graph/backends/FusedDepthwiseConvolutionBatchNormalizationFunction.h
@@ -42,8 +42,8 @@
     using TensorType         = typename TargetInfo::TensorType;
     using TensorConcreteType = typename TargetInfo::TensorConcreteType;
 
-    FusedDepthwiseConvolutionBatchNormalizationFunction()
-        : _depth_conv_layer(), _fused_batch_norm_layer(), _fused_bias(), _is_prepared(false)
+    FusedDepthwiseConvolutionBatchNormalizationFunction(std::shared_ptr<IMemoryManager> memory_manager = nullptr)
+        : _depth_conv_layer(memory_manager), _fused_batch_norm_layer(), _fused_bias(), _is_prepared(false)
     {
     }
 
diff --git a/src/graph/backends/CL/CLFunctionsFactory.cpp b/src/graph/backends/CL/CLFunctionsFactory.cpp
index d53b634..ca6c837 100644
--- a/src/graph/backends/CL/CLFunctionsFactory.cpp
+++ b/src/graph/backends/CL/CLFunctionsFactory.cpp
@@ -25,6 +25,7 @@
 
 #include "arm_compute/core/utils/misc/Cast.h"
 #include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/GraphContext.h"
 #include "arm_compute/graph/backends/FunctionHelpers.h"
 #include "arm_compute/runtime/CL/CLFunctions.h"
 #include "arm_compute/runtime/CPP/CPPFunctions.h"
@@ -254,9 +255,9 @@
         case NodeType::FullyConnectedLayer:
             return detail::create_fully_connected_layer<CLFullyConnectedLayer, CLTargetInfo>(*polymorphic_downcast<FullyConnectedLayerNode *>(node), ctx);
         case NodeType::FusedConvolutionBatchNormalizationLayer:
-            return detail::create_fused_convolution_batch_normalization_layer<CLFusedLayerTypes, CLTargetInfo>(*polymorphic_downcast<FusedConvolutionBatchNormalizationNode *>(node));
+            return detail::create_fused_convolution_batch_normalization_layer<CLFusedLayerTypes, CLTargetInfo>(*polymorphic_downcast<FusedConvolutionBatchNormalizationNode *>(node), ctx);
         case NodeType::FusedDepthwiseConvolutionBatchNormalizationLayer:
-            return detail::create_fused_depthwise_convolution_batch_normalization_layer<CLFusedLayerTypes, CLTargetInfo>(*polymorphic_downcast<FusedDepthwiseConvolutionBatchNormalizationNode *>(node));
+            return detail::create_fused_depthwise_convolution_batch_normalization_layer<CLFusedLayerTypes, CLTargetInfo>(*polymorphic_downcast<FusedDepthwiseConvolutionBatchNormalizationNode *>(node), ctx);
         case NodeType::GenerateProposalsLayer:
             return detail::create_generate_proposals_layer<CLGenerateProposalsLayer, CLTargetInfo>(*polymorphic_downcast<GenerateProposalsLayerNode *>(node), ctx);
         case NodeType::NormalizationLayer:
diff --git a/src/graph/backends/GLES/GCFunctionsFactory.cpp b/src/graph/backends/GLES/GCFunctionsFactory.cpp
index 13543db..b9562c7 100644
--- a/src/graph/backends/GLES/GCFunctionsFactory.cpp
+++ b/src/graph/backends/GLES/GCFunctionsFactory.cpp
@@ -25,6 +25,7 @@
 
 #include "arm_compute/core/utils/misc/Cast.h"
 #include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/GraphContext.h"
 #include "arm_compute/graph/backends/FunctionHelpers.h"
 #include "arm_compute/runtime/GLES_COMPUTE/GCFunctions.h"
 
diff --git a/src/graph/backends/NEON/NEFunctionFactory.cpp b/src/graph/backends/NEON/NEFunctionFactory.cpp
index 12f44e3..e0fd32f 100644
--- a/src/graph/backends/NEON/NEFunctionFactory.cpp
+++ b/src/graph/backends/NEON/NEFunctionFactory.cpp
@@ -218,9 +218,9 @@
         case NodeType::FullyConnectedLayer:
             return detail::create_fully_connected_layer<NEFullyConnectedLayer, NETargetInfo>(*polymorphic_downcast<FullyConnectedLayerNode *>(node), ctx);
         case NodeType::FusedConvolutionBatchNormalizationLayer:
-            return detail::create_fused_convolution_batch_normalization_layer<NEFusedLayerTypes, NETargetInfo>(*polymorphic_downcast<FusedConvolutionBatchNormalizationNode *>(node));
+            return detail::create_fused_convolution_batch_normalization_layer<NEFusedLayerTypes, NETargetInfo>(*polymorphic_downcast<FusedConvolutionBatchNormalizationNode *>(node), ctx);
         case NodeType::FusedDepthwiseConvolutionBatchNormalizationLayer:
-            return detail::create_fused_depthwise_convolution_batch_normalization_layer<NEFusedLayerTypes, NETargetInfo>(*polymorphic_downcast<FusedDepthwiseConvolutionBatchNormalizationNode *>(node));
+            return detail::create_fused_depthwise_convolution_batch_normalization_layer<NEFusedLayerTypes, NETargetInfo>(*polymorphic_downcast<FusedDepthwiseConvolutionBatchNormalizationNode *>(node), ctx);
         case NodeType::NormalizationLayer:
             return detail::create_normalization_layer<NENormalizationLayer, NETargetInfo>(*polymorphic_downcast<NormalizationLayerNode *>(node), ctx);
         case NodeType::PermuteLayer:
diff --git a/src/graph/mutators/NodeFusionMutator.cpp b/src/graph/mutators/NodeFusionMutator.cpp
index 61d9479..abd6436 100644
--- a/src/graph/mutators/NodeFusionMutator.cpp
+++ b/src/graph/mutators/NodeFusionMutator.cpp
@@ -309,21 +309,12 @@
         return (output_qasymm8 && same_qinfo) || !output_qasymm8;
     };
 
-    Target target = g.nodes()[0].get()->output(0)->desc().target;
-
     // Fusion mutations
     detail::fuse_layer<BatchNormalizationLayerNode, ActivationLayerNode>(g, empty_prec, detail::fuse_node_with_activation<BatchNormalizationLayerNode>, supported_fused_activations);
     detail::fuse_layer<ConvolutionLayerNode, ActivationLayerNode>(g, empty_prec, detail::fuse_node_with_activation<ConvolutionLayerNode>, supported_fused_activations);
     detail::fuse_layer<DepthwiseConvolutionLayerNode, ActivationLayerNode>(g, qs8_prec, detail::fuse_node_with_activation<DepthwiseConvolutionLayerNode>, supported_fused_activations);
-
-    // Currently fuse batch normalization brings performance uplift only on OpenCL with FP32 data type
-    // TODO (COMPMID-2524): Fuse batch normalization with convolution and depthwise convolution at graph level for NEON - FP32
-    if(target == Target::CL)
-    {
-        //Depthwise Convolution and Batch Normalization Fusion active only for CL
-        detail::fuse_layer<ConvolutionLayerNode, BatchNormalizationLayerNode>(g, empty_prec, detail::fuse_convolution_with_batch_normalization);
-        detail::fuse_layer<DepthwiseConvolutionLayerNode, BatchNormalizationLayerNode>(g, empty_prec, detail::fuse_depthwise_convolution_with_batch_normalization);
-    }
+    detail::fuse_layer<ConvolutionLayerNode, BatchNormalizationLayerNode>(g, empty_prec, detail::fuse_convolution_with_batch_normalization);
+    detail::fuse_layer<DepthwiseConvolutionLayerNode, BatchNormalizationLayerNode>(g, empty_prec, detail::fuse_depthwise_convolution_with_batch_normalization);
 }
 } // namespace graph
 } // namespace arm_compute