COMPMID-1451:Fuse RELU,LU_BOUNDED_RELU with requantization in NEGEMMConvolutionLayer.

Change-Id: Iea5f2c5bcac8051c4c7655a6eabb2c43772eb31f
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/154104
Tested-by: bsgcomp <bsgcomp@arm.com>
Reviewed-by: Michele DiGiorgio <michele.digiorgio@arm.com>
Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
diff --git a/src/graph/backends/GLES/GCFunctionsFactory.cpp b/src/graph/backends/GLES/GCFunctionsFactory.cpp
index 6268583..02a0567 100644
--- a/src/graph/backends/GLES/GCFunctionsFactory.cpp
+++ b/src/graph/backends/GLES/GCFunctionsFactory.cpp
@@ -120,8 +120,9 @@
         biases->info()->set_data_type(DataType::S32);
     }
 
-    const PadStrideInfo     conv_info      = node.convolution_info();
-    const ConvolutionMethod conv_algorithm = node.convolution_method();
+    const PadStrideInfo       conv_info      = node.convolution_info();
+    const ConvolutionMethod   conv_algorithm = node.convolution_method();
+    const ActivationLayerInfo fused_act      = node.fused_activation();
 
     // Create and configure function (we assume that functions have been validated before creation)
     std::shared_ptr<IMemoryManager> mm = get_memory_manager(ctx, GCTargetInfo::TargetType);
@@ -132,13 +133,13 @@
     {
         std::tie(func, func_name) = create_named_function<GCConvolutionLayerFunctions::DirectConvolutionLayer>(
                                         std::string("DirectConvolutionLayer"),
-                                        input, weights, biases, output, conv_info);
+                                        input, weights, biases, output, conv_info, fused_act);
     }
     else
     {
         std::tie(func, func_name) = create_named_memory_managed_function<GCConvolutionLayerFunctions::GenericConvolutionLayer>(
                                         std::string("ConvolutionLayer"), mm,
-                                        input, weights, biases, output, conv_info);
+                                        input, weights, biases, output, conv_info, WeightsInfo(), Size2D(1U, 1U), fused_act);
     }
 
     // Log info
@@ -149,6 +150,7 @@
                                << " Input shape: " << input->info()->tensor_shape()
                                << " Weights shape: " << weights->info()->tensor_shape()
                                << " Output shape: " << output->info()->tensor_shape()
+                               << (fused_act.enabled() ? " " + to_string(fused_act.activation()) : "")
                                << std::endl);
     return func;
 }
diff --git a/src/graph/backends/NEON/NEFunctionFactory.cpp b/src/graph/backends/NEON/NEFunctionFactory.cpp
index 286c890..e967c1b 100644
--- a/src/graph/backends/NEON/NEFunctionFactory.cpp
+++ b/src/graph/backends/NEON/NEFunctionFactory.cpp
@@ -97,8 +97,9 @@
         biases->info()->set_data_type(DataType::S32);
     }
 
-    const PadStrideInfo     conv_info      = node.convolution_info();
-    const ConvolutionMethod conv_algorithm = node.convolution_method();
+    const PadStrideInfo       conv_info      = node.convolution_info();
+    const ConvolutionMethod   conv_algorithm = node.convolution_method();
+    const ActivationLayerInfo fused_act      = node.fused_activation();
 
     // Create and configure function (we assume that functions have been validated before creation)
     std::shared_ptr<IMemoryManager> mm = get_memory_manager(ctx, Target::NEON);
@@ -107,22 +108,22 @@
     if(conv_algorithm == ConvolutionMethod::Direct)
     {
         std::tie(func, func_name) = create_named_memory_managed_function<NEDirectConvolutionLayer>(
-                                        std::string("DirectConvolutionLayer"), mm, input, weights, biases, output, conv_info);
+                                        std::string("DirectConvolutionLayer"), mm, input, weights, biases, output, conv_info, fused_act);
     }
     else if(conv_algorithm == ConvolutionMethod::GEMM)
     {
         std::tie(func, func_name) = create_named_memory_managed_function<NEGEMMConvolutionLayer>(
-                                        std::string("GEMMConvolutionLayer"), mm, input, weights, biases, output, conv_info);
+                                        std::string("GEMMConvolutionLayer"), mm, input, weights, biases, output, conv_info, WeightsInfo(), Size2D(1, 1), fused_act);
     }
     else if(conv_algorithm == ConvolutionMethod::Winograd)
     {
         std::tie(func, func_name) = create_named_memory_managed_function<NEWinogradConvolutionLayer>(
-                                        std::string("WinogradConvolutionLayer"), mm, input, weights, biases, output, conv_info);
+                                        std::string("WinogradConvolutionLayer"), mm, input, weights, biases, output, conv_info, fused_act);
     }
     else
     {
         std::tie(func, func_name) = create_named_memory_managed_function<NEConvolutionLayer>(
-                                        std::string("ConvolutionLayer"), mm, input, weights, biases, output, conv_info);
+                                        std::string("ConvolutionLayer"), mm, input, weights, biases, output, conv_info, WeightsInfo(), Size2D(1, 1), fused_act);
     }
 
     // Log info
@@ -140,6 +141,7 @@
                                << " Input shape: " << input->info()->tensor_shape()
                                << " Weights shape: " << weights->info()->tensor_shape()
                                << " Output shape: " << output->info()->tensor_shape()
+                               << (fused_act.enabled() ? " " + to_string(fused_act.activation()) : "")
                                << std::endl);
     return func;
 }
diff --git a/src/graph/mutators/NodeFusionMutator.cpp b/src/graph/mutators/NodeFusionMutator.cpp
index 82bfe25..7e66ce0 100644
--- a/src/graph/mutators/NodeFusionMutator.cpp
+++ b/src/graph/mutators/NodeFusionMutator.cpp
@@ -38,26 +38,24 @@
 {
 namespace detail
 {
-void fuse_batch_norm_with_activation(Graph &g)
+template <typename N>
+void fuse_node_with_activation(Graph &g, const std::set<Activation> &supported_fused_activations)
 {
-    // Supported activations when fusing
-    const std::set<Activation> supported_fused_activations = { Activation::RELU, Activation::BOUNDED_RELU, Activation::LU_BOUNDED_RELU };
-
     // Not interested in the order of nodes
     for(auto &node : g.nodes())
     {
         // Check if the node is batch norm and not a branching node
-        if(node && node->type() == NodeType::BatchNormalizationLayer && node->output_edges().size() == 1)
+        if(node && node->type() == N::node_type && node->output_edges().size() == 1)
         {
             auto output_edge_id = *node->output_edges().begin();
             auto output_edge    = g.edge(output_edge_id);
             // Check if following node is an activation layer node
             if((output_edge != nullptr) && (output_edge->consumer() != nullptr) && (output_edge->consumer()->type() == NodeType::ActivationLayer))
             {
-                auto *bn_node  = arm_compute::utils::cast::polymorphic_downcast<BatchNormalizationLayerNode *>(output_edge->producer());
+                auto *n_node   = arm_compute::utils::cast::polymorphic_downcast<N *>(output_edge->producer());
                 auto *act_node = arm_compute::utils::cast::polymorphic_downcast<ActivationLayerNode *>(output_edge->consumer());
 
-                ARM_COMPUTE_ERROR_ON(act_node->output(0) == nullptr || bn_node->output(0) == nullptr);
+                ARM_COMPUTE_ERROR_ON(act_node->output(0) == nullptr || n_node->output(0) == nullptr);
 
                 // Check if activation is supported for fusion
                 if(supported_fused_activations.count(act_node->activation_info().activation()) == 0)
@@ -65,17 +63,17 @@
                     continue;
                 }
 
-                ARM_COMPUTE_LOG_GRAPH_VERBOSE("Fusing Batch Normalization node with ID : " << output_edge->producer_id()
+                ARM_COMPUTE_LOG_GRAPH_VERBOSE("Fusing node with ID : " << output_edge->producer_id()
                                               << " with Activation Layer node with ID : " << output_edge->consumer_id() << std::endl);
 
                 // Prevent fusion if batch normalization node has an output accessor
-                if(bn_node->output(0)->accessor() == nullptr)
+                if(n_node->output(0)->accessor() == nullptr)
                 {
                     // Get driving nodes of activation node
                     std::vector<NodeIdxPair> act_driving_nodes = get_driving_nodes(*act_node);
 
                     // Set activation info to batch normalization
-                    bn_node->set_fused_activation(act_node->activation_info());
+                    n_node->set_fused_activation(act_node->activation_info());
 
                     // Extract activation node accessor if any
                     auto act_node_accessor = act_node->output(0)->extract_accessor();
@@ -86,15 +84,15 @@
                     // Update batch normalization node outputs
                     for(auto &driving_node : act_driving_nodes)
                     {
-                        g.add_connection(bn_node->id(), 0, driving_node.node_id, driving_node.index);
+                        g.add_connection(n_node->id(), 0, driving_node.node_id, driving_node.index);
                     }
 
                     // Update accessor to batch normalization node
-                    bn_node->output(0)->set_accessor(std::move(act_node_accessor));
+                    n_node->output(0)->set_accessor(std::move(act_node_accessor));
                 }
                 else
                 {
-                    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Prevented fusion as batch normalization node has an output accessor\n");
+                    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Prevented fusion of node with activation due to the presence of an output accessor\n");
                 }
             }
         }
@@ -109,7 +107,11 @@
 
 void NodeFusionMutator::mutate(Graph &g)
 {
-    detail::fuse_batch_norm_with_activation(g);
+    // Supported activations when fusing
+    const std::set<Activation> supported_fused_activations = { Activation::RELU, Activation::BOUNDED_RELU, Activation::LU_BOUNDED_RELU };
+
+    detail::fuse_node_with_activation<BatchNormalizationLayerNode>(g, supported_fused_activations);
+    detail::fuse_node_with_activation<ConvolutionLayerNode>(g, supported_fused_activations);
 }
 } // namespace graph
 } // namespace arm_compute
diff --git a/src/graph/nodes/BatchNormalizationLayerNode.cpp b/src/graph/nodes/BatchNormalizationLayerNode.cpp
index 3ae11fc..3d392bd 100644
--- a/src/graph/nodes/BatchNormalizationLayerNode.cpp
+++ b/src/graph/nodes/BatchNormalizationLayerNode.cpp
@@ -78,7 +78,7 @@
 
 NodeType BatchNormalizationLayerNode::type() const
 {
-    return NodeType::BatchNormalizationLayer;
+    return BatchNormalizationLayerNode::node_type;
 }
 
 void BatchNormalizationLayerNode::accept(INodeVisitor &v)
diff --git a/src/graph/nodes/ConvolutionLayerNode.cpp b/src/graph/nodes/ConvolutionLayerNode.cpp
index e9cb039..15c7ff6 100644
--- a/src/graph/nodes/ConvolutionLayerNode.cpp
+++ b/src/graph/nodes/ConvolutionLayerNode.cpp
@@ -37,7 +37,7 @@
                                            ConvolutionMethod method,
                                            FastMathHint      fast_math_hint,
                                            QuantizationInfo  out_quant_info)
-    : _info(std::move(info)), _num_groups(num_groups), _method(method), _fast_math_hint(fast_math_hint), _out_quant_info(out_quant_info)
+    : _info(std::move(info)), _num_groups(num_groups), _method(method), _fast_math_hint(fast_math_hint), _out_quant_info(out_quant_info), _fused_activation()
 {
     _input_edges.resize(3, EmptyEdgeID);
     _outputs.resize(1, NullTensorID);
@@ -73,6 +73,16 @@
     return _num_groups;
 }
 
+ActivationLayerInfo ConvolutionLayerNode::fused_activation() const
+{
+    return _fused_activation;
+}
+
+void ConvolutionLayerNode::set_fused_activation(ActivationLayerInfo fused_activation)
+{
+    _fused_activation = fused_activation;
+}
+
 TensorDescriptor ConvolutionLayerNode::compute_output_descriptor(const TensorDescriptor &input_descriptor,
                                                                  const TensorDescriptor &weights_descriptor,
                                                                  const PadStrideInfo    &info)
@@ -126,7 +136,7 @@
 
 NodeType ConvolutionLayerNode::type() const
 {
-    return NodeType::ConvolutionLayer;
+    return ConvolutionLayerNode::node_type;
 }
 
 void ConvolutionLayerNode::accept(INodeVisitor &v)