COMPMID-915: Create ResNet50 example

ResidualLayer node (COMPMID-916) also created as required for the ResNet
architecture.

Change-Id: I4fb4d2e08a8d3ce206f96f7946f5afc3e244676a
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/121185
Tested-by: Jenkins <bsgcomp@arm.com>
Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
diff --git a/arm_compute/graph/Nodes.h b/arm_compute/graph/Nodes.h
index 4390c59..3009a24 100644
--- a/arm_compute/graph/Nodes.h
+++ b/arm_compute/graph/Nodes.h
@@ -39,6 +39,7 @@
 #include "arm_compute/graph/nodes/PoolingLayer.h"
 #include "arm_compute/graph/nodes/QuantizationLayer.h"
 #include "arm_compute/graph/nodes/ReshapeLayer.h"
+#include "arm_compute/graph/nodes/ResidualLayer.h"
 #include "arm_compute/graph/nodes/SoftmaxLayer.h"
 
 #endif /* __ARM_COMPUTE_GRAPH_NODES_H__ */
diff --git a/arm_compute/graph/Types.h b/arm_compute/graph/Types.h
index 3ec0c7a..a5d6ae8 100644
--- a/arm_compute/graph/Types.h
+++ b/arm_compute/graph/Types.h
@@ -91,6 +91,7 @@
 enum class OperationType
 {
     ActivationLayer,
+    ArithmeticAddition,
     BatchNormalizationLayer,
     ConvolutionLayer,
     DepthConvertLayer,
diff --git a/arm_compute/graph/nodes/ResidualLayer.h b/arm_compute/graph/nodes/ResidualLayer.h
new file mode 100644
index 0000000..1eecf6f
--- /dev/null
+++ b/arm_compute/graph/nodes/ResidualLayer.h
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_GRAPH_RESIDUAL_LAYER_H__
+#define __ARM_COMPUTE_GRAPH_RESIDUAL_LAYER_H__
+
+#include "arm_compute/graph/GraphContext.h"
+#include "arm_compute/graph/INode.h"
+#include "arm_compute/graph/ITensorObject.h"
+#include "arm_compute/graph/SubGraph.h"
+#include "arm_compute/graph/Types.h"
+
+#include "arm_compute/core/utils/misc/utility.h"
+
+#include <vector>
+
+namespace arm_compute
+{
+namespace graph
+{
+/** Branch Layer node */
+class ResidualLayer final : public INode
+{
+public:
+    /** Default Constructor
+     *
+     * @param[in] sub_graph1 First graph branch
+     * @param[in] sub_graph2 Second graph branch
+     */
+    template <typename... Ts>
+    ResidualLayer(SubGraph &&sub_graph1, SubGraph &&sub_graph2)
+        : _sub_graphs()
+    {
+        _sub_graphs.push_back(arm_compute::support::cpp14::make_unique<SubGraph>(std::move(sub_graph1)));
+        _sub_graphs.push_back(arm_compute::support::cpp14::make_unique<SubGraph>(std::move(sub_graph2)));
+    }
+    /** Default Constructor
+     *
+     * @param[in] sub_graph Sub graph
+     */
+    template <typename... Ts>
+    ResidualLayer(SubGraph &&sub_graph)
+        : _sub_graphs()
+    {
+        _sub_graphs.push_back(arm_compute::support::cpp14::make_unique<SubGraph>(std::move(sub_graph)));
+    }
+
+    // Inherited methods overriden:
+    std::unique_ptr<arm_compute::IFunction> instantiate_node(GraphContext &ctx, ITensorObject *input, ITensorObject *output) override;
+
+private:
+    std::vector<std::unique_ptr<SubGraph>> _sub_graphs;
+};
+} // namespace graph
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_GRAPH_RESIDUAL_LAYER_H__ */
diff --git a/examples/graph_resnet50.cpp b/examples/graph_resnet50.cpp
new file mode 100644
index 0000000..88f58bf
--- /dev/null
+++ b/examples/graph_resnet50.cpp
@@ -0,0 +1,233 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/Nodes.h"
+#include "support/ToolchainSupport.h"
+#include "utils/GraphUtils.h"
+#include "utils/Utils.h"
+
+#include <cstdlib>
+
+using namespace arm_compute::utils;
+using namespace arm_compute::graph;
+using namespace arm_compute::graph_utils;
+
+/** Example demonstrating how to implement Microsoft's ResNet50 network using the Compute Library's graph API
+ *
+ * @param[in] argc Number of arguments
+ * @param[in] argv Arguments ( [optional] Target (0 = NEON, 1 = OpenCL), [optional] Path to the weights folder, [optional] image, [optional] labels )
+ */
+class GraphResNet50Example : public Example
+{
+public:
+    void do_setup(int argc, char **argv) override
+    {
+        std::string data_path; /* Path to the trainable data */
+        std::string image;     /* Image data */
+        std::string label;     /* Label data */
+
+        // Create a preprocessor object
+        const std::array<float, 3> mean_rgb{ { 122.68f, 116.67f, 104.01f } };
+        std::unique_ptr<IPreprocessor> preprocessor = arm_compute::support::cpp14::make_unique<CaffePreproccessor>(mean_rgb,
+                                                                                                                   false /* Do not convert to BGR */);
+
+        // Set target. 0 (NEON), 1 (OpenCL), 2 (OpenCL with Tuner). By default it is NEON
+        const int  int_target_hint = argc > 1 ? std::strtol(argv[1], nullptr, 10) : 0;
+        TargetHint target_hint     = set_target_hint(int_target_hint);
+
+        // Parse arguments
+        if(argc < 2)
+        {
+            // Print help
+            std::cout << "Usage: " << argv[0] << " [target] [path_to_data] [image] [labels]\n\n";
+            std::cout << "No data folder provided: using random values\n\n";
+        }
+        else if(argc == 2)
+        {
+            std::cout << "Usage: " << argv[0] << " " << argv[1] << " [path_to_data] [image] [labels]\n\n";
+            std::cout << "No data folder provided: using random values\n\n";
+        }
+        else if(argc == 3)
+        {
+            data_path = argv[2];
+            std::cout << "Usage: " << argv[0] << " " << argv[1] << " " << argv[2] << " [image] [labels]\n\n";
+            std::cout << "No image provided: using random values\n\n";
+        }
+        else if(argc == 4)
+        {
+            data_path = argv[2];
+            image     = argv[3];
+            std::cout << "Usage: " << argv[0] << " " << argv[1] << " " << argv[2] << " " << argv[3] << " [labels]\n\n";
+            std::cout << "No text file with labels provided: skipping output accessor\n\n";
+        }
+        else
+        {
+            data_path = argv[2];
+            image     = argv[3];
+            label     = argv[4];
+        }
+
+        // Initialize the graph
+        graph.graph_init(int_target_hint == 2);
+
+        graph << target_hint
+              << Tensor(TensorInfo(TensorShape(224U, 224U, 3U, 1U), 1, DataType::F32),
+                        get_input_accessor(image, std::move(preprocessor), false /* Do not convert to BGR */))
+              << ConvolutionLayer(
+                  7U, 7U, 64U,
+                  get_weights_accessor(data_path, "/cnn_data/resnet50_model/conv1_weights.npy"),
+                  std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
+                  PadStrideInfo(2, 2, 3, 3))
+              << BatchNormalizationLayer(
+                  get_weights_accessor(data_path, "/cnn_data/resnet50_model/conv1_BatchNorm_moving_mean.npy"),
+                  get_weights_accessor(data_path, "/cnn_data/resnet50_model/conv1_BatchNorm_moving_variance.npy"),
+                  get_weights_accessor(data_path, "/cnn_data/resnet50_model/conv1_BatchNorm_gamma.npy"),
+                  get_weights_accessor(data_path, "/cnn_data/resnet50_model/conv1_BatchNorm_beta.npy"),
+                  0.0000100099996416f)
+              << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+              << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 3, PadStrideInfo(2, 2, 0, 1, 0, 1, DimensionRoundingType::FLOOR)));
+
+        add_residual_block(data_path, "block1", 64, 3, 2);
+        add_residual_block(data_path, "block2", 128, 4, 2);
+        add_residual_block(data_path, "block3", 256, 6, 2);
+        add_residual_block(data_path, "block4", 512, 3, 1);
+
+        graph << PoolingLayer(PoolingLayerInfo(PoolingType::AVG))
+              << ConvolutionLayer(
+                  1U, 1U, 1000U,
+                  get_weights_accessor(data_path, "/cnn_data/resnet50_model/logits_weights.npy"),
+                  get_weights_accessor(data_path, "/cnn_data/resnet50_model/logits_biases.npy"),
+                  PadStrideInfo(1, 1, 0, 0))
+              << FlattenLayer()
+              << SoftmaxLayer()
+              << Tensor(get_output_accessor(label, 5));
+    }
+    void do_run() override
+    {
+        // Run graph
+        graph.run();
+    }
+
+private:
+    Graph graph{};
+
+    void add_residual_block(const std::string &data_path, const std::string &name, unsigned int base_depth, unsigned int num_units, unsigned int stride)
+    {
+        for(unsigned int i = 0; i < num_units; ++i)
+        {
+            std::stringstream unit;
+            unit << "/cnn_data/resnet50_model/" << name << "_unit_" << (i + 1) << "_bottleneck_v1_";
+            std::string unit_name = unit.str();
+
+            unsigned int middle_stride = 1;
+
+            if(i == (num_units - 1))
+            {
+                middle_stride = stride;
+            }
+
+            SubGraph right;
+            right << ConvolutionLayer(
+                      1U, 1U, base_depth,
+                      get_weights_accessor(data_path, unit_name + "conv1_weights.npy"),
+                      std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
+                      PadStrideInfo(1, 1, 0, 0))
+                  << BatchNormalizationLayer(
+                      get_weights_accessor(data_path, unit_name + "conv1_BatchNorm_moving_mean.npy"),
+                      get_weights_accessor(data_path, unit_name + "conv1_BatchNorm_moving_variance.npy"),
+                      get_weights_accessor(data_path, unit_name + "conv1_BatchNorm_gamma.npy"),
+                      get_weights_accessor(data_path, unit_name + "conv1_BatchNorm_beta.npy"),
+                      0.0000100099996416f)
+                  << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+
+                  << ConvolutionLayer(
+                      3U, 3U, base_depth,
+                      get_weights_accessor(data_path, unit_name + "conv2_weights.npy"),
+                      std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
+                      PadStrideInfo(middle_stride, middle_stride, 1, 1))
+                  << BatchNormalizationLayer(
+                      get_weights_accessor(data_path, unit_name + "conv2_BatchNorm_moving_mean.npy"),
+                      get_weights_accessor(data_path, unit_name + "conv2_BatchNorm_moving_variance.npy"),
+                      get_weights_accessor(data_path, unit_name + "conv2_BatchNorm_gamma.npy"),
+                      get_weights_accessor(data_path, unit_name + "conv2_BatchNorm_beta.npy"),
+                      0.0000100099996416f)
+                  << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU))
+
+                  << ConvolutionLayer(
+                      1U, 1U, base_depth * 4,
+                      get_weights_accessor(data_path, unit_name + "conv3_weights.npy"),
+                      std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
+                      PadStrideInfo(1, 1, 0, 0))
+                  << BatchNormalizationLayer(
+                      get_weights_accessor(data_path, unit_name + "conv3_BatchNorm_moving_mean.npy"),
+                      get_weights_accessor(data_path, unit_name + "conv3_BatchNorm_moving_variance.npy"),
+                      get_weights_accessor(data_path, unit_name + "conv3_BatchNorm_gamma.npy"),
+                      get_weights_accessor(data_path, unit_name + "conv3_BatchNorm_beta.npy"),
+                      0.0000100099996416f);
+
+            if(i == 0)
+            {
+                SubGraph left;
+                left << ConvolutionLayer(
+                         1U, 1U, base_depth * 4,
+                         get_weights_accessor(data_path, unit_name + "shortcut_weights.npy"),
+                         std::unique_ptr<arm_compute::graph::ITensorAccessor>(nullptr),
+                         PadStrideInfo(1, 1, 0, 0))
+                     << BatchNormalizationLayer(
+                         get_weights_accessor(data_path, unit_name + "shortcut_BatchNorm_moving_mean.npy"),
+                         get_weights_accessor(data_path, unit_name + "shortcut_BatchNorm_moving_variance.npy"),
+                         get_weights_accessor(data_path, unit_name + "shortcut_BatchNorm_gamma.npy"),
+                         get_weights_accessor(data_path, unit_name + "shortcut_BatchNorm_beta.npy"),
+                         0.0000100099996416f);
+
+                graph << ResidualLayer(std::move(left), std::move(right));
+            }
+            else if(middle_stride > 1)
+            {
+                SubGraph left;
+                left << PoolingLayer(PoolingLayerInfo(PoolingType::MAX, 1, PadStrideInfo(middle_stride, middle_stride, 0, 0), true))
+                     // TODO (alegil01) : Remove once we understand why a single node graph does not run in CL
+                     << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LINEAR, 1.f, 0.f));
+
+                graph << ResidualLayer(std::move(left), std::move(right));
+            }
+            else
+            {
+                graph << ResidualLayer(std::move(right));
+            }
+
+            graph << ActivationLayer(ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU));
+        }
+    }
+};
+
+/** Main program for ResNet50
+ *
+ * @param[in] argc Number of arguments
+ * @param[in] argv Arguments ( [optional] Target (0 = NEON, 1 = OpenCL), [optional] Path to the weights folder, [optional] image, [optional] labels )
+ */
+int main(int argc, char **argv)
+{
+    return arm_compute::utils::run_example<GraphResNet50Example>(argc, argv);
+}
diff --git a/src/graph/Graph.cpp b/src/graph/Graph.cpp
index 98d9590..b6c6822 100644
--- a/src/graph/Graph.cpp
+++ b/src/graph/Graph.cpp
@@ -77,6 +77,7 @@
 Graph::Graph()
     : _pimpl{ new Private() }
 {
+    graph_init();
 }
 
 void Graph::graph_init(const bool use_cl_tuner)
diff --git a/src/graph/nodes/ResidualLayer.cpp b/src/graph/nodes/ResidualLayer.cpp
new file mode 100644
index 0000000..87404f9
--- /dev/null
+++ b/src/graph/nodes/ResidualLayer.cpp
@@ -0,0 +1,199 @@
+/*
+ * Copyright (c) 2017-2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/nodes/ResidualLayer.h"
+
+#include "arm_compute/graph/Error.h"
+#include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/NodeContext.h"
+#include "arm_compute/graph/OperationRegistry.h"
+#include "arm_compute/graph/SubGraph.h"
+#include "arm_compute/graph/Tensor.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "support/ToolchainSupport.h"
+#include "utils/Utils.h"
+
+#include <memory>
+#include <tuple>
+#include <vector>
+
+using namespace arm_compute::graph;
+
+/** Residual function */
+class ResidualFunction final : public arm_compute::IFunction
+{
+public:
+    /** Default Constructor */
+    ResidualFunction(GraphContext &ctx, ITensorObject *output)
+        : _ctx(ctx), _input(nullptr), _output(output), _func(nullptr), _graphs(), _graph_outputs()
+    {
+    }
+
+    /** Prevent instances from being copy constructed */
+    ResidualFunction(const ResidualFunction &) = delete;
+    /** Prevent instances from being copy assigned */
+    const ResidualFunction &operator=(const ResidualFunction &) = delete;
+    /** Prevent instances from being move constructed */
+    ResidualFunction(ResidualFunction &&) = delete;
+    /** Prevent instances from being move assigned */
+    ResidualFunction &operator=(ResidualFunction &&) = delete;
+    /** Default destructor */
+    ~ResidualFunction() override = default;
+
+    /** Set the input (when using only one sub graph)
+     *
+     * @param[in] input Input to set
+     */
+    void set_input(std::unique_ptr<ITensorObject> input)
+    {
+        _input = std::move(input);
+    }
+
+    /** Registers graph to be executed by the residual function
+     *
+     * @param[in] graph  Graph to register
+     * @param[in] output Output to register
+     */
+    void register_graph(std::unique_ptr<Graph> graph, std::unique_ptr<ITensorObject> output)
+    {
+        _graphs.push_back(std::move(graph));
+        _graph_outputs.push_back(std::move(output));
+    }
+
+    /** Configure the function */
+    void configure()
+    {
+        ARM_COMPUTE_ERROR_ON(_graphs.size() < 1 || _graphs.size() > 2);
+        TargetHint target_hint = _ctx.hints().target_hint();
+
+        // Create node context
+        NodeContext node_ctx(OperationType::ArithmeticAddition);
+        node_ctx.set_target(target_hint);
+
+        if(_graphs.size() == 1)
+        {
+            arm_compute::ITensor *in = _input->tensor();
+            node_ctx.add_input(in);
+        }
+
+        for(auto &o : _graph_outputs)
+        {
+            arm_compute::ITensor *in = o->tensor();
+            node_ctx.add_input(in);
+        }
+
+        arm_compute::ITensor *out = _output->tensor();
+        auto_init_if_empty(*out->info(), *_graph_outputs[0]->tensor()->info());
+        node_ctx.add_output(out);
+
+        _func = OperationRegistry::get().find_operation(OperationType::ArithmeticAddition, target_hint)->configure(node_ctx);
+
+        for(auto &o : _graph_outputs)
+        {
+            o->allocate();
+        }
+    }
+
+    // Inherited methods overriden:
+    void run() override
+    {
+        ARM_COMPUTE_ERROR_ON(_graphs.size() < 1 || _graphs.size() > 2);
+
+        for(auto &g : _graphs)
+        {
+            ARM_COMPUTE_ERROR_ON(g.get() == nullptr);
+            g->run();
+        }
+
+        _func->run();
+    }
+
+private:
+    GraphContext                                _ctx;
+    std::unique_ptr<ITensorObject>              _input;
+    ITensorObject                              *_output;
+    std::unique_ptr<arm_compute::IFunction>     _func;
+    std::vector<std::unique_ptr<Graph>>         _graphs;
+    std::vector<std::unique_ptr<ITensorObject>> _graph_outputs;
+};
+
+std::unique_ptr<arm_compute::IFunction> ResidualLayer::instantiate_node(GraphContext &ctx, ITensorObject *input, ITensorObject *output)
+{
+    ARM_COMPUTE_ERROR_ON_UNALLOCATED_TENSOR_OBJECT(input, output);
+    ARM_COMPUTE_ERROR_ON(dynamic_cast<Tensor *>(input) == nullptr);
+    ARM_COMPUTE_ERROR_ON(dynamic_cast<Tensor *>(output) == nullptr);
+
+    // Create residual function
+    auto func = arm_compute::support::cpp14::make_unique<ResidualFunction>(ctx, output);
+
+    if(_sub_graphs.size() == 1)
+    {
+        std::unique_ptr<ITensorObject> original_in;
+        original_in = arm_compute::support::cpp14::make_unique<SubTensor>(*dynamic_cast<Tensor *>(input),
+                                                                          input->tensor()->info()->tensor_shape(),
+                                                                          Coordinates());
+        func->set_input(std::move(original_in));
+    }
+
+    // Constuct all sub-graphs given the input/output
+    for(auto &sg : _sub_graphs)
+    {
+        ARM_COMPUTE_ERROR_ON(sg.get() == nullptr);
+
+        // IO buffers
+        std::unique_ptr<ITensorObject> in;
+        std::unique_ptr<ITensorObject> out;
+        std::unique_ptr<ITensorObject> func_in;
+
+        // Create input sub-tensor
+        if(!sg->has_input())
+        {
+            in = arm_compute::support::cpp14::make_unique<SubTensor>(*dynamic_cast<Tensor *>(input),
+                                                                     input->tensor()->info()->tensor_shape(),
+                                                                     Coordinates());
+        }
+
+        // Create output sub-tensor
+        if(!sg->has_output())
+        {
+            ITensorInfo *info = input->tensor()->info();
+            func_in           = arm_compute::support::cpp14::make_unique<Tensor>(TensorInfo(info->num_channels(), info->data_type(), info->fixed_point_position()));
+            func_in->set_target(ctx.hints().target_hint());
+            out = arm_compute::support::cpp14::make_unique<SubTensor>(func_in->tensor(),
+                                                                      TensorShape(),
+                                                                      Coordinates(0, 0, 0),
+                                                                      func_in->target(),
+                                                                      true);
+        }
+
+        // Construct sub_graph
+        auto g = sg->construct(ctx, std::move(in), std::move(out));
+
+        // Register graph to function
+        func->register_graph(std::move(g), std::move(func_in));
+    }
+
+    func->configure();
+
+    return std::move(func);
+}
diff --git a/src/graph/operations/CLSimpleOperations.cpp b/src/graph/operations/CLSimpleOperations.cpp
index 94e3fe1..fe56122 100644
--- a/src/graph/operations/CLSimpleOperations.cpp
+++ b/src/graph/operations/CLSimpleOperations.cpp
@@ -66,6 +66,34 @@
     return std::move(activation);
 }
 
+/* Arithmetic addition */
+REGISTER_SIMPLE_OPERATION(CLArithmeticAdditionOperation, OPENCL, OperationType::ArithmeticAddition)
+{
+    ARM_COMPUTE_ERROR_ON(ctx.num_inputs() != 2);
+    ARM_COMPUTE_ERROR_ON(ctx.num_outputs() != 1);
+    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ICLTensor *>(ctx.input(0)) == nullptr);
+    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ICLTensor *>(ctx.input(1)) == nullptr);
+    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ICLTensor *>(ctx.output(0)) == nullptr);
+
+    // Extract IO and info
+    auto *in1 = dynamic_cast<arm_compute::ICLTensor *>(ctx.input(0));
+    auto *in2 = dynamic_cast<arm_compute::ICLTensor *>(ctx.input(1));
+    auto *out = dynamic_cast<arm_compute::ICLTensor *>(ctx.output(0));
+
+    auto addition = arm_compute::support::cpp14::make_unique<arm_compute::CLArithmeticAddition>();
+    addition->configure(in1, in2, out, ConvertPolicy::SATURATE);
+
+    // Log info
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating CLArithmeticAddition"
+                               << " Data Type: " << in1->info()->data_type()
+                               << " Input 1 shape: " << in1->info()->tensor_shape()
+                               << " Input 2 shape: " << in2->info()->tensor_shape()
+                               << " Output shape: " << out->info()->tensor_shape()
+                               << std::endl);
+
+    return std::move(addition);
+}
+
 /* Batch Normalization Layer */
 REGISTER_SIMPLE_OPERATION(CLBatchNormalizationLayerOperation, OPENCL, OperationType::BatchNormalizationLayer)
 {
@@ -464,4 +492,4 @@
                                << std::endl);
 
     return std::move(smx);
-}
\ No newline at end of file
+}
diff --git a/src/graph/operations/NESimpleOperations.cpp b/src/graph/operations/NESimpleOperations.cpp
index 265bed6..4154b9a 100644
--- a/src/graph/operations/NESimpleOperations.cpp
+++ b/src/graph/operations/NESimpleOperations.cpp
@@ -66,6 +66,34 @@
     return std::move(activation);
 }
 
+/* Arithmetic addition */
+REGISTER_SIMPLE_OPERATION(NEArithmeticAdditionOperation, NEON, OperationType::ArithmeticAddition)
+{
+    ARM_COMPUTE_ERROR_ON(ctx.num_inputs() != 2);
+    ARM_COMPUTE_ERROR_ON(ctx.num_outputs() != 1);
+    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ITensor *>(ctx.input(0)) == nullptr);
+    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ITensor *>(ctx.input(1)) == nullptr);
+    ARM_COMPUTE_ERROR_ON(dynamic_cast<arm_compute::ITensor *>(ctx.output(0)) == nullptr);
+
+    // Extract IO and info
+    auto *in1 = dynamic_cast<arm_compute::ITensor *>(ctx.input(0));
+    auto *in2 = dynamic_cast<arm_compute::ITensor *>(ctx.input(1));
+    auto *out = dynamic_cast<arm_compute::ITensor *>(ctx.output(0));
+
+    auto addition = arm_compute::support::cpp14::make_unique<arm_compute::NEArithmeticAddition>();
+    addition->configure(in1, in2, out, ConvertPolicy::SATURATE);
+
+    // Log info
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiating NEArithmeticAddition"
+                               << " Data Type: " << in1->info()->data_type()
+                               << " Input 1 shape: " << in1->info()->tensor_shape()
+                               << " Input 2 shape: " << in2->info()->tensor_shape()
+                               << " Output shape: " << out->info()->tensor_shape()
+                               << std::endl);
+
+    return std::move(addition);
+}
+
 /* Batch Normalization Layer */
 REGISTER_SIMPLE_OPERATION(NEBatchNormalizationLayerOperation, NEON, OperationType::BatchNormalizationLayer)
 {
@@ -464,4 +492,4 @@
                                << std::endl);
 
     return std::move(smx);
-}
\ No newline at end of file
+}