COMPMID-1808: Add Detection Output Layer to the GraphAPI
COMPMID-1710: Integrate Detection ouput in MobilenetSSD graph example

Change-Id: I384d1eb492ef14ece58f2023ad7bbc16f834450b
Reviewed-on: https://review.mlplatform.org/356
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Pablo Marquez <pablo.tello@arm.com>
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
diff --git a/arm_compute/graph/GraphBuilder.h b/arm_compute/graph/GraphBuilder.h
index 33a13f1..cb905e7 100644
--- a/arm_compute/graph/GraphBuilder.h
+++ b/arm_compute/graph/GraphBuilder.h
@@ -201,6 +201,18 @@
      * @return Node ID of the created node, EmptyNodeID in case of error
      */
     static NodeID add_elementwise_node(Graph &g, NodeParams params, NodeIdxPair input0, NodeIdxPair input1, EltwiseOperation operation);
+    /** Adds a detection output layer node to the graph
+     *
+     * @param[in] g              Graph to add the node to
+     * @param[in] params         Common node parameters
+     * @param[in] input_loc      Location input to the detection output layer node as a NodeID-Index pair
+     * @param[in] input_conf     Confidence input to the detection output layer node as a NodeID-Index pair
+     * @param[in] input_priorbox PriorBox input to the detection output layer node as a NodeID-Index pair
+     * @param[in] detect_info    Detection output layer parameters
+     *
+     * @return Node ID of the created node, EmptyNodeID in case of error
+     */
+    static NodeID add_detection_output_node(Graph &g, NodeParams params, NodeIdxPair input_loc, NodeIdxPair input_conf, NodeIdxPair input_priorbox, DetectionOutputLayerInfo detect_info);
     /** Adds a Dummy node to the graph
      *
      * @note this node if for debugging purposes. Just alters the shape of the graph pipeline as requested.
diff --git a/arm_compute/graph/INodeVisitor.h b/arm_compute/graph/INodeVisitor.h
index 2df2574..573d642 100644
--- a/arm_compute/graph/INodeVisitor.h
+++ b/arm_compute/graph/INodeVisitor.h
@@ -71,6 +71,11 @@
      * @param[in] n Node to visit.
      */
     virtual void visit(DepthwiseConvolutionLayerNode &n) = 0;
+    /** Visit DetectionOutputLayerNode.
+     *
+     * @param[in] n Node to visit.
+     */
+    virtual void visit(DetectionOutputLayerNode &n) = 0;
     /** Visit EltwiseLayerNode.
      *
      * @param[in] n Node to visit.
@@ -170,6 +175,10 @@
     {
         default_visit();
     }
+    virtual void visit(DetectionOutputLayerNode &n) override
+    {
+        default_visit();
+    }
     virtual void visit(DepthwiseConvolutionLayerNode &n) override
     {
         default_visit();
diff --git a/arm_compute/graph/TypePrinter.h b/arm_compute/graph/TypePrinter.h
index d633091..e33c984 100644
--- a/arm_compute/graph/TypePrinter.h
+++ b/arm_compute/graph/TypePrinter.h
@@ -83,6 +83,9 @@
         case NodeType::DeconvolutionLayer:
             os << "DeconvolutionLayer";
             break;
+        case NodeType::DetectionOutputLayer:
+            os << "DetectionOutputLayer";
+            break;
         case NodeType::DepthwiseConvolutionLayer:
             os << "DepthwiseConvolutionLayer";
             break;
diff --git a/arm_compute/graph/Types.h b/arm_compute/graph/Types.h
index b6803c8..60fe0a8 100644
--- a/arm_compute/graph/Types.h
+++ b/arm_compute/graph/Types.h
@@ -45,6 +45,7 @@
 using arm_compute::PermutationVector;
 
 using arm_compute::ActivationLayerInfo;
+using arm_compute::DetectionOutputLayerInfo;
 using arm_compute::NormType;
 using arm_compute::NormalizationLayerInfo;
 using arm_compute::FullyConnectedLayerInfo;
@@ -133,6 +134,7 @@
     ConvolutionLayer,
     DeconvolutionLayer,
     DepthwiseConvolutionLayer,
+    DetectionOutputLayer,
     EltwiseLayer,
     FlattenLayer,
     FullyConnectedLayer,
diff --git a/arm_compute/graph/backends/FunctionHelpers.h b/arm_compute/graph/backends/FunctionHelpers.h
index 3e71e39..96adffe 100644
--- a/arm_compute/graph/backends/FunctionHelpers.h
+++ b/arm_compute/graph/backends/FunctionHelpers.h
@@ -489,6 +489,51 @@
     return func;
 }
 
+/** Create a backend detection output layer function
+ *
+ * @tparam DetectionOutputLayer Function Backend detection output function
+ * @tparam TargetInfo           Target-specific information
+ *
+ * @param[in] node Node to create the backend function for
+ *
+ * @return Backend detection output layer function
+ */
+template <typename DetectionOutputLayerFunction, typename TargetInfo>
+std::unique_ptr<IFunction> create_detection_output_layer(DetectionOutputLayerNode &node)
+{
+    validate_node<TargetInfo>(node, 3 /* expected inputs */, 1 /* expected outputs */);
+
+    // Extract IO and info
+    typename TargetInfo::TensorType *input0      = get_backing_tensor<TargetInfo>(node.input(0));
+    typename TargetInfo::TensorType *input1      = get_backing_tensor<TargetInfo>(node.input(1));
+    typename TargetInfo::TensorType *input2      = get_backing_tensor<TargetInfo>(node.input(2));
+    typename TargetInfo::TensorType *output      = get_backing_tensor<TargetInfo>(node.output(0));
+    const DetectionOutputLayerInfo   detect_info = node.detection_output_info();
+
+    ARM_COMPUTE_ERROR_ON(input0 == nullptr);
+    ARM_COMPUTE_ERROR_ON(input1 == nullptr);
+    ARM_COMPUTE_ERROR_ON(input2 == nullptr);
+    ARM_COMPUTE_ERROR_ON(output == nullptr);
+
+    // Create and configure function
+    auto func = support::cpp14::make_unique<DetectionOutputLayerFunction>();
+    func->configure(input0, input1, input2, output, detect_info);
+
+    // Log info
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
+                               << node.name()
+                               << " Type: " << node.type()
+                               << " Target: " << TargetInfo::TargetType
+                               << " Data Type: " << input0->info()->data_type()
+                               << " Input0 shape: " << input0->info()->tensor_shape()
+                               << " Input1 shape: " << input1->info()->tensor_shape()
+                               << " Input2 shape: " << input2->info()->tensor_shape()
+                               << " Output shape: " << output->info()->tensor_shape()
+                               << " DetectionOutputLayer info: " << detect_info
+                               << std::endl);
+
+    return std::move(func);
+}
 /** Create a backend element-wise operation layer function
  *
  * @tparam EltwiseFunctions Backend element-wise function
diff --git a/arm_compute/graph/backends/ValidateHelpers.h b/arm_compute/graph/backends/ValidateHelpers.h
index 75e2363..f1e5361 100644
--- a/arm_compute/graph/backends/ValidateHelpers.h
+++ b/arm_compute/graph/backends/ValidateHelpers.h
@@ -203,6 +203,30 @@
 
     return status;
 }
+/** Validates a detection output layer node
+ *
+ * @tparam DetectionOutputLayer DetectionOutput layer type
+ *
+ * @param[in] node Node to validate
+ *
+ * @return Status
+ */
+template <typename DetectionOutputLayer>
+Status validate_detection_output_layer(DetectionOutputLayerNode &node)
+{
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Validating DetectionOutputLayer node with ID : " << node.id() << " and Name: " << node.name() << std::endl);
+    ARM_COMPUTE_RETURN_ERROR_ON(node.num_inputs() != 3);
+    ARM_COMPUTE_RETURN_ERROR_ON(node.num_outputs() != 1);
+
+    // Extract IO and info
+    arm_compute::ITensorInfo      *input0      = get_backing_tensor_info(node.input(0));
+    arm_compute::ITensorInfo      *input1      = get_backing_tensor_info(node.input(1));
+    arm_compute::ITensorInfo      *input2      = get_backing_tensor_info(node.input(2));
+    arm_compute::ITensorInfo      *output      = get_backing_tensor_info(node.output(0));
+    const DetectionOutputLayerInfo detect_info = node.detection_output_info();
+
+    return DetectionOutputLayer::validate(input0, input1, input2, output, detect_info);
+}
 
 /** Validates a Generate Proposals layer node
  *
diff --git a/arm_compute/graph/frontend/Layers.h b/arm_compute/graph/frontend/Layers.h
index d070331..72353a2 100644
--- a/arm_compute/graph/frontend/Layers.h
+++ b/arm_compute/graph/frontend/Layers.h
@@ -458,7 +458,35 @@
     int                    _depth_multiplier;
     const QuantizationInfo _quant_info;
 };
+/** DetectionOutput Layer */
+class DetectionOutputLayer final : public ILayer
+{
+public:
+    /** Construct a detection output layer.
+     *
+     * @param[in] sub_stream_conf  Confidence graph sub-stream.
+     * @param[in] sub_stream_prior PriorBox graph sub-stream.
+     * @param[in] detect_info      DetectionOutput parameters.
+     */
+    DetectionOutputLayer(SubStream &&sub_stream_conf, SubStream &&sub_stream_prior, DetectionOutputLayerInfo detect_info)
+        : _ss_conf(std::move(sub_stream_conf)), _ss_prior(std::move(sub_stream_prior)), _detect_info(detect_info)
+    {
+    }
 
+    NodeID create_layer(IStream &s) override
+    {
+        NodeParams  common_params  = { name(), s.hints().target_hint };
+        NodeIdxPair input_loc      = { s.tail_node(), 0 };
+        NodeIdxPair input_conf     = { _ss_conf.tail_node(), 0 };
+        NodeIdxPair input_priorbox = { _ss_prior.tail_node(), 0 };
+        return GraphBuilder::add_detection_output_node(s.graph(), common_params, input_loc, input_conf, input_priorbox, _detect_info);
+    }
+
+private:
+    SubStream                _ss_conf;
+    SubStream                _ss_prior;
+    DetectionOutputLayerInfo _detect_info;
+};
 /** Dummy Layer */
 class DummyLayer final : public ILayer
 {
diff --git a/arm_compute/graph/nodes/DetectionOutputLayerNode.h b/arm_compute/graph/nodes/DetectionOutputLayerNode.h
new file mode 100644
index 0000000..da1b051
--- /dev/null
+++ b/arm_compute/graph/nodes/DetectionOutputLayerNode.h
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_GRAPH_DETECTION_OUTPUT_LAYER_NODE_H__
+#define __ARM_COMPUTE_GRAPH_DETECTION_OUTPUT_LAYER_NODE_H__
+
+#include "arm_compute/graph/INode.h"
+
+namespace arm_compute
+{
+namespace graph
+{
+/** DetectionOutput Layer node */
+class DetectionOutputLayerNode final : public INode
+{
+public:
+    /** Constructor
+     *
+     * @param[in] detection_info DetectionOutput Layer information
+     */
+    DetectionOutputLayerNode(DetectionOutputLayerInfo detection_info);
+    /** DetectionOutput metadata accessor
+     *
+     * @return DetectionOutput Layer info
+     */
+    DetectionOutputLayerInfo detection_output_info() const;
+    /** Computes detection output output descriptor
+     *
+     * @param[in] input_descriptor Input descriptor
+     * @param[in] info             DetectionOutput operation attributes
+     *
+     * @return Output descriptor
+     */
+    static TensorDescriptor compute_output_descriptor(const TensorDescriptor &input_descriptor, const DetectionOutputLayerInfo &info);
+
+    // Inherited overridden methods:
+    NodeType         type() const override;
+    bool             forward_descriptors() override;
+    TensorDescriptor configure_output(size_t idx) const override;
+    void accept(INodeVisitor &v) override;
+
+private:
+    DetectionOutputLayerInfo _info;
+
+    // Each detection contains a bounding box, given by its coordinates xmin, ymin, xmax, ymax, associated at the respective image, label and a confidence
+    static const int detection_size = 7;
+};
+} // namespace graph
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_GRAPH_DETECTION_OUTPUT_LAYER_NODE_H__ */
diff --git a/arm_compute/graph/nodes/Nodes.h b/arm_compute/graph/nodes/Nodes.h
index 5c7599f..c85c4dc 100644
--- a/arm_compute/graph/nodes/Nodes.h
+++ b/arm_compute/graph/nodes/Nodes.h
@@ -33,6 +33,7 @@
 #include "arm_compute/graph/nodes/ConvolutionLayerNode.h"
 #include "arm_compute/graph/nodes/DeconvolutionLayerNode.h"
 #include "arm_compute/graph/nodes/DepthwiseConvolutionLayerNode.h"
+#include "arm_compute/graph/nodes/DetectionOutputLayerNode.h"
 #include "arm_compute/graph/nodes/DummyNode.h"
 #include "arm_compute/graph/nodes/EltwiseLayerNode.h"
 #include "arm_compute/graph/nodes/FlattenLayerNode.h"
diff --git a/arm_compute/graph/nodes/NodesFwd.h b/arm_compute/graph/nodes/NodesFwd.h
index f956b54..542c129 100644
--- a/arm_compute/graph/nodes/NodesFwd.h
+++ b/arm_compute/graph/nodes/NodesFwd.h
@@ -39,6 +39,7 @@
 class ConvolutionLayerNode;
 class DeconvolutionLayerNode;
 class DepthwiseConvolutionLayerNode;
+class DetectionOutputLayerNode;
 class DummyNode;
 class EltwiseLayerNode;
 class FlattenLayerNode;
diff --git a/examples/graph_ssd_mobilenet.cpp b/examples/graph_ssd_mobilenet.cpp
index 95a4dcc..676c5e9 100644
--- a/examples/graph_ssd_mobilenet.cpp
+++ b/examples/graph_ssd_mobilenet.cpp
@@ -39,12 +39,9 @@
     GraphSSDMobilenetExample()
         : cmd_parser(), common_opts(cmd_parser), common_params(), graph(0, "MobileNetSSD")
     {
-        mbox_loc_opt = cmd_parser.add_option<SimpleOption<std::string>>("mbox_loc_opt", "");
-        mbox_loc_opt->set_help("Filename containing the reference values for the graph branch mbox_loc_opt.");
-        mbox_conf_flatten_opt = cmd_parser.add_option<SimpleOption<std::string>>("mbox_conf_flatten", "");
-        mbox_conf_flatten_opt->set_help("Filename containing the reference values for the graph branch mbox_conf_flatten.");
-        mbox_priorbox_opt = cmd_parser.add_option<SimpleOption<std::string>>("mbox_priorbox", "");
-        mbox_priorbox_opt->set_help("Filename containing the reference values for the graph branch mbox_priorbox.");
+        // Add topk option
+        keep_topk_opt = cmd_parser.add_option<SimpleOption<int>>("topk", 100);
+        keep_topk_opt->set_help("Top k detections results per image.");
     }
     GraphSSDMobilenetExample(const GraphSSDMobilenetExample &) = delete;
     GraphSSDMobilenetExample &operator=(const GraphSSDMobilenetExample &) = delete;
@@ -162,8 +159,6 @@
         mbox_loc << ConcatLayer(std::move(conv_11_mbox_loc), std::move(conv_13_mbox_loc), conv_14_2_mbox_loc, std::move(conv_15_2_mbox_loc),
                                 std::move(conv_16_2_mbox_loc), std::move(conv_17_2_mbox_loc));
 
-        mbox_loc << OutputLayer(get_npy_output_accessor(mbox_loc_opt->value(), TensorShape(7668U), DataType::F32));
-
         //mbox_conf
         SubStream conv_11_mbox_conf(conv_11);
         conv_11_mbox_conf << get_node_C(conv_11, data_path, "conv11_mbox_conf", 63, PadStrideInfo(1, 1, 0, 0));
@@ -190,8 +185,6 @@
         mbox_conf << SoftmaxLayer().set_name("mbox_conf/softmax");
         mbox_conf << FlattenLayer().set_name("mbox_conf/flat");
 
-        mbox_conf << OutputLayer(get_npy_output_accessor(mbox_conf_flatten_opt->value(), TensorShape(40257U), DataType::F32));
-
         const std::vector<float> priorbox_variances     = { 0.1f, 0.1f, 0.2f, 0.2f };
         const float              priorbox_offset        = 0.5f;
         const std::vector<float> priorbox_aspect_ratios = { 2.f, 3.f };
@@ -235,7 +228,19 @@
                           std::move(conv_11_mbox_priorbox), std::move(conv_13_mbox_priorbox), std::move(conv_14_2_mbox_priorbox),
                           std::move(conv_15_2_mbox_priorbox), std::move(conv_16_2_mbox_priorbox), std::move(conv_17_2_mbox_priorbox));
 
-        mbox_priorbox << OutputLayer(get_npy_output_accessor(mbox_priorbox_opt->value(), TensorShape(7668U, 2U, 1U), DataType::F32));
+        const int                          num_classes         = 21;
+        const bool                         share_location      = true;
+        const DetectionOutputLayerCodeType detection_type      = DetectionOutputLayerCodeType::CENTER_SIZE;
+        const int                          keep_top_k          = keep_topk_opt->value();
+        const float                        nms_threshold       = 0.45f;
+        const int                          label_id_background = 0;
+        const float                        conf_thrs           = 0.25f;
+        const int                          top_k               = 100;
+
+        SubStream detection_ouput(mbox_loc);
+        detection_ouput << DetectionOutputLayer(std::move(mbox_conf), std::move(mbox_priorbox),
+                                                DetectionOutputLayerInfo(num_classes, share_location, detection_type, keep_top_k, nms_threshold, top_k, label_id_background, conf_thrs));
+        detection_ouput << OutputLayer(get_detection_output_accessor(common_params, { tensor_shape }));
 
         // Finalize graph
         GraphConfig config;
@@ -256,13 +261,9 @@
 private:
     CommandLineParser  cmd_parser;
     CommonGraphOptions common_opts;
-
-    SimpleOption<std::string> *mbox_loc_opt{ nullptr };
-    SimpleOption<std::string> *mbox_conf_flatten_opt{ nullptr };
-    SimpleOption<std::string> *mbox_priorbox_opt{ nullptr };
-
-    CommonGraphParams common_params;
-    Stream            graph;
+    SimpleOption<int> *keep_topk_opt{ nullptr };
+    CommonGraphParams  common_params;
+    Stream             graph;
 
     ConcatLayer get_node_A(IStream &master_graph, const std::string &data_path, std::string &&param_path,
                            unsigned int  conv_filt,
diff --git a/src/graph/GraphBuilder.cpp b/src/graph/GraphBuilder.cpp
index 3fc258d..d09002d 100644
--- a/src/graph/GraphBuilder.cpp
+++ b/src/graph/GraphBuilder.cpp
@@ -362,6 +362,22 @@
 
     return conv_nid;
 }
+NodeID GraphBuilder::add_detection_output_node(Graph &g, NodeParams params, NodeIdxPair input_loc, NodeIdxPair input_conf, NodeIdxPair input_priorbox, DetectionOutputLayerInfo detect_info)
+{
+    CHECK_NODEIDX_PAIR(input_loc, g);
+    CHECK_NODEIDX_PAIR(input_conf, g);
+    CHECK_NODEIDX_PAIR(input_priorbox, g);
+
+    // Create detection_output node and connect
+    NodeID detect_nid = g.add_node<DetectionOutputLayerNode>(detect_info);
+    g.add_connection(input_loc.node_id, input_loc.index, detect_nid, 0);
+    g.add_connection(input_conf.node_id, input_conf.index, detect_nid, 1);
+    g.add_connection(input_priorbox.node_id, input_priorbox.index, detect_nid, 2);
+
+    set_node_params(g, detect_nid, params);
+
+    return detect_nid;
+}
 
 NodeID GraphBuilder::add_dummy_node(Graph &g, NodeParams params, NodeIdxPair input, TensorShape shape)
 {
diff --git a/src/graph/backends/CL/CLFunctionsFactory.cpp b/src/graph/backends/CL/CLFunctionsFactory.cpp
index c37a137..5b329c0 100644
--- a/src/graph/backends/CL/CLFunctionsFactory.cpp
+++ b/src/graph/backends/CL/CLFunctionsFactory.cpp
@@ -27,6 +27,7 @@
 #include "arm_compute/graph/Graph.h"
 #include "arm_compute/graph/backends/FunctionHelpers.h"
 #include "arm_compute/runtime/CL/CLFunctions.h"
+#include "arm_compute/runtime/CPP/CPPFunctions.h"
 
 using namespace arm_compute::utils::cast;
 
@@ -68,6 +69,94 @@
     using Subtraction    = CLArithmeticSubtraction;
     using Multiplication = CLPixelWiseMultiplication;
 };
+// TODO (isagot01): Remove once we support heterogeneous scheduling at function level
+/** Wrapper for the CPP Function in the OpenCL backend **/
+class CPPWrapperFunction : public IFunction
+{
+public:
+    /* Default constructor */
+    CPPWrapperFunction()
+        : _tensors(), _func(nullptr)
+    {
+    }
+
+    void run() override
+    {
+        for(auto &tensor : _tensors)
+        {
+            tensor->map(CLScheduler::get().queue());
+        }
+        _func->run();
+
+        for(auto &tensor : _tensors)
+        {
+            tensor->unmap(CLScheduler::get().queue());
+        }
+    }
+
+    void register_tensor(ICLTensor *tensor)
+    {
+        _tensors.push_back(tensor);
+    }
+
+    void register_function(std::unique_ptr<IFunction> function)
+    {
+        _func = std::move(function);
+    }
+
+private:
+    std::vector<arm_compute::ICLTensor *> _tensors;
+    std::unique_ptr<IFunction>            _func;
+};
+
+namespace detail
+{
+// Specialized functions
+template <>
+std::unique_ptr<IFunction> create_detection_output_layer<CPPDetectionOutputLayer, CLTargetInfo>(DetectionOutputLayerNode &node)
+{
+    validate_node<CLTargetInfo>(node, 3 /* expected inputs */, 1 /* expected outputs */);
+
+    // Extract IO and info
+    CLTargetInfo::TensorType      *input0      = get_backing_tensor<CLTargetInfo>(node.input(0));
+    CLTargetInfo::TensorType      *input1      = get_backing_tensor<CLTargetInfo>(node.input(1));
+    CLTargetInfo::TensorType      *input2      = get_backing_tensor<CLTargetInfo>(node.input(2));
+    CLTargetInfo::TensorType      *output      = get_backing_tensor<CLTargetInfo>(node.output(0));
+    const DetectionOutputLayerInfo detect_info = node.detection_output_info();
+
+    ARM_COMPUTE_ERROR_ON(input0 == nullptr);
+    ARM_COMPUTE_ERROR_ON(input1 == nullptr);
+    ARM_COMPUTE_ERROR_ON(input2 == nullptr);
+    ARM_COMPUTE_ERROR_ON(output == nullptr);
+
+    // Create and configure function
+    auto func = support::cpp14::make_unique<CPPDetectionOutputLayer>();
+    func->configure(input0, input1, input2, output, detect_info);
+
+    // Log info
+    ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
+                               << node.name()
+                               << " Type: " << node.type()
+                               << " Target: " << CLTargetInfo::TargetType
+                               << " Data Type: " << input0->info()->data_type()
+                               << " Input0 shape: " << input0->info()->tensor_shape()
+                               << " Input1 shape: " << input1->info()->tensor_shape()
+                               << " Input2 shape: " << input2->info()->tensor_shape()
+                               << " Output shape: " << output->info()->tensor_shape()
+                               << " DetectionOutputLayer info: " << detect_info
+                               << std::endl);
+
+    auto wrap_function = support::cpp14::make_unique<CPPWrapperFunction>();
+    ;
+    wrap_function->register_function(std::move(func));
+    wrap_function->register_tensor(input0);
+    wrap_function->register_tensor(input1);
+    wrap_function->register_tensor(input2);
+    wrap_function->register_tensor(output);
+
+    return std::move(wrap_function);
+}
+} // namespace detail
 
 std::unique_ptr<IFunction> CLFunctionFactory::create(INode *node, GraphContext &ctx)
 {
@@ -95,6 +184,8 @@
             return detail::create_concatenate_layer<CLConcatenateLayer, CLTargetInfo>(*polymorphic_downcast<ConcatenateLayerNode *>(node));
         case NodeType::DepthwiseConvolutionLayer:
             return detail::create_depthwise_convolution_layer<CLDepthwiseConvolutionLayerFunctions, CLTargetInfo>(*polymorphic_downcast<DepthwiseConvolutionLayerNode *>(node));
+        case NodeType::DetectionOutputLayer:
+            return detail::create_detection_output_layer<CPPDetectionOutputLayer, CLTargetInfo>(*polymorphic_downcast<DetectionOutputLayerNode *>(node));
         case NodeType::EltwiseLayer:
             return detail::create_eltwise_layer<CLEltwiseFunctions, CLTargetInfo>(*polymorphic_downcast<EltwiseLayerNode *>(node));
         case NodeType::FlattenLayer:
diff --git a/src/graph/backends/CL/CLNodeValidator.cpp b/src/graph/backends/CL/CLNodeValidator.cpp
index a070973..85ac1f5 100644
--- a/src/graph/backends/CL/CLNodeValidator.cpp
+++ b/src/graph/backends/CL/CLNodeValidator.cpp
@@ -28,6 +28,7 @@
 
 #include "arm_compute/core/utils/misc/Cast.h"
 #include "arm_compute/runtime/CL/CLFunctions.h"
+#include "arm_compute/runtime/CPP/CPPFunctions.h"
 
 using namespace arm_compute::utils::cast;
 
@@ -59,6 +60,8 @@
         case NodeType::DepthwiseConvolutionLayer:
             return detail::validate_depthwise_convolution_layer<CLDepthwiseConvolutionLayer,
                    CLDepthwiseConvolutionLayer3x3>(*polymorphic_downcast<DepthwiseConvolutionLayerNode *>(node));
+        case NodeType::DetectionOutputLayer:
+            return detail::validate_detection_output_layer<CPPDetectionOutputLayer>(*polymorphic_downcast<DetectionOutputLayerNode *>(node));
         case NodeType::GenerateProposalsLayer:
             return detail::validate_generate_proposals_layer<CLGenerateProposalsLayer>(*polymorphic_downcast<GenerateProposalsLayerNode *>(node));
         case NodeType::NormalizePlanarYUVLayer:
diff --git a/src/graph/backends/GLES/GCNodeValidator.cpp b/src/graph/backends/GLES/GCNodeValidator.cpp
index fe69c7a..95bb44f 100644
--- a/src/graph/backends/GLES/GCNodeValidator.cpp
+++ b/src/graph/backends/GLES/GCNodeValidator.cpp
@@ -111,6 +111,8 @@
             return validate_convolution_layer(*polymorphic_downcast<ConvolutionLayerNode *>(node));
         case NodeType::DepthwiseConvolutionLayer:
             return validate_depthwise_convolution_layer(*polymorphic_downcast<DepthwiseConvolutionLayerNode *>(node));
+        case NodeType::DetectionOutputLayer:
+            return ARM_COMPUTE_CREATE_ERROR(arm_compute::ErrorCode::RUNTIME_ERROR, "Unsupported operation : DetectionOutputLayer");
         case NodeType::FlattenLayer:
             return ARM_COMPUTE_CREATE_ERROR(arm_compute::ErrorCode::RUNTIME_ERROR, "Unsupported operation : FlattenLayer");
         case NodeType::GenerateProposalsLayer:
diff --git a/src/graph/backends/NEON/NEFunctionFactory.cpp b/src/graph/backends/NEON/NEFunctionFactory.cpp
index ca8d485..dc987dd 100644
--- a/src/graph/backends/NEON/NEFunctionFactory.cpp
+++ b/src/graph/backends/NEON/NEFunctionFactory.cpp
@@ -31,6 +31,7 @@
 #include "arm_compute/graph/backends/FunctionHelpers.h"
 #include "arm_compute/graph/backends/Utils.h"
 #include "arm_compute/graph/nodes/Nodes.h"
+#include "arm_compute/runtime/CPP/CPPFunctions.h"
 #include "arm_compute/runtime/NEON/NEFunctions.h"
 #include "support/ToolchainSupport.h"
 
@@ -77,7 +78,7 @@
 
 namespace detail
 {
-// Specialize functions
+// Specialized functions
 template <>
 std::unique_ptr<IFunction> create_convolution_layer<NEConvolutionLayerFunctions, NETargetInfo>(ConvolutionLayerNode &node,
                                                                                                GraphContext &ctx)
@@ -201,6 +202,8 @@
             return detail::create_concatenate_layer<NEConcatenateLayer, NETargetInfo>(*polymorphic_downcast<ConcatenateLayerNode *>(node));
         case NodeType::DepthwiseConvolutionLayer:
             return detail::create_depthwise_convolution_layer<NEDepthwiseConvolutionLayerFunctions, NETargetInfo>(*polymorphic_downcast<DepthwiseConvolutionLayerNode *>(node));
+        case NodeType::DetectionOutputLayer:
+            return detail::create_detection_output_layer<CPPDetectionOutputLayer, NETargetInfo>(*polymorphic_downcast<DetectionOutputLayerNode *>(node));
         case NodeType::EltwiseLayer:
             return detail::create_eltwise_layer<NEEltwiseFunctions, NETargetInfo>(*polymorphic_downcast<EltwiseLayerNode *>(node));
         case NodeType::FlattenLayer:
diff --git a/src/graph/backends/NEON/NENodeValidator.cpp b/src/graph/backends/NEON/NENodeValidator.cpp
index a2abc83..db6af5e 100644
--- a/src/graph/backends/NEON/NENodeValidator.cpp
+++ b/src/graph/backends/NEON/NENodeValidator.cpp
@@ -27,6 +27,7 @@
 #include "arm_compute/graph/nodes/Nodes.h"
 
 #include "arm_compute/core/utils/misc/Cast.h"
+#include "arm_compute/runtime/CPP/CPPFunctions.h"
 #include "arm_compute/runtime/NEON/NEFunctions.h"
 
 using namespace arm_compute::utils::cast;
@@ -59,6 +60,8 @@
         case NodeType::DepthwiseConvolutionLayer:
             return detail::validate_depthwise_convolution_layer<NEDepthwiseConvolutionLayer,
                    NEDepthwiseConvolutionLayer3x3>(*polymorphic_downcast<DepthwiseConvolutionLayerNode *>(node));
+        case NodeType::DetectionOutputLayer:
+            return detail::validate_detection_output_layer<CPPDetectionOutputLayer>(*polymorphic_downcast<DetectionOutputLayerNode *>(node));
         case NodeType::GenerateProposalsLayer:
             return ARM_COMPUTE_CREATE_ERROR(arm_compute::ErrorCode::RUNTIME_ERROR, "Unsupported operation : GenerateProposalsLayer");
         case NodeType::NormalizePlanarYUVLayer:
diff --git a/src/graph/nodes/DetectionOutputLayerNode.cpp b/src/graph/nodes/DetectionOutputLayerNode.cpp
new file mode 100644
index 0000000..c2d9f24
--- /dev/null
+++ b/src/graph/nodes/DetectionOutputLayerNode.cpp
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/graph/nodes/DetectionOutputLayerNode.h"
+
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/INodeVisitor.h"
+#include "arm_compute/graph/Utils.h"
+
+namespace arm_compute
+{
+namespace graph
+{
+DetectionOutputLayerNode::DetectionOutputLayerNode(DetectionOutputLayerInfo detection_info)
+    : _info(detection_info)
+{
+    _input_edges.resize(3, EmptyEdgeID);
+    _outputs.resize(1, NullTensorID);
+}
+
+DetectionOutputLayerInfo DetectionOutputLayerNode::detection_output_info() const
+{
+    return _info;
+}
+
+TensorDescriptor DetectionOutputLayerNode::compute_output_descriptor(const TensorDescriptor         &input_descriptor,
+                                                                     const DetectionOutputLayerInfo &info)
+{
+    const unsigned int max_size = info.keep_top_k() * ((input_descriptor.shape.num_dimensions() > 1) ? input_descriptor.shape[1] : 1);
+
+    TensorDescriptor output_descriptor = input_descriptor;
+    output_descriptor.shape.set(0, detection_size);
+    output_descriptor.shape.set(1, max_size);
+
+    return output_descriptor;
+}
+
+bool DetectionOutputLayerNode::forward_descriptors()
+{
+    if((input_id(0) != NullTensorID) && (input_id(1) != NullTensorID) && (input_id(2) != NullTensorID) && (output_id(0) != NullTensorID))
+    {
+        Tensor *dst = output(0);
+        ARM_COMPUTE_ERROR_ON(dst == nullptr);
+        dst->desc() = configure_output(0);
+        return true;
+    }
+    return false;
+}
+
+TensorDescriptor DetectionOutputLayerNode::configure_output(size_t idx) const
+{
+    ARM_COMPUTE_UNUSED(idx);
+    ARM_COMPUTE_ERROR_ON(idx >= _outputs.size());
+
+    const Tensor *input0 = input(0);
+    ARM_COMPUTE_ERROR_ON(input0 == nullptr);
+
+    return compute_output_descriptor(input0->desc(), _info);
+}
+
+NodeType DetectionOutputLayerNode::type() const
+{
+    return NodeType::DetectionOutputLayer;
+}
+
+void DetectionOutputLayerNode::accept(INodeVisitor &v)
+{
+    v.visit(*this);
+}
+} // namespace graph
+} // namespace arm_compute
diff --git a/utils/GraphUtils.cpp b/utils/GraphUtils.cpp
index 2f1df7a..ab2c753 100644
--- a/utils/GraphUtils.cpp
+++ b/utils/GraphUtils.cpp
@@ -420,6 +420,77 @@
     _output_stream << "Accuracy : " << accuracy << std::endl;
 }
 
+DetectionOutputAccessor::DetectionOutputAccessor(const std::string &labels_path, std::vector<TensorShape> &imgs_tensor_shapes, std::ostream &output_stream)
+    : _labels(), _tensor_shapes(std::move(imgs_tensor_shapes)), _output_stream(output_stream)
+{
+    _labels.clear();
+
+    std::ifstream ifs;
+
+    try
+    {
+        ifs.exceptions(std::ifstream::badbit);
+        ifs.open(labels_path, std::ios::in | std::ios::binary);
+
+        for(std::string line; !std::getline(ifs, line).fail();)
+        {
+            _labels.emplace_back(line);
+        }
+    }
+    catch(const std::ifstream::failure &e)
+    {
+        ARM_COMPUTE_ERROR("Accessing %s: %s", labels_path.c_str(), e.what());
+    }
+}
+
+template <typename T>
+void DetectionOutputAccessor::access_predictions_tensor(ITensor &tensor)
+{
+    const size_t num_detection = tensor.info()->valid_region().shape.y();
+    const auto   output_prt    = reinterpret_cast<T *>(tensor.buffer() + tensor.info()->offset_first_element_in_bytes());
+
+    if(num_detection > 0)
+    {
+        _output_stream << "---------------------- Detections ----------------------" << std::endl
+                       << std::endl;
+
+        _output_stream << std::left << std::setprecision(4) << std::setw(8) << "Image | " << std::setw(8) << "Label | " << std::setw(12) << "Confidence | "
+                       << "[ xmin, ymin, xmax, ymax ]" << std::endl;
+
+        for(size_t i = 0; i < num_detection; ++i)
+        {
+            auto im = static_cast<const int>(output_prt[i * 7]);
+            _output_stream << std::setw(8) << im << std::setw(8)
+                           << _labels[output_prt[i * 7 + 1]] << std::setw(12) << output_prt[i * 7 + 2]
+                           << " [" << (output_prt[i * 7 + 3] * _tensor_shapes[im].x())
+                           << ", " << (output_prt[i * 7 + 4] * _tensor_shapes[im].y())
+                           << ", " << (output_prt[i * 7 + 5] * _tensor_shapes[im].x())
+                           << ", " << (output_prt[i * 7 + 6] * _tensor_shapes[im].y())
+                           << "]" << std::endl;
+        }
+    }
+    else
+    {
+        _output_stream << "No detection found." << std::endl;
+    }
+}
+
+bool DetectionOutputAccessor::access_tensor(ITensor &tensor)
+{
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&tensor, 1, DataType::F32);
+
+    switch(tensor.info()->data_type())
+    {
+        case DataType::F32:
+            access_predictions_tensor<float>(tensor);
+            break;
+        default:
+            ARM_COMPUTE_ERROR("NOT SUPPORTED!");
+    }
+
+    return false;
+}
+
 TopNPredictionsAccessor::TopNPredictionsAccessor(const std::string &labels_path, size_t top_n, std::ostream &output_stream)
     : _labels(), _output_stream(output_stream), _top_n(top_n)
 {
diff --git a/utils/GraphUtils.h b/utils/GraphUtils.h
index d7f24af..131378e 100644
--- a/utils/GraphUtils.h
+++ b/utils/GraphUtils.h
@@ -283,6 +283,36 @@
     size_t           _positive_samples_top5;
 };
 
+/** Detection output accessor class */
+class DetectionOutputAccessor final : public graph::ITensorAccessor
+{
+public:
+    /** Constructor
+     *
+     * @param[in]  labels_path        Path to labels text file.
+     * @param[in]  imgs_tensor_shapes Network input images tensor shapes.
+     * @param[out] output_stream      (Optional) Output stream
+     */
+    DetectionOutputAccessor(const std::string &labels_path, std::vector<TensorShape> &imgs_tensor_shapes, std::ostream &output_stream = std::cout);
+    /** Allow instances of this class to be move constructed */
+    DetectionOutputAccessor(DetectionOutputAccessor &&) = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    DetectionOutputAccessor(const DetectionOutputAccessor &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    DetectionOutputAccessor &operator=(const DetectionOutputAccessor &) = delete;
+
+    // Inherited methods overriden:
+    bool access_tensor(ITensor &tensor) override;
+
+private:
+    template <typename T>
+    void access_predictions_tensor(ITensor &tensor);
+
+    std::vector<std::string> _labels;
+    std::vector<TensorShape> _tensor_shapes;
+    std::ostream            &_output_stream;
+};
+
 /** Result accessor class */
 class TopNPredictionsAccessor final : public graph::ITensorAccessor
 {
@@ -472,6 +502,39 @@
         return arm_compute::support::cpp14::make_unique<TopNPredictionsAccessor>(graph_parameters.labels, top_n, output_stream);
     }
 }
+/** Generates appropriate output accessor according to the specified graph parameters
+ *
+ * @note If the output accessor is requested to validate the graph then ValidationOutputAccessor is generated
+ *       else if output_accessor_file is empty will generate a DummyAccessor else will generate a TopNPredictionsAccessor
+ *
+ * @param[in]  graph_parameters Graph parameters
+ * @param[in]  tensor_shapes    Network input images tensor shapes.
+ * @param[in]  is_validation    (Optional) Validation flag (default = false)
+ * @param[out] output_stream    (Optional) Output stream (default = std::cout)
+ *
+ * @return An appropriate tensor accessor
+ */
+inline std::unique_ptr<graph::ITensorAccessor> get_detection_output_accessor(const arm_compute::utils::CommonGraphParams &graph_parameters,
+                                                                             std::vector<TensorShape>                     tensor_shapes,
+                                                                             bool                                         is_validation = false,
+                                                                             std::ostream                                &output_stream = std::cout)
+{
+    if(!graph_parameters.validation_file.empty())
+    {
+        return arm_compute::support::cpp14::make_unique<ValidationOutputAccessor>(graph_parameters.validation_file,
+                                                                                  output_stream,
+                                                                                  graph_parameters.validation_range_start,
+                                                                                  graph_parameters.validation_range_end);
+    }
+    else if(graph_parameters.labels.empty())
+    {
+        return arm_compute::support::cpp14::make_unique<DummyAccessor>(0);
+    }
+    else
+    {
+        return arm_compute::support::cpp14::make_unique<DetectionOutputAccessor>(graph_parameters.labels, tensor_shapes, output_stream);
+    }
+}
 /** Generates appropriate npy output accessor according to the specified npy_path
  *
  * @note If npy_path is empty will generate a DummyAccessor else will generate a NpyAccessor