COMPMID-1766: Implemented CPP Non Max Suppression

Change-Id: I2d2b684d464f7b3bb1f91cfd29952f612d65f11f
Signed-off-by: Pablo Tello <pablo.tello@arm.com>
Reviewed-on: https://review.mlplatform.org/708
Reviewed-by: VidhyaSudhan Loganathan <vidhyasudhan.loganathan@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
diff --git a/tests/AssetsLibrary.h b/tests/AssetsLibrary.h
index d09e227..366c145 100644
--- a/tests/AssetsLibrary.h
+++ b/tests/AssetsLibrary.h
@@ -207,6 +207,9 @@
     template <typename T, typename D>
     void fill(T &&tensor, D &&distribution, std::random_device::result_type seed_offset) const;
 
+    template <typename T, typename D>
+    void fill_boxes(T &&tensor, D &&distribution, std::random_device::result_type seed_offset) const;
+
     /** Fills the specified @p raw tensor with random values drawn from @p
      * distribution.
      *
@@ -482,6 +485,40 @@
 }
 
 template <typename T, typename D>
+void AssetsLibrary::fill_boxes(T &&tensor, D &&distribution, std::random_device::result_type seed_offset) const
+{
+    using ResultType = typename std::remove_reference<D>::type::result_type;
+    std::mt19937 gen(_seed + seed_offset);
+    TensorShape  shape(tensor.shape());
+    const int    num_boxes = tensor.num_elements() / 4;
+    // Iterate over all elements
+    std::uniform_real_distribution<> size_dist(0.f, 1.f);
+    for(int element_idx = 0; element_idx < num_boxes * 4; element_idx += 4)
+    {
+        const ResultType delta   = size_dist(gen);
+        const ResultType epsilon = size_dist(gen);
+        const ResultType left    = distribution(gen);
+        const ResultType top     = distribution(gen);
+        const ResultType right   = left + delta;
+        const ResultType bottom  = top + epsilon;
+        const std::tuple<ResultType, ResultType, ResultType, ResultType> box(left, top, right, bottom);
+        Coordinates x1              = index2coord(shape, element_idx);
+        Coordinates y1              = index2coord(shape, element_idx + 1);
+        Coordinates x2              = index2coord(shape, element_idx + 2);
+        Coordinates y2              = index2coord(shape, element_idx + 3);
+        ResultType &target_value_x1 = reinterpret_cast<ResultType *>(tensor(x1))[0];
+        ResultType &target_value_y1 = reinterpret_cast<ResultType *>(tensor(y1))[0];
+        ResultType &target_value_x2 = reinterpret_cast<ResultType *>(tensor(x2))[0];
+        ResultType &target_value_y2 = reinterpret_cast<ResultType *>(tensor(y2))[0];
+        store_value_with_data_type(&target_value_x1, std::get<0>(box), tensor.data_type());
+        store_value_with_data_type(&target_value_y1, std::get<1>(box), tensor.data_type());
+        store_value_with_data_type(&target_value_x2, std::get<2>(box), tensor.data_type());
+        store_value_with_data_type(&target_value_y2, std::get<3>(box), tensor.data_type());
+    }
+    fill_borders_with_garbage(tensor, distribution, seed_offset);
+}
+
+template <typename T, typename D>
 void AssetsLibrary::fill(T &&tensor, D &&distribution, std::random_device::result_type seed_offset) const
 {
     using ResultType = typename std::remove_reference<D>::type::result_type;
diff --git a/tests/datasets/ShapeDatasets.h b/tests/datasets/ShapeDatasets.h
index 480df3c..f461d7f 100644
--- a/tests/datasets/ShapeDatasets.h
+++ b/tests/datasets/ShapeDatasets.h
@@ -946,6 +946,37 @@
     {
     }
 };
+
+/** Data set containing small 2D tensor shapes. */
+class Small2DNonMaxSuppressionShapes final : public ShapeDataset
+{
+public:
+    Small2DNonMaxSuppressionShapes()
+        : ShapeDataset("Shape",
+    {
+        TensorShape{ 4U, 7U },
+                     TensorShape{ 4U, 13U },
+                     TensorShape{ 4U, 64U }
+    })
+    {
+    }
+};
+
+/** Data set containing large 2D tensor shapes. */
+class Large2DNonMaxSuppressionShapes final : public ShapeDataset
+{
+public:
+    Large2DNonMaxSuppressionShapes()
+        : ShapeDataset("Shape",
+    {
+        TensorShape{ 4U, 207U },
+                     TensorShape{ 4U, 113U },
+                     TensorShape{ 4U, 264U }
+    })
+    {
+    }
+};
+
 } // namespace datasets
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/CPP/NonMaximalSuppression.cpp b/tests/validation/CPP/NonMaximalSuppression.cpp
new file mode 100644
index 0000000..6cd7b52
--- /dev/null
+++ b/tests/validation/CPP/NonMaximalSuppression.cpp
@@ -0,0 +1,144 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CPP/functions/CPPDetectionOutputLayer.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+#include "tests/NEON/Accessor.h"
+#include "tests/PaddingCalculator.h"
+#include "tests/datasets/ShapeDatasets.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/Macros.h"
+#include "tests/framework/datasets/Datasets.h"
+#include "tests/validation/Validation.h"
+#include "tests/validation/fixtures/NonMaxSuppressionFixture.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace
+{
+const auto max_output_boxes_dataset = framework::dataset::make("MaxOutputBoxes", 1, 10);
+const auto score_threshold_dataset  = framework::dataset::make("ScoreThreshold", { 0.1f, 0.5f, 0.f, 1.f });
+const auto nms_threshold_dataset    = framework::dataset::make("NMSThreshold", { 0.1f, 0.5f, 0.f, 1.f });
+const auto NMSParametersSmall       = datasets::Small2DNonMaxSuppressionShapes() * max_output_boxes_dataset * score_threshold_dataset * nms_threshold_dataset;
+const auto NMSParametersBig         = datasets::Large2DNonMaxSuppressionShapes() * max_output_boxes_dataset * score_threshold_dataset * nms_threshold_dataset;
+
+} // namespace
+
+TEST_SUITE(CPP)
+TEST_SUITE(NMS)
+
+// *INDENT-OFF*
+// clang-format off
+DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(
+                                                framework::dataset::make("BoundingBox",{
+                                                                                        TensorInfo(TensorShape(4U, 100U), 1, DataType::F32),
+                                                                                        TensorInfo(TensorShape(1U, 4U, 2U), 1, DataType::F32),    // invalid shape
+                                                                                        TensorInfo(TensorShape(4U, 2U), 1, DataType::S32),    // invalid data type
+                                                                                        TensorInfo(TensorShape(4U, 3U), 1, DataType::F32),
+                                                                                        TensorInfo(TensorShape(4U, 66U), 1, DataType::F32),
+                                                                                        TensorInfo(TensorShape(4U, 100U), 1, DataType::F32),
+                                                                                        TensorInfo(TensorShape(4U, 100U), 1, DataType::F32),
+                                                                                        TensorInfo(TensorShape(4U, 100U), 1, DataType::F32),
+                                                                                        TensorInfo(TensorShape(4U, 100U), 1, DataType::F32),
+                                                                                        TensorInfo(TensorShape(4U, 100U), 1, DataType::F32),
+                                                                                    }),
+                                                framework::dataset::make("Scores", {
+                                                                                        TensorInfo(TensorShape(100U), 1, DataType::F32),
+                                                                                        TensorInfo(TensorShape(37U, 2U, 13U, 27U), 1, DataType::F32), // invalid shape
+                                                                                        TensorInfo(TensorShape(4U), 1, DataType::F32),
+                                                                                        TensorInfo(TensorShape(3U), 1, DataType::U8),  // invalid data type
+                                                                                        TensorInfo(TensorShape(66U), 1, DataType::F32),  // invalid data type
+                                                                                        TensorInfo(TensorShape(100U), 1, DataType::F32),
+                                                                                        TensorInfo(TensorShape(100U), 1, DataType::F32),
+                                                                                        TensorInfo(TensorShape(100U), 1, DataType::F32),
+                                                                                        TensorInfo(TensorShape(100U), 1, DataType::F32),
+                                                                                        TensorInfo(TensorShape(100U), 1, DataType::F32),
+                                                                                    })),
+                                                framework::dataset::make("Indices", {
+                                                                                        TensorInfo(TensorShape(100U), 1, DataType::S32),
+                                                                                        TensorInfo(TensorShape(100U), 1, DataType::S32),
+                                                                                        TensorInfo(TensorShape(4U), 1, DataType::S32),
+                                                                                        TensorInfo(TensorShape(3U), 1, DataType::S32),
+                                                                                        TensorInfo(TensorShape(200U), 1, DataType::S32), // indices bigger than max bbs, OK because max_output is 66
+                                                                                        TensorInfo(TensorShape(100U), 1, DataType::F32), // invalid data type
+                                                                                        TensorInfo(TensorShape(100U), 1, DataType::S32),
+                                                                                        TensorInfo(TensorShape(100U), 1, DataType::S32),
+                                                                                        TensorInfo(TensorShape(100U), 1, DataType::S32),
+                                                                                        TensorInfo(TensorShape(100U), 1, DataType::S32),
+
+                                                                                    })),
+                                                framework::dataset::make("max_output", {
+                                                                                        10U, 2U,4U, 3U,66U, 1U,
+                                                                                        0U, /* invalid, must be greater than 0 */
+                                                                                        10000U, /* OK, clamped to indices' size */
+                                                                                        100U,
+                                                                                        10U,
+                                                                                     })),
+                                                framework::dataset::make("score_threshold", {
+                                                                                        0.1f, 0.4f, 0.2f,0.8f,0.3f, 0.01f, 0.5f, 0.45f,
+                                                                                        -1.f, /* invalid value, must be in [0,1] */
+                                                                                        0.5f,
+                                                                                     })),
+                                                framework::dataset::make("nms_threshold", {
+                                                                                        0.3f, 0.7f, 0.1f,0.13f,0.2f, 0.97f, 0.76f, 0.87f, 0.1f,
+                                                                                        10.f, /* invalid value, must be in [0,1]*/
+                                                                                     })),
+                                                framework::dataset::make("Expected", {
+                                                                                        true, false, false, false, true, false, false,true, false, false
+                                                                                     })),
+
+                                            bbox_info, scores_info, indices_info, max_out, score_threshold, nms_threshold, expected)
+{
+    ARM_COMPUTE_EXPECT(bool(CPPNonMaximumSuppression::validate(&bbox_info.clone()->set_is_resizable(false),
+                                                               &scores_info.clone()->set_is_resizable(false),
+                                                               &indices_info.clone()->set_is_resizable(false),
+                                max_out,score_threshold,nms_threshold)) == expected, framework::LogLevel::ERRORS);
+}
+// clang-format on
+// *INDENT-ON*
+
+using CPPNonMaxSuppressionFixture = NMSValidationFixture<Tensor, Accessor, CPPNonMaximumSuppression>;
+
+FIXTURE_DATA_TEST_CASE(RunSmall, CPPNonMaxSuppressionFixture, framework::DatasetMode::PRECOMMIT, NMSParametersSmall)
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge, CPPNonMaxSuppressionFixture, framework::DatasetMode::NIGHTLY, NMSParametersBig)
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+
+TEST_SUITE_END() // CPP
+TEST_SUITE_END() // NMS
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/fixtures/NonMaxSuppressionFixture.h b/tests/validation/fixtures/NonMaxSuppressionFixture.h
new file mode 100644
index 0000000..9299ed6
--- /dev/null
+++ b/tests/validation/fixtures/NonMaxSuppressionFixture.h
@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_TEST_NON_MAX_SUPPRESSION_FIXTURE
+#define ARM_COMPUTE_TEST_NON_MAX_SUPPRESSION_FIXTURE
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "tests/AssetsLibrary.h"
+#include "tests/Globals.h"
+#include "tests/IAccessor.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/Fixture.h"
+#include "tests/validation/reference/NonMaxSuppression.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+template <typename TensorType, typename AccessorType, typename FunctionType>
+
+class NMSValidationFixture : public framework::Fixture
+{
+public:
+    template <typename...>
+    void setup(TensorShape input_shape, unsigned int max_output_size, float score_threshold, float nms_threshold)
+    {
+        ARM_COMPUTE_ERROR_ON(max_output_size == 0);
+        ARM_COMPUTE_ERROR_ON(input_shape.num_dimensions() != 2);
+        const TensorShape output_shape(max_output_size);
+        const TensorShape scores_shape(input_shape[1]);
+        _target    = compute_target(input_shape, scores_shape, output_shape, max_output_size, score_threshold, nms_threshold);
+        _reference = compute_reference(input_shape, scores_shape, output_shape, max_output_size, score_threshold, nms_threshold);
+    }
+
+protected:
+    template <typename U>
+    void fill(U &&tensor, int i, int lo, int hi)
+    {
+        std::uniform_real_distribution<> distribution(lo, hi);
+        library->fill_boxes(tensor, distribution, i);
+    }
+
+    TensorType compute_target(const TensorShape input_shape, const TensorShape scores_shape, const TensorShape output_shape,
+                              unsigned int max_output_size, float score_threshold, float nms_threshold)
+    {
+        // Create tensors
+        TensorType bboxes  = create_tensor<TensorType>(input_shape, DataType::F32);
+        TensorType scores  = create_tensor<TensorType>(scores_shape, DataType::F32);
+        TensorType indices = create_tensor<TensorType>(output_shape, DataType::S32);
+
+        // Create and configure function
+        FunctionType nms_func;
+        nms_func.configure(&bboxes, &scores, &indices, max_output_size, score_threshold, nms_threshold);
+
+        ARM_COMPUTE_EXPECT(bboxes.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(indices.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(scores.info()->is_resizable(), framework::LogLevel::ERRORS);
+
+        // Allocate tensors
+        bboxes.allocator()->allocate();
+        indices.allocator()->allocate();
+        scores.allocator()->allocate();
+
+        ARM_COMPUTE_EXPECT(!bboxes.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(!indices.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(!scores.info()->is_resizable(), framework::LogLevel::ERRORS);
+
+        // Fill tensors
+        fill(AccessorType(bboxes), 0, 0.f, 1.f);
+        fill(AccessorType(scores), 1, 0.f, 1.f);
+
+        // Compute function
+        nms_func.run();
+        return indices;
+    }
+
+    SimpleTensor<int> compute_reference(const TensorShape input_shape, const TensorShape scores_shape, const TensorShape output_shape,
+                                        unsigned int max_output_size, float score_threshold, float nms_threshold)
+    {
+        // Create reference
+        SimpleTensor<float> bboxes{ input_shape, DataType::F32 };
+        SimpleTensor<float> scores{ scores_shape, DataType::F32 };
+        SimpleTensor<int>   indices{ output_shape, DataType::S32 };
+
+        // Fill reference
+        fill(bboxes, 0, 0.f, 1.f);
+        fill(scores, 1, 0.f, 1.f);
+
+        return reference::non_max_suppression(bboxes, scores, indices, max_output_size, score_threshold, nms_threshold);
+    }
+
+    TensorType        _target{};
+    SimpleTensor<int> _reference{};
+};
+
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_TEST_NON_MAX_SUPPRESSION_FIXTURE */
diff --git a/tests/validation/reference/NonMaxSuppression.cpp b/tests/validation/reference/NonMaxSuppression.cpp
new file mode 100644
index 0000000..7592908
--- /dev/null
+++ b/tests/validation/reference/NonMaxSuppression.cpp
@@ -0,0 +1,157 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "NonMaxSuppression.h"
+
+#include "arm_compute/core/Types.h"
+#include "tests/validation/Helpers.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace reference
+{
+namespace
+{
+using CandidateBox = std::pair<int /* index */, float /* score */>;
+using Box          = std::tuple<float, float, float, float>;
+
+inline float get_elem_by_coordinate(const SimpleTensor<float> &tensor, Coordinates coord)
+{
+    return *static_cast<const float *>(tensor(coord));
+}
+
+inline Box get_box(const SimpleTensor<float> &boxes, size_t id)
+{
+    return std::make_tuple(
+               get_elem_by_coordinate(boxes, Coordinates(0, id)),
+               get_elem_by_coordinate(boxes, Coordinates(1, id)),
+               get_elem_by_coordinate(boxes, Coordinates(2, id)),
+               get_elem_by_coordinate(boxes, Coordinates(3, id)));
+}
+
+inline std::pair<float, float> get_min_yx(Box b)
+{
+    return std::make_pair(
+               std::min<float>(std::get<0>(b), std::get<2>(b)),
+               std::min<float>(std::get<1>(b), std::get<3>(b)));
+}
+
+inline std::pair<float, float> get_max_yx(Box b)
+{
+    return std::make_pair(
+               std::max<float>(std::get<0>(b), std::get<2>(b)),
+               std::max<float>(std::get<1>(b), std::get<3>(b)));
+}
+
+inline float compute_size(const std::pair<float, float> &min, const std::pair<float, float> &max)
+{
+    return (max.first - min.first) * (max.second - min.second);
+}
+
+inline float compute_intersection(const std::pair<float, float> &b0_min, const std::pair<float, float> &b0_max,
+                                  const std::pair<float, float> &b1_min, const std::pair<float, float> &b1_max, float b0_size, float b1_size)
+{
+    const float inter = std::max<float>(std::min<float>(b0_max.first, b1_max.first) - std::max<float>(b0_min.first, b1_min.first), 0.0) * std::max<float>(std::min<float>(b0_max.second,
+                        b1_max.second)
+                        - std::max<float>(b0_min.second, b1_min.second),
+                        0.0);
+    return inter / (b0_size + b1_size - inter);
+}
+
+inline bool reject_box(Box b0, Box b1, float threshold)
+{
+    const auto  b0_min  = get_min_yx(b0);
+    const auto  b0_max  = get_max_yx(b0);
+    const auto  b1_min  = get_min_yx(b1);
+    const auto  b1_max  = get_max_yx(b1);
+    const float b0_size = compute_size(b0_min, b0_max);
+    const float b1_size = compute_size(b1_min, b1_max);
+    if(b0_size <= 0.f || b1_size <= 0.f)
+    {
+        return false;
+    }
+    else
+    {
+        return compute_intersection(b0_min, b0_max, b1_min, b1_max, b0_size, b1_size) > threshold;
+    }
+}
+
+inline std::vector<CandidateBox> get_candidates(const SimpleTensor<float> &scores, float threshold)
+{
+    std::vector<CandidateBox> candidates_vector;
+    for(int i = 0; i < scores.num_elements(); ++i)
+    {
+        if(scores[i] > threshold)
+        {
+            const auto cb = CandidateBox({ i, scores[i] });
+            candidates_vector.push_back(cb);
+        }
+    }
+    std::sort(candidates_vector.begin(), candidates_vector.end(), [](const CandidateBox bb0, const CandidateBox bb1)
+    {
+        return bb0.second >= bb1.second;
+    });
+    return candidates_vector;
+}
+
+inline bool is_box_selected(const CandidateBox &cb, const SimpleTensor<float> &bboxes, std::vector<int> &selected_boxes, float threshold)
+{
+    for(int j = selected_boxes.size() - 1; j >= 0; --j)
+    {
+        if(reject_box(get_box(bboxes, cb.first), get_box(bboxes, selected_boxes[j]), threshold))
+        {
+            return false;
+        }
+    }
+    return true;
+}
+} // namespace
+
+SimpleTensor<int> non_max_suppression(const SimpleTensor<float> &bboxes, const SimpleTensor<float> &scores, SimpleTensor<int> &indices,
+                                      unsigned int max_output_size, float score_threshold, float nms_threshold)
+{
+    const size_t                    num_boxes         = bboxes.shape().y();
+    const size_t                    output_size       = std::min(static_cast<size_t>(max_output_size), num_boxes);
+    const std::vector<CandidateBox> candidates_vector = get_candidates(scores, score_threshold);
+    std::vector<int>                selected;
+    size_t                          p(0);
+    while(selected.size() < output_size && p < candidates_vector.size() && selected.size() < candidates_vector.size())
+    {
+        const auto nc = candidates_vector[p++];
+        if(is_box_selected(nc, bboxes, selected, nms_threshold))
+        {
+            selected.push_back(nc.first);
+        }
+    }
+    std::copy_n(selected.begin(), selected.size(), indices.data());
+    return indices;
+}
+
+} // namespace reference
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/reference/NonMaxSuppression.h b/tests/validation/reference/NonMaxSuppression.h
new file mode 100644
index 0000000..0418412
--- /dev/null
+++ b/tests/validation/reference/NonMaxSuppression.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_NON_MAX_SUPPRESION_H__
+#define __ARM_COMPUTE_TEST_NON_MAX_SUPPRESION_H__
+
+#include "tests/SimpleTensor.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace reference
+{
+SimpleTensor<int> non_max_suppression(const SimpleTensor<float> &bboxes, const SimpleTensor<float> &scores, SimpleTensor<int> &indices,
+                                      unsigned int max_output_size, float score_threshold, float nms_threshold);
+
+} // namespace reference
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_TEST_NON_MAX_SUPPRESION_H__ */