COMPMID-1766: Implemented CPP Non Max Suppression

Change-Id: I1dcd5fb3d9ad6c6c750415bf8074698b800dfbc1
Reviewed-on: https://review.mlplatform.org/494
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Giuseppe Rossini <giuseppe.rossini@arm.com>
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
diff --git a/arm_compute/runtime/CPP/functions/CPPDetectionOutputLayer.h b/arm_compute/runtime/CPP/functions/CPPDetectionOutputLayer.h
index 7f80948..8c610f3 100644
--- a/arm_compute/runtime/CPP/functions/CPPDetectionOutputLayer.h
+++ b/arm_compute/runtime/CPP/functions/CPPDetectionOutputLayer.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -39,6 +39,56 @@
 // LabelBBox used for map label and bounding box
 using LabelBBox = std::map<int, std::vector<NormalizedBBox>>;
 
+/** CPP Function to perform non maximum suppression on the bounding boxes and scores
+ *
+ */
+class CPPNonMaximumSuppression : public IFunction
+{
+public:
+    /** Default constructor */
+    CPPNonMaximumSuppression();
+    /** Configure the function to perform non maximal suppression
+     *
+     * @param[in]  bboxes          The input bounding boxes. Data types supported: F32.
+     * @param[in]  scores          The corresponding input confidence. Same as @p scores.
+     * @param[out] indices         The kept indices of bboxes after nms. Data types supported: S32.
+     * @param[in]  max_output_size An integer tensor representing the maximum number of boxes to be selected by non max suppression.
+     * @param[in]  score_threshold The threshold used to filter detection results.
+     * @param[in]  nms_threshold   The threshold used in non maximum suppression.
+     *
+     */
+    void configure(const ITensor *bboxes, const ITensor *scores, ITensor *indices, unsigned int max_output_size, const float score_threshold, const float nms_threshold);
+
+    /** Static function to check if given arguments will lead to a valid configuration of @ref CPPNonMaximumSuppression
+     *
+     * @param[in]  bboxes          The input bounding boxes. Data types supported: F32.
+     * @param[in]  scores          The corresponding input confidence. Same as @p scores.
+     * @param[out] indices         The kept indices of bboxes after nms. Data types supported: S32.
+     * @param[in]  max_output_size An integer tensor representing the maximum number of boxes to be selected by non max suppression.
+     * @param[in]  score_threshold The threshold used to filter detection results.
+     * @param[in]  nms_threshold   The threshold used in non maximum suppression.
+     *
+     */
+    static Status validate(const ITensorInfo *bboxes, const ITensorInfo *scores, const ITensorInfo *indices, unsigned int max_output_size,
+                           const float score_threshold, const float nms_threshold);
+
+    // Inherited methods overridden:
+    void run() override;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CPPNonMaximumSuppression(const CPPNonMaximumSuppression &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CPPNonMaximumSuppression &operator=(const CPPNonMaximumSuppression &) = delete;
+
+private:
+    const ITensor *_bboxes;
+    const ITensor *_scores;
+    ITensor       *_indices;
+    unsigned int   _max_output_size;
+
+    float _score_threshold;
+    float _nms_threshold;
+};
+
 /** CPP Function to generate the detection output based on location and confidence
  * predictions by doing non maximum suppression.
  *
diff --git a/src/runtime/CPP/functions/CPPDetectionOutputLayer.cpp b/src/runtime/CPP/functions/CPPDetectionOutputLayer.cpp
index 61005ab..34a7294 100644
--- a/src/runtime/CPP/functions/CPPDetectionOutputLayer.cpp
+++ b/src/runtime/CPP/functions/CPPDetectionOutputLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -34,7 +34,7 @@
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input_loc, const ITensorInfo *input_conf, const ITensorInfo *input_priorbox, const ITensorInfo *output, DetectionOutputLayerInfo info)
+Status detection_layer_validate_arguments(const ITensorInfo *input_loc, const ITensorInfo *input_conf, const ITensorInfo *input_priorbox, const ITensorInfo *output, DetectionOutputLayerInfo info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input_loc, input_conf, input_priorbox, output);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_loc, 1, DataType::F32);
@@ -366,14 +366,103 @@
             indices.push_back(idx);
         }
         score_index_vec.erase(score_index_vec.begin());
-        if(keep && eta < 1 && adaptive_threshold > 0.5)
+        if(keep && eta < 1.f && adaptive_threshold > 0.5f)
         {
             adaptive_threshold *= eta;
         }
     }
 }
+
+Status non_max_suppression_validate_arguments(const ITensorInfo *bboxes, const ITensorInfo *scores, const ITensorInfo *indices, unsigned int max_output_size,
+                                              const float score_threshold, const float nms_threshold)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(bboxes, scores, indices);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bboxes, 1, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(scores, 1, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(indices, 1, DataType::S32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(bboxes->num_dimensions() > 2, "The bboxes tensor must be a 2-D float tensor of shape [4, num_boxes].");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(scores->num_dimensions() > 1, "The scores tensor must be a 1-D float tensor of shape [num_boxes].");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(indices->num_dimensions() > 1, "The indices must be 1-D integer tensor of shape [M], where max_output_size <= M");
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(bboxes, scores);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(scores->num_dimensions() > 1, "Scores must be a 1D float tensor");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(indices->dimension(0) == 0, "Indices tensor must be bigger than 0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(max_output_size == 0, "Max size cannot be 0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(nms_threshold < 0.f || nms_threshold > 1.f, "Threshould must be in [0,1]");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(score_threshold < 0.f || score_threshold > 1.f, "Threshould must be in [0,1]");
+
+    return Status{};
+}
 } // namespace
 
+CPPNonMaximumSuppression::CPPNonMaximumSuppression()
+    : _bboxes(nullptr), _scores(nullptr), _indices(nullptr), _max_output_size(0), _score_threshold(0.f), _nms_threshold(0.f)
+{
+}
+
+void CPPNonMaximumSuppression::configure(
+    const ITensor *bboxes, const ITensor *scores, ITensor *indices, unsigned int max_output_size,
+    const float score_threshold, const float nms_threshold)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(bboxes, scores, indices);
+    ARM_COMPUTE_ERROR_THROW_ON(non_max_suppression_validate_arguments(bboxes->info(), scores->info(), indices->info(), max_output_size, score_threshold, nms_threshold));
+
+    // copy scores also to a vector
+    _bboxes  = bboxes;
+    _scores  = scores;
+    _indices = indices;
+
+    _nms_threshold   = nms_threshold;
+    _max_output_size = max_output_size;
+    _score_threshold = score_threshold;
+}
+
+Status CPPNonMaximumSuppression::validate(
+    const ITensorInfo *bboxes, const ITensorInfo *scores, const ITensorInfo *indices, unsigned int max_output_size,
+    const float score_threshold, const float nms_threshold)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(non_max_suppression_validate_arguments(bboxes, scores, indices, max_output_size, score_threshold, nms_threshold));
+    return Status{};
+}
+
+void extract_bounding_boxes_from_tensor(const ITensor *bboxes, std::vector<NormalizedBBox> &bboxes_vector)
+{
+    Window input_win;
+    input_win.use_tensor_dimensions(bboxes->info()->tensor_shape());
+    input_win.set_dimension_step(0U, 4U);
+    input_win.set_dimension_step(1U, 1U);
+    Iterator input(bboxes, input_win);
+    auto     f = [&bboxes_vector, &input](const Coordinates &)
+    {
+        const auto input_ptr = reinterpret_cast<const float *>(input.ptr());
+        bboxes_vector.push_back(NormalizedBBox({ *input_ptr, *(input_ptr + 1), *(2 + input_ptr), *(3 + input_ptr) }));
+    };
+    execute_window_loop(input_win, f, input);
+}
+
+void extract_scores_from_tensor(const ITensor *scores, std::vector<float> &scores_vector)
+{
+    Window window;
+    window.use_tensor_dimensions(scores->info()->tensor_shape());
+    Iterator it(scores, window);
+    auto     f = [&it, &scores_vector](const Coordinates &)
+    {
+        const auto input_ptr = reinterpret_cast<const float *>(it.ptr());
+        scores_vector.push_back(*input_ptr);
+    };
+    execute_window_loop(window, f, it);
+}
+
+void CPPNonMaximumSuppression::run()
+{
+    std::vector<NormalizedBBox> bboxes_vector;
+    std::vector<float>          scores_vector;
+    std::vector<int>            indices_vector;
+    extract_bounding_boxes_from_tensor(_bboxes, bboxes_vector);
+    extract_scores_from_tensor(_scores, scores_vector);
+    ApplyNMSFast(bboxes_vector, scores_vector, _score_threshold, _nms_threshold, 1, -1 /* disable top_k */, indices_vector);
+    std::copy_n(indices_vector.begin(), std::min(indices_vector.size(), _indices->info()->dimension(0)), reinterpret_cast<int *>(_indices->ptr_to_element(Coordinates(0))));
+}
+
 CPPDetectionOutputLayer::CPPDetectionOutputLayer()
     : _input_loc(nullptr), _input_conf(nullptr), _input_priorbox(nullptr), _output(nullptr), _info(), _num_priors(), _num(), _all_location_predictions(), _all_confidence_scores(), _all_prior_bboxes(),
       _all_prior_variances(), _all_decode_bboxes(), _all_indices()
@@ -391,7 +480,7 @@
     auto_init_if_empty(*output->info(), input_loc->info()->clone()->set_tensor_shape(TensorShape(7U, max_size)));
 
     // Perform validation step
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input_loc->info(), input_conf->info(), input_priorbox->info(), output->info(), info));
+    ARM_COMPUTE_ERROR_THROW_ON(detection_layer_validate_arguments(input_loc->info(), input_conf->info(), input_priorbox->info(), output->info(), info));
 
     _input_loc      = input_loc;
     _input_conf     = input_conf;
@@ -429,7 +518,7 @@
 
 Status CPPDetectionOutputLayer::validate(const ITensorInfo *input_loc, const ITensorInfo *input_conf, const ITensorInfo *input_priorbox, const ITensorInfo *output, DetectionOutputLayerInfo info)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input_loc, input_conf, input_priorbox, output, info));
+    ARM_COMPUTE_RETURN_ON_ERROR(detection_layer_validate_arguments(input_loc, input_conf, input_priorbox, output, info));
     return Status{};
 }
 
@@ -582,4 +671,4 @@
         }
     }
 }
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute
diff --git a/tests/AssetsLibrary.h b/tests/AssetsLibrary.h
index d09e227..366c145 100644
--- a/tests/AssetsLibrary.h
+++ b/tests/AssetsLibrary.h
@@ -207,6 +207,9 @@
     template <typename T, typename D>
     void fill(T &&tensor, D &&distribution, std::random_device::result_type seed_offset) const;
 
+    template <typename T, typename D>
+    void fill_boxes(T &&tensor, D &&distribution, std::random_device::result_type seed_offset) const;
+
     /** Fills the specified @p raw tensor with random values drawn from @p
      * distribution.
      *
@@ -482,6 +485,40 @@
 }
 
 template <typename T, typename D>
+void AssetsLibrary::fill_boxes(T &&tensor, D &&distribution, std::random_device::result_type seed_offset) const
+{
+    using ResultType = typename std::remove_reference<D>::type::result_type;
+    std::mt19937 gen(_seed + seed_offset);
+    TensorShape  shape(tensor.shape());
+    const int    num_boxes = tensor.num_elements() / 4;
+    // Iterate over all elements
+    std::uniform_real_distribution<> size_dist(0.f, 1.f);
+    for(int element_idx = 0; element_idx < num_boxes * 4; element_idx += 4)
+    {
+        const ResultType delta   = size_dist(gen);
+        const ResultType epsilon = size_dist(gen);
+        const ResultType left    = distribution(gen);
+        const ResultType top     = distribution(gen);
+        const ResultType right   = left + delta;
+        const ResultType bottom  = top + epsilon;
+        const std::tuple<ResultType, ResultType, ResultType, ResultType> box(left, top, right, bottom);
+        Coordinates x1              = index2coord(shape, element_idx);
+        Coordinates y1              = index2coord(shape, element_idx + 1);
+        Coordinates x2              = index2coord(shape, element_idx + 2);
+        Coordinates y2              = index2coord(shape, element_idx + 3);
+        ResultType &target_value_x1 = reinterpret_cast<ResultType *>(tensor(x1))[0];
+        ResultType &target_value_y1 = reinterpret_cast<ResultType *>(tensor(y1))[0];
+        ResultType &target_value_x2 = reinterpret_cast<ResultType *>(tensor(x2))[0];
+        ResultType &target_value_y2 = reinterpret_cast<ResultType *>(tensor(y2))[0];
+        store_value_with_data_type(&target_value_x1, std::get<0>(box), tensor.data_type());
+        store_value_with_data_type(&target_value_y1, std::get<1>(box), tensor.data_type());
+        store_value_with_data_type(&target_value_x2, std::get<2>(box), tensor.data_type());
+        store_value_with_data_type(&target_value_y2, std::get<3>(box), tensor.data_type());
+    }
+    fill_borders_with_garbage(tensor, distribution, seed_offset);
+}
+
+template <typename T, typename D>
 void AssetsLibrary::fill(T &&tensor, D &&distribution, std::random_device::result_type seed_offset) const
 {
     using ResultType = typename std::remove_reference<D>::type::result_type;
diff --git a/tests/datasets/ShapeDatasets.h b/tests/datasets/ShapeDatasets.h
index d7ffc12..bd29fe6 100644
--- a/tests/datasets/ShapeDatasets.h
+++ b/tests/datasets/ShapeDatasets.h
@@ -949,6 +949,37 @@
     {
     }
 };
+
+/** Data set containing small 2D tensor shapes. */
+class Small2DNonMaxSuppressionShapes final : public ShapeDataset
+{
+public:
+    Small2DNonMaxSuppressionShapes()
+        : ShapeDataset("Shape",
+    {
+        TensorShape{ 4U, 7U },
+                     TensorShape{ 4U, 13U },
+                     TensorShape{ 4U, 64U }
+    })
+    {
+    }
+};
+
+/** Data set containing large 2D tensor shapes. */
+class Large2DNonMaxSuppressionShapes final : public ShapeDataset
+{
+public:
+    Large2DNonMaxSuppressionShapes()
+        : ShapeDataset("Shape",
+    {
+        TensorShape{ 4U, 207U },
+                     TensorShape{ 4U, 113U },
+                     TensorShape{ 4U, 264U }
+    })
+    {
+    }
+};
+
 } // namespace datasets
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/CPP/NonMaximalSuppression.cpp b/tests/validation/CPP/NonMaximalSuppression.cpp
new file mode 100644
index 0000000..6cd7b52
--- /dev/null
+++ b/tests/validation/CPP/NonMaximalSuppression.cpp
@@ -0,0 +1,144 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/CPP/functions/CPPDetectionOutputLayer.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+#include "tests/NEON/Accessor.h"
+#include "tests/PaddingCalculator.h"
+#include "tests/datasets/ShapeDatasets.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/Macros.h"
+#include "tests/framework/datasets/Datasets.h"
+#include "tests/validation/Validation.h"
+#include "tests/validation/fixtures/NonMaxSuppressionFixture.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace
+{
+const auto max_output_boxes_dataset = framework::dataset::make("MaxOutputBoxes", 1, 10);
+const auto score_threshold_dataset  = framework::dataset::make("ScoreThreshold", { 0.1f, 0.5f, 0.f, 1.f });
+const auto nms_threshold_dataset    = framework::dataset::make("NMSThreshold", { 0.1f, 0.5f, 0.f, 1.f });
+const auto NMSParametersSmall       = datasets::Small2DNonMaxSuppressionShapes() * max_output_boxes_dataset * score_threshold_dataset * nms_threshold_dataset;
+const auto NMSParametersBig         = datasets::Large2DNonMaxSuppressionShapes() * max_output_boxes_dataset * score_threshold_dataset * nms_threshold_dataset;
+
+} // namespace
+
+TEST_SUITE(CPP)
+TEST_SUITE(NMS)
+
+// *INDENT-OFF*
+// clang-format off
+DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip(
+                                                framework::dataset::make("BoundingBox",{
+                                                                                        TensorInfo(TensorShape(4U, 100U), 1, DataType::F32),
+                                                                                        TensorInfo(TensorShape(1U, 4U, 2U), 1, DataType::F32),    // invalid shape
+                                                                                        TensorInfo(TensorShape(4U, 2U), 1, DataType::S32),    // invalid data type
+                                                                                        TensorInfo(TensorShape(4U, 3U), 1, DataType::F32),
+                                                                                        TensorInfo(TensorShape(4U, 66U), 1, DataType::F32),
+                                                                                        TensorInfo(TensorShape(4U, 100U), 1, DataType::F32),
+                                                                                        TensorInfo(TensorShape(4U, 100U), 1, DataType::F32),
+                                                                                        TensorInfo(TensorShape(4U, 100U), 1, DataType::F32),
+                                                                                        TensorInfo(TensorShape(4U, 100U), 1, DataType::F32),
+                                                                                        TensorInfo(TensorShape(4U, 100U), 1, DataType::F32),
+                                                                                    }),
+                                                framework::dataset::make("Scores", {
+                                                                                        TensorInfo(TensorShape(100U), 1, DataType::F32),
+                                                                                        TensorInfo(TensorShape(37U, 2U, 13U, 27U), 1, DataType::F32), // invalid shape
+                                                                                        TensorInfo(TensorShape(4U), 1, DataType::F32),
+                                                                                        TensorInfo(TensorShape(3U), 1, DataType::U8),  // invalid data type
+                                                                                        TensorInfo(TensorShape(66U), 1, DataType::F32),  // invalid data type
+                                                                                        TensorInfo(TensorShape(100U), 1, DataType::F32),
+                                                                                        TensorInfo(TensorShape(100U), 1, DataType::F32),
+                                                                                        TensorInfo(TensorShape(100U), 1, DataType::F32),
+                                                                                        TensorInfo(TensorShape(100U), 1, DataType::F32),
+                                                                                        TensorInfo(TensorShape(100U), 1, DataType::F32),
+                                                                                    })),
+                                                framework::dataset::make("Indices", {
+                                                                                        TensorInfo(TensorShape(100U), 1, DataType::S32),
+                                                                                        TensorInfo(TensorShape(100U), 1, DataType::S32),
+                                                                                        TensorInfo(TensorShape(4U), 1, DataType::S32),
+                                                                                        TensorInfo(TensorShape(3U), 1, DataType::S32),
+                                                                                        TensorInfo(TensorShape(200U), 1, DataType::S32), // indices bigger than max bbs, OK because max_output is 66
+                                                                                        TensorInfo(TensorShape(100U), 1, DataType::F32), // invalid data type
+                                                                                        TensorInfo(TensorShape(100U), 1, DataType::S32),
+                                                                                        TensorInfo(TensorShape(100U), 1, DataType::S32),
+                                                                                        TensorInfo(TensorShape(100U), 1, DataType::S32),
+                                                                                        TensorInfo(TensorShape(100U), 1, DataType::S32),
+
+                                                                                    })),
+                                                framework::dataset::make("max_output", {
+                                                                                        10U, 2U,4U, 3U,66U, 1U,
+                                                                                        0U, /* invalid, must be greater than 0 */
+                                                                                        10000U, /* OK, clamped to indices' size */
+                                                                                        100U,
+                                                                                        10U,
+                                                                                     })),
+                                                framework::dataset::make("score_threshold", {
+                                                                                        0.1f, 0.4f, 0.2f,0.8f,0.3f, 0.01f, 0.5f, 0.45f,
+                                                                                        -1.f, /* invalid value, must be in [0,1] */
+                                                                                        0.5f,
+                                                                                     })),
+                                                framework::dataset::make("nms_threshold", {
+                                                                                        0.3f, 0.7f, 0.1f,0.13f,0.2f, 0.97f, 0.76f, 0.87f, 0.1f,
+                                                                                        10.f, /* invalid value, must be in [0,1]*/
+                                                                                     })),
+                                                framework::dataset::make("Expected", {
+                                                                                        true, false, false, false, true, false, false,true, false, false
+                                                                                     })),
+
+                                            bbox_info, scores_info, indices_info, max_out, score_threshold, nms_threshold, expected)
+{
+    ARM_COMPUTE_EXPECT(bool(CPPNonMaximumSuppression::validate(&bbox_info.clone()->set_is_resizable(false),
+                                                               &scores_info.clone()->set_is_resizable(false),
+                                                               &indices_info.clone()->set_is_resizable(false),
+                                max_out,score_threshold,nms_threshold)) == expected, framework::LogLevel::ERRORS);
+}
+// clang-format on
+// *INDENT-ON*
+
+using CPPNonMaxSuppressionFixture = NMSValidationFixture<Tensor, Accessor, CPPNonMaximumSuppression>;
+
+FIXTURE_DATA_TEST_CASE(RunSmall, CPPNonMaxSuppressionFixture, framework::DatasetMode::PRECOMMIT, NMSParametersSmall)
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge, CPPNonMaxSuppressionFixture, framework::DatasetMode::NIGHTLY, NMSParametersBig)
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+
+TEST_SUITE_END() // CPP
+TEST_SUITE_END() // NMS
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/fixtures/NonMaxSuppressionFixture.h b/tests/validation/fixtures/NonMaxSuppressionFixture.h
new file mode 100644
index 0000000..9299ed6
--- /dev/null
+++ b/tests/validation/fixtures/NonMaxSuppressionFixture.h
@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_TEST_NON_MAX_SUPPRESSION_FIXTURE
+#define ARM_COMPUTE_TEST_NON_MAX_SUPPRESSION_FIXTURE
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "tests/AssetsLibrary.h"
+#include "tests/Globals.h"
+#include "tests/IAccessor.h"
+#include "tests/framework/Asserts.h"
+#include "tests/framework/Fixture.h"
+#include "tests/validation/reference/NonMaxSuppression.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+template <typename TensorType, typename AccessorType, typename FunctionType>
+
+class NMSValidationFixture : public framework::Fixture
+{
+public:
+    template <typename...>
+    void setup(TensorShape input_shape, unsigned int max_output_size, float score_threshold, float nms_threshold)
+    {
+        ARM_COMPUTE_ERROR_ON(max_output_size == 0);
+        ARM_COMPUTE_ERROR_ON(input_shape.num_dimensions() != 2);
+        const TensorShape output_shape(max_output_size);
+        const TensorShape scores_shape(input_shape[1]);
+        _target    = compute_target(input_shape, scores_shape, output_shape, max_output_size, score_threshold, nms_threshold);
+        _reference = compute_reference(input_shape, scores_shape, output_shape, max_output_size, score_threshold, nms_threshold);
+    }
+
+protected:
+    template <typename U>
+    void fill(U &&tensor, int i, int lo, int hi)
+    {
+        std::uniform_real_distribution<> distribution(lo, hi);
+        library->fill_boxes(tensor, distribution, i);
+    }
+
+    TensorType compute_target(const TensorShape input_shape, const TensorShape scores_shape, const TensorShape output_shape,
+                              unsigned int max_output_size, float score_threshold, float nms_threshold)
+    {
+        // Create tensors
+        TensorType bboxes  = create_tensor<TensorType>(input_shape, DataType::F32);
+        TensorType scores  = create_tensor<TensorType>(scores_shape, DataType::F32);
+        TensorType indices = create_tensor<TensorType>(output_shape, DataType::S32);
+
+        // Create and configure function
+        FunctionType nms_func;
+        nms_func.configure(&bboxes, &scores, &indices, max_output_size, score_threshold, nms_threshold);
+
+        ARM_COMPUTE_EXPECT(bboxes.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(indices.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(scores.info()->is_resizable(), framework::LogLevel::ERRORS);
+
+        // Allocate tensors
+        bboxes.allocator()->allocate();
+        indices.allocator()->allocate();
+        scores.allocator()->allocate();
+
+        ARM_COMPUTE_EXPECT(!bboxes.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(!indices.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(!scores.info()->is_resizable(), framework::LogLevel::ERRORS);
+
+        // Fill tensors
+        fill(AccessorType(bboxes), 0, 0.f, 1.f);
+        fill(AccessorType(scores), 1, 0.f, 1.f);
+
+        // Compute function
+        nms_func.run();
+        return indices;
+    }
+
+    SimpleTensor<int> compute_reference(const TensorShape input_shape, const TensorShape scores_shape, const TensorShape output_shape,
+                                        unsigned int max_output_size, float score_threshold, float nms_threshold)
+    {
+        // Create reference
+        SimpleTensor<float> bboxes{ input_shape, DataType::F32 };
+        SimpleTensor<float> scores{ scores_shape, DataType::F32 };
+        SimpleTensor<int>   indices{ output_shape, DataType::S32 };
+
+        // Fill reference
+        fill(bboxes, 0, 0.f, 1.f);
+        fill(scores, 1, 0.f, 1.f);
+
+        return reference::non_max_suppression(bboxes, scores, indices, max_output_size, score_threshold, nms_threshold);
+    }
+
+    TensorType        _target{};
+    SimpleTensor<int> _reference{};
+};
+
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_TEST_NON_MAX_SUPPRESSION_FIXTURE */
diff --git a/tests/validation/reference/NonMaxSuppression.cpp b/tests/validation/reference/NonMaxSuppression.cpp
new file mode 100644
index 0000000..013a26f
--- /dev/null
+++ b/tests/validation/reference/NonMaxSuppression.cpp
@@ -0,0 +1,126 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "Permute.h"
+
+#include "arm_compute/core/Types.h"
+#include "tests/validation/Helpers.h"
+#include <queue>
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace reference
+{
+namespace
+{
+inline float get_elem_by_coordinate(const SimpleTensor<float> &tensor, Coordinates coord)
+{
+    return *static_cast<const float *>(tensor(coord));
+}
+
+// Return intersection-over-union overlap between boxes i and j
+inline bool iou_greater_than_threshold(const SimpleTensor<float> &boxes, size_t i, size_t j, float iou_threshold)
+{
+    const float ymin_i = std::min<float>(get_elem_by_coordinate(boxes, Coordinates(0, i)), get_elem_by_coordinate(boxes, Coordinates(2, i)));
+    const float xmin_i = std::min<float>(get_elem_by_coordinate(boxes, Coordinates(1, i)), get_elem_by_coordinate(boxes, Coordinates(3, i)));
+    const float ymax_i = std::max<float>(get_elem_by_coordinate(boxes, Coordinates(0, i)), get_elem_by_coordinate(boxes, Coordinates(2, i)));
+    const float xmax_i = std::max<float>(get_elem_by_coordinate(boxes, Coordinates(1, i)), get_elem_by_coordinate(boxes, Coordinates(3, i)));
+    const float ymin_j = std::min<float>(get_elem_by_coordinate(boxes, Coordinates(0, j)), get_elem_by_coordinate(boxes, Coordinates(2, j)));
+    const float xmin_j = std::min<float>(get_elem_by_coordinate(boxes, Coordinates(1, j)), get_elem_by_coordinate(boxes, Coordinates(3, j)));
+    const float ymax_j = std::max<float>(get_elem_by_coordinate(boxes, Coordinates(0, j)), get_elem_by_coordinate(boxes, Coordinates(2, j)));
+    const float xmax_j = std::max<float>(get_elem_by_coordinate(boxes, Coordinates(1, j)), get_elem_by_coordinate(boxes, Coordinates(3, j)));
+    const float area_i = (ymax_i - ymin_i) * (xmax_i - xmin_i);
+    const float area_j = (ymax_j - ymin_j) * (xmax_j - xmin_j);
+    if(area_i <= 0 || area_j <= 0)
+    {
+        return false;
+    }
+    const float intersection_ymin = std::max<float>(ymin_i, ymin_j);
+    const float intersection_xmin = std::max<float>(xmin_i, xmin_j);
+    const float intersection_ymax = std::min<float>(ymax_i, ymax_j);
+    const float intersection_xmax = std::min<float>(xmax_i, xmax_j);
+    const float intersection_area = std::max<float>(intersection_ymax - intersection_ymin, 0.0) * std::max<float>(intersection_xmax - intersection_xmin, 0.0);
+    const float iou               = intersection_area / (area_i + area_j - intersection_area);
+    return iou > iou_threshold;
+}
+
+} // namespace
+
+SimpleTensor<int> non_max_suppression(const SimpleTensor<float> &bboxes, const SimpleTensor<float> &scores, SimpleTensor<int> &indices,
+                                      unsigned int max_output_size, float score_threshold, float nms_threshold)
+{
+    const size_t       num_boxes   = bboxes.shape().y();
+    const size_t       output_size = std::min(static_cast<size_t>(max_output_size), num_boxes);
+    std::vector<float> scores_data(num_boxes);
+    std::copy_n(scores.data(), num_boxes, scores_data.begin());
+
+    using CandidateBox = std::pair<int /* index */, float /* score */>;
+    auto cmp           = [](const CandidateBox bb0, const CandidateBox bb1)
+    {
+        return bb0.second < bb1.second;
+    };
+
+    std::priority_queue<CandidateBox, std::deque<CandidateBox>, decltype(cmp)> candidate_priority_queue(cmp);
+    for(size_t i = 0; i < scores_data.size(); ++i)
+    {
+        if(scores_data[i] > score_threshold)
+        {
+            candidate_priority_queue.emplace(CandidateBox({ i, scores_data[i] }));
+        }
+    }
+
+    std::vector<int>   selected;
+    std::vector<float> selected_scores;
+    CandidateBox       next_candidate;
+
+    while(selected.size() < output_size && !candidate_priority_queue.empty())
+    {
+        next_candidate = candidate_priority_queue.top();
+        candidate_priority_queue.pop();
+        bool should_select = true;
+        for(int j = selected.size() - 1; j >= 0; --j)
+        {
+            if(iou_greater_than_threshold(bboxes, next_candidate.first, selected[j], nms_threshold))
+            {
+                should_select = false;
+                break;
+            }
+        }
+        if(should_select)
+        {
+            selected.push_back(next_candidate.first);
+            selected_scores.push_back(next_candidate.second);
+        }
+    }
+    std::copy_n(selected.begin(), selected.size(), indices.data());
+    return indices;
+}
+
+} // namespace reference
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
diff --git a/tests/validation/reference/NonMaxSuppression.h b/tests/validation/reference/NonMaxSuppression.h
new file mode 100644
index 0000000..0418412
--- /dev/null
+++ b/tests/validation/reference/NonMaxSuppression.h
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_TEST_NON_MAX_SUPPRESION_H__
+#define __ARM_COMPUTE_TEST_NON_MAX_SUPPRESION_H__
+
+#include "tests/SimpleTensor.h"
+
+namespace arm_compute
+{
+namespace test
+{
+namespace validation
+{
+namespace reference
+{
+SimpleTensor<int> non_max_suppression(const SimpleTensor<float> &bboxes, const SimpleTensor<float> &scores, SimpleTensor<int> &indices,
+                                      unsigned int max_output_size, float score_threshold, float nms_threshold);
+
+} // namespace reference
+} // namespace validation
+} // namespace test
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_TEST_NON_MAX_SUPPRESION_H__ */