COMPMID-1328 Add support for BoxWithNMSLimit operator in CPP

Change-Id: I5aae537372bf797fbb2a2bae81038f8963b041a9
diff --git a/arm_compute/core/CPP/CPPKernels.h b/arm_compute/core/CPP/CPPKernels.h
index bf24a94..39b77cd 100644
--- a/arm_compute/core/CPP/CPPKernels.h
+++ b/arm_compute/core/CPP/CPPKernels.h
@@ -25,6 +25,7 @@
 #define __ARM_COMPUTE_CPPKERNELS_H__
 
 /* Header regrouping all the CPP kernels */
+#include "arm_compute/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.h"
 #include "arm_compute/core/CPP/kernels/CPPCornerCandidatesKernel.h"
 #include "arm_compute/core/CPP/kernels/CPPDetectionWindowNonMaximaSuppressionKernel.h"
 #include "arm_compute/core/CPP/kernels/CPPFlipWeightsKernel.h"
diff --git a/arm_compute/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.h b/arm_compute/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.h
new file mode 100644
index 0000000..a1adca3
--- /dev/null
+++ b/arm_compute/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.h
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CPPBOXWITHNONMAXIMASUPPRESSIONLIMITKERNEL_H__
+#define __ARM_COMPUTE_CPPBOXWITHNONMAXIMASUPPRESSIONLIMITKERNEL_H__
+
+#include "arm_compute/core/IArray.h"
+#include "arm_compute/core/IHOG.h"
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** CPP kernel to perform computation of BoxWithNonMaximaSuppressionLimit */
+class CPPBoxWithNonMaximaSuppressionLimitKernel : public ICPPKernel
+{
+public:
+    const char *name() const override
+    {
+        return "CPPBoxWithNonMaximaSuppressionLimitKernel";
+    }
+    /** Default constructor */
+    CPPBoxWithNonMaximaSuppressionLimitKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CPPBoxWithNonMaximaSuppressionLimitKernel(const CPPBoxWithNonMaximaSuppressionLimitKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CPPBoxWithNonMaximaSuppressionLimitKernel &operator=(const CPPBoxWithNonMaximaSuppressionLimitKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    CPPBoxWithNonMaximaSuppressionLimitKernel(CPPBoxWithNonMaximaSuppressionLimitKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    CPPBoxWithNonMaximaSuppressionLimitKernel &operator=(CPPBoxWithNonMaximaSuppressionLimitKernel &&) = default;
+    /** Initialise the kernel's input and output tensors.
+     *
+     * @param[in]  scores_in        The scores input tensor of size [num_classes, count]. Data types supported: F16/F32
+     * @param[in]  boxes_in         The boxes input tensor of size [num_classes * 4, count]. Data types supported: Same as @p scores_in
+     * @param[in]  batch_splits_in  The batch splits input tensor of size [batch_size]. Data types supported: Same as @p scores_in
+     *                              @note Can be a nullptr. If not a nullptr, @p scores_in and @p boxes_in have items from multiple images.
+     * @param[out] scores_out       The scores output tensor of size [N]. Data types supported: Same as @p scores_in
+     * @param[out] boxes_out        The boxes output tensor of size [4, N]. Data types supported: Same as @p scores_in
+     * @param[out] classes          The classes output tensor of size [N]. Data types supported: Same as @p scores_in
+     * @param[out] batch_splits_out (Optional) The batch splits output tensor [batch_size]. Data types supported: Same as @p scores_in
+     * @param[out] keeps            (Optional) The keeps output tensor of size [N]. Data types supported: Same as@p scores_in
+     * @param[out] keeps_size       (Optional) Number of filtered indices per class tensor of size [num_classes]. Data types supported: Same as @p scores_in
+     * @param[in]  info             (Optional) BoxNMSLimitInfo information.
+     */
+    void configure(const ITensor *scores_in, const ITensor *boxes_in, const ITensor *batch_splits_in, ITensor *scores_out, ITensor *boxes_out, ITensor *classes,
+                   ITensor *batch_splits_out = nullptr, ITensor *keeps = nullptr, ITensor *keeps_size = nullptr, const BoxNMSLimitInfo info = BoxNMSLimitInfo());
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+    bool is_parallelisable() const override;
+
+    template <typename T>
+    void run_nmslimit();
+
+private:
+    const ITensor *_scores_in;
+    const ITensor *_boxes_in;
+    const ITensor *_batch_splits_in;
+    ITensor        *_scores_out;
+    ITensor        *_boxes_out;
+    ITensor        *_classes;
+    ITensor        *_batch_splits_out;
+    ITensor        *_keeps;
+    ITensor        *_keeps_size;
+    BoxNMSLimitInfo _info;
+};
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_CPPBOXWITHNONMAXIMASUPPRESSIONLIMITKERNEL_H__ */
diff --git a/arm_compute/core/Types.h b/arm_compute/core/Types.h
index 134b8e2..cb1a212 100644
--- a/arm_compute/core/Types.h
+++ b/arm_compute/core/Types.h
@@ -574,6 +574,82 @@
     L2   /**< L2 Pooling */
 };
 
+/** Available non maxima suppression types */
+enum class NMSType
+{
+    LINEAR,   /**< Linear NMS */
+    GAUSSIAN, /**< Gaussian NMS */
+    ORIGINAL  /**< Original NMS */
+};
+
+/** BoxWithNonMaximaSuppressionLimit Information class */
+class BoxNMSLimitInfo final
+{
+public:
+    /** Constructor
+     *
+     * @param[in] score_thresh             (Optional) Score threshold.
+     * @param[in] nms                      (Optional) NMS value
+     * @param[in] detections               (Optional) Number of detections
+     * @param[in] soft_nms_enabled         (Optional) Enable SoftNMS
+     * @param[in] soft_nms_method          (Optional) Soft NMS method
+     * @param[in] soft_nms_sigma           (Optional) Soft NMS sigma value
+     * @param[in] soft_nms_min_score_thres (Optional) Soft NMS minimum score threshold
+     */
+    BoxNMSLimitInfo(float score_thresh = 0.05f, float nms = 0.3f,
+                    int detections = 100, bool soft_nms_enabled = false,
+                    NMSType soft_nms_method = NMSType::LINEAR,
+                    float soft_nms_sigma = 0.5f, float soft_nms_min_score_thres = 0.001f)
+        : _score_thresh(score_thresh), _nms(nms), _detections_per_im(detections), _soft_nms_enabled(soft_nms_enabled), _soft_nms_method(soft_nms_method), _soft_nms_sigma(soft_nms_sigma),
+          _soft_nms_min_score_thres(soft_nms_min_score_thres)
+    {
+    }
+    /** Get the score threshold */
+    float score_thresh() const
+    {
+        return _score_thresh;
+    }
+    /** Get the NMS */
+    float nms() const
+    {
+        return _nms;
+    }
+    /** Get the number of detections */
+    int detections_per_im() const
+    {
+        return _detections_per_im;
+    }
+    /** Check if soft NMS is enabled */
+    bool soft_nms_enabled() const
+    {
+        return _soft_nms_enabled;
+    }
+    /** Get soft NMS method */
+    NMSType soft_nms_method() const
+    {
+        return _soft_nms_method;
+    }
+    /** Get soft NMS sigma */
+    float soft_nms_sigma() const
+    {
+        return _soft_nms_sigma;
+    }
+    /** Get soft nms min score threshold */
+    float soft_nms_min_score_thres() const
+    {
+        return _soft_nms_min_score_thres;
+    }
+
+private:
+    float   _score_thresh;
+    float   _nms;
+    int     _detections_per_im;
+    bool    _soft_nms_enabled;
+    NMSType _soft_nms_method;
+    float   _soft_nms_sigma;
+    float   _soft_nms_min_score_thres;
+};
+
 /** Padding and stride information class */
 class PadStrideInfo
 {
diff --git a/arm_compute/runtime/CPP/CPPFunctions.h b/arm_compute/runtime/CPP/CPPFunctions.h
index 4b7e1af..9c17d9e 100644
--- a/arm_compute/runtime/CPP/CPPFunctions.h
+++ b/arm_compute/runtime/CPP/CPPFunctions.h
@@ -25,6 +25,7 @@
 #define __ARM_COMPUTE_CPPFUNCTIONS_H__
 
 /* Header regrouping all the CPP functions */
+#include "arm_compute/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.h"
 #include "arm_compute/runtime/CPP/functions/CPPPermute.h"
 #include "arm_compute/runtime/CPP/functions/CPPUpsample.h"
 
diff --git a/arm_compute/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.h b/arm_compute/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.h
new file mode 100644
index 0000000..8a886a4
--- /dev/null
+++ b/arm_compute/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_CPPBOXWITHNONMAXIMASUPPRESSIONLIMIT_H__
+#define __ARM_COMPUTE_CPPBOXWITHNONMAXIMASUPPRESSIONLIMIT_H__
+
+#include "arm_compute/runtime/CPP/ICPPSimpleFunction.h"
+
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+class ITensor;
+
+/** Basic function to run @ref CPPBoxWithNonMaximaSuppressionLimitKernel */
+class CPPBoxWithNonMaximaSuppressionLimit : public ICPPSimpleFunction
+{
+public:
+    /** Configure the BoxWithNonMaximaSuppressionLimit CPP kernel
+     *
+     * @param[in]  scores_in        The scores input tensor of size [count, num_classes]. Data types supported: F16/F32
+     * @param[in]  boxes_in         The boxes input tensor of size [count, num_classes * 4]. Data types supported: Same as @p scores_in
+     * @param[in]  batch_splits_in  The batch splits input tensor of size [batch_size]. Data types supported: Same as @p scores_in
+     *                              @note Can be a nullptr. If not a nullptr, @p scores_in and @p boxes_in have items from multiple images.
+     * @param[out] scores_out       The scores output tensor of size [N]. Data types supported: Same as @p scores_in
+     * @param[out] boxes_out        The boxes output tensor of size [N, 4]. Data types supported: Same as @p scores_in
+     * @param[out] classes          The classes output tensor of size [N]. Data types supported: Same as @p scores_in
+     * @param[out] batch_splits_out (Optional) The batch splits output tensor. Data types supported: Same as @p scores_in
+     * @param[out] keeps            (Optional) The keeps output tensor of size [N]. Data types supported: Same as @p scores_in
+     * @param[in]  keeps_size       (Optional) Number of filtered indices per class tensor of size [num_classes]. Data types supported: Same as @p scores_in
+     * @param[in]  info             (Optional) BoxNMSLimitInfo information.
+     */
+    void configure(const ITensor *scores_in, const ITensor *boxes_in, const ITensor *batch_splits_in, ITensor *scores_out, ITensor *boxes_out, ITensor *classes,
+                   ITensor *batch_splits_out = nullptr, ITensor *keeps = nullptr, ITensor *keeps_size = nullptr, const BoxNMSLimitInfo info = BoxNMSLimitInfo());
+};
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_CPPBOXWITHNONMAXIMASUPPRESSIONLIMIT_H__ */
diff --git a/src/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.cpp b/src/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.cpp
new file mode 100644
index 0000000..89413fc
--- /dev/null
+++ b/src/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.cpp
@@ -0,0 +1,409 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.h"
+
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+
+#include <algorithm>
+#include <cmath>
+
+namespace arm_compute
+{
+namespace
+{
+template <typename T>
+std::vector<int> SoftNMS(const ITensor *proposals, std::vector<std::vector<T>> &scores_in, std::vector<int> inds, const BoxNMSLimitInfo &info, int class_id)
+{
+    std::vector<int> keep;
+    const int        proposals_width = proposals->info()->dimension(1);
+
+    std::vector<T> x1(proposals_width);
+    std::vector<T> y1(proposals_width);
+    std::vector<T> x2(proposals_width);
+    std::vector<T> y2(proposals_width);
+    std::vector<T> areas(proposals_width);
+
+    for(int i = 0; i < proposals_width; ++i)
+    {
+        x1[i]    = *reinterpret_cast<T *>(proposals->ptr_to_element(Coordinates(class_id * 4, i)));
+        y1[i]    = *reinterpret_cast<T *>(proposals->ptr_to_element(Coordinates(class_id * 4 + 1, i)));
+        x2[i]    = *reinterpret_cast<T *>(proposals->ptr_to_element(Coordinates(class_id * 4 + 2, i)));
+        y2[i]    = *reinterpret_cast<T *>(proposals->ptr_to_element(Coordinates(class_id * 4 + 3, i)));
+        areas[i] = (x2[i] - x1[i] + 1.0) * (y2[i] - y1[i] + 1.0);
+    }
+
+    // Note: Soft NMS scores have already been initialize with input scores
+
+    while(!inds.empty())
+    {
+        // Find proposal with max score among remaining proposals
+        int max_pos = 0;
+        for(unsigned int i = 1; i < inds.size(); ++i)
+        {
+            if(scores_in[class_id][inds.at(i)] > scores_in[class_id][inds.at(max_pos)])
+            {
+                max_pos = i;
+            }
+        }
+        int element = inds.at(max_pos);
+        keep.push_back(element);
+        std::swap(inds.at(0), inds.at(max_pos));
+
+        // Remove first element and compute IoU of the remaining boxes with identified max box
+        inds.erase(inds.begin());
+
+        std::vector<int> sorted_indices_temp;
+        for(auto idx : inds)
+        {
+            const auto xx1 = std::max(x1[idx], x1[element]);
+            const auto yy1 = std::max(y1[idx], y1[element]);
+            const auto xx2 = std::min(x2[idx], x2[element]);
+            const auto yy2 = std::min(y2[idx], y2[element]);
+
+            const auto w     = std::max((xx2 - xx1 + 1.f), 0.f);
+            const auto h     = std::max((yy2 - yy1 + 1.f), 0.f);
+            const auto inter = w * h;
+            const auto ovr   = inter / (areas[element] + areas[idx] - inter);
+
+            // Update scores based on computed IoU, overlap threshold and NMS method
+            T weight;
+            switch(info.soft_nms_method())
+            {
+                case NMSType::LINEAR:
+                    weight = (ovr > info.nms()) ? (1.f - ovr) : 1.f;
+                    break;
+                case NMSType::GAUSSIAN: // Gaussian
+                    weight = std::exp(-1.f * ovr * ovr / info.soft_nms_sigma());
+                    break;
+                case NMSType::ORIGINAL: // Original NMS
+                    weight = (ovr > info.nms()) ? 0.f : 1.f;
+                    break;
+                default:
+                    ARM_COMPUTE_ERROR("Not supported");
+            }
+
+            // Discard boxes with new scores below min threshold and update pending indices
+            scores_in[class_id][idx] *= weight;
+            if(scores_in[class_id][idx] >= info.soft_nms_min_score_thres())
+            {
+                sorted_indices_temp.push_back(idx);
+            }
+        }
+        inds = sorted_indices_temp;
+    }
+
+    return keep;
+}
+
+template <typename T>
+std::vector<int> NonMaximaSuppression(const ITensor *proposals, std::vector<int> sorted_indices, const BoxNMSLimitInfo &info, int class_id)
+{
+    std::vector<int> keep;
+
+    const int proposals_width = proposals->info()->dimension(1);
+
+    std::vector<T> x1(proposals_width);
+    std::vector<T> y1(proposals_width);
+    std::vector<T> x2(proposals_width);
+    std::vector<T> y2(proposals_width);
+    std::vector<T> areas(proposals_width);
+
+    for(int i = 0; i < proposals_width; ++i)
+    {
+        x1[i]    = *reinterpret_cast<T *>(proposals->ptr_to_element(Coordinates(class_id * 4, i)));
+        y1[i]    = *reinterpret_cast<T *>(proposals->ptr_to_element(Coordinates(class_id * 4 + 1, i)));
+        x2[i]    = *reinterpret_cast<T *>(proposals->ptr_to_element(Coordinates(class_id * 4 + 2, i)));
+        y2[i]    = *reinterpret_cast<T *>(proposals->ptr_to_element(Coordinates(class_id * 4 + 3, i)));
+        areas[i] = (x2[i] - x1[i] + 1.0) * (y2[i] - y1[i] + 1.0);
+    }
+
+    while(!sorted_indices.empty())
+    {
+        int i = sorted_indices.at(0);
+        keep.push_back(i);
+
+        std::vector<int> sorted_indices_temp = sorted_indices;
+        std::vector<int> new_indices;
+        sorted_indices_temp.erase(sorted_indices_temp.begin());
+
+        for(unsigned int j = 0; j < sorted_indices_temp.size(); ++j)
+        {
+            const auto xx1 = std::max(x1[sorted_indices_temp.at(j)], x1[i]);
+            const auto yy1 = std::max(y1[sorted_indices_temp.at(j)], y1[i]);
+            const auto xx2 = std::min(x2[sorted_indices_temp.at(j)], x2[i]);
+            const auto yy2 = std::min(y2[sorted_indices_temp.at(j)], y2[i]);
+
+            const auto w     = std::max((xx2 - xx1 + 1.f), 0.f);
+            const auto h     = std::max((yy2 - yy1 + 1.f), 0.f);
+            const auto inter = w * h;
+            const auto ovr   = inter / (areas[i] + areas[sorted_indices_temp.at(j)] - inter);
+
+            if(ovr <= info.nms())
+            {
+                new_indices.push_back(j);
+            }
+        }
+
+        const unsigned int new_indices_size = new_indices.size();
+        std::vector<int>   new_sorted_indices(new_indices_size);
+        for(unsigned int i = 0; i < new_indices_size; ++i)
+        {
+            new_sorted_indices[i] = sorted_indices[new_indices[i] + 1];
+        }
+        sorted_indices = new_sorted_indices;
+    }
+
+    return keep;
+}
+} // namespace
+
+CPPBoxWithNonMaximaSuppressionLimitKernel::CPPBoxWithNonMaximaSuppressionLimitKernel()
+    : _scores_in(nullptr), _boxes_in(nullptr), _batch_splits_in(nullptr), _scores_out(nullptr), _boxes_out(nullptr), _classes(nullptr), _batch_splits_out(nullptr), _keeps(nullptr), _keeps_size(nullptr),
+      _info()
+{
+}
+
+bool CPPBoxWithNonMaximaSuppressionLimitKernel::is_parallelisable() const
+{
+    return false;
+}
+
+template <typename T>
+void CPPBoxWithNonMaximaSuppressionLimitKernel::run_nmslimit()
+{
+    const int                     batch_size   = _batch_splits_in == nullptr ? 1 : _batch_splits_in->info()->dimension(0);
+    const int                     num_classes  = _scores_in->info()->dimension(0);
+    const int                     scores_count = _scores_in->info()->dimension(1);
+    std::vector<int>              total_keep_per_batch(batch_size);
+    std::vector<std::vector<int>> keeps(num_classes);
+    int                           total_keep_count = 0;
+
+    std::vector<std::vector<T>> in_scores(num_classes, std::vector<T>(scores_count));
+    for(int i = 0; i < scores_count; ++i)
+    {
+        for(int j = 0; j < num_classes; ++j)
+        {
+            in_scores[j][i] = *reinterpret_cast<const T *>(_scores_in->ptr_to_element(Coordinates(j, i)));
+        }
+    }
+
+    int offset        = 0;
+    int cur_start_idx = 0;
+    for(int b = 0; b < batch_size; ++b)
+    {
+        const int num_boxes = _batch_splits_in == nullptr ? 1 : static_cast<int>(*reinterpret_cast<T *>(_batch_splits_in->ptr_to_element(Coordinates(b))));
+        // Skip first class
+        for(int j = 1; j < num_classes; ++j)
+        {
+            std::vector<T>   cur_scores(scores_count);
+            std::vector<int> inds;
+            for(int i = 0; i < scores_count; ++i)
+            {
+                const T score = in_scores[j][i];
+                cur_scores[i] = score;
+
+                if(score > _info.score_thresh())
+                {
+                    inds.push_back(i);
+                }
+            }
+            if(_info.soft_nms_enabled())
+            {
+                keeps[j] = SoftNMS(_boxes_in, in_scores, inds, _info, j);
+            }
+            else
+            {
+                std::sort(inds.data(), inds.data() + inds.size(),
+                          [&cur_scores](int lhs, int rhs)
+                {
+                    return cur_scores[lhs] > cur_scores[rhs];
+                });
+
+                keeps[j] = NonMaximaSuppression<T>(_boxes_in, inds, _info, j);
+            }
+            total_keep_count += keeps[j].size();
+        }
+
+        if(_info.detections_per_im() > 0 && total_keep_count > _info.detections_per_im())
+        {
+            // merge all scores (represented by indices) together and sort
+            auto get_all_scores_sorted = [&in_scores, &keeps, total_keep_count]()
+            {
+                std::vector<T> ret(total_keep_count);
+
+                int ret_idx = 0;
+                for(unsigned int i = 1; i < keeps.size(); ++i)
+                {
+                    auto &cur_keep = keeps[i];
+                    for(auto &ckv : cur_keep)
+                    {
+                        ret[ret_idx++] = in_scores[i][ckv];
+                    }
+                }
+
+                std::sort(ret.data(), ret.data() + ret.size());
+
+                return ret;
+            };
+
+            auto    all_scores_sorted = get_all_scores_sorted();
+            const T image_thresh      = all_scores_sorted[all_scores_sorted.size() - _info.detections_per_im()];
+            for(int j = 1; j < num_classes; ++j)
+            {
+                auto            &cur_keep = keeps[j];
+                std::vector<int> new_keeps_j;
+                for(auto &k : cur_keep)
+                {
+                    if(in_scores[j][k] >= image_thresh)
+                    {
+                        new_keeps_j.push_back(k);
+                    }
+                }
+                keeps[j] = new_keeps_j;
+            }
+            total_keep_count = _info.detections_per_im();
+        }
+
+        total_keep_per_batch[b] = total_keep_count;
+
+        // Write results
+        int cur_out_idx = 0;
+        for(int j = 1; j < num_classes; ++j)
+        {
+            auto     &cur_keep        = keeps[j];
+            auto      cur_out_scores  = reinterpret_cast<T *>(_scores_out->ptr_to_element(Coordinates(cur_start_idx + cur_out_idx)));
+            auto      cur_out_classes = reinterpret_cast<T *>(_classes->ptr_to_element(Coordinates(cur_start_idx + cur_out_idx)));
+            const int box_column      = (cur_start_idx + cur_out_idx) * 4;
+
+            for(unsigned int k = 0; k < cur_keep.size(); ++k)
+            {
+                cur_out_scores[k]     = in_scores[j][cur_keep[k]];
+                cur_out_classes[k]    = static_cast<T>(j);
+                auto cur_out_box_row0 = reinterpret_cast<T *>(_boxes_out->ptr_to_element(Coordinates(box_column + 0, k)));
+                auto cur_out_box_row1 = reinterpret_cast<T *>(_boxes_out->ptr_to_element(Coordinates(box_column + 1, k)));
+                auto cur_out_box_row2 = reinterpret_cast<T *>(_boxes_out->ptr_to_element(Coordinates(box_column + 2, k)));
+                auto cur_out_box_row3 = reinterpret_cast<T *>(_boxes_out->ptr_to_element(Coordinates(box_column + 3, k)));
+                *cur_out_box_row0     = *reinterpret_cast<const T *>(_boxes_in->ptr_to_element(Coordinates(j * 4 + 0, cur_keep[k])));
+                *cur_out_box_row1     = *reinterpret_cast<const T *>(_boxes_in->ptr_to_element(Coordinates(j * 4 + 1, cur_keep[k])));
+                *cur_out_box_row2     = *reinterpret_cast<const T *>(_boxes_in->ptr_to_element(Coordinates(j * 4 + 2, cur_keep[k])));
+                *cur_out_box_row3     = *reinterpret_cast<const T *>(_boxes_in->ptr_to_element(Coordinates(j * 4 + 3, cur_keep[k])));
+            }
+
+            cur_out_idx += cur_keep.size();
+        }
+
+        if(_keeps != nullptr)
+        {
+            cur_out_idx = 0;
+            for(int j = 0; j < num_classes; ++j)
+            {
+                for(unsigned int i = 0; i < keeps[j].size(); ++i)
+                {
+                    *reinterpret_cast<T *>(_keeps->ptr_to_element(Coordinates(cur_start_idx + cur_out_idx + i))) = static_cast<T>(keeps[j].at(i));
+                }
+                *reinterpret_cast<T *>(_keeps_size->ptr_to_element(Coordinates(j + b * num_classes))) = static_cast<T>(keeps[j].size());
+                cur_out_idx += keeps[j].size();
+            }
+        }
+
+        offset += num_boxes;
+        cur_start_idx += total_keep_count;
+    }
+
+    if(_batch_splits_out != nullptr)
+    {
+        for(int b = 0; b < batch_size; ++b)
+        {
+            *reinterpret_cast<float *>(_batch_splits_out->ptr_to_element(Coordinates(b))) = total_keep_per_batch[b];
+        }
+    }
+}
+
+void CPPBoxWithNonMaximaSuppressionLimitKernel::configure(const ITensor *scores_in, const ITensor *boxes_in, const ITensor *batch_splits_in, ITensor *scores_out, ITensor *boxes_out, ITensor *classes,
+                                                          ITensor *batch_splits_out, ITensor *keeps, ITensor *keeps_size, const BoxNMSLimitInfo info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(scores_in, boxes_in, scores_out, boxes_out, classes);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(scores_in, 1, DataType::F16, DataType::F32);
+    const unsigned int num_classes = scores_in->info()->dimension(0);
+
+    ARM_COMPUTE_UNUSED(num_classes);
+    ARM_COMPUTE_ERROR_ON_MSG((4 * num_classes) != boxes_in->info()->dimension(0), "First dimension of input boxes must be of size 4*num_classes");
+    ARM_COMPUTE_ERROR_ON_MSG(scores_in->info()->dimension(1) != boxes_in->info()->dimension(1), "Input scores and input boxes must have the same number of rows");
+    ARM_COMPUTE_ERROR_ON(scores_out->info()->dimension(0) != boxes_out->info()->dimension(1));
+    ARM_COMPUTE_ERROR_ON(boxes_out->info()->dimension(0) != 4);
+    if(keeps != nullptr)
+    {
+        ARM_COMPUTE_ERROR_ON_MSG(keeps_size == nullptr, "keeps_size cannot be nullptr if keeps has to be provided as output");
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(scores_in, keeps);
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(scores_in, keeps_size);
+        ARM_COMPUTE_ERROR_ON(scores_out->info()->dimension(0) != keeps->info()->dimension(0));
+        ARM_COMPUTE_ERROR_ON(num_classes != keeps_size->info()->dimension(0));
+    }
+    if(batch_splits_in != nullptr)
+    {
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(scores_in, batch_splits_in);
+    }
+    if(batch_splits_out != nullptr)
+    {
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(scores_in, batch_splits_out);
+    }
+
+    _scores_in        = scores_in;
+    _boxes_in         = boxes_in;
+    _batch_splits_in  = batch_splits_in;
+    _scores_out       = scores_out;
+    _boxes_out        = boxes_out;
+    _classes          = classes;
+    _batch_splits_out = batch_splits_out;
+    _keeps            = keeps;
+    _keeps_size       = keeps_size;
+    _info             = info;
+
+    // Configure kernel window
+    Window win = calculate_max_window(*scores_in->info(), Steps(scores_in->info()->dimension(0)));
+
+    IKernel::configure(win);
+}
+
+void CPPBoxWithNonMaximaSuppressionLimitKernel::run(const Window &window, const ThreadInfo &info)
+{
+    ARM_COMPUTE_UNUSED(info);
+    ARM_COMPUTE_UNUSED(window);
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(IKernel::window(), window);
+
+    switch(_scores_in->info()->data_type())
+    {
+        case DataType::F32:
+            run_nmslimit<float>();
+            break;
+        case DataType::F16:
+            run_nmslimit<half>();
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Not supported");
+    }
+}
+} // namespace arm_compute
diff --git a/src/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.cpp b/src/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.cpp
new file mode 100644
index 0000000..2e10152
--- /dev/null
+++ b/src/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.cpp
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.h"
+
+#include "arm_compute/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.h"
+#include "support/ToolchainSupport.h"
+
+using namespace arm_compute;
+
+void CPPBoxWithNonMaximaSuppressionLimit::configure(const ITensor *scores_in, const ITensor *boxes_in, const ITensor *batch_splits_in, ITensor *scores_out, ITensor *boxes_out, ITensor *classes,
+                                                    ITensor *batch_splits_out, ITensor *keeps, ITensor *keeps_size, const BoxNMSLimitInfo info)
+{
+    auto k = arm_compute::support::cpp14::make_unique<CPPBoxWithNonMaximaSuppressionLimitKernel>();
+    k->configure(scores_in, boxes_in, batch_splits_in, scores_out, boxes_out, classes, batch_splits_out, keeps, keeps_size, info);
+    _kernel = std::move(k);
+}
\ No newline at end of file