COMPMID-2635: Add support for QASYMM8 in CPPBoxWithNonMaximaSuppressionLimit

Change-Id: Ife35cf865e6573ff7f921eb0b39af89dbf0f5dda
Signed-off-by: Michele Di Giorgio <michele.digiorgio@arm.com>
Reviewed-on: https://review.mlplatform.org/c/1873
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Pablo Marquez <pablo.tello@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
diff --git a/arm_compute/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.h b/arm_compute/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.h
index 8a886a4..4857f74 100644
--- a/arm_compute/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.h
+++ b/arm_compute/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,26 +24,37 @@
 #ifndef __ARM_COMPUTE_CPPBOXWITHNONMAXIMASUPPRESSIONLIMIT_H__
 #define __ARM_COMPUTE_CPPBOXWITHNONMAXIMASUPPRESSIONLIMIT_H__
 
-#include "arm_compute/runtime/CPP/ICPPSimpleFunction.h"
-
+#include "arm_compute/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/IMemoryManager.h"
+#include "arm_compute/runtime/MemoryGroup.h"
+#include "arm_compute/runtime/Tensor.h"
 
 namespace arm_compute
 {
 class ITensor;
 
 /** Basic function to run @ref CPPBoxWithNonMaximaSuppressionLimitKernel */
-class CPPBoxWithNonMaximaSuppressionLimit : public ICPPSimpleFunction
+class CPPBoxWithNonMaximaSuppressionLimit : public IFunction
 {
 public:
+    /** Constructor */
+    CPPBoxWithNonMaximaSuppressionLimit(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CPPBoxWithNonMaximaSuppressionLimit(const CPPBoxWithNonMaximaSuppressionLimit &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CPPBoxWithNonMaximaSuppressionLimit &operator=(const CPPBoxWithNonMaximaSuppressionLimit &) = delete;
     /** Configure the BoxWithNonMaximaSuppressionLimit CPP kernel
      *
-     * @param[in]  scores_in        The scores input tensor of size [count, num_classes]. Data types supported: F16/F32
-     * @param[in]  boxes_in         The boxes input tensor of size [count, num_classes * 4]. Data types supported: Same as @p scores_in
+     * @param[in]  scores_in        The scores input tensor of size [count, num_classes]. Data types supported: QASYMM8/F16/F32
+     * @param[in]  boxes_in         The boxes input tensor of size [count, num_classes * 4].
+     *                              Data types supported: QASYMM16 with 0.125 scale and 0 offset if @p scores_in is QASYMM8, otherwise same as @p scores_in
      * @param[in]  batch_splits_in  The batch splits input tensor of size [batch_size]. Data types supported: Same as @p scores_in
      *                              @note Can be a nullptr. If not a nullptr, @p scores_in and @p boxes_in have items from multiple images.
      * @param[out] scores_out       The scores output tensor of size [N]. Data types supported: Same as @p scores_in
-     * @param[out] boxes_out        The boxes output tensor of size [N, 4]. Data types supported: Same as @p scores_in
+     * @param[out] boxes_out        The boxes output tensor of size [N, 4].
+     *                              Data types supported: QASYMM16 with 0.125 scale and 0 offset if @p scores_in is QASYMM8, otherwise same as @p scores_in
      * @param[out] classes          The classes output tensor of size [N]. Data types supported: Same as @p scores_in
      * @param[out] batch_splits_out (Optional) The batch splits output tensor. Data types supported: Same as @p scores_in
      * @param[out] keeps            (Optional) The keeps output tensor of size [N]. Data types supported: Same as @p scores_in
@@ -52,6 +63,56 @@
      */
     void configure(const ITensor *scores_in, const ITensor *boxes_in, const ITensor *batch_splits_in, ITensor *scores_out, ITensor *boxes_out, ITensor *classes,
                    ITensor *batch_splits_out = nullptr, ITensor *keeps = nullptr, ITensor *keeps_size = nullptr, const BoxNMSLimitInfo info = BoxNMSLimitInfo());
+    /** Static function to check if given info will lead to a valid configuration of @ref CPPDetectionOutputLayer
+     *
+     * @param[in] scores_in        The scores input tensor of size [count, num_classes]. Data types supported: QASYMM8/F16/F32
+     * @param[in] boxes_in         The boxes input tensor of size [count, num_classes * 4].
+     *                             Data types supported: QASYMM16 with 0.125 scale and 0 offset if @p scores_in is QASYMM8, otherwise same as @p scores_in
+     * @param[in] batch_splits_in  The batch splits input tensor of size [batch_size]. Data types supported: Same as @p scores_in
+     *                             @note Can be a nullptr. If not a nullptr, @p scores_in and @p boxes_in have items from multiple images.
+     * @param[in] scores_out       The scores output tensor of size [N]. Data types supported: Same as @p scores_in
+     * @param[in] boxes_out        The boxes output tensor of size [N, 4].
+     *                             Data types supported: QASYMM16 with 0.125 scale and 0 offset if @p scores_in is QASYMM8, otherwise same as @p scores_in
+     * @param[in] classes          The classes output tensor of size [N]. Data types supported: Same as @p scores_in
+     * @param[in] batch_splits_out (Optional) The batch splits output tensor. Data types supported: Same as @p scores_in
+     * @param[in] keeps            (Optional) The keeps output tensor of size [N]. Data types supported: Same as @p scores_in
+     * @param[in] keeps_size       (Optional) Number of filtered indices per class tensor of size [num_classes]. Data types supported: Same as @p scores_in
+     * @param[in] info             (Optional) BoxNMSLimitInfo information.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *scores_in, const ITensorInfo *boxes_in, const ITensorInfo *batch_splits_in, const ITensorInfo *scores_out, const ITensorInfo *boxes_out,
+                           const ITensorInfo *classes,
+                           const ITensorInfo *batch_splits_out = nullptr, const ITensorInfo *keeps = nullptr, const ITensorInfo *keeps_size = nullptr, const BoxNMSLimitInfo info = BoxNMSLimitInfo());
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    MemoryGroup _memory_group;
+
+    CPPBoxWithNonMaximaSuppressionLimitKernel _box_with_nms_limit_kernel;
+
+    const ITensor *_scores_in;
+    const ITensor *_boxes_in;
+    const ITensor *_batch_splits_in;
+    ITensor       *_scores_out;
+    ITensor       *_boxes_out;
+    ITensor       *_classes;
+    ITensor       *_batch_splits_out;
+    ITensor       *_keeps;
+    ITensor       *_keeps_size;
+
+    Tensor _scores_in_f32;
+    Tensor _boxes_in_f32;
+    Tensor _batch_splits_in_f32;
+    Tensor _scores_out_f32;
+    Tensor _boxes_out_f32;
+    Tensor _classes_f32;
+    Tensor _batch_splits_out_f32;
+    Tensor _keeps_f32;
+    Tensor _keeps_size_f32;
+
+    bool _is_qasymm8;
 };
 } // namespace arm_compute
 #endif /* __ARM_COMPUTE_CPPBOXWITHNONMAXIMASUPPRESSIONLIMIT_H__ */
diff --git a/src/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.cpp b/src/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.cpp
index 02150ff..62568b4 100644
--- a/src/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.cpp
+++ b/src/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.cpp
@@ -351,6 +351,7 @@
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(scores_in, boxes_in, scores_out, boxes_out, classes);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(scores_in, 1, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(scores_in, boxes_in, scores_out);
     const unsigned int num_classes = scores_in->info()->dimension(0);
 
     ARM_COMPUTE_UNUSED(num_classes);
diff --git a/src/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.cpp b/src/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.cpp
index 2e10152..158f45a 100644
--- a/src/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.cpp
+++ b/src/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,14 +24,226 @@
 #include "arm_compute/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.h"
 
 #include "arm_compute/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.h"
-#include "support/ToolchainSupport.h"
+#include "arm_compute/runtime/Scheduler.h"
 
-using namespace arm_compute;
+namespace arm_compute
+{
+namespace
+{
+void dequantize_tensor(const ITensor *input, ITensor *output, DataType data_type)
+{
+    const UniformQuantizationInfo qinfo = input->info()->quantization_info().uniform();
+
+    Window window;
+    window.use_tensor_dimensions(input->info()->tensor_shape());
+    Iterator input_it(input, window);
+    Iterator output_it(output, window);
+
+    switch(data_type)
+    {
+        case DataType::QASYMM8:
+            execute_window_loop(window, [&](const Coordinates &)
+            {
+                *reinterpret_cast<float *>(output_it.ptr()) = dequantize(*reinterpret_cast<const uint8_t *>(input_it.ptr()), qinfo.scale, qinfo.offset);
+            },
+            input_it, output_it);
+            break;
+        case DataType::QASYMM16:
+            execute_window_loop(window, [&](const Coordinates &)
+            {
+                *reinterpret_cast<float *>(output_it.ptr()) = dequantize(*reinterpret_cast<const uint16_t *>(input_it.ptr()), qinfo.scale, qinfo.offset);
+            },
+            input_it, output_it);
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Unsupported data type");
+    }
+}
+
+void quantize_tensor(const ITensor *input, ITensor *output, DataType data_type)
+{
+    const UniformQuantizationInfo qinfo = input->info()->quantization_info().uniform();
+
+    Window window;
+    window.use_tensor_dimensions(input->info()->tensor_shape());
+    Iterator input_it(input, window);
+    Iterator output_it(output, window);
+
+    switch(data_type)
+    {
+        case DataType::QASYMM8:
+            execute_window_loop(window, [&](const Coordinates &)
+            {
+                *reinterpret_cast<uint8_t *>(output_it.ptr()) = quantize_qasymm8(*reinterpret_cast<const float *>(input_it.ptr()), qinfo);
+            },
+            input_it, output_it);
+            break;
+        case DataType::QASYMM16:
+            execute_window_loop(window, [&](const Coordinates &)
+            {
+                *reinterpret_cast<uint16_t *>(output_it.ptr()) = quantize_qasymm16(*reinterpret_cast<const float *>(input_it.ptr()), qinfo);
+            },
+            input_it, output_it);
+            break;
+        default:
+            ARM_COMPUTE_ERROR("Unsupported data type");
+    }
+}
+} // namespace
+
+CPPBoxWithNonMaximaSuppressionLimit::CPPBoxWithNonMaximaSuppressionLimit(std::shared_ptr<IMemoryManager> memory_manager)
+    : _memory_group(std::move(memory_manager)),
+      _box_with_nms_limit_kernel(),
+      _scores_in(),
+      _boxes_in(),
+      _batch_splits_in(),
+      _scores_out(),
+      _boxes_out(),
+      _classes(),
+      _batch_splits_out(),
+      _keeps(),
+      _keeps_size(),
+      _scores_in_f32(),
+      _boxes_in_f32(),
+      _batch_splits_in_f32(),
+      _scores_out_f32(),
+      _boxes_out_f32(),
+      _classes_f32(),
+      _batch_splits_out_f32(),
+      _keeps_f32(),
+      _keeps_size_f32(),
+      _is_qasymm8(false)
+{
+}
 
 void CPPBoxWithNonMaximaSuppressionLimit::configure(const ITensor *scores_in, const ITensor *boxes_in, const ITensor *batch_splits_in, ITensor *scores_out, ITensor *boxes_out, ITensor *classes,
                                                     ITensor *batch_splits_out, ITensor *keeps, ITensor *keeps_size, const BoxNMSLimitInfo info)
 {
-    auto k = arm_compute::support::cpp14::make_unique<CPPBoxWithNonMaximaSuppressionLimitKernel>();
-    k->configure(scores_in, boxes_in, batch_splits_in, scores_out, boxes_out, classes, batch_splits_out, keeps, keeps_size, info);
-    _kernel = std::move(k);
-}
\ No newline at end of file
+    ARM_COMPUTE_ERROR_ON_NULLPTR(scores_in, boxes_in, batch_splits_in, scores_out, boxes_out, classes);
+
+    _is_qasymm8 = scores_in->info()->data_type() == DataType::QASYMM8;
+
+    _scores_in        = scores_in;
+    _boxes_in         = boxes_in;
+    _batch_splits_in  = batch_splits_in;
+    _scores_out       = scores_out;
+    _boxes_out        = boxes_out;
+    _classes          = classes;
+    _batch_splits_out = batch_splits_out;
+    _keeps            = keeps;
+    _keeps_size       = keeps_size;
+
+    if(_is_qasymm8)
+    {
+        // Manage intermediate buffers
+        _memory_group.manage(&_scores_in_f32);
+        _memory_group.manage(&_boxes_in_f32);
+        _memory_group.manage(&_batch_splits_in_f32);
+        _memory_group.manage(&_scores_out_f32);
+        _memory_group.manage(&_boxes_out_f32);
+        _memory_group.manage(&_classes_f32);
+        _scores_in_f32.allocator()->init(scores_in->info()->clone()->set_data_type(DataType::F32));
+        _boxes_in_f32.allocator()->init(boxes_in->info()->clone()->set_data_type(DataType::F32));
+        _batch_splits_in_f32.allocator()->init(batch_splits_in->info()->clone()->set_data_type(DataType::F32));
+        _scores_out_f32.allocator()->init(scores_out->info()->clone()->set_data_type(DataType::F32));
+        _boxes_out_f32.allocator()->init(boxes_out->info()->clone()->set_data_type(DataType::F32));
+        _classes_f32.allocator()->init(classes->info()->clone()->set_data_type(DataType::F32));
+        if(batch_splits_out != nullptr)
+        {
+            _memory_group.manage(&_batch_splits_out_f32);
+            _batch_splits_out_f32.allocator()->init(batch_splits_out->info()->clone()->set_data_type(DataType::F32));
+        }
+        if(keeps != nullptr)
+        {
+            _memory_group.manage(&_keeps_f32);
+            _keeps_f32.allocator()->init(keeps->info()->clone()->set_data_type(DataType::F32));
+        }
+        if(keeps_size != nullptr)
+        {
+            _memory_group.manage(&_keeps_size_f32);
+            _keeps_size_f32.allocator()->init(keeps_size->info()->clone()->set_data_type(DataType::F32));
+        }
+
+        _box_with_nms_limit_kernel.configure(&_scores_in_f32, &_boxes_in_f32, &_batch_splits_in_f32, &_scores_out_f32, &_boxes_out_f32, &_classes_f32,
+                                             (batch_splits_out != nullptr) ? &_batch_splits_out_f32 : nullptr, (keeps != nullptr) ? &_keeps_f32 : nullptr,
+                                             (keeps_size != nullptr) ? &_keeps_size_f32 : nullptr, info);
+    }
+    else
+    {
+        _box_with_nms_limit_kernel.configure(scores_in, boxes_in, batch_splits_in, scores_out, boxes_out, classes, batch_splits_out, keeps, keeps_size, info);
+    }
+
+    if(_is_qasymm8)
+    {
+        _scores_in_f32.allocator()->allocate();
+        _boxes_in_f32.allocator()->allocate();
+        _batch_splits_in_f32.allocator()->allocate();
+        _scores_out_f32.allocator()->allocate();
+        _boxes_out_f32.allocator()->allocate();
+        _classes_f32.allocator()->allocate();
+        if(batch_splits_out != nullptr)
+        {
+            _batch_splits_out_f32.allocator()->allocate();
+        }
+        if(keeps != nullptr)
+        {
+            _keeps_f32.allocator()->allocate();
+        }
+        if(keeps_size != nullptr)
+        {
+            _keeps_size_f32.allocator()->allocate();
+        }
+    }
+}
+
+Status validate(const ITensorInfo *scores_in, const ITensorInfo *boxes_in, const ITensorInfo *batch_splits_in, const ITensorInfo *scores_out, const ITensorInfo *boxes_out, const ITensorInfo *classes,
+                const ITensorInfo *batch_splits_out, const ITensorInfo *keeps, const ITensorInfo *keeps_size, const BoxNMSLimitInfo info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(scores_in, boxes_in, batch_splits_in, scores_out, boxes_out, classes);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(scores_in, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
+
+    const bool is_qasymm8 = scores_in->data_type() == DataType::QASYMM8;
+    if(is_qasymm8)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(boxes_in, 1, DataType::QASYMM16);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(boxes_in, boxes_out);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(boxes_in, boxes_out);
+        const UniformQuantizationInfo boxes_qinfo = boxes_in->quantization_info().uniform();
+        ARM_COMPUTE_RETURN_ERROR_ON(boxes_qinfo.scale != 0.125f);
+        ARM_COMPUTE_RETURN_ERROR_ON(boxes_qinfo.offset != 0);
+    }
+
+    return Status{};
+}
+
+void CPPBoxWithNonMaximaSuppressionLimit::run()
+{
+    if(_is_qasymm8)
+    {
+        dequantize_tensor(_scores_in, &_scores_in_f32, _scores_in->info()->data_type());
+        dequantize_tensor(_boxes_in, &_boxes_in_f32, _boxes_in->info()->data_type());
+        dequantize_tensor(_batch_splits_in, &_batch_splits_in_f32, _batch_splits_in->info()->data_type());
+    }
+
+    Scheduler::get().schedule(&_box_with_nms_limit_kernel, Window::DimY);
+
+    if(_is_qasymm8)
+    {
+        quantize_tensor(&_scores_out_f32, _scores_out, _scores_out->info()->data_type());
+        quantize_tensor(&_boxes_out_f32, _boxes_out, _boxes_out->info()->data_type());
+        quantize_tensor(&_classes_f32, _classes, _classes->info()->data_type());
+        if(_batch_splits_out != nullptr)
+        {
+            quantize_tensor(&_batch_splits_out_f32, _batch_splits_out, _batch_splits_out->info()->data_type());
+        }
+        if(_keeps != nullptr)
+        {
+            quantize_tensor(&_keeps_f32, _keeps, _keeps->info()->data_type());
+        }
+        if(_keeps_size != nullptr)
+        {
+            quantize_tensor(&_keeps_size_f32, _keeps_size, _keeps_size->info()->data_type());
+        }
+    }
+}
+} // namespace arm_compute