IVGCVSW-4227 Add CpuAcc backend support for DetectionPostProcess

Change-Id: I318bf92b8d1db593d9c30b9b4412bfecbe65bc12
Signed-off-by: Derek Lamberti <derek.lamberti@arm.com>
diff --git a/src/backends/neon/NeonLayerSupport.cpp b/src/backends/neon/NeonLayerSupport.cpp
index 20b6550..3fc3233 100644
--- a/src/backends/neon/NeonLayerSupport.cpp
+++ b/src/backends/neon/NeonLayerSupport.cpp
@@ -28,6 +28,7 @@
 #include "workloads/NeonDepthToSpaceWorkload.hpp"
 #include "workloads/NeonDepthwiseConvolutionWorkload.hpp"
 #include "workloads/NeonDequantizeWorkload.hpp"
+#include "workloads/NeonDetectionPostProcessWorkload.hpp"
 #include "workloads/NeonGreaterWorkload.hpp"
 #include "workloads/NeonInstanceNormalizationWorkload.hpp"
 #include "workloads/NeonL2NormalizationFloatWorkload.hpp"
@@ -339,6 +340,29 @@
                                    output);
 }
 
+bool NeonLayerSupport::IsDetectionPostProcessSupported(const TensorInfo& boxEncodings,
+                                                       const TensorInfo& scores,
+                                                       const TensorInfo& anchors,
+                                                       const TensorInfo& detectionBoxes,
+                                                       const TensorInfo& detectionClasses,
+                                                       const TensorInfo& detectionScores,
+                                                       const TensorInfo& numDetections,
+                                                       const DetectionPostProcessDescriptor& descriptor,
+                                                       Optional<std::string&> reasonIfUnsupported) const
+{
+    FORWARD_WORKLOAD_VALIDATE_FUNC(NeonDetectionPostProcessValidate,
+                                   reasonIfUnsupported,
+                                   boxEncodings,
+                                   scores,
+                                   anchors,
+                                   detectionBoxes,
+                                   detectionClasses,
+                                   detectionScores,
+                                   numDetections,
+                                   descriptor);
+}
+
+
 bool NeonLayerSupport::IsDilatedDepthwiseConvolutionSupported(const TensorInfo& input,
                                                               const TensorInfo& output,
                                                               const DepthwiseConvolution2dDescriptor& descriptor,
diff --git a/src/backends/neon/NeonLayerSupport.hpp b/src/backends/neon/NeonLayerSupport.hpp
index 5d4fbad..8e6cd6a 100644
--- a/src/backends/neon/NeonLayerSupport.hpp
+++ b/src/backends/neon/NeonLayerSupport.hpp
@@ -86,6 +86,16 @@
                                const TensorInfo& output,
                                Optional<std::string&> reasonIfUnsupported = EmptyOptional()) const override;
 
+    bool IsDetectionPostProcessSupported(const TensorInfo& boxEncodings,
+                                         const TensorInfo& scores,
+                                         const TensorInfo& anchors,
+                                         const TensorInfo& detectionBoxes,
+                                         const TensorInfo& detectionClasses,
+                                         const TensorInfo& detectionScores,
+                                         const TensorInfo& numDetections,
+                                         const DetectionPostProcessDescriptor& descriptor,
+                                         Optional<std::string&> reasonIfUnsupported = EmptyOptional()) const override;
+
     bool IsDilatedDepthwiseConvolutionSupported(const TensorInfo& input,
                                                 const TensorInfo& output,
                                                 const DepthwiseConvolution2dDescriptor& descriptor,
diff --git a/src/backends/neon/NeonWorkloadFactory.cpp b/src/backends/neon/NeonWorkloadFactory.cpp
index dd11af4..8d798ec 100644
--- a/src/backends/neon/NeonWorkloadFactory.cpp
+++ b/src/backends/neon/NeonWorkloadFactory.cpp
@@ -205,7 +205,7 @@
 std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateDetectionPostProcess(
     const armnn::DetectionPostProcessQueueDescriptor& descriptor, const armnn::WorkloadInfo& info) const
 {
-    return MakeWorkloadHelper<NullWorkload, NullWorkload>(descriptor, info);
+    return std::make_unique<NeonDetectionPostProcessWorkload>(descriptor, info);
 }
 
 std::unique_ptr<armnn::IWorkload> NeonWorkloadFactory::CreateDivision(
diff --git a/src/backends/neon/backend.mk b/src/backends/neon/backend.mk
index 2328934..3ddc79a 100644
--- a/src/backends/neon/backend.mk
+++ b/src/backends/neon/backend.mk
@@ -34,6 +34,7 @@
         workloads/NeonDepthToSpaceWorkload.cpp \
         workloads/NeonDepthwiseConvolutionWorkload.cpp \
         workloads/NeonDequantizeWorkload.cpp \
+        workloads/NeonDetectionPostProcessWorkload.cpp \
         workloads/NeonFloorFloatWorkload.cpp \
         workloads/NeonFullyConnectedWorkload.cpp \
         workloads/NeonGreaterWorkload.cpp \
diff --git a/src/backends/neon/test/NeonEndToEndTests.cpp b/src/backends/neon/test/NeonEndToEndTests.cpp
index e841821..2f4c847 100644
--- a/src/backends/neon/test/NeonEndToEndTests.cpp
+++ b/src/backends/neon/test/NeonEndToEndTests.cpp
@@ -11,6 +11,7 @@
 #include <backendsCommon/test/ConcatEndToEndTestImpl.hpp>
 #include <backendsCommon/test/DepthToSpaceEndToEndTestImpl.hpp>
 #include <backendsCommon/test/DequantizeEndToEndTestImpl.hpp>
+#include <backendsCommon/test/DetectionPostProcessEndToEndTestImpl.hpp>
 #include <backendsCommon/test/InstanceNormalizationEndToEndTestImpl.hpp>
 #include <backendsCommon/test/PreluEndToEndTestImpl.hpp>
 #include <backendsCommon/test/QuantizedLstmEndToEndTestImpl.hpp>
@@ -504,4 +505,168 @@
     ArgMinAxis3EndToEnd<armnn::DataType::QuantisedAsymm8>(defaultBackends);
 }
 
+BOOST_AUTO_TEST_CASE(NeonDetectionPostProcessRegularNmsTest)
+{
+    std::vector<float> boxEncodings({
+                                        0.0f, 0.0f, 0.0f, 0.0f,
+                                        0.0f, 1.0f, 0.0f, 0.0f,
+                                        0.0f, -1.0f, 0.0f, 0.0f,
+                                        0.0f, 0.0f, 0.0f, 0.0f,
+                                        0.0f, 1.0f, 0.0f, 0.0f,
+                                        0.0f, 0.0f, 0.0f, 0.0f
+                                    });
+    std::vector<float> scores({
+                                  0.0f, 0.9f, 0.8f,
+                                  0.0f, 0.75f, 0.72f,
+                                  0.0f, 0.6f, 0.5f,
+                                  0.0f, 0.93f, 0.95f,
+                                  0.0f, 0.5f, 0.4f,
+                                  0.0f, 0.3f, 0.2f
+                              });
+    std::vector<float> anchors({
+                                   0.5f, 0.5f, 1.0f, 1.0f,
+                                   0.5f, 0.5f, 1.0f, 1.0f,
+                                   0.5f, 0.5f, 1.0f, 1.0f,
+                                   0.5f, 10.5f, 1.0f, 1.0f,
+                                   0.5f, 10.5f, 1.0f, 1.0f,
+                                   0.5f, 100.5f, 1.0f, 1.0f
+                               });
+    DetectionPostProcessRegularNmsEndToEnd<armnn::DataType::Float32>(defaultBackends, boxEncodings, scores, anchors);
+}
+
+inline void QuantizeData(uint8_t* quant, const float* dequant, const TensorInfo& info)
+{
+    for (size_t i = 0; i < info.GetNumElements(); i++)
+    {
+        quant[i] = armnn::Quantize<uint8_t>(dequant[i], info.GetQuantizationScale(), info.GetQuantizationOffset());
+    }
+}
+
+BOOST_AUTO_TEST_CASE(NeonDetectionPostProcessRegularNmsUint8Test)
+{
+    armnn::TensorInfo boxEncodingsInfo({ 1, 6, 4 }, armnn::DataType::Float32);
+    armnn::TensorInfo scoresInfo({ 1, 6, 3 }, armnn::DataType::Float32);
+    armnn::TensorInfo anchorsInfo({ 6, 4 }, armnn::DataType::Float32);
+
+    boxEncodingsInfo.SetQuantizationScale(1.0f);
+    boxEncodingsInfo.SetQuantizationOffset(1);
+    scoresInfo.SetQuantizationScale(0.01f);
+    scoresInfo.SetQuantizationOffset(0);
+    anchorsInfo.SetQuantizationScale(0.5f);
+    anchorsInfo.SetQuantizationOffset(0);
+
+    std::vector<float> boxEncodings({
+                                        0.0f, 0.0f, 0.0f, 0.0f,
+                                        0.0f, 1.0f, 0.0f, 0.0f,
+                                        0.0f, -1.0f, 0.0f, 0.0f,
+                                        0.0f, 0.0f, 0.0f, 0.0f,
+                                        0.0f, 1.0f, 0.0f, 0.0f,
+                                        0.0f, 0.0f, 0.0f, 0.0f
+                                    });
+    std::vector<float> scores({
+                                  0.0f, 0.9f, 0.8f,
+                                  0.0f, 0.75f, 0.72f,
+                                  0.0f, 0.6f, 0.5f,
+                                  0.0f, 0.93f, 0.95f,
+                                  0.0f, 0.5f, 0.4f,
+                                  0.0f, 0.3f, 0.2f
+                              });
+    std::vector<float> anchors({
+                                   0.5f, 0.5f, 1.0f, 1.0f,
+                                   0.5f, 0.5f, 1.0f, 1.0f,
+                                   0.5f, 0.5f, 1.0f, 1.0f,
+                                   0.5f, 10.5f, 1.0f, 1.0f,
+                                   0.5f, 10.5f, 1.0f, 1.0f,
+                                   0.5f, 100.5f, 1.0f, 1.0f
+                               });
+
+    std::vector<uint8_t> qBoxEncodings(boxEncodings.size(), 0);
+    std::vector<uint8_t> qScores(scores.size(), 0);
+    std::vector<uint8_t> qAnchors(anchors.size(), 0);
+    QuantizeData(qBoxEncodings.data(), boxEncodings.data(), boxEncodingsInfo);
+    QuantizeData(qScores.data(), scores.data(), scoresInfo);
+    QuantizeData(qAnchors.data(), anchors.data(), anchorsInfo);
+    DetectionPostProcessRegularNmsEndToEnd<armnn::DataType::QuantisedAsymm8>(defaultBackends, qBoxEncodings,
+                                                                             qScores, qAnchors,
+                                                                             1.0f, 1, 0.01f, 0, 0.5f, 0);
+}
+
+BOOST_AUTO_TEST_CASE(NeonDetectionPostProcessFastNmsTest)
+{
+    std::vector<float> boxEncodings({
+                                        0.0f, 0.0f, 0.0f, 0.0f,
+                                        0.0f, 1.0f, 0.0f, 0.0f,
+                                        0.0f, -1.0f, 0.0f, 0.0f,
+                                        0.0f, 0.0f, 0.0f, 0.0f,
+                                        0.0f, 1.0f, 0.0f, 0.0f,
+                                        0.0f, 0.0f, 0.0f, 0.0f
+                                    });
+    std::vector<float> scores({
+                                  0.0f, 0.9f, 0.8f,
+                                  0.0f, 0.75f, 0.72f,
+                                  0.0f, 0.6f, 0.5f,
+                                  0.0f, 0.93f, 0.95f,
+                                  0.0f, 0.5f, 0.4f,
+                                  0.0f, 0.3f, 0.2f
+                              });
+    std::vector<float> anchors({
+                                   0.5f, 0.5f, 1.0f, 1.0f,
+                                   0.5f, 0.5f, 1.0f, 1.0f,
+                                   0.5f, 0.5f, 1.0f, 1.0f,
+                                   0.5f, 10.5f, 1.0f, 1.0f,
+                                   0.5f, 10.5f, 1.0f, 1.0f,
+                                   0.5f, 100.5f, 1.0f, 1.0f
+                               });
+    DetectionPostProcessFastNmsEndToEnd<armnn::DataType::Float32>(defaultBackends, boxEncodings, scores, anchors);
+}
+
+BOOST_AUTO_TEST_CASE(RefDetectionPostProcessFastNmsUint8Test)
+{
+    armnn::TensorInfo boxEncodingsInfo({ 1, 6, 4 }, armnn::DataType::Float32);
+    armnn::TensorInfo scoresInfo({ 1, 6, 3 }, armnn::DataType::Float32);
+    armnn::TensorInfo anchorsInfo({ 6, 4 }, armnn::DataType::Float32);
+
+    boxEncodingsInfo.SetQuantizationScale(1.0f);
+    boxEncodingsInfo.SetQuantizationOffset(1);
+    scoresInfo.SetQuantizationScale(0.01f);
+    scoresInfo.SetQuantizationOffset(0);
+    anchorsInfo.SetQuantizationScale(0.5f);
+    anchorsInfo.SetQuantizationOffset(0);
+
+    std::vector<float> boxEncodings({
+                                        0.0f, 0.0f, 0.0f, 0.0f,
+                                        0.0f, 1.0f, 0.0f, 0.0f,
+                                        0.0f, -1.0f, 0.0f, 0.0f,
+                                        0.0f, 0.0f, 0.0f, 0.0f,
+                                        0.0f, 1.0f, 0.0f, 0.0f,
+                                        0.0f, 0.0f, 0.0f, 0.0f
+                                    });
+    std::vector<float> scores({
+                                  0.0f, 0.9f, 0.8f,
+                                  0.0f, 0.75f, 0.72f,
+                                  0.0f, 0.6f, 0.5f,
+                                  0.0f, 0.93f, 0.95f,
+                                  0.0f, 0.5f, 0.4f,
+                                  0.0f, 0.3f, 0.2f
+                              });
+    std::vector<float> anchors({
+                                   0.5f, 0.5f, 1.0f, 1.0f,
+                                   0.5f, 0.5f, 1.0f, 1.0f,
+                                   0.5f, 0.5f, 1.0f, 1.0f,
+                                   0.5f, 10.5f, 1.0f, 1.0f,
+                                   0.5f, 10.5f, 1.0f, 1.0f,
+                                   0.5f, 100.5f, 1.0f, 1.0f
+                               });
+
+    std::vector<uint8_t> qBoxEncodings(boxEncodings.size(), 0);
+    std::vector<uint8_t> qScores(scores.size(), 0);
+    std::vector<uint8_t> qAnchors(anchors.size(), 0);
+    QuantizeData(qBoxEncodings.data(), boxEncodings.data(), boxEncodingsInfo);
+    QuantizeData(qScores.data(), scores.data(), scoresInfo);
+    QuantizeData(qAnchors.data(), anchors.data(), anchorsInfo);
+    DetectionPostProcessFastNmsEndToEnd<armnn::DataType::QuantisedAsymm8>(defaultBackends, qBoxEncodings,
+                                                                          qScores, qAnchors,
+                                                                          1.0f, 1, 0.01f, 0, 0.5f, 0);
+}
+
 BOOST_AUTO_TEST_SUITE_END()
diff --git a/src/backends/neon/workloads/CMakeLists.txt b/src/backends/neon/workloads/CMakeLists.txt
index cf3789e..44db6d2 100644
--- a/src/backends/neon/workloads/CMakeLists.txt
+++ b/src/backends/neon/workloads/CMakeLists.txt
@@ -30,6 +30,8 @@
     NeonDepthwiseConvolutionWorkload.hpp
     NeonDequantizeWorkload.cpp
     NeonDequantizeWorkload.hpp
+    NeonDetectionPostProcessWorkload.cpp
+    NeonDetectionPostProcessWorkload.hpp
     NeonFloorFloatWorkload.cpp
     NeonFloorFloatWorkload.hpp
     NeonFullyConnectedWorkload.cpp
diff --git a/src/backends/neon/workloads/NeonDetectionPostProcessWorkload.cpp b/src/backends/neon/workloads/NeonDetectionPostProcessWorkload.cpp
new file mode 100644
index 0000000..481e950
--- /dev/null
+++ b/src/backends/neon/workloads/NeonDetectionPostProcessWorkload.cpp
@@ -0,0 +1,112 @@
+//
+// Copyright © 2019 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "NeonDetectionPostProcessWorkload.hpp"
+
+#include "NeonWorkloadUtils.hpp"
+
+#include <aclCommon/ArmComputeTensorHandle.hpp>
+#include <aclCommon/ArmComputeTensorUtils.hpp>
+
+#include <boost/cast.hpp>
+
+namespace armnn
+{
+
+arm_compute::DetectionPostProcessLayerInfo MakeInfo(const DetectionPostProcessDescriptor& desc)
+{
+    return arm_compute::DetectionPostProcessLayerInfo(desc.m_MaxDetections,
+                                                      desc.m_MaxClassesPerDetection,
+                                                      desc.m_NmsScoreThreshold,
+                                                      desc.m_NmsIouThreshold,
+                                                      desc.m_NumClasses,
+                                                      { desc.m_ScaleX,
+                                                        desc.m_ScaleY,
+                                                        desc.m_ScaleW,
+                                                        desc.m_ScaleH },
+                                                      desc.m_UseRegularNms,
+                                                      desc.m_DetectionsPerClass);
+}
+
+arm_compute::Status NeonDetectionPostProcessValidate(const TensorInfo& boxEncodings,
+                                                     const TensorInfo& scores,
+                                                     const TensorInfo& anchors,
+                                                     const TensorInfo& detectionBoxes,
+                                                     const TensorInfo& detectionClasses,
+                                                     const TensorInfo& detectionScores,
+                                                     const TensorInfo& numDetections,
+                                                     const DetectionPostProcessDescriptor &desc)
+{
+    arm_compute::DetectionPostProcessLayerInfo info = MakeInfo(desc);
+
+    const arm_compute::TensorInfo aclBoxEncodings =
+        armcomputetensorutils::BuildArmComputeTensorInfo(boxEncodings);
+
+    const arm_compute::TensorInfo aclScores =
+        armcomputetensorutils::BuildArmComputeTensorInfo(scores);
+
+    const arm_compute::TensorInfo aclAnchors =
+        armcomputetensorutils::BuildArmComputeTensorInfo(anchors);
+
+    arm_compute::TensorInfo aclDetectionBoxes =
+        armcomputetensorutils::BuildArmComputeTensorInfo(detectionBoxes);
+
+    arm_compute::TensorInfo aclDetectionClasses =
+        armcomputetensorutils::BuildArmComputeTensorInfo(detectionClasses);
+
+    arm_compute::TensorInfo aclDetectionScores =
+        armcomputetensorutils::BuildArmComputeTensorInfo(detectionScores);
+
+    arm_compute::TensorInfo aclNumDetections =
+        armcomputetensorutils::BuildArmComputeTensorInfo(numDetections);
+
+    return arm_compute::CPPDetectionPostProcessLayer::validate(
+            &aclBoxEncodings,
+            &aclScores,
+            &aclAnchors,
+            &aclDetectionBoxes,
+            &aclDetectionClasses,
+            &aclDetectionScores,
+            &aclNumDetections,
+            info);
+}
+
+NeonDetectionPostProcessWorkload::NeonDetectionPostProcessWorkload(
+    const DetectionPostProcessQueueDescriptor& descriptor,
+    const WorkloadInfo& info)
+    : BaseWorkload<DetectionPostProcessQueueDescriptor>(descriptor, info)
+{
+    m_Anchors = std::make_unique<arm_compute::Tensor>();
+    BuildArmComputeTensor(*m_Anchors, descriptor.m_Anchors->GetTensorInfo());
+
+    arm_compute::DetectionPostProcessLayerInfo di = MakeInfo(m_Data.m_Parameters);
+
+    auto AclTensorRef = [](ITensorHandle* tensor) -> arm_compute::ITensor&
+        {
+            return boost::polymorphic_downcast<IAclTensorHandle*>(tensor)->GetTensor();
+        };
+
+    arm_compute::ITensor& boxEncodings  = AclTensorRef(m_Data.m_Inputs[0]);
+    arm_compute::ITensor& scores        = AclTensorRef(m_Data.m_Inputs[1]);
+
+    arm_compute::ITensor& detectionBoxes    = AclTensorRef(m_Data.m_Outputs[0]);
+    arm_compute::ITensor& detectionClasses  = AclTensorRef(m_Data.m_Outputs[1]);
+    arm_compute::ITensor& detectionScores   = AclTensorRef(m_Data.m_Outputs[2]);
+    arm_compute::ITensor& numDetections     = AclTensorRef(m_Data.m_Outputs[3]);
+
+    m_Func.configure(&boxEncodings, &scores, m_Anchors.get(),
+                     &detectionBoxes, &detectionClasses, &detectionScores, &numDetections,
+                     di);
+
+    InitializeArmComputeTensorData(*m_Anchors, m_Data.m_Anchors);
+}
+
+void NeonDetectionPostProcessWorkload::Execute() const
+{
+    ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonDetectionPostProcessWorkload_Execute");
+    m_Func.run();
+}
+
+} // namespace armnn
diff --git a/src/backends/neon/workloads/NeonDetectionPostProcessWorkload.hpp b/src/backends/neon/workloads/NeonDetectionPostProcessWorkload.hpp
new file mode 100644
index 0000000..f7f0ebf
--- /dev/null
+++ b/src/backends/neon/workloads/NeonDetectionPostProcessWorkload.hpp
@@ -0,0 +1,40 @@
+//
+// Copyright © 2019 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include <backendsCommon/Workload.hpp>
+
+#include <arm_compute/core/Error.h>
+#include <arm_compute/runtime/CPP/functions/CPPDetectionPostProcessLayer.h>
+
+namespace armnn
+{
+
+arm_compute::Status NeonDetectionPostProcessValidate(const TensorInfo& boxEncodings,
+                                                     const TensorInfo& scores,
+                                                     const TensorInfo& anchors,
+                                                     const TensorInfo& detectionBoxes,
+                                                     const TensorInfo& detectionClasses,
+                                                     const TensorInfo& detectionScores,
+                                                     const TensorInfo& numDetections,
+                                                     const DetectionPostProcessDescriptor &desc);
+
+class NeonDetectionPostProcessWorkload : public BaseWorkload<DetectionPostProcessQueueDescriptor>
+{
+public:
+    NeonDetectionPostProcessWorkload(
+        const DetectionPostProcessQueueDescriptor& descriptor,
+        const WorkloadInfo& info);
+    virtual void Execute() const override;
+
+private:
+    mutable arm_compute::CPPDetectionPostProcessLayer m_Func;
+
+    std::unique_ptr<arm_compute::Tensor> m_Anchors;
+
+};
+
+} // namespace armnn
\ No newline at end of file
diff --git a/src/backends/neon/workloads/NeonWorkloads.hpp b/src/backends/neon/workloads/NeonWorkloads.hpp
index 77d819e..dc9bef3 100644
--- a/src/backends/neon/workloads/NeonWorkloads.hpp
+++ b/src/backends/neon/workloads/NeonWorkloads.hpp
@@ -16,6 +16,7 @@
 #include "NeonDepthToSpaceWorkload.hpp"
 #include "NeonDepthwiseConvolutionWorkload.hpp"
 #include "NeonDequantizeWorkload.hpp"
+#include "NeonDetectionPostProcessWorkload.hpp"
 #include "NeonFloorFloatWorkload.hpp"
 #include "NeonFullyConnectedWorkload.hpp"
 #include "NeonGreaterWorkload.hpp"