IVGCVSW-2557 Ref Workload Implementation for Detection PostProcess

 * implementation of DetectionPostProcessQueueDescriptor validate
 * add Uint8ToFloat32Workload
 * add implementation of Detection PostProcess functionalities
 * add ref workload implemenentation for float and uint8
 * add layer support for Detection PostProcess in ref
 * unit tests

Change-Id: I650461f49edbb3c533d68ef8700377af51bc3592
diff --git a/src/backends/reference/RefLayerSupport.cpp b/src/backends/reference/RefLayerSupport.cpp
index 78e44bd..4b32a89 100644
--- a/src/backends/reference/RefLayerSupport.cpp
+++ b/src/backends/reference/RefLayerSupport.cpp
@@ -203,6 +203,18 @@
                                      &TrueFunc<>);
 }
 
+bool RefLayerSupport::IsDetectionPostProcessSupported(const armnn::TensorInfo& input0,
+                                                      const armnn::TensorInfo& input1,
+                                                      const armnn::DetectionPostProcessDescriptor& descriptor,
+                                                      armnn::Optional<std::string&> reasonIfUnsupported) const
+{
+    ignore_unused(input1);
+    return IsSupportedForDataTypeRef(reasonIfUnsupported,
+                                     input0.GetDataType(),
+                                     &TrueFunc<>,
+                                     &TrueFunc<>);
+}
+
 bool RefLayerSupport::IsDivisionSupported(const TensorInfo& input0,
                                           const TensorInfo& input1,
                                           const TensorInfo& output,
diff --git a/src/backends/reference/RefLayerSupport.hpp b/src/backends/reference/RefLayerSupport.hpp
index 5fe111b..3b73f22 100644
--- a/src/backends/reference/RefLayerSupport.hpp
+++ b/src/backends/reference/RefLayerSupport.hpp
@@ -66,6 +66,11 @@
                                          const Optional<TensorInfo>& biases,
                                          Optional<std::string&> reasonIfUnsupported = EmptyOptional()) const override;
 
+    bool IsDetectionPostProcessSupported(const TensorInfo& input0,
+                                         const TensorInfo& input1,
+                                         const DetectionPostProcessDescriptor& descriptor,
+                                         Optional<std::string&> reasonIfUnsupported = EmptyOptional()) const override;
+
     bool IsDivisionSupported(const TensorInfo& input0,
                              const TensorInfo& input1,
                              const TensorInfo& output,
diff --git a/src/backends/reference/RefWorkloadFactory.cpp b/src/backends/reference/RefWorkloadFactory.cpp
index 103abdd..9c1ce1e 100644
--- a/src/backends/reference/RefWorkloadFactory.cpp
+++ b/src/backends/reference/RefWorkloadFactory.cpp
@@ -154,7 +154,7 @@
 std::unique_ptr<IWorkload> RefWorkloadFactory::CreateDetectionPostProcess(
     const armnn::DetectionPostProcessQueueDescriptor& descriptor, const armnn::WorkloadInfo& info) const
 {
-    return MakeWorkload<NullWorkload, NullWorkload>(descriptor, info);
+    return MakeWorkload<RefDetectionPostProcessFloat32Workload, RefDetectionPostProcessUint8Workload>(descriptor, info);
 }
 
 std::unique_ptr<armnn::IWorkload> RefWorkloadFactory::CreateNormalization(
diff --git a/src/backends/reference/backend.mk b/src/backends/reference/backend.mk
index 3ee0791..acaedc9 100644
--- a/src/backends/reference/backend.mk
+++ b/src/backends/reference/backend.mk
@@ -16,6 +16,7 @@
         workloads/Broadcast.cpp \
         workloads/ConvImpl.cpp \
         workloads/Debug.cpp \
+        workloads/DetectionPostProcess.cpp \
         workloads/ElementwiseFunction.cpp \
         workloads/FullyConnected.cpp \
         workloads/Gather.cpp \
@@ -37,6 +38,8 @@
         workloads/RefDebugWorkload.cpp \
         workloads/RefDepthwiseConvolution2dFloat32Workload.cpp \
         workloads/RefDepthwiseConvolution2dUint8Workload.cpp \
+        workloads/RefDetectionPostProcessFloat32Workload.cpp \
+        workloads/RefDetectionPostProcessUint8Workload.cpp \
         workloads/RefElementwiseWorkload.cpp \
         workloads/RefFakeQuantizationFloat32Workload.cpp \
         workloads/RefFloorFloat32Workload.cpp \
diff --git a/src/backends/reference/test/CMakeLists.txt b/src/backends/reference/test/CMakeLists.txt
index 8fa9b5c..9e5711e 100644
--- a/src/backends/reference/test/CMakeLists.txt
+++ b/src/backends/reference/test/CMakeLists.txt
@@ -5,6 +5,7 @@
 
 list(APPEND armnnRefBackendUnitTests_sources
     RefCreateWorkloadTests.cpp
+    RefDetectionPostProcessTests.cpp
     RefEndToEndTests.cpp
     RefJsonPrinterTests.cpp
     RefLayerSupportTests.cpp
diff --git a/src/backends/reference/test/RefDetectionPostProcessTests.cpp b/src/backends/reference/test/RefDetectionPostProcessTests.cpp
new file mode 100644
index 0000000..39403f0
--- /dev/null
+++ b/src/backends/reference/test/RefDetectionPostProcessTests.cpp
@@ -0,0 +1,172 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "reference/workloads/DetectionPostProcess.cpp"
+
+#include <armnn/Descriptors.hpp>
+#include <armnn/Types.hpp>
+
+#include <boost/test/unit_test.hpp>
+
+BOOST_AUTO_TEST_SUITE(RefDetectionPostProcess)
+
+
+BOOST_AUTO_TEST_CASE(TopKSortTest)
+{
+    unsigned int k = 3;
+    unsigned int indices[8] = { 0, 1, 2, 3, 4, 5, 6, 7 };
+    float values[8] = { 0, 7, 6, 5, 4, 3, 2, 500 };
+    TopKSort(k, indices, values, 8);
+    BOOST_TEST(indices[0] == 7);
+    BOOST_TEST(indices[1] == 1);
+    BOOST_TEST(indices[2] == 2);
+}
+
+BOOST_AUTO_TEST_CASE(FullTopKSortTest)
+{
+    unsigned int k = 8;
+    unsigned int indices[8] = { 0, 1, 2, 3, 4, 5, 6, 7 };
+    float values[8] = { 0, 7, 6, 5, 4, 3, 2, 500 };
+    TopKSort(k, indices, values, 8);
+    BOOST_TEST(indices[0] == 7);
+    BOOST_TEST(indices[1] == 1);
+    BOOST_TEST(indices[2] == 2);
+    BOOST_TEST(indices[3] == 3);
+    BOOST_TEST(indices[4] == 4);
+    BOOST_TEST(indices[5] == 5);
+    BOOST_TEST(indices[6] == 6);
+    BOOST_TEST(indices[7] == 0);
+}
+
+BOOST_AUTO_TEST_CASE(IouTest)
+{
+    float boxI[4] = { 0.0f, 0.0f, 10.0f, 10.0f };
+    float boxJ[4] = { 1.0f, 1.0f, 11.0f, 11.0f };
+    float iou = IntersectionOverUnion(boxI, boxJ);
+    BOOST_TEST(iou == 0.68, boost::test_tools::tolerance(0.001));
+}
+
+BOOST_AUTO_TEST_CASE(NmsFunction)
+{
+    std::vector<float> boxCorners({
+        0.0f, 0.0f, 1.0f, 1.0f,
+        0.0f, 0.1f, 1.0f, 1.1f,
+        0.0f, -0.1f, 1.0f, 0.9f,
+        0.0f, 10.0f, 1.0f, 11.0f,
+        0.0f, 10.1f, 1.0f, 11.1f,
+        0.0f, 100.0f, 1.0f, 101.0f
+    });
+
+    std::vector<float> scores({ 0.9f, 0.75f, 0.6f, 0.93f, 0.5f, 0.3f });
+
+    std::vector<unsigned int> result = NonMaxSuppression(6, boxCorners, scores, 0.0, 3, 0.5);
+    BOOST_TEST(result.size() == 3);
+    BOOST_TEST(result[0] == 3);
+    BOOST_TEST(result[1] == 0);
+    BOOST_TEST(result[2] == 5);
+}
+
+void DetectionPostProcessTestImpl(bool useRegularNms, const std::vector<float>& expectedDetectionBoxes,
+                                  const std::vector<float>& expectedDetectionClasses,
+                                  const std::vector<float>& expectedDetectionScores,
+                                  const std::vector<float>& expectedNumDetections)
+{
+    armnn::TensorInfo boxEncodingsInfo({ 1, 6, 4 }, armnn::DataType::Float32);
+    armnn::TensorInfo scoresInfo({ 1, 6, 4 }, armnn::DataType::Float32);
+    armnn::TensorInfo anchorsInfo({ 1, 6, 4 }, armnn::DataType::Float32);
+
+    armnn::TensorInfo detectionBoxesInfo({ 1, 3, 4 }, armnn::DataType::Float32);
+    armnn::TensorInfo detectionScoresInfo({ 1, 3 }, armnn::DataType::Float32);
+    armnn::TensorInfo detectionClassesInfo({ 1, 3 }, armnn::DataType::Float32);
+    armnn::TensorInfo numDetectionInfo({ 1 }, armnn::DataType::Float32);
+
+    armnn::DetectionPostProcessDescriptor desc;
+    desc.m_UseRegularNms = useRegularNms;
+    desc.m_MaxDetections = 3;
+    desc.m_MaxClassesPerDetection = 1;
+    desc.m_DetectionsPerClass =1;
+    desc.m_NmsScoreThreshold = 0.0;
+    desc.m_NmsIouThreshold = 0.5;
+    desc.m_NumClasses = 2;
+    desc.m_ScaleY = 10.0;
+    desc.m_ScaleX = 10.0;
+    desc.m_ScaleH = 5.0;
+    desc.m_ScaleW = 5.0;
+
+    std::vector<float> boxEncodings({
+        0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 1.0f, 0.0f, 0.0f,
+        0.0f, -1.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 1.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f
+    });
+    std::vector<float> scores({
+        0.0f, 0.9f, 0.8f,
+        0.0f, 0.75f, 0.72f,
+        0.0f, 0.6f, 0.5f,
+        0.0f, 0.93f, 0.95f,
+        0.0f, 0.5f, 0.4f,
+        0.0f, 0.3f, 0.2f
+    });
+    std::vector<float> anchors({
+        0.5f, 0.5f, 1.0f, 1.0f,
+        0.5f, 0.5f, 1.0f, 1.0f,
+        0.5f, 0.5f, 1.0f, 1.0f,
+        0.5f, 10.5f, 1.0f, 1.0f,
+        0.5f, 10.5f, 1.0f, 1.0f,
+        0.5f, 100.5f, 1.0f, 1.0f
+    });
+
+    std::vector<float> detectionBoxes(detectionBoxesInfo.GetNumElements());
+    std::vector<float> detectionScores(detectionScoresInfo.GetNumElements());
+    std::vector<float> detectionClasses(detectionClassesInfo.GetNumElements());
+    std::vector<float> numDetections(1);
+
+    armnn::DetectionPostProcess(boxEncodingsInfo, scoresInfo, anchorsInfo,
+                                detectionBoxesInfo, detectionClassesInfo,
+                                detectionScoresInfo, numDetectionInfo, desc,
+                                boxEncodings.data(), scores.data(), anchors.data(),
+                                detectionBoxes.data(), detectionClasses.data(),
+                                detectionScores.data(), numDetections.data());
+
+    BOOST_TEST(detectionBoxes == expectedDetectionBoxes);
+    BOOST_TEST(detectionScores == expectedDetectionScores);
+    BOOST_TEST(detectionClasses == expectedDetectionClasses);
+    BOOST_TEST(numDetections == expectedNumDetections);
+}
+
+BOOST_AUTO_TEST_CASE(RegularNmsDetectionPostProcess)
+{
+    std::vector<float> expectedDetectionBoxes({
+        0.0f, 10.0f, 1.0f, 11.0f,
+        0.0f, 10.0f, 1.0f, 11.0f,
+        0.0f, 0.0f, 0.0f, 0.0f
+    });
+
+    std::vector<float> expectedDetectionScores({ 0.95f, 0.93f, 0.0f });
+    std::vector<float> expectedDetectionClasses({ 1.0f, 0.0f, 0.0f });
+    std::vector<float> expectedNumDetections({ 2.0f });
+
+    DetectionPostProcessTestImpl(true, expectedDetectionBoxes, expectedDetectionClasses,
+                                 expectedDetectionScores, expectedNumDetections);
+}
+
+BOOST_AUTO_TEST_CASE(FastNmsDetectionPostProcess)
+{
+    std::vector<float> expectedDetectionBoxes({
+        0.0f, 10.0f, 1.0f, 11.0f,
+        0.0f, 0.0f, 1.0f, 1.0f,
+        0.0f, 100.0f, 1.0f, 101.0f
+    });
+    std::vector<float> expectedDetectionScores({ 0.95f, 0.9f, 0.3f });
+    std::vector<float> expectedDetectionClasses({ 1.0f, 0.0f, 0.0f });
+    std::vector<float> expectedNumDetections({ 3.0f });
+
+    DetectionPostProcessTestImpl(false, expectedDetectionBoxes, expectedDetectionClasses,
+                                 expectedDetectionScores, expectedNumDetections);
+}
+
+BOOST_AUTO_TEST_SUITE_END()
\ No newline at end of file
diff --git a/src/backends/reference/workloads/CMakeLists.txt b/src/backends/reference/workloads/CMakeLists.txt
index 57e89fa..47e42f7 100644
--- a/src/backends/reference/workloads/CMakeLists.txt
+++ b/src/backends/reference/workloads/CMakeLists.txt
@@ -15,6 +15,8 @@
     ConvImpl.hpp
     Debug.cpp
     Debug.hpp
+    DetectionPostProcess.cpp
+    DetectionPostProcess.hpp
     ElementwiseFunction.cpp
     ElementwiseFunction.hpp
     FullyConnected.cpp
@@ -60,6 +62,10 @@
     RefDepthwiseConvolution2dFloat32Workload.hpp
     RefDepthwiseConvolution2dUint8Workload.cpp
     RefDepthwiseConvolution2dUint8Workload.hpp
+    RefDetectionPostProcessUint8Workload.cpp
+    RefDetectionPostProcessUint8Workload.hpp
+    RefDetectionPostProcessFloat32Workload.cpp
+    RefDetectionPostProcessFloat32Workload.hpp
     RefFakeQuantizationFloat32Workload.cpp
     RefFakeQuantizationFloat32Workload.hpp
     RefFloorFloat32Workload.cpp
diff --git a/src/backends/reference/workloads/DetectionPostProcess.cpp b/src/backends/reference/workloads/DetectionPostProcess.cpp
new file mode 100644
index 0000000..958de82
--- /dev/null
+++ b/src/backends/reference/workloads/DetectionPostProcess.cpp
@@ -0,0 +1,264 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "DetectionPostProcess.hpp"
+
+#include <armnn/ArmNN.hpp>
+
+#include <boost/numeric/conversion/cast.hpp>
+
+#include <algorithm>
+#include <numeric>
+
+namespace
+{
+
+std::vector<unsigned int> GenerateRangeK(unsigned int k)
+{
+    std::vector<unsigned int> range(k);
+    std::iota(range.begin(), range.end(), 0);
+    return range;
+}
+
+void TopKSort(unsigned int k, unsigned int* indices, const float* values, unsigned int numElement)
+{
+    std::partial_sort(indices, indices + k, indices + numElement,
+                      [&values](unsigned int i, unsigned int j) { return values[i] > values[j]; });
+}
+
+float IntersectionOverUnion(const float* boxI, const float* boxJ)
+{
+    // Box-corner format: ymin, xmin, ymax, xmax.
+    const int yMin = 0;
+    const int xMin = 1;
+    const int yMax = 2;
+    const int xMax = 3;
+    float areaI = (boxI[yMax] - boxI[yMin]) * (boxI[xMax] - boxI[xMin]);
+    float areaJ = (boxJ[yMax] - boxJ[yMin]) * (boxJ[xMax] - boxJ[xMin]);
+    float yMinIntersection = std::max(boxI[yMin], boxJ[yMin]);
+    float xMinIntersection = std::max(boxI[xMin], boxJ[xMin]);
+    float yMaxIntersection = std::min(boxI[yMax], boxJ[yMax]);
+    float xMaxIntersection = std::min(boxI[xMax], boxJ[xMax]);
+    float areaIntersection = std::max(yMaxIntersection - yMinIntersection, 0.0f) *
+                                std::max(xMaxIntersection - xMinIntersection, 0.0f);
+    float areaUnion = areaI + areaJ - areaIntersection;
+    return areaIntersection / areaUnion;
+}
+
+std::vector<unsigned int> NonMaxSuppression(unsigned int numBoxes, const std::vector<float>& boxCorners,
+                                            const std::vector<float>& scores, float nmsScoreThreshold,
+                                            unsigned int maxDetection, float nmsIouThreshold)
+{
+    // Select boxes that have scores above a given threshold.
+    std::vector<float> scoresAboveThreshold;
+    std::vector<unsigned int> indicesAboveThreshold;
+    for (unsigned int i = 0; i < numBoxes; ++i)
+    {
+        if (scores[i] >= nmsScoreThreshold)
+        {
+            scoresAboveThreshold.push_back(scores[i]);
+            indicesAboveThreshold.push_back(i);
+        }
+    }
+
+    // Sort the indices based on scores.
+    unsigned int numAboveThreshold = boost::numeric_cast<unsigned int>(scoresAboveThreshold.size());
+    std::vector<unsigned int> sortedIndices = GenerateRangeK(numAboveThreshold);
+    TopKSort(numAboveThreshold,sortedIndices.data(), scoresAboveThreshold.data(), numAboveThreshold);
+
+    // Number of output cannot be more than max detections specified in the option.
+    unsigned int numOutput = std::min(maxDetection, numAboveThreshold);
+    std::vector<unsigned int> outputIndices;
+    std::vector<bool> visited(numAboveThreshold, false);
+
+    // Prune out the boxes with high intersection over union by keeping the box with higher score.
+    for (unsigned int i = 0; i < numAboveThreshold; ++i)
+    {
+        if (outputIndices.size() >= numOutput)
+        {
+            break;
+        }
+        if (!visited[sortedIndices[i]])
+        {
+            outputIndices.push_back(indicesAboveThreshold[sortedIndices[i]]);
+        }
+        for (unsigned int j = i + 1; j < numAboveThreshold; ++j)
+        {
+            unsigned int iIndex = indicesAboveThreshold[sortedIndices[i]] * 4;
+            unsigned int jIndex = indicesAboveThreshold[sortedIndices[j]] * 4;
+            if (IntersectionOverUnion(&boxCorners[iIndex], &boxCorners[jIndex]) > nmsIouThreshold)
+            {
+                visited[sortedIndices[j]] = true;
+            }
+        }
+    }
+    return outputIndices;
+}
+
+void AllocateOutputData(unsigned int numOutput, unsigned int numSelected, const std::vector<float>& boxCorners,
+                        const std::vector<unsigned int>& outputIndices, const std::vector<unsigned int>& selectedBoxes,
+                        const std::vector<unsigned int>& selectedClasses, const std::vector<float>& selectedScores,
+                        float* detectionBoxes, float* detectionScores, float* detectionClasses, float* numDetections)
+{
+    for (unsigned int i = 0; i < numOutput; ++i)
+        {
+            unsigned int boxIndex = i * 4;
+            unsigned int boxConorIndex = selectedBoxes[outputIndices[i]] * 4;
+            if (i < numSelected)
+            {
+                detectionScores[i] = selectedScores[outputIndices[i]];
+                detectionClasses[i] = boost::numeric_cast<float>(selectedClasses[outputIndices[i]]);
+                detectionBoxes[boxIndex] = boxCorners[boxConorIndex];
+                detectionBoxes[boxIndex + 1] = boxCorners[boxConorIndex + 1];
+                detectionBoxes[boxIndex + 2] = boxCorners[boxConorIndex + 2];
+                detectionBoxes[boxIndex + 3] = boxCorners[boxConorIndex + 3];
+            }
+            else
+            {
+                detectionScores[i] = 0.0f;
+                detectionClasses[i] = 0.0f;
+                detectionBoxes[boxIndex] = 0.0f;
+                detectionBoxes[boxIndex + 1] = 0.0f;
+                detectionBoxes[boxIndex + 2] = 0.0f;
+                detectionBoxes[boxIndex + 3] = 0.0f;
+            }
+        }
+        numDetections[0] = boost::numeric_cast<float>(numOutput);
+}
+
+} // anonymous namespace
+
+namespace armnn
+{
+
+void DetectionPostProcess(const TensorInfo& boxEncodingsInfo,
+                          const TensorInfo& scoresInfo,
+                          const TensorInfo& anchorsInfo,
+                          const TensorInfo& detectionBoxesInfo,
+                          const TensorInfo& detectionClassesInfo,
+                          const TensorInfo& detectionScoresInfo,
+                          const TensorInfo& numDetectionsInfo,
+                          const DetectionPostProcessDescriptor& desc,
+                          const float* boxEncodings,
+                          const float* scores,
+                          const float* anchors,
+                          float* detectionBoxes,
+                          float* detectionClasses,
+                          float* detectionScores,
+                          float* numDetections)
+{
+    // Transform center-size format which is (ycenter, xcenter, height, width) to box-corner format,
+    // which represents the lower left corner and the upper right corner (ymin, xmin, ymax, xmax)
+    std::vector<float> boxCorners(boxEncodingsInfo.GetNumElements());
+    unsigned int numBoxes = boxEncodingsInfo.GetShape()[1];
+    for (unsigned int i = 0; i < numBoxes; ++i)
+    {
+        unsigned int indexY = i * 4;
+        unsigned int indexX = indexY + 1;
+        unsigned int indexH = indexX + 1;
+        unsigned int indexW = indexH + 1;
+        float yCentre = boxEncodings[indexY] / desc.m_ScaleY * anchors[indexH] + anchors[indexY];
+        float xCentre = boxEncodings[indexX] / desc.m_ScaleX * anchors[indexW] + anchors[indexX];
+        float halfH = 0.5f * expf(boxEncodings[indexH] / desc.m_ScaleH) * anchors[indexH];
+        float halfW = 0.5f * expf(boxEncodings[indexW] / desc.m_ScaleW) * anchors[indexW];
+        // ymin
+        boxCorners[indexY] = yCentre - halfH;
+        // xmin
+        boxCorners[indexX] = xCentre - halfW;
+        // ymax
+        boxCorners[indexH] = yCentre + halfH;
+        // xmax
+        boxCorners[indexW] = xCentre + halfW;
+
+        BOOST_ASSERT(boxCorners[indexY] < boxCorners[indexH]);
+        BOOST_ASSERT(boxCorners[indexX] < boxCorners[indexW]);
+    }
+
+    unsigned int numClassesWithBg = desc.m_NumClasses + 1;
+
+    // Perform Non Max Suppression.
+    if (desc.m_UseRegularNms)
+    {
+        // Perform Regular NMS.
+        // For each class, perform NMS and select max detection numbers of the highest score across all classes.
+        std::vector<float> classScores(numBoxes);
+        std::vector<unsigned int>selectedBoxesAfterNms;
+        std::vector<float> selectedScoresAfterNms;
+        std::vector<unsigned int> selectedClasses;
+
+        for (unsigned int c = 0; c < desc.m_NumClasses; ++c)
+        {
+            // For each boxes, get scores of the boxes for the class c.
+            for (unsigned int i = 0; i < numBoxes; ++i)
+            {
+                classScores[i] = scores[i * numClassesWithBg + c + 1];
+            }
+            std::vector<unsigned int> selectedIndices = NonMaxSuppression(numBoxes, boxCorners, classScores,
+                                                                          desc.m_NmsScoreThreshold,
+                                                                          desc.m_MaxClassesPerDetection,
+                                                                          desc.m_NmsIouThreshold);
+
+            for (unsigned int i = 0; i < selectedIndices.size(); ++i)
+            {
+                selectedBoxesAfterNms.push_back(selectedIndices[i]);
+                selectedScoresAfterNms.push_back(classScores[selectedIndices[i]]);
+                selectedClasses.push_back(c);
+            }
+        }
+
+        // Select max detection numbers of the highest score across all classes
+        unsigned int numSelected = boost::numeric_cast<unsigned int>(selectedBoxesAfterNms.size());
+        unsigned int numOutput = std::min(desc.m_MaxDetections,  numSelected);
+
+        // Sort the max scores among the selected indices.
+        std::vector<unsigned int> outputIndices = GenerateRangeK(numSelected);
+        TopKSort(numOutput, outputIndices.data(), selectedScoresAfterNms.data(), numSelected);
+
+        AllocateOutputData(numOutput, numSelected, boxCorners, outputIndices,
+                           selectedBoxesAfterNms, selectedClasses, selectedScoresAfterNms,
+                           detectionBoxes, detectionScores, detectionClasses, numDetections);
+    }
+    else
+    {
+        // Perform Fast NMS.
+        // Select max scores of boxes and perform NMS on max scores,
+        // select max detection numbers of the highest score
+        unsigned int numClassesPerBox = std::min(desc.m_MaxClassesPerDetection, desc.m_NumClasses);
+        std::vector<float> maxScores;
+        std::vector<unsigned int>boxIndices;
+        std::vector<unsigned int>maxScoreClasses;
+
+        for (unsigned int box = 0; box < numBoxes; ++box)
+        {
+            unsigned int scoreIndex = box * numClassesWithBg + 1;
+
+            // Get the max scores of the box.
+            std::vector<unsigned int> maxScoreIndices = GenerateRangeK(desc.m_NumClasses);
+            TopKSort(numClassesPerBox, maxScoreIndices.data(), scores + scoreIndex, desc.m_NumClasses);
+
+            for (unsigned int i = 0; i < numClassesPerBox; ++i)
+            {
+                maxScores.push_back(scores[scoreIndex + maxScoreIndices[i]]);
+                maxScoreClasses.push_back(maxScoreIndices[i]);
+                boxIndices.push_back(box);
+            }
+        }
+
+        // Perform NMS on max scores
+        std::vector<unsigned int> selectedIndices = NonMaxSuppression(numBoxes, boxCorners, maxScores,
+                                                                      desc.m_NmsScoreThreshold,
+                                                                      desc.m_MaxDetections,
+                                                                      desc.m_NmsIouThreshold);
+
+        unsigned int numSelected = boost::numeric_cast<unsigned int>(selectedIndices.size());
+        unsigned int numOutput = std::min(desc.m_MaxDetections,  numSelected);
+
+        AllocateOutputData(numOutput, numSelected, boxCorners, selectedIndices,
+                           boxIndices, maxScoreClasses, maxScores,
+                           detectionBoxes, detectionScores, detectionClasses, numDetections);
+    }
+}
+
+} // namespace armnn
diff --git a/src/backends/reference/workloads/DetectionPostProcess.hpp b/src/backends/reference/workloads/DetectionPostProcess.hpp
new file mode 100644
index 0000000..06e9e15
--- /dev/null
+++ b/src/backends/reference/workloads/DetectionPostProcess.hpp
@@ -0,0 +1,29 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+#pragma once
+
+#include "armnn/Tensor.hpp"
+#include "armnn/Descriptors.hpp"
+
+namespace armnn
+{
+
+void DetectionPostProcess(const TensorInfo& boxEncodingsInfo,
+                          const TensorInfo& scoresInfo,
+                          const TensorInfo& anchorsInfo,
+                          const TensorInfo& detectionBoxesInfo,
+                          const TensorInfo& detectionClassesInfo,
+                          const TensorInfo& detectionScoresInfo,
+                          const TensorInfo& numDetectionsInfo,
+                          const DetectionPostProcessDescriptor& desc,
+                          const float* boxEncodings,
+                          const float* scores,
+                          const float* anchors,
+                          float* detectionBoxes,
+                          float* detectionClasses,
+                          float* detectionScores,
+                          float* numDetections);
+
+} // namespace armnn
diff --git a/src/backends/reference/workloads/RefDetectionPostProcessFloat32Workload.cpp b/src/backends/reference/workloads/RefDetectionPostProcessFloat32Workload.cpp
new file mode 100644
index 0000000..ddab046
--- /dev/null
+++ b/src/backends/reference/workloads/RefDetectionPostProcessFloat32Workload.cpp
@@ -0,0 +1,48 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "RefDetectionPostProcessFloat32Workload.hpp"
+
+#include "DetectionPostProcess.hpp"
+#include "Profiling.hpp"
+#include "RefWorkloadUtils.hpp"
+
+namespace armnn
+{
+
+RefDetectionPostProcessFloat32Workload::RefDetectionPostProcessFloat32Workload(
+        const DetectionPostProcessQueueDescriptor& descriptor, const WorkloadInfo& info)
+        : Float32Workload<DetectionPostProcessQueueDescriptor>(descriptor, info),
+          m_Anchors(std::make_unique<ScopedCpuTensorHandle>(*(descriptor.m_Anchors))) {}
+
+void RefDetectionPostProcessFloat32Workload::Execute() const
+{
+    ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuRef, "RefDetectionPostProcessUint8Workload_Execute");
+
+    const TensorInfo& boxEncodingsInfo = GetTensorInfo(m_Data.m_Inputs[0]);
+    const TensorInfo& scoresInfo = GetTensorInfo(m_Data.m_Inputs[1]);
+    const TensorInfo& anchorsInfo = GetTensorInfo(m_Anchors.get());
+    const TensorInfo& detectionBoxesInfo = GetTensorInfo(m_Data.m_Outputs[0]);
+    const TensorInfo& detectionClassesInfo = GetTensorInfo(m_Data.m_Outputs[1]);
+    const TensorInfo& detectionScoresInfo = GetTensorInfo(m_Data.m_Outputs[2]);
+    const TensorInfo& numDetectionsInfo = GetTensorInfo(m_Data.m_Outputs[3]);
+
+    const float* boxEncodings = GetInputTensorDataFloat(0, m_Data);
+    const float* scores = GetInputTensorDataFloat(1, m_Data);
+    const float* anchors = m_Anchors->GetConstTensor<float>();
+
+    float* detectionBoxes = GetOutputTensorData<float>(0, m_Data);
+    float* detectionClasses = GetOutputTensorData<float>(1, m_Data);
+    float* detectionScores = GetOutputTensorData<float>(2, m_Data);
+    float* numDetections = GetOutputTensorData<float>(3, m_Data);
+
+    DetectionPostProcess(boxEncodingsInfo, scoresInfo, anchorsInfo,
+                         detectionBoxesInfo, detectionClassesInfo,
+                         detectionScoresInfo, numDetectionsInfo, m_Data.m_Parameters,
+                         boxEncodings, scores, anchors, detectionBoxes,
+                         detectionClasses, detectionScores, numDetections);
+}
+
+} //namespace armnn
diff --git a/src/backends/reference/workloads/RefDetectionPostProcessFloat32Workload.hpp b/src/backends/reference/workloads/RefDetectionPostProcessFloat32Workload.hpp
new file mode 100644
index 0000000..9f2a697
--- /dev/null
+++ b/src/backends/reference/workloads/RefDetectionPostProcessFloat32Workload.hpp
@@ -0,0 +1,25 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include <backendsCommon/Workload.hpp>
+#include <backendsCommon/WorkloadData.hpp>
+
+namespace armnn
+{
+
+class RefDetectionPostProcessFloat32Workload : public Float32Workload<DetectionPostProcessQueueDescriptor>
+{
+public:
+    explicit RefDetectionPostProcessFloat32Workload(const DetectionPostProcessQueueDescriptor& descriptor,
+                                                    const WorkloadInfo& info);
+    virtual void Execute() const override;
+
+private:
+    std::unique_ptr<ScopedCpuTensorHandle> m_Anchors;
+};
+
+} //namespace armnn
diff --git a/src/backends/reference/workloads/RefDetectionPostProcessUint8Workload.cpp b/src/backends/reference/workloads/RefDetectionPostProcessUint8Workload.cpp
new file mode 100644
index 0000000..ccdaf87
--- /dev/null
+++ b/src/backends/reference/workloads/RefDetectionPostProcessUint8Workload.cpp
@@ -0,0 +1,52 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "RefDetectionPostProcessUint8Workload.hpp"
+
+#include "DetectionPostProcess.hpp"
+#include "Profiling.hpp"
+#include "RefWorkloadUtils.hpp"
+
+namespace armnn
+{
+
+RefDetectionPostProcessUint8Workload::RefDetectionPostProcessUint8Workload(
+        const DetectionPostProcessQueueDescriptor& descriptor, const WorkloadInfo& info)
+        : Uint8ToFloat32Workload<DetectionPostProcessQueueDescriptor>(descriptor, info),
+          m_Anchors(std::make_unique<ScopedCpuTensorHandle>(*(descriptor.m_Anchors))) {}
+
+void RefDetectionPostProcessUint8Workload::Execute() const
+{
+    ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuRef, "RefDetectionPostProcessUint8Workload_Execute");
+
+    const TensorInfo& boxEncodingsInfo = GetTensorInfo(m_Data.m_Inputs[0]);
+    const TensorInfo& scoresInfo = GetTensorInfo(m_Data.m_Inputs[1]);
+    const TensorInfo& anchorsInfo = GetTensorInfo(m_Anchors.get());
+    const TensorInfo& detectionBoxesInfo = GetTensorInfo(m_Data.m_Outputs[0]);
+    const TensorInfo& detectionClassesInfo = GetTensorInfo(m_Data.m_Outputs[1]);
+    const TensorInfo& detectionScoresInfo = GetTensorInfo(m_Data.m_Outputs[2]);
+    const TensorInfo& numDetectionsInfo = GetTensorInfo(m_Data.m_Outputs[3]);
+
+    const uint8_t* boxEncodingsData = GetInputTensorDataU8(0, m_Data);
+    const uint8_t* scoresData = GetInputTensorDataU8(1, m_Data);
+    const uint8_t* anchorsData = m_Anchors->GetConstTensor<uint8_t>();
+
+    auto boxEncodings = Dequantize(boxEncodingsData, boxEncodingsInfo);
+    auto scores = Dequantize(scoresData, scoresInfo);
+    auto anchors = Dequantize(anchorsData, anchorsInfo);
+
+    float* detectionBoxes = GetOutputTensorData<float>(0, m_Data);
+    float* detectionClasses = GetOutputTensorData<float>(1, m_Data);
+    float* detectionScores = GetOutputTensorData<float>(2, m_Data);
+    float* numDetections = GetOutputTensorData<float>(3, m_Data);
+
+    DetectionPostProcess(boxEncodingsInfo, scoresInfo, anchorsInfo,
+                         detectionBoxesInfo, detectionClassesInfo,
+                         detectionScoresInfo, numDetectionsInfo, m_Data.m_Parameters,
+                         boxEncodings.data(), scores.data(), anchors.data(),
+                         detectionBoxes, detectionClasses, detectionScores, numDetections);
+}
+
+} //namespace armnn
diff --git a/src/backends/reference/workloads/RefDetectionPostProcessUint8Workload.hpp b/src/backends/reference/workloads/RefDetectionPostProcessUint8Workload.hpp
new file mode 100644
index 0000000..91590f5
--- /dev/null
+++ b/src/backends/reference/workloads/RefDetectionPostProcessUint8Workload.hpp
@@ -0,0 +1,25 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include <backendsCommon/Workload.hpp>
+#include <backendsCommon/WorkloadData.hpp>
+
+namespace armnn
+{
+
+class RefDetectionPostProcessUint8Workload : public Uint8ToFloat32Workload<DetectionPostProcessQueueDescriptor>
+{
+public:
+    explicit RefDetectionPostProcessUint8Workload(const DetectionPostProcessQueueDescriptor& descriptor,
+                                                  const WorkloadInfo& info);
+    virtual void Execute() const override;
+
+private:
+    std::unique_ptr<ScopedCpuTensorHandle> m_Anchors;
+};
+
+} //namespace armnn
diff --git a/src/backends/reference/workloads/RefWorkloads.hpp b/src/backends/reference/workloads/RefWorkloads.hpp
index d9f4dbb..2156388 100644
--- a/src/backends/reference/workloads/RefWorkloads.hpp
+++ b/src/backends/reference/workloads/RefWorkloads.hpp
@@ -38,6 +38,8 @@
 #include "ResizeBilinear.hpp"
 #include "RefNormalizationFloat32Workload.hpp"
 #include "RefDepthwiseConvolution2dFloat32Workload.hpp"
+#include "RefDetectionPostProcessFloat32Workload.hpp"
+#include "RefDetectionPostProcessUint8Workload.hpp"
 #include "RefPooling2dUint8Workload.hpp"
 #include "BatchNormImpl.hpp"
 #include "Activation.hpp"