IVGCVSW-2559 End to end tests for Detection PostProcess

* end to end tests for Detection PostProcess float and uint8
* add anchors to AddDetectionPostProcessLayer
* add anchors to VisitDetectionPostProcessLayer
* refactor code

Change-Id: I3c5a9a4a60b74c2246b4a27692bbf3c235163f90
Signed-off-by: Narumol Prangnawarat <narumol.prangnawarat@arm.com>
diff --git a/src/armnn/LayerVisitorBase.hpp b/src/armnn/LayerVisitorBase.hpp
index 3b6a2ff..641ca31 100644
--- a/src/armnn/LayerVisitorBase.hpp
+++ b/src/armnn/LayerVisitorBase.hpp
@@ -57,6 +57,7 @@
 
     virtual void VisitDetectionPostProcessLayer(const IConnectableLayer*,
                                                 const DetectionPostProcessDescriptor&,
+                                                const ConstTensor&,
                                                 const char*) { DefaultPolicy::Apply(); }
 
     virtual void VisitFullyConnectedLayer(const IConnectableLayer*,
diff --git a/src/armnn/Network.cpp b/src/armnn/Network.cpp
index 7897a81..5c70003 100644
--- a/src/armnn/Network.cpp
+++ b/src/armnn/Network.cpp
@@ -648,9 +648,13 @@
 }
 
 IConnectableLayer* Network::AddDetectionPostProcessLayer(const armnn::DetectionPostProcessDescriptor& descriptor,
-                                                         const char* name)
+                                                         const ConstTensor& anchors, const char* name)
 {
-    return m_Graph->AddLayer<DetectionPostProcessLayer>(descriptor, name);
+    const auto layer = m_Graph->AddLayer<DetectionPostProcessLayer>(descriptor, name);
+
+    layer->m_Anchors = std::make_unique<ScopedCpuTensorHandle>(anchors);
+
+    return layer;
 }
 
 IConnectableLayer* Network::AddPermuteLayer(const PermuteDescriptor& permuteDescriptor,
diff --git a/src/armnn/Network.hpp b/src/armnn/Network.hpp
index 4239ac5..66fb240 100644
--- a/src/armnn/Network.hpp
+++ b/src/armnn/Network.hpp
@@ -59,6 +59,7 @@
 
     IConnectableLayer* AddDetectionPostProcessLayer(
         const DetectionPostProcessDescriptor& descriptor,
+        const ConstTensor& anchors,
         const char* name = nullptr) override;
 
     IConnectableLayer* AddFullyConnectedLayer(const FullyConnectedDescriptor& fullyConnectedDescriptor,
diff --git a/src/armnn/layers/DetectionPostProcessLayer.cpp b/src/armnn/layers/DetectionPostProcessLayer.cpp
index 3eea198..289cee0 100644
--- a/src/armnn/layers/DetectionPostProcessLayer.cpp
+++ b/src/armnn/layers/DetectionPostProcessLayer.cpp
@@ -24,12 +24,15 @@
                                                                      const armnn::IWorkloadFactory& factory) const
 {
     DetectionPostProcessQueueDescriptor descriptor;
+    descriptor.m_Anchors = m_Anchors.get();
     return factory.CreateDetectionPostProcess(descriptor, PrepInfoAndDesc(descriptor, graph));
 }
 
 DetectionPostProcessLayer* DetectionPostProcessLayer::Clone(Graph& graph) const
 {
-    return CloneBase<DetectionPostProcessLayer>(graph, m_Param, GetName());
+    auto layer = CloneBase<DetectionPostProcessLayer>(graph, m_Param, GetName());
+    layer->m_Anchors = m_Anchors ? std::make_unique<ScopedCpuTensorHandle>(*m_Anchors) : nullptr;
+    return std::move(layer);
 }
 
 void DetectionPostProcessLayer::ValidateTensorShapesFromInputs()
@@ -72,7 +75,8 @@
 
 void DetectionPostProcessLayer::Accept(ILayerVisitor& visitor) const
 {
-    visitor.VisitDetectionPostProcessLayer(this, GetParameters(), GetName());
+    ConstTensor anchorTensor(m_Anchors->GetTensorInfo(), m_Anchors->GetConstTensor<void>());
+    visitor.VisitDetectionPostProcessLayer(this, GetParameters(), anchorTensor, GetName());
 }
 
 } // namespace armnn
diff --git a/src/armnn/test/TestLayerVisitor.hpp b/src/armnn/test/TestLayerVisitor.hpp
index 5775df0..6b95032 100644
--- a/src/armnn/test/TestLayerVisitor.hpp
+++ b/src/armnn/test/TestLayerVisitor.hpp
@@ -61,6 +61,7 @@
 
     virtual void VisitDetectionPostProcessLayer(const IConnectableLayer* layer,
                                                 const DetectionPostProcessDescriptor& descriptor,
+                                                const ConstTensor& anchors,
                                                 const char* name = nullptr) {};
 
     virtual void VisitFullyConnectedLayer(const IConnectableLayer* layer,
diff --git a/src/backends/backendsCommon/WorkloadData.cpp b/src/backends/backendsCommon/WorkloadData.cpp
index b31d626..7474b9b 100644
--- a/src/backends/backendsCommon/WorkloadData.cpp
+++ b/src/backends/backendsCommon/WorkloadData.cpp
@@ -1101,8 +1101,8 @@
     const TensorInfo& scoresInfo =  workloadInfo.m_InputTensorInfos[1];
     const TensorInfo& anchorsInfo = m_Anchors->GetTensorInfo();
     const TensorInfo& detectionBoxesInfo = workloadInfo.m_OutputTensorInfos[0];
-    const TensorInfo& detectionScoresInfo = workloadInfo.m_OutputTensorInfos[1];
-    const TensorInfo& detectionClassesInfo = workloadInfo.m_OutputTensorInfos[2];
+    const TensorInfo& detectionClassesInfo = workloadInfo.m_OutputTensorInfos[1];
+    const TensorInfo& detectionScoresInfo = workloadInfo.m_OutputTensorInfos[2];
     const TensorInfo& numDetectionsInfo = workloadInfo.m_OutputTensorInfos[3];
 
     ValidateTensorNumDimensions(boxEncodingsInfo, "DetectionPostProcessQueueDescriptor", 3, "box encodings");
diff --git a/src/backends/backendsCommon/test/CMakeLists.txt b/src/backends/backendsCommon/test/CMakeLists.txt
index 80a9cfe..4a1d467 100644
--- a/src/backends/backendsCommon/test/CMakeLists.txt
+++ b/src/backends/backendsCommon/test/CMakeLists.txt
@@ -14,6 +14,7 @@
     ConvertFp16ToFp32TestImpl.hpp
     ConvertFp32ToFp16TestImpl.hpp
     DebugTestImpl.hpp
+    DetectionPostProcessTestImpl.hpp
     EndToEndTestImpl.hpp
     FullyConnectedTestImpl.hpp
     GatherTestImpl.hpp
diff --git a/src/backends/backendsCommon/test/DetectionPostProcessTestImpl.hpp b/src/backends/backendsCommon/test/DetectionPostProcessTestImpl.hpp
new file mode 100644
index 0000000..5f4d2a4
--- /dev/null
+++ b/src/backends/backendsCommon/test/DetectionPostProcessTestImpl.hpp
@@ -0,0 +1,162 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include <armnn/INetwork.hpp>
+#include <backendsCommon/test/CommonTestUtils.hpp>
+#include <TypeUtils.hpp>
+
+namespace{
+
+template<typename T>
+armnn::INetworkPtr CreateDetectionPostProcessNetwork(const armnn::TensorInfo& boxEncodingsInfo,
+                                                     const armnn::TensorInfo& scoresInfo,
+                                                     const armnn::TensorInfo& anchorsInfo,
+                                                     const std::vector<T>& anchors,
+                                                     bool useRegularNms)
+{
+    armnn::TensorInfo detectionBoxesInfo({ 1, 3, 4 }, armnn::DataType::Float32);
+    armnn::TensorInfo detectionScoresInfo({ 1, 3 }, armnn::DataType::Float32);
+    armnn::TensorInfo detectionClassesInfo({ 1, 3 }, armnn::DataType::Float32);
+    armnn::TensorInfo numDetectionInfo({ 1 }, armnn::DataType::Float32);
+
+    armnn::DetectionPostProcessDescriptor desc;
+    desc.m_UseRegularNms = useRegularNms;
+    desc.m_MaxDetections = 3;
+    desc.m_MaxClassesPerDetection = 1;
+    desc.m_DetectionsPerClass =1;
+    desc.m_NmsScoreThreshold = 0.0;
+    desc.m_NmsIouThreshold = 0.5;
+    desc.m_NumClasses = 2;
+    desc.m_ScaleY = 10.0;
+    desc.m_ScaleX = 10.0;
+    desc.m_ScaleH = 5.0;
+    desc.m_ScaleW = 5.0;
+
+    armnn::INetworkPtr net(armnn::INetwork::Create());
+
+    armnn::IConnectableLayer* boxesLayer = net->AddInputLayer(0);
+    armnn::IConnectableLayer* scoresLayer = net->AddInputLayer(1);
+    armnn::ConstTensor anchorsTensor(anchorsInfo, anchors.data());
+    armnn::IConnectableLayer* detectionLayer = net->AddDetectionPostProcessLayer(desc, anchorsTensor,
+                                                                                 "DetectionPostProcess");
+    armnn::IConnectableLayer* detectionBoxesLayer = net->AddOutputLayer(0, "detectionBoxes");
+    armnn::IConnectableLayer* detectionClassesLayer = net->AddOutputLayer(1, "detectionClasses");
+    armnn::IConnectableLayer* detectionScoresLayer = net->AddOutputLayer(2, "detectionScores");
+    armnn::IConnectableLayer* numDetectionLayer = net->AddOutputLayer(3, "numDetection");
+    Connect(boxesLayer, detectionLayer, boxEncodingsInfo, 0, 0);
+    Connect(scoresLayer, detectionLayer, scoresInfo, 0, 1);
+    Connect(detectionLayer, detectionBoxesLayer, detectionBoxesInfo, 0, 0);
+    Connect(detectionLayer, detectionClassesLayer, detectionClassesInfo, 1, 0);
+    Connect(detectionLayer, detectionScoresLayer, detectionScoresInfo, 2, 0);
+    Connect(detectionLayer, numDetectionLayer, numDetectionInfo, 3, 0);
+
+    return net;
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+void DetectionPostProcessEndToEnd(const std::vector<BackendId>& backends, bool useRegularNms,
+                                  const std::vector<T>& boxEncodings,
+                                  const std::vector<T>& scores,
+                                  const std::vector<T>& anchors,
+                                  const std::vector<float>& expectedDetectionBoxes,
+                                  const std::vector<float>& expectedDetectionClasses,
+                                  const std::vector<float>& expectedDetectionScores,
+                                  const std::vector<float>& expectedNumDetections,
+                                  float boxScale = 1.0f,
+                                  int32_t boxOffset = 0,
+                                  float scoreScale = 1.0f,
+                                  int32_t scoreOffset = 0,
+                                  float anchorScale = 1.0f,
+                                  int32_t anchorOffset = 0)
+{
+    armnn::TensorInfo boxEncodingsInfo({ 1, 6, 4 }, ArmnnType);
+    armnn::TensorInfo scoresInfo({ 1, 6, 3}, ArmnnType);
+    armnn::TensorInfo anchorsInfo({ 6, 4 }, ArmnnType);
+
+    boxEncodingsInfo.SetQuantizationScale(boxScale);
+    boxEncodingsInfo.SetQuantizationOffset(boxOffset);
+    scoresInfo.SetQuantizationScale(scoreScale);
+    scoresInfo.SetQuantizationOffset(scoreOffset);
+    anchorsInfo.SetQuantizationScale(anchorScale);
+    anchorsInfo.SetQuantizationOffset(anchorOffset);
+
+    // Builds up the structure of the network
+    armnn::INetworkPtr net = CreateDetectionPostProcessNetwork<T>(boxEncodingsInfo, scoresInfo,
+                                                                  anchorsInfo, anchors, useRegularNms);
+
+    BOOST_TEST_CHECKPOINT("create a network");
+
+    std::map<int, std::vector<T>> inputTensorData = {{ 0, boxEncodings }, { 1, scores }};
+    std::map<int, std::vector<float>> expectedOutputData = {{ 0, expectedDetectionBoxes },
+                                                            { 1, expectedDetectionClasses },
+                                                            { 2, expectedDetectionScores },
+                                                            { 3, expectedNumDetections }};
+
+    EndToEndLayerTestImpl<ArmnnType, armnn::DataType::Float32>(
+        move(net), inputTensorData, expectedOutputData, backends);
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+void DetectionPostProcessRegularNmsEndToEnd(const std::vector<BackendId>& backends,
+                                            const std::vector<T>& boxEncodings,
+                                            const std::vector<T>& scores,
+                                            const std::vector<T>& anchors,
+                                            float boxScale = 1.0f,
+                                            int32_t boxOffset = 0,
+                                            float scoreScale = 1.0f,
+                                            int32_t scoreOffset = 0,
+                                            float anchorScale = 1.0f,
+                                            int32_t anchorOffset = 0)
+{
+    std::vector<float> expectedDetectionBoxes({
+        0.0f, 10.0f, 1.0f, 11.0f,
+        0.0f, 10.0f, 1.0f, 11.0f,
+        0.0f, 0.0f, 0.0f, 0.0f
+    });
+    std::vector<float> expectedDetectionScores({ 0.95f, 0.93f, 0.0f });
+    std::vector<float> expectedDetectionClasses({ 1.0f, 0.0f, 0.0f });
+    std::vector<float> expectedNumDetections({ 2.0f });
+
+    DetectionPostProcessEndToEnd<ArmnnType>(backends, true, boxEncodings, scores, anchors,
+                                            expectedDetectionBoxes, expectedDetectionClasses,
+                                            expectedDetectionScores, expectedNumDetections,
+                                            boxScale, boxOffset, scoreScale, scoreOffset,
+                                            anchorScale, anchorOffset);
+
+};
+
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+void DetectionPostProcessFastNmsEndToEnd(const std::vector<BackendId>& backends,
+                                         const std::vector<T>& boxEncodings,
+                                         const std::vector<T>& scores,
+                                         const std::vector<T>& anchors,
+                                         float boxScale = 1.0f,
+                                         int32_t boxOffset = 0,
+                                         float scoreScale = 1.0f,
+                                          int32_t scoreOffset = 0,
+                                         float anchorScale = 1.0f,
+                                         int32_t anchorOffset = 0)
+{
+    std::vector<float> expectedDetectionBoxes({
+        0.0f, 10.0f, 1.0f, 11.0f,
+        0.0f, 0.0f, 1.0f, 1.0f,
+        0.0f, 100.0f, 1.0f, 101.0f
+    });
+    std::vector<float> expectedDetectionScores({ 0.95f, 0.9f, 0.3f });
+    std::vector<float> expectedDetectionClasses({ 1.0f, 0.0f, 0.0f });
+    std::vector<float> expectedNumDetections({ 3.0f });
+
+    DetectionPostProcessEndToEnd<ArmnnType>(backends, false, boxEncodings, scores, anchors,
+                                            expectedDetectionBoxes, expectedDetectionClasses,
+                                            expectedDetectionScores, expectedNumDetections,
+                                            boxScale, boxOffset, scoreScale, scoreOffset,
+                                            anchorScale, anchorOffset);
+
+};
+
+} // anonymous namespace
diff --git a/src/backends/backendsCommon/test/EndToEndTestImpl.hpp b/src/backends/backendsCommon/test/EndToEndTestImpl.hpp
index d17b61e..a04fdf7 100644
--- a/src/backends/backendsCommon/test/EndToEndTestImpl.hpp
+++ b/src/backends/backendsCommon/test/EndToEndTestImpl.hpp
@@ -163,7 +163,10 @@
         }
         else
         {
-            BOOST_TEST(it.second == out);
+            for (unsigned int i = 0; i < out.size(); ++i)
+            {
+                BOOST_TEST(it.second[i] == out[i], boost::test_tools::tolerance(0.000001f));
+            }
         }
     }
 }
diff --git a/src/backends/reference/RefWorkloadFactory.cpp b/src/backends/reference/RefWorkloadFactory.cpp
index 9c1ce1e..3bf83bd 100644
--- a/src/backends/reference/RefWorkloadFactory.cpp
+++ b/src/backends/reference/RefWorkloadFactory.cpp
@@ -154,7 +154,16 @@
 std::unique_ptr<IWorkload> RefWorkloadFactory::CreateDetectionPostProcess(
     const armnn::DetectionPostProcessQueueDescriptor& descriptor, const armnn::WorkloadInfo& info) const
 {
-    return MakeWorkload<RefDetectionPostProcessFloat32Workload, RefDetectionPostProcessUint8Workload>(descriptor, info);
+    const DataType dataType = info.m_InputTensorInfos[0].GetDataType();
+    switch (dataType)
+    {
+        case DataType::Float32:
+            return std::make_unique<RefDetectionPostProcessFloat32Workload>(descriptor, info);
+        case DataType::QuantisedAsymm8:
+            return std::make_unique<RefDetectionPostProcessUint8Workload>(descriptor, info);
+        default:
+            return MakeWorkload<NullWorkload, NullWorkload>(descriptor, info);
+    }
 }
 
 std::unique_ptr<armnn::IWorkload> RefWorkloadFactory::CreateNormalization(
diff --git a/src/backends/reference/test/RefDetectionPostProcessTests.cpp b/src/backends/reference/test/RefDetectionPostProcessTests.cpp
index 39403f0..a9faff7 100644
--- a/src/backends/reference/test/RefDetectionPostProcessTests.cpp
+++ b/src/backends/reference/test/RefDetectionPostProcessTests.cpp
@@ -74,8 +74,8 @@
                                   const std::vector<float>& expectedNumDetections)
 {
     armnn::TensorInfo boxEncodingsInfo({ 1, 6, 4 }, armnn::DataType::Float32);
-    armnn::TensorInfo scoresInfo({ 1, 6, 4 }, armnn::DataType::Float32);
-    armnn::TensorInfo anchorsInfo({ 1, 6, 4 }, armnn::DataType::Float32);
+    armnn::TensorInfo scoresInfo({ 1, 6, 3 }, armnn::DataType::Float32);
+    armnn::TensorInfo anchorsInfo({ 6, 4 }, armnn::DataType::Float32);
 
     armnn::TensorInfo detectionBoxesInfo({ 1, 3, 4 }, armnn::DataType::Float32);
     armnn::TensorInfo detectionScoresInfo({ 1, 3 }, armnn::DataType::Float32);
diff --git a/src/backends/reference/test/RefEndToEndTests.cpp b/src/backends/reference/test/RefEndToEndTests.cpp
index 802167a..c89e586 100644
--- a/src/backends/reference/test/RefEndToEndTests.cpp
+++ b/src/backends/reference/test/RefEndToEndTests.cpp
@@ -4,6 +4,8 @@
 //
 
 #include <backendsCommon/test/EndToEndTestImpl.hpp>
+
+#include <backendsCommon/test/DetectionPostProcessTestImpl.hpp>
 #include <backendsCommon/test/GatherEndToEndTestImpl.hpp>
 #include <backendsCommon/test/MergerTestImpl.hpp>
 #include <backendsCommon/test/ArithmeticTestImpl.hpp>
@@ -453,4 +455,168 @@
     GatherMultiDimEndToEnd<armnn::DataType::QuantisedAsymm8>(defaultBackends);
 }
 
+BOOST_AUTO_TEST_CASE(RefDetectionPostProcessRegularNmsTest)
+{
+    std::vector<float> boxEncodings({
+        0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 1.0f, 0.0f, 0.0f,
+        0.0f, -1.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 1.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f
+    });
+    std::vector<float> scores({
+        0.0f, 0.9f, 0.8f,
+        0.0f, 0.75f, 0.72f,
+        0.0f, 0.6f, 0.5f,
+        0.0f, 0.93f, 0.95f,
+        0.0f, 0.5f, 0.4f,
+        0.0f, 0.3f, 0.2f
+    });
+    std::vector<float> anchors({
+        0.5f, 0.5f, 1.0f, 1.0f,
+        0.5f, 0.5f, 1.0f, 1.0f,
+        0.5f, 0.5f, 1.0f, 1.0f,
+        0.5f, 10.5f, 1.0f, 1.0f,
+        0.5f, 10.5f, 1.0f, 1.0f,
+        0.5f, 100.5f, 1.0f, 1.0f
+    });
+    DetectionPostProcessRegularNmsEndToEnd<armnn::DataType::Float32>(defaultBackends, boxEncodings, scores, anchors);
+}
+
+inline void QuantizeData(uint8_t* quant, const float* dequant, const TensorInfo& info)
+{
+    for (size_t i = 0; i < info.GetNumElements(); i++)
+    {
+        quant[i] = armnn::Quantize<uint8_t>(dequant[i], info.GetQuantizationScale(), info.GetQuantizationOffset());
+    }
+}
+
+BOOST_AUTO_TEST_CASE(RefDetectionPostProcessRegularNmsUint8Test)
+{
+    armnn::TensorInfo boxEncodingsInfo({ 1, 6, 4 }, armnn::DataType::Float32);
+    armnn::TensorInfo scoresInfo({ 1, 6, 3 }, armnn::DataType::Float32);
+    armnn::TensorInfo anchorsInfo({ 6, 4 }, armnn::DataType::Float32);
+
+    boxEncodingsInfo.SetQuantizationScale(1.0f);
+    boxEncodingsInfo.SetQuantizationOffset(1);
+    scoresInfo.SetQuantizationScale(0.01f);
+    scoresInfo.SetQuantizationOffset(0);
+    anchorsInfo.SetQuantizationScale(0.5f);
+    anchorsInfo.SetQuantizationOffset(0);
+
+    std::vector<float> boxEncodings({
+        0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 1.0f, 0.0f, 0.0f,
+        0.0f, -1.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 1.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f
+    });
+    std::vector<float> scores({
+        0.0f, 0.9f, 0.8f,
+        0.0f, 0.75f, 0.72f,
+        0.0f, 0.6f, 0.5f,
+        0.0f, 0.93f, 0.95f,
+        0.0f, 0.5f, 0.4f,
+        0.0f, 0.3f, 0.2f
+    });
+    std::vector<float> anchors({
+        0.5f, 0.5f, 1.0f, 1.0f,
+        0.5f, 0.5f, 1.0f, 1.0f,
+        0.5f, 0.5f, 1.0f, 1.0f,
+        0.5f, 10.5f, 1.0f, 1.0f,
+        0.5f, 10.5f, 1.0f, 1.0f,
+        0.5f, 100.5f, 1.0f, 1.0f
+    });
+
+    std::vector<uint8_t> qBoxEncodings(boxEncodings.size(), 0);
+    std::vector<uint8_t> qScores(scores.size(), 0);
+    std::vector<uint8_t> qAnchors(anchors.size(), 0);
+    QuantizeData(qBoxEncodings.data(), boxEncodings.data(), boxEncodingsInfo);
+    QuantizeData(qScores.data(), scores.data(), scoresInfo);
+    QuantizeData(qAnchors.data(), anchors.data(), anchorsInfo);
+    DetectionPostProcessRegularNmsEndToEnd<armnn::DataType::QuantisedAsymm8>(defaultBackends, qBoxEncodings,
+                                                                             qScores, qAnchors,
+                                                                             1.0f, 1, 0.01f, 0, 0.5f, 0);
+}
+
+BOOST_AUTO_TEST_CASE(RefDetectionPostProcessFastNmsTest)
+{
+    std::vector<float> boxEncodings({
+        0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 1.0f, 0.0f, 0.0f,
+        0.0f, -1.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 1.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f
+    });
+    std::vector<float> scores({
+        0.0f, 0.9f, 0.8f,
+        0.0f, 0.75f, 0.72f,
+        0.0f, 0.6f, 0.5f,
+        0.0f, 0.93f, 0.95f,
+        0.0f, 0.5f, 0.4f,
+        0.0f, 0.3f, 0.2f
+    });
+    std::vector<float> anchors({
+        0.5f, 0.5f, 1.0f, 1.0f,
+        0.5f, 0.5f, 1.0f, 1.0f,
+        0.5f, 0.5f, 1.0f, 1.0f,
+        0.5f, 10.5f, 1.0f, 1.0f,
+        0.5f, 10.5f, 1.0f, 1.0f,
+        0.5f, 100.5f, 1.0f, 1.0f
+    });
+    DetectionPostProcessFastNmsEndToEnd<armnn::DataType::Float32>(defaultBackends, boxEncodings, scores, anchors);
+}
+
+BOOST_AUTO_TEST_CASE(RefDetectionPostProcessFastNmsUint8Test)
+{
+    armnn::TensorInfo boxEncodingsInfo({ 1, 6, 4 }, armnn::DataType::Float32);
+    armnn::TensorInfo scoresInfo({ 1, 6, 3 }, armnn::DataType::Float32);
+    armnn::TensorInfo anchorsInfo({ 6, 4 }, armnn::DataType::Float32);
+
+    boxEncodingsInfo.SetQuantizationScale(1.0f);
+    boxEncodingsInfo.SetQuantizationOffset(1);
+    scoresInfo.SetQuantizationScale(0.01f);
+    scoresInfo.SetQuantizationOffset(0);
+    anchorsInfo.SetQuantizationScale(0.5f);
+    anchorsInfo.SetQuantizationOffset(0);
+
+    std::vector<float> boxEncodings({
+        0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 1.0f, 0.0f, 0.0f,
+        0.0f, -1.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f,
+        0.0f, 1.0f, 0.0f, 0.0f,
+        0.0f, 0.0f, 0.0f, 0.0f
+    });
+    std::vector<float> scores({
+        0.0f, 0.9f, 0.8f,
+        0.0f, 0.75f, 0.72f,
+        0.0f, 0.6f, 0.5f,
+        0.0f, 0.93f, 0.95f,
+        0.0f, 0.5f, 0.4f,
+        0.0f, 0.3f, 0.2f
+    });
+    std::vector<float> anchors({
+        0.5f, 0.5f, 1.0f, 1.0f,
+        0.5f, 0.5f, 1.0f, 1.0f,
+        0.5f, 0.5f, 1.0f, 1.0f,
+        0.5f, 10.5f, 1.0f, 1.0f,
+        0.5f, 10.5f, 1.0f, 1.0f,
+        0.5f, 100.5f, 1.0f, 1.0f
+    });
+
+    std::vector<uint8_t> qBoxEncodings(boxEncodings.size(), 0);
+    std::vector<uint8_t> qScores(scores.size(), 0);
+    std::vector<uint8_t> qAnchors(anchors.size(), 0);
+    QuantizeData(qBoxEncodings.data(), boxEncodings.data(), boxEncodingsInfo);
+    QuantizeData(qScores.data(), scores.data(), scoresInfo);
+    QuantizeData(qAnchors.data(), anchors.data(), anchorsInfo);
+    DetectionPostProcessFastNmsEndToEnd<armnn::DataType::QuantisedAsymm8>(defaultBackends, qBoxEncodings,
+                                                                          qScores, qAnchors,
+                                                                          1.0f, 1, 0.01f, 0, 0.5f, 0);
+}
+
 BOOST_AUTO_TEST_SUITE_END()
diff --git a/src/backends/reference/workloads/DetectionPostProcess.cpp b/src/backends/reference/workloads/DetectionPostProcess.cpp
index 958de82..2eb35f5 100644
--- a/src/backends/reference/workloads/DetectionPostProcess.cpp
+++ b/src/backends/reference/workloads/DetectionPostProcess.cpp
@@ -105,15 +105,15 @@
     for (unsigned int i = 0; i < numOutput; ++i)
         {
             unsigned int boxIndex = i * 4;
-            unsigned int boxConorIndex = selectedBoxes[outputIndices[i]] * 4;
             if (i < numSelected)
             {
+                unsigned int boxCornorIndex = selectedBoxes[outputIndices[i]] * 4;
                 detectionScores[i] = selectedScores[outputIndices[i]];
                 detectionClasses[i] = boost::numeric_cast<float>(selectedClasses[outputIndices[i]]);
-                detectionBoxes[boxIndex] = boxCorners[boxConorIndex];
-                detectionBoxes[boxIndex + 1] = boxCorners[boxConorIndex + 1];
-                detectionBoxes[boxIndex + 2] = boxCorners[boxConorIndex + 2];
-                detectionBoxes[boxIndex + 3] = boxCorners[boxConorIndex + 3];
+                detectionBoxes[boxIndex] = boxCorners[boxCornorIndex];
+                detectionBoxes[boxIndex + 1] = boxCorners[boxCornorIndex + 1];
+                detectionBoxes[boxIndex + 2] = boxCorners[boxCornorIndex + 2];
+                detectionBoxes[boxIndex + 3] = boxCorners[boxCornorIndex + 3];
             }
             else
             {
@@ -125,7 +125,7 @@
                 detectionBoxes[boxIndex + 3] = 0.0f;
             }
         }
-        numDetections[0] = boost::numeric_cast<float>(numOutput);
+        numDetections[0] = boost::numeric_cast<float>(numSelected);
 }
 
 } // anonymous namespace
@@ -216,7 +216,7 @@
         std::vector<unsigned int> outputIndices = GenerateRangeK(numSelected);
         TopKSort(numOutput, outputIndices.data(), selectedScoresAfterNms.data(), numSelected);
 
-        AllocateOutputData(numOutput, numSelected, boxCorners, outputIndices,
+        AllocateOutputData(detectionBoxesInfo.GetShape()[1], numOutput, boxCorners, outputIndices,
                            selectedBoxesAfterNms, selectedClasses, selectedScoresAfterNms,
                            detectionBoxes, detectionScores, detectionClasses, numDetections);
     }
@@ -255,7 +255,7 @@
         unsigned int numSelected = boost::numeric_cast<unsigned int>(selectedIndices.size());
         unsigned int numOutput = std::min(desc.m_MaxDetections,  numSelected);
 
-        AllocateOutputData(numOutput, numSelected, boxCorners, selectedIndices,
+        AllocateOutputData(detectionBoxesInfo.GetShape()[1], numOutput, boxCorners, selectedIndices,
                            boxIndices, maxScoreClasses, maxScores,
                            detectionBoxes, detectionScores, detectionClasses, numDetections);
     }