IVGCVSW-3307 Introduce RefTensorHandle

Use it for intermediate tensors on reference backend.
Lays the groundwork for memory management in the reference backend.

Change-Id: I7d3ee132cac31bde70ae6e1b815f4f0b03d550a6
Signed-off-by: Matthew Bentham <Matthew.Bentham@arm.com>
diff --git a/src/armnnUtils/TensorIOUtils.hpp b/src/armnnUtils/TensorIOUtils.hpp
index bf5a37b..47e0a32 100644
--- a/src/armnnUtils/TensorIOUtils.hpp
+++ b/src/armnnUtils/TensorIOUtils.hpp
@@ -37,7 +37,11 @@
                              {
                                  if (value.size() != inputBinding.second.GetNumElements())
                                  {
-                                    throw armnn::Exception("Input tensor has incorrect size");
+                                    std::ostringstream msg;
+                                    msg << "Input tensor has incorrect size (expected "
+                                        << inputBinding.second.GetNumElements() << " got "
+                                        << value.size();
+                                    throw armnn::Exception(msg.str());
                                  }
 
                                  armnn::ConstTensor inputTensor(inputBinding.second, value.data());
@@ -84,4 +88,4 @@
     return outputTensors;
 }
 
-} // namespace armnnUtils
\ No newline at end of file
+} // namespace armnnUtils
diff --git a/src/backends/aclCommon/test/CreateWorkloadClNeon.hpp b/src/backends/aclCommon/test/CreateWorkloadClNeon.hpp
index 03bcf32..53d4dc9 100644
--- a/src/backends/aclCommon/test/CreateWorkloadClNeon.hpp
+++ b/src/backends/aclCommon/test/CreateWorkloadClNeon.hpp
@@ -8,6 +8,7 @@
 
 #include <backendsCommon/MemCopyWorkload.hpp>
 #include <reference/RefWorkloadFactory.hpp>
+#include <reference/RefTensorHandle.hpp>
 
 #if defined(ARMCOMPUTECL_ENABLED)
 #include <cl/ClTensorHandle.hpp>
@@ -92,7 +93,7 @@
     MemCopyQueueDescriptor queueDescriptor1 = workload1->GetData();
     BOOST_TEST(queueDescriptor1.m_Inputs.size() == 1);
     BOOST_TEST(queueDescriptor1.m_Outputs.size() == 1);
-    auto inputHandle1  = boost::polymorphic_downcast<ConstCpuTensorHandle*>(queueDescriptor1.m_Inputs[0]);
+    auto inputHandle1  = boost::polymorphic_downcast<RefTensorHandle*>(queueDescriptor1.m_Inputs[0]);
     auto outputHandle1 = boost::polymorphic_downcast<IComputeTensorHandle*>(queueDescriptor1.m_Outputs[0]);
     BOOST_TEST((inputHandle1->GetTensorInfo() == TensorInfo({2, 3}, DataType::Float32)));
     BOOST_TEST(CompareTensorHandleShape<IComputeTensorHandle>(outputHandle1, {2, 3}));
@@ -102,7 +103,7 @@
     BOOST_TEST(queueDescriptor2.m_Inputs.size() == 1);
     BOOST_TEST(queueDescriptor2.m_Outputs.size() == 1);
     auto inputHandle2  = boost::polymorphic_downcast<IComputeTensorHandle*>(queueDescriptor2.m_Inputs[0]);
-    auto outputHandle2 = boost::polymorphic_downcast<CpuTensorHandle*>(queueDescriptor2.m_Outputs[0]);
+    auto outputHandle2 = boost::polymorphic_downcast<RefTensorHandle*>(queueDescriptor2.m_Outputs[0]);
     BOOST_TEST(CompareTensorHandleShape<IComputeTensorHandle>(inputHandle2, {2, 3}));
     BOOST_TEST((outputHandle2->GetTensorInfo() == TensorInfo({2, 3}, DataType::Float32)));
 }
diff --git a/src/backends/backendsCommon/CpuTensorHandle.cpp b/src/backends/backendsCommon/CpuTensorHandle.cpp
index 9dcd3f3..de83048 100644
--- a/src/backends/backendsCommon/CpuTensorHandle.cpp
+++ b/src/backends/backendsCommon/CpuTensorHandle.cpp
@@ -11,6 +11,22 @@
 namespace armnn
 {
 
+TensorShape GetUnpaddedTensorStrides(const TensorInfo& tensorInfo)
+{
+    TensorShape shape(tensorInfo.GetShape());
+    auto size = GetDataTypeSize(tensorInfo.GetDataType());
+    auto runningSize = size;
+    std::vector<unsigned int> strides(shape.GetNumDimensions());
+    auto lastIdx = shape.GetNumDimensions()-1;
+    for (unsigned int i=0; i < lastIdx ; i++)
+    {
+        strides[lastIdx-i] = runningSize;
+        runningSize *= shape[lastIdx-i];
+    }
+    strides[0] = runningSize;
+    return TensorShape(shape.GetNumDimensions(), strides.data());
+}
+
 ConstCpuTensorHandle::ConstCpuTensorHandle(const TensorInfo& tensorInfo)
 : m_TensorInfo(tensorInfo)
 , m_Memory(nullptr)
diff --git a/src/backends/backendsCommon/CpuTensorHandle.hpp b/src/backends/backendsCommon/CpuTensorHandle.hpp
index dd6413f..5fefc12 100644
--- a/src/backends/backendsCommon/CpuTensorHandle.hpp
+++ b/src/backends/backendsCommon/CpuTensorHandle.hpp
@@ -16,6 +16,10 @@
 namespace armnn
 {
 
+// Get a TensorShape representing the strides (in bytes) for each dimension
+// of a tensor, assuming fully packed data with no padding
+TensorShape GetUnpaddedTensorStrides(const TensorInfo& tensorInfo);
+
 // Abstract tensor handles wrapping a CPU-readable region of memory, interpreting it as tensor data.
 class ConstCpuTensorHandle : public ITensorHandle
 {
@@ -41,18 +45,7 @@
 
     TensorShape GetStrides() const override
     {
-        TensorShape shape(m_TensorInfo.GetShape());
-        auto size = GetDataTypeSize(m_TensorInfo.GetDataType());
-        auto runningSize = size;
-        std::vector<unsigned int> strides(shape.GetNumDimensions());
-        auto lastIdx = shape.GetNumDimensions()-1;
-        for (unsigned int i=0; i < lastIdx ; i++)
-        {
-            strides[lastIdx-i] = runningSize;
-            runningSize *= shape[lastIdx-i];
-        }
-        strides[0] = runningSize;
-        return TensorShape(shape.GetNumDimensions(), strides.data());
+        return GetUnpaddedTensorStrides(m_TensorInfo);
     }
     TensorShape GetShape() const override { return m_TensorInfo.GetShape(); }
 
@@ -63,8 +56,8 @@
 
 private:
     // Only used for testing
-    void CopyOutTo(void *) const override {}
-    void CopyInFrom(const void*) override {}
+    void CopyOutTo(void *) const override { BOOST_ASSERT_MSG(false, "Unimplemented"); }
+    void CopyInFrom(const void*) override { BOOST_ASSERT_MSG(false, "Unimplemented"); }
 
     ConstCpuTensorHandle(const ConstCpuTensorHandle& other) = delete;
     ConstCpuTensorHandle& operator=(const ConstCpuTensorHandle& other) = delete;
diff --git a/src/backends/reference/CMakeLists.txt b/src/backends/reference/CMakeLists.txt
index ff16f18..fabffea 100644
--- a/src/backends/reference/CMakeLists.txt
+++ b/src/backends/reference/CMakeLists.txt
@@ -7,6 +7,8 @@
     RefBackend.cpp
     RefBackend.hpp
     RefBackendId.hpp
+    RefTensorHandle.hpp
+    RefTensorHandle.cpp
     RefLayerSupport.cpp
     RefLayerSupport.hpp
     RefWorkloadFactory.cpp
diff --git a/src/backends/reference/RefTensorHandle.cpp b/src/backends/reference/RefTensorHandle.cpp
new file mode 100644
index 0000000..b7670f6
--- /dev/null
+++ b/src/backends/reference/RefTensorHandle.cpp
@@ -0,0 +1,45 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+#include "RefTensorHandle.hpp"
+
+namespace armnn
+{
+
+RefTensorHandle::RefTensorHandle(const TensorInfo &tensorInfo):
+    m_TensorInfo(tensorInfo),
+    m_Memory(nullptr)
+{
+
+}
+
+RefTensorHandle::~RefTensorHandle()
+{
+    ::operator delete(m_Memory);
+}
+
+void RefTensorHandle::Allocate()
+{
+    if (m_Memory == nullptr)
+    {
+        m_Memory = ::operator new(m_TensorInfo.GetNumBytes());
+    }
+    else
+    {
+        throw InvalidArgumentException("RefTensorHandle::Allocate Trying to allocate a RefTensorHandle"
+                                           "that already has allocated memory.");
+    }
+}
+
+void RefTensorHandle::CopyOutTo(void* memory) const
+{
+    memcpy(memory, m_Memory, m_TensorInfo.GetNumBytes());
+}
+
+void RefTensorHandle::CopyInFrom(const void* memory)
+{
+    memcpy(m_Memory, memory, m_TensorInfo.GetNumBytes());
+}
+
+}
\ No newline at end of file
diff --git a/src/backends/reference/RefTensorHandle.hpp b/src/backends/reference/RefTensorHandle.hpp
new file mode 100644
index 0000000..66d840a
--- /dev/null
+++ b/src/backends/reference/RefTensorHandle.hpp
@@ -0,0 +1,66 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+#pragma once
+
+#include <backendsCommon/CpuTensorHandle.hpp>
+
+namespace armnn
+{
+
+// An implementation of ITensorHandle with simple "bump the pointer" memory-management behaviour
+class RefTensorHandle : public ITensorHandle
+{
+public:
+    RefTensorHandle(const TensorInfo& tensorInfo);
+
+    ~RefTensorHandle();
+
+    virtual void Manage() override
+    {}
+
+    virtual ITensorHandle* GetParent() const override
+    {
+        return nullptr;
+    }
+
+    virtual const void* Map(bool /* blocking = true */) const override
+    {
+        return m_Memory;
+    }
+
+    virtual void Unmap() const override
+    {}
+
+    virtual void Allocate() override;
+
+    TensorShape GetStrides() const override
+    {
+        return GetUnpaddedTensorStrides(m_TensorInfo);
+    }
+
+    TensorShape GetShape() const override
+    {
+        return m_TensorInfo.GetShape();
+    }
+
+    const TensorInfo& GetTensorInfo() const
+    {
+        return m_TensorInfo;
+    }
+
+private:
+    // Only used for testing
+    void CopyOutTo(void*) const override;
+    void CopyInFrom(const void*) override;
+
+    RefTensorHandle(const RefTensorHandle& other) = delete;
+
+    RefTensorHandle& operator=(const RefTensorHandle& other) = delete;
+
+    TensorInfo m_TensorInfo;
+    void* m_Memory;
+};
+
+}
\ No newline at end of file
diff --git a/src/backends/reference/RefWorkloadFactory.cpp b/src/backends/reference/RefWorkloadFactory.cpp
index 95a4419..8d2a2b1 100644
--- a/src/backends/reference/RefWorkloadFactory.cpp
+++ b/src/backends/reference/RefWorkloadFactory.cpp
@@ -2,13 +2,14 @@
 // Copyright © 2017 Arm Ltd. All rights reserved.
 // SPDX-License-Identifier: MIT
 //
+#include <Layer.hpp>
 #include <backendsCommon/CpuTensorHandle.hpp>
 #include <backendsCommon/MemCopyWorkload.hpp>
 #include <backendsCommon/MakeWorkloadHelper.hpp>
 #include "RefWorkloadFactory.hpp"
 #include "RefBackendId.hpp"
 #include "workloads/RefWorkloads.hpp"
-#include "Layer.hpp"
+#include "RefTensorHandle.hpp"
 
 #include <boost/log/trivial.hpp>
 
@@ -72,13 +73,13 @@
 
 std::unique_ptr<ITensorHandle> RefWorkloadFactory::CreateTensorHandle(const TensorInfo& tensorInfo) const
 {
-    return std::make_unique<ScopedCpuTensorHandle>(tensorInfo);
+    return std::make_unique<RefTensorHandle>(tensorInfo);
 }
 
 std::unique_ptr<ITensorHandle> RefWorkloadFactory::CreateTensorHandle(const TensorInfo& tensorInfo,
                                                                       DataLayout dataLayout) const
 {
-    return std::make_unique<ScopedCpuTensorHandle>(tensorInfo);
+    return std::make_unique<RefTensorHandle>(tensorInfo);
 }
 
 std::unique_ptr<IWorkload> RefWorkloadFactory::CreateInput(const InputQueueDescriptor& descriptor,
diff --git a/src/backends/reference/backend.mk b/src/backends/reference/backend.mk
index 7995654..12e5774 100644
--- a/src/backends/reference/backend.mk
+++ b/src/backends/reference/backend.mk
@@ -10,6 +10,7 @@
 BACKEND_SOURCES := \
         RefBackend.cpp \
         RefLayerSupport.cpp \
+        RefTensorHandle.cpp \
         RefWorkloadFactory.cpp \
         workloads/Activation.cpp \
         workloads/BatchNormImpl.cpp \
@@ -78,6 +79,7 @@
 
 BACKEND_TEST_SOURCES := \
         test/RefCreateWorkloadTests.cpp \
+        test/RefDetectionPostProcessTests.cpp \
         test/RefEndToEndTests.cpp \
         test/RefJsonPrinterTests.cpp \
         test/RefLayerSupportTests.cpp \
diff --git a/src/backends/reference/test/RefCreateWorkloadTests.cpp b/src/backends/reference/test/RefCreateWorkloadTests.cpp
index 68df349..9071679 100644
--- a/src/backends/reference/test/RefCreateWorkloadTests.cpp
+++ b/src/backends/reference/test/RefCreateWorkloadTests.cpp
@@ -5,7 +5,7 @@
 
 #include <test/CreateWorkload.hpp>
 
-#include <backendsCommon/CpuTensorHandle.hpp>
+#include <reference/RefTensorHandle.hpp>
 #include <reference/RefWorkloadFactory.hpp>
 #include <reference/workloads/RefWorkloads.hpp>
 
@@ -16,8 +16,8 @@
 void CheckInputOutput(std::unique_ptr<Workload> workload, const TensorInfo& inputInfo, const TensorInfo& outputInfo)
 {
     auto queueDescriptor = workload->GetData();
-    auto inputHandle  = boost::polymorphic_downcast<ConstCpuTensorHandle*>(queueDescriptor.m_Inputs[0]);
-    auto outputHandle = boost::polymorphic_downcast<CpuTensorHandle*>(queueDescriptor.m_Outputs[0]);
+    auto inputHandle  = boost::polymorphic_downcast<RefTensorHandle*>(queueDescriptor.m_Inputs[0]);
+    auto outputHandle = boost::polymorphic_downcast<RefTensorHandle*>(queueDescriptor.m_Outputs[0]);
     BOOST_TEST((inputHandle->GetTensorInfo() == inputInfo));
     BOOST_TEST((outputHandle->GetTensorInfo() == outputInfo));
 }
@@ -29,9 +29,9 @@
                        const TensorInfo&         outputInfo)
 {
     auto queueDescriptor = workload->GetData();
-    auto inputHandle0     = boost::polymorphic_downcast<ConstCpuTensorHandle*>(queueDescriptor.m_Inputs[0]);
-    auto inputHandle1     = boost::polymorphic_downcast<ConstCpuTensorHandle*>(queueDescriptor.m_Inputs[1]);
-    auto outputHandle    = boost::polymorphic_downcast<CpuTensorHandle*>(queueDescriptor.m_Outputs[0]);
+    auto inputHandle0     = boost::polymorphic_downcast<RefTensorHandle*>(queueDescriptor.m_Inputs[0]);
+    auto inputHandle1     = boost::polymorphic_downcast<RefTensorHandle*>(queueDescriptor.m_Inputs[1]);
+    auto outputHandle    = boost::polymorphic_downcast<RefTensorHandle*>(queueDescriptor.m_Outputs[0]);
     BOOST_TEST((inputHandle0->GetTensorInfo() == inputInfo0));
     BOOST_TEST((inputHandle1->GetTensorInfo() == inputInfo1));
     BOOST_TEST((outputHandle->GetTensorInfo() == outputInfo));
@@ -497,16 +497,16 @@
 
     // Checks that outputs are as we expect them (see definition of CreateSplitterWorkloadTest).
     SplitterQueueDescriptor queueDescriptor = workload->GetData();
-    auto inputHandle = boost::polymorphic_downcast<ConstCpuTensorHandle*>(queueDescriptor.m_Inputs[0]);
+    auto inputHandle = boost::polymorphic_downcast<RefTensorHandle*>(queueDescriptor.m_Inputs[0]);
     BOOST_TEST((inputHandle->GetTensorInfo() == TensorInfo({ 5, 7, 7 }, DataType)));
 
-    auto outputHandle0 = boost::polymorphic_downcast<CpuTensorHandle*>(queueDescriptor.m_Outputs[0]);
+    auto outputHandle0 = boost::polymorphic_downcast<RefTensorHandle*>(queueDescriptor.m_Outputs[0]);
     BOOST_TEST((outputHandle0->GetTensorInfo() == TensorInfo({ 1, 7, 7 }, DataType)));
 
-    auto outputHandle1 = boost::polymorphic_downcast<CpuTensorHandle*>(queueDescriptor.m_Outputs[1]);
+    auto outputHandle1 = boost::polymorphic_downcast<RefTensorHandle*>(queueDescriptor.m_Outputs[1]);
     BOOST_TEST((outputHandle1->GetTensorInfo() == TensorInfo({ 2, 7, 7 }, DataType)));
 
-    auto outputHandle2 = boost::polymorphic_downcast<CpuTensorHandle*>(queueDescriptor.m_Outputs[2]);
+    auto outputHandle2 = boost::polymorphic_downcast<RefTensorHandle*>(queueDescriptor.m_Outputs[2]);
     BOOST_TEST((outputHandle2->GetTensorInfo() == TensorInfo({ 2, 7, 7 }, DataType)));
 }
 
@@ -538,10 +538,10 @@
     auto wlConcat = std::move(workloads.second);
 
     //Checks that the index of inputs/outputs matches what we declared on InputDescriptor construction.
-    armnn::CpuTensorHandle* sOut0 = dynamic_cast<armnn::CpuTensorHandle*>(wlSplitter->GetData().m_Outputs[0]);
-    armnn::CpuTensorHandle* sOut1 = dynamic_cast<armnn::CpuTensorHandle*>(wlSplitter->GetData().m_Outputs[1]);
-    armnn::CpuTensorHandle* mIn0 = dynamic_cast<armnn::CpuTensorHandle*>(wlConcat->GetData().m_Inputs[0]);
-    armnn::CpuTensorHandle* mIn1 = dynamic_cast<armnn::CpuTensorHandle*>(wlConcat->GetData().m_Inputs[1]);
+    armnn::RefTensorHandle* sOut0 = dynamic_cast<armnn::RefTensorHandle*>(wlSplitter->GetData().m_Outputs[0]);
+    armnn::RefTensorHandle* sOut1 = dynamic_cast<armnn::RefTensorHandle*>(wlSplitter->GetData().m_Outputs[1]);
+    armnn::RefTensorHandle* mIn0 = dynamic_cast<armnn::RefTensorHandle*>(wlConcat->GetData().m_Inputs[0]);
+    armnn::RefTensorHandle* mIn1 = dynamic_cast<armnn::RefTensorHandle*>(wlConcat->GetData().m_Inputs[1]);
 
     BOOST_TEST(sOut0);
     BOOST_TEST(sOut1);
@@ -580,12 +580,12 @@
     CreateSplitterMultipleInputsOneOutputWorkloadTest<SplitterWorkloadType,
         ActivationWorkloadType, DataType>(factory, graph, wlSplitter, wlActiv0_0, wlActiv0_1, wlActiv1_0, wlActiv1_1);
 
-    armnn::CpuTensorHandle* sOut0 = dynamic_cast<armnn::CpuTensorHandle*>(wlSplitter->GetData().m_Outputs[0]);
-    armnn::CpuTensorHandle* sOut1 = dynamic_cast<armnn::CpuTensorHandle*>(wlSplitter->GetData().m_Outputs[1]);
-    armnn::CpuTensorHandle* activ0_0Im = dynamic_cast<armnn::CpuTensorHandle*>(wlActiv0_0->GetData().m_Inputs[0]);
-    armnn::CpuTensorHandle* activ0_1Im = dynamic_cast<armnn::CpuTensorHandle*>(wlActiv0_1->GetData().m_Inputs[0]);
-    armnn::CpuTensorHandle* activ1_0Im = dynamic_cast<armnn::CpuTensorHandle*>(wlActiv1_0->GetData().m_Inputs[0]);
-    armnn::CpuTensorHandle* activ1_1Im = dynamic_cast<armnn::CpuTensorHandle*>(wlActiv1_1->GetData().m_Inputs[0]);
+    armnn::RefTensorHandle* sOut0 = dynamic_cast<armnn::RefTensorHandle*>(wlSplitter->GetData().m_Outputs[0]);
+    armnn::RefTensorHandle* sOut1 = dynamic_cast<armnn::RefTensorHandle*>(wlSplitter->GetData().m_Outputs[1]);
+    armnn::RefTensorHandle* activ0_0Im = dynamic_cast<armnn::RefTensorHandle*>(wlActiv0_0->GetData().m_Inputs[0]);
+    armnn::RefTensorHandle* activ0_1Im = dynamic_cast<armnn::RefTensorHandle*>(wlActiv0_1->GetData().m_Inputs[0]);
+    armnn::RefTensorHandle* activ1_0Im = dynamic_cast<armnn::RefTensorHandle*>(wlActiv1_0->GetData().m_Inputs[0]);
+    armnn::RefTensorHandle* activ1_1Im = dynamic_cast<armnn::RefTensorHandle*>(wlActiv1_1->GetData().m_Inputs[0]);
 
 
     BOOST_TEST(sOut0);
@@ -874,7 +874,7 @@
 
     // Check output is as expected
     auto queueDescriptor = workload->GetData();
-    auto outputHandle = boost::polymorphic_downcast<CpuTensorHandle*>(queueDescriptor.m_Outputs[0]);
+    auto outputHandle = boost::polymorphic_downcast<RefTensorHandle*>(queueDescriptor.m_Outputs[0]);
     BOOST_TEST((outputHandle->GetTensorInfo() == TensorInfo(outputShape, DataType)));
 }
 
@@ -914,7 +914,7 @@
 
     // Check output is as expected
     auto queueDescriptor = workload->GetData();
-    auto outputHandle = boost::polymorphic_downcast<CpuTensorHandle*>(queueDescriptor.m_Outputs[0]);
+    auto outputHandle = boost::polymorphic_downcast<RefTensorHandle*>(queueDescriptor.m_Outputs[0]);
     BOOST_TEST((outputHandle->GetTensorInfo() == TensorInfo(outputShape, dataType)));
 }
 
diff --git a/src/backends/reference/workloads/RefBatchToSpaceNdWorkload.cpp b/src/backends/reference/workloads/RefBatchToSpaceNdWorkload.cpp
index c293066..c21ef76 100644
--- a/src/backends/reference/workloads/RefBatchToSpaceNdWorkload.cpp
+++ b/src/backends/reference/workloads/RefBatchToSpaceNdWorkload.cpp
@@ -26,4 +26,4 @@
 }
 
 
-} //namespace armnn
\ No newline at end of file
+} //namespace armnn
diff --git a/src/backends/reference/workloads/RefConvolution2dWorkload.cpp b/src/backends/reference/workloads/RefConvolution2dWorkload.cpp
index 0824d5c..a660d2e 100644
--- a/src/backends/reference/workloads/RefConvolution2dWorkload.cpp
+++ b/src/backends/reference/workloads/RefConvolution2dWorkload.cpp
@@ -17,15 +17,16 @@
         : BaseWorkload<Convolution2dQueueDescriptor>(descriptor, info)
 {
     m_Weight = std::make_unique<ScopedCpuTensorHandle>(*(descriptor.m_Weight));
-    const TensorInfo& rFilterInfo = GetTensorInfo(m_Weight.get());
+    const TensorInfo& rFilterInfo = m_Weight->GetTensorInfo();
+
     m_FilterShape = rFilterInfo.GetShape();
     m_FilterDecoder = MakeDecoder<float>(rFilterInfo, m_Weight.get()->Map(true));
 
     if (descriptor.m_Parameters.m_BiasEnabled)
     {
         m_Bias = std::make_unique<ScopedCpuTensorHandle>(*(descriptor.m_Bias));
-        const TensorInfo& biasInfo = GetTensorInfo(m_Bias.get());
-        m_BiasDecoder = MakeDecoder<float>(biasInfo, m_Bias.get()->Map(true));
+        const TensorInfo& biasInfo = m_Bias->GetTensorInfo();
+        m_BiasDecoder = MakeDecoder<float>(biasInfo, m_Bias->Map(true));
     }
 }
 
diff --git a/src/backends/reference/workloads/RefDepthwiseConvolution2dWorkload.cpp b/src/backends/reference/workloads/RefDepthwiseConvolution2dWorkload.cpp
index c7dc4af..48a20cf 100644
--- a/src/backends/reference/workloads/RefDepthwiseConvolution2dWorkload.cpp
+++ b/src/backends/reference/workloads/RefDepthwiseConvolution2dWorkload.cpp
@@ -20,15 +20,15 @@
         : BaseWorkload<DepthwiseConvolution2dQueueDescriptor>(descriptor, info)
 {
     m_Weight = std::make_unique<ScopedCpuTensorHandle>(*(descriptor.m_Weight));
-    const TensorInfo& rFilterInfo = GetTensorInfo(m_Weight.get());
+    const TensorInfo& rFilterInfo = m_Weight->GetTensorInfo();
     m_FilterShape = rFilterInfo.GetShape();
-    m_FilterDecoder = MakeDecoder<float>(rFilterInfo, m_Weight.get()->Map(true));
+    m_FilterDecoder = MakeDecoder<float>(rFilterInfo, m_Weight->Map(true));
 
     if (descriptor.m_Parameters.m_BiasEnabled)
     {
         m_Bias = std::make_unique<ScopedCpuTensorHandle>(*(descriptor.m_Bias));
-        const TensorInfo& biasInfo = GetTensorInfo(m_Bias.get());
-        m_BiasDecoder = MakeDecoder<float>(biasInfo, m_Bias.get()->Map(true));
+        const TensorInfo& biasInfo = m_Bias->GetTensorInfo();
+        m_BiasDecoder = MakeDecoder<float>(biasInfo, m_Bias->Map(true));
     }
 }
 
diff --git a/src/backends/reference/workloads/RefDetectionPostProcessWorkload.cpp b/src/backends/reference/workloads/RefDetectionPostProcessWorkload.cpp
index db24cc5..b9817ba 100644
--- a/src/backends/reference/workloads/RefDetectionPostProcessWorkload.cpp
+++ b/src/backends/reference/workloads/RefDetectionPostProcessWorkload.cpp
@@ -24,7 +24,7 @@
 
     const TensorInfo& boxEncodingsInfo = GetTensorInfo(m_Data.m_Inputs[0]);
     const TensorInfo& scoresInfo       = GetTensorInfo(m_Data.m_Inputs[1]);
-    const TensorInfo& anchorsInfo      = GetTensorInfo(m_Anchors.get());
+    const TensorInfo& anchorsInfo      = m_Anchors->GetTensorInfo();
 
     const TensorInfo& detectionBoxesInfo   = GetTensorInfo(m_Data.m_Outputs[0]);
     const TensorInfo& detectionClassesInfo = GetTensorInfo(m_Data.m_Outputs[1]);
diff --git a/src/backends/reference/workloads/RefFullyConnectedWorkload.cpp b/src/backends/reference/workloads/RefFullyConnectedWorkload.cpp
index dc7030e..c7a3d90 100644
--- a/src/backends/reference/workloads/RefFullyConnectedWorkload.cpp
+++ b/src/backends/reference/workloads/RefFullyConnectedWorkload.cpp
@@ -17,14 +17,14 @@
         : BaseWorkload<FullyConnectedQueueDescriptor>(descriptor, info),
           m_Weight(std::make_unique<ScopedCpuTensorHandle>(*(descriptor.m_Weight)))
 {
-    const TensorInfo& rWeightInfo = GetTensorInfo(m_Weight.get());
+    const TensorInfo& rWeightInfo = m_Weight->GetTensorInfo();
     m_WeightShape = rWeightInfo.GetShape();
     m_WeightDecoder = MakeDecoder<float>(rWeightInfo, m_Weight->Map(true));
 
     if (descriptor.m_Parameters.m_BiasEnabled)
     {
         m_Bias = std::make_unique<ScopedCpuTensorHandle>(*(descriptor.m_Bias));
-        const TensorInfo& biasInfo = GetTensorInfo(m_Bias.get());
+        const TensorInfo& biasInfo = m_Bias->GetTensorInfo();
         m_BiasDecoder = MakeDecoder<float>(biasInfo, m_Bias->Map(true));
     }
 }
diff --git a/src/backends/reference/workloads/RefPermuteWorkload.cpp b/src/backends/reference/workloads/RefPermuteWorkload.cpp
index 9e44d16..c943eb8 100644
--- a/src/backends/reference/workloads/RefPermuteWorkload.cpp
+++ b/src/backends/reference/workloads/RefPermuteWorkload.cpp
@@ -20,11 +20,11 @@
     ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuRef, GetName() + "_Execute");
 
     const ITensorHandle*     src      = m_Data.m_Inputs[0];
-    const ITensorHandle*     dst      = m_Data.m_Outputs[0];
+    ITensorHandle*           dst      = m_Data.m_Outputs[0];
     const PermutationVector& mappings = m_Data.m_Parameters.m_DimMappings;
 
     armnnUtils::Permute(GetTensorInfo(dst).GetShape(), mappings,
-                        GetConstCpuData<void>(src), GetCpuData<void>(dst), sizeof(T));
+                        src->Map(), dst->Map(), sizeof(T));
 }
 
 template class RefPermuteWorkload<DataType::Float16>;
diff --git a/src/backends/reference/workloads/RefTransposeConvolution2dWorkload.cpp b/src/backends/reference/workloads/RefTransposeConvolution2dWorkload.cpp
index 50dafca..ec60030 100644
--- a/src/backends/reference/workloads/RefTransposeConvolution2dWorkload.cpp
+++ b/src/backends/reference/workloads/RefTransposeConvolution2dWorkload.cpp
@@ -19,7 +19,7 @@
 {
     // set up weights decoder
     m_Weights = std::make_unique<ScopedCpuTensorHandle>(*(descriptor.m_Weight));
-    const TensorInfo& weightsInfo = GetTensorInfo(m_Weights.get());
+    const TensorInfo& weightsInfo = m_Weights->GetTensorInfo();
 
     m_WeightsDecoder = MakeDecoder<float>(weightsInfo, m_Weights.get()->Map(true));
     m_WeightsShape   = weightsInfo.GetShape();
@@ -28,7 +28,7 @@
     if (descriptor.m_Parameters.m_BiasEnabled)
     {
         m_Biases = std::make_unique<ScopedCpuTensorHandle>(*(descriptor.m_Bias));
-        const TensorInfo& biasesInfo = GetTensorInfo(m_Biases.get());
+        const TensorInfo& biasesInfo = m_Biases->GetTensorInfo();
         m_BiasesDecoder = MakeDecoder<float>(biasesInfo, m_Biases.get()->Map(true));
     }
 }
diff --git a/src/backends/reference/workloads/RefWorkloadUtils.hpp b/src/backends/reference/workloads/RefWorkloadUtils.hpp
index ce79616..c3260c8 100644
--- a/src/backends/reference/workloads/RefWorkloadUtils.hpp
+++ b/src/backends/reference/workloads/RefWorkloadUtils.hpp
@@ -9,8 +9,10 @@
 
 #include <armnn/Tensor.hpp>
 #include <armnn/Types.hpp>
-#include <Half.hpp>
 
+#include <reference/RefTensorHandle.hpp>
+
+#include <Half.hpp>
 #include <boost/polymorphic_cast.hpp>
 
 namespace armnn
@@ -22,41 +24,24 @@
 
 inline const TensorInfo& GetTensorInfo(const ITensorHandle* tensorHandle)
 {
-    // We know that reference workloads use CpuTensorHandles only, so this cast is legitimate.
-    const ConstCpuTensorHandle* cpuTensorHandle =
-        boost::polymorphic_downcast<const ConstCpuTensorHandle*>(tensorHandle);
-    return cpuTensorHandle->GetTensorInfo();
+    // We know that reference workloads use RefTensorHandles for inputs and outputs
+    const RefTensorHandle* refTensorHandle =
+        boost::polymorphic_downcast<const RefTensorHandle*>(tensorHandle);
+    return refTensorHandle->GetTensorInfo();
 }
 
-template <typename DataType>
-inline const DataType* GetConstCpuData(const ITensorHandle* tensorHandle)
-{
-    // We know that reference workloads use (Const)CpuTensorHandles only, so this cast is legitimate.
-    const ConstCpuTensorHandle* cpuTensorHandle =
-        boost::polymorphic_downcast<const ConstCpuTensorHandle*>(tensorHandle);
-    return cpuTensorHandle->GetConstTensor<DataType>();
-}
-
-template <typename DataType>
-inline DataType* GetCpuData(const ITensorHandle* tensorHandle)
-{
-    // We know that reference workloads use CpuTensorHandles only, so this cast is legitimate.
-    const CpuTensorHandle* cpuTensorHandle = boost::polymorphic_downcast<const CpuTensorHandle*>(tensorHandle);
-    return cpuTensorHandle->GetTensor<DataType>();
-};
-
 template <typename DataType, typename PayloadType>
 const DataType* GetInputTensorData(unsigned int idx, const PayloadType& data)
 {
     const ITensorHandle* tensorHandle = data.m_Inputs[idx];
-    return GetConstCpuData<DataType>(tensorHandle);
+    return reinterpret_cast<const DataType*>(tensorHandle->Map());
 }
 
 template <typename DataType, typename PayloadType>
 DataType* GetOutputTensorData(unsigned int idx, const PayloadType& data)
 {
-    const ITensorHandle* tensorHandle = data.m_Outputs[idx];
-    return GetCpuData<DataType>(tensorHandle);
+    ITensorHandle* tensorHandle = data.m_Outputs[idx];
+    return reinterpret_cast<DataType*>(tensorHandle->Map());
 }
 
 template <typename PayloadType>
@@ -87,35 +72,6 @@
 /// u8 helpers
 ////////////////////////////////////////////
 
-inline const uint8_t* GetConstCpuU8Data(const ITensorHandle* tensorHandle)
-{
-    // We know that reference workloads use (Const)CpuTensorHandles only, so this cast is legitimate.
-    const ConstCpuTensorHandle* cpuTensorHandle =
-        boost::polymorphic_downcast<const ConstCpuTensorHandle*>(tensorHandle);
-    return cpuTensorHandle->GetConstTensor<uint8_t>();
-};
-
-inline uint8_t* GetCpuU8Data(const ITensorHandle* tensorHandle)
-{
-    // We know that reference workloads use CpuTensorHandles only, so this cast is legitimate.
-    const CpuTensorHandle* cpuTensorHandle = boost::polymorphic_downcast<const CpuTensorHandle*>(tensorHandle);
-    return cpuTensorHandle->GetTensor<uint8_t>();
-};
-
-template <typename PayloadType>
-const uint8_t* GetInputTensorDataU8(unsigned int idx, const PayloadType& data)
-{
-    const ITensorHandle* tensorHandle = data.m_Inputs[idx];
-    return GetConstCpuU8Data(tensorHandle);
-}
-
-template <typename PayloadType>
-uint8_t* GetOutputTensorDataU8(unsigned int idx, const PayloadType& data)
-{
-    const ITensorHandle* tensorHandle = data.m_Outputs[idx];
-    return GetCpuU8Data(tensorHandle);
-}
-
 template<typename T>
 std::vector<float> Dequantize(const T* quant, const TensorInfo& info)
 {