IVGCVSW-2316 Add reference implementation and unit tests for Debug

Change-Id: Ib2e5de2a057da57ef77a9f5c4367d699d4773294
diff --git a/src/backends/backendsCommon/test/CMakeLists.txt b/src/backends/backendsCommon/test/CMakeLists.txt
index f1a4289..8107176 100644
--- a/src/backends/backendsCommon/test/CMakeLists.txt
+++ b/src/backends/backendsCommon/test/CMakeLists.txt
@@ -13,6 +13,7 @@
     Conv2dTestImpl.hpp
     ConvertFp16ToFp32TestImpl.hpp
     ConvertFp32ToFp16TestImpl.hpp
+    DebugTestImpl.hpp
     EndToEndTestImpl.hpp
     FullyConnectedTestImpl.hpp
     IsLayerSupportedTestImpl.hpp
diff --git a/src/backends/backendsCommon/test/DebugTestImpl.hpp b/src/backends/backendsCommon/test/DebugTestImpl.hpp
new file mode 100644
index 0000000..e0f8a35
--- /dev/null
+++ b/src/backends/backendsCommon/test/DebugTestImpl.hpp
@@ -0,0 +1,272 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+#pragma once
+
+#include "WorkloadTestUtils.hpp"
+
+#include <armnn/ArmNN.hpp>
+#include <armnn/Tensor.hpp>
+#include <armnn/TypesUtils.hpp>
+
+#include <backendsCommon/CpuTensorHandle.hpp>
+#include <backendsCommon/IBackendInternal.hpp>
+#include <backendsCommon/WorkloadFactory.hpp>
+
+#include <test/TensorHelpers.hpp>
+
+namespace
+{
+
+template<typename T, std::size_t Dim>
+LayerTestResult<T, Dim> DebugTestImpl(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+    armnn::TensorInfo& inputTensorInfo,
+    armnn::TensorInfo& outputTensorInfo,
+    std::vector<float>& inputData,
+    std::vector<float>& outputExpectedData,
+    armnn::DebugQueueDescriptor descriptor,
+    const std::string expectedStringOutput,
+    const float qScale = 1.0f,
+    const int32_t qOffset = 0)
+{
+    if(armnn::IsQuantizedType<T>())
+    {
+        inputTensorInfo.SetQuantizationScale(qScale);
+        inputTensorInfo.SetQuantizationOffset(qOffset);
+
+        outputTensorInfo.SetQuantizationScale(qScale);
+        outputTensorInfo.SetQuantizationOffset(qOffset);
+    }
+
+    boost::multi_array<T, Dim> input =
+        MakeTensor<T, Dim>(inputTensorInfo, QuantizedVector<T>(qScale, qOffset, inputData));
+
+    LayerTestResult<T, Dim> ret(outputTensorInfo);
+    ret.outputExpected =
+        MakeTensor<T, Dim>(outputTensorInfo, QuantizedVector<T>(qScale, qOffset, outputExpectedData));
+
+    std::unique_ptr<armnn::ITensorHandle> inputHandle =
+        workloadFactory.CreateTensorHandle(inputTensorInfo);
+
+    std::unique_ptr<armnn::ITensorHandle> outputHandle =
+        workloadFactory.CreateTensorHandle(outputTensorInfo);
+
+    armnn::WorkloadInfo info;
+    AddInputToWorkload(descriptor, info, inputTensorInfo, inputHandle.get());
+    AddOutputToWorkload(descriptor, info, outputTensorInfo, outputHandle.get());
+
+    std::unique_ptr<armnn::IWorkload> workload = workloadFactory.CreateDebug(descriptor, info);
+
+    inputHandle->Allocate();
+    outputHandle->Allocate();
+
+    CopyDataToITensorHandle(inputHandle.get(), input.data());
+  
+    std::ostringstream oss;
+    std::streambuf* coutStreambuf = std::cout.rdbuf();
+    std::cout.rdbuf(oss.rdbuf());
+
+    ExecuteWorkload(*workload, memoryManager);
+  
+    std::cout.rdbuf(coutStreambuf);
+    
+    BOOST_TEST(oss.str() == expectedStringOutput);
+
+    CopyDataFromITensorHandle(ret.output.data(), outputHandle.get());
+
+    return ret;
+}
+
+template <typename T>
+LayerTestResult<T, 4> Debug4DTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    armnn::TensorInfo inputTensorInfo;
+    armnn::TensorInfo outputTensorInfo;
+
+    unsigned int inputShape[]  = {1, 2, 2, 3};
+    unsigned int outputShape[] = {1, 2, 2, 3};
+
+    armnn::DebugQueueDescriptor desc;
+    desc.m_Parameters.m_LayerName = "TestOutput";
+    desc.m_Parameters.m_SlotIndex = 1;
+
+    inputTensorInfo = armnn::TensorInfo(4, inputShape, armnn::GetDataType<T>());
+    outputTensorInfo = armnn::TensorInfo(4, outputShape, armnn::GetDataType<T>());
+
+    std::vector<float> input = std::vector<float>(
+    {
+        1.0f,   2.0f,  3.0f,
+        4.0f,   5.0f,  6.0f,
+        7.0f,   8.0f,  9.0f,
+        10.0f, 11.0f, 12.0f,
+    });
+
+    std::vector<float> outputExpected = std::vector<float>(
+    {
+        1.0f,   2.0f,  3.0f,
+        4.0f,   5.0f,  6.0f,
+        7.0f,   8.0f,  9.0f,
+        10.0f, 11.0f, 12.0f,
+    });
+
+    const std::string expectedStringOutput =
+        "{ \"layer\": \"TestOutput\","
+        " \"outputSlot\": 1,"
+        " \"shape\": [1, 2, 2, 3],"
+        " \"min\": 1, \"max\": 12,"
+        " \"data\": [[[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]]] }\n";
+
+    return DebugTestImpl<T, 4>(workloadFactory,
+                               memoryManager,
+                               inputTensorInfo,
+                               outputTensorInfo,
+                               input,
+                               outputExpected,
+                               desc,
+                               expectedStringOutput);
+}
+
+template <typename T>
+LayerTestResult<T, 3> Debug3DTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    armnn::TensorInfo inputTensorInfo;
+    armnn::TensorInfo outputTensorInfo;
+
+    unsigned int inputShape[]  = {3, 3, 1};
+    unsigned int outputShape[] = {3, 3, 1};
+
+    armnn::DebugQueueDescriptor desc;
+    desc.m_Parameters.m_LayerName = "TestOutput";
+
+    inputTensorInfo = armnn::TensorInfo(3, inputShape, armnn::GetDataType<T>());
+    outputTensorInfo = armnn::TensorInfo(3, outputShape, armnn::GetDataType<T>());
+
+    std::vector<float> input = std::vector<float>(
+    {
+        1.0f, 2.0f, 3.0f,
+        4.0f, 5.0f, 6.0f,
+        7.0f, 8.0f, 9.0f,
+    });
+
+    std::vector<float> outputExpected = std::vector<float>(
+    {
+        1.0f, 2.0f, 3.0f,
+        4.0f, 5.0f, 6.0f,
+        7.0f, 8.0f, 9.0f,
+    });
+
+    const std::string expectedStringOutput =
+        "{ \"layer\": \"TestOutput\","
+        " \"outputSlot\": 0,"
+        " \"shape\": [3, 3, 1],"
+        " \"min\": 1, \"max\": 9,"
+        " \"data\": [[[1], [2], [3]], [[4], [5], [6]], [[7], [8], [9]]] }\n";
+
+    return DebugTestImpl<T, 3>(workloadFactory,
+                               memoryManager,
+                               inputTensorInfo,
+                               outputTensorInfo,
+                               input,
+                               outputExpected,
+                               desc,
+                               expectedStringOutput);
+}
+
+template <typename T>
+LayerTestResult<T, 2> Debug2DTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    armnn::TensorInfo inputTensorInfo;
+    armnn::TensorInfo outputTensorInfo;
+
+    unsigned int inputShape[]  = {2, 2};
+    unsigned int outputShape[] = {2, 2};
+
+    armnn::DebugQueueDescriptor desc;
+    desc.m_Parameters.m_LayerName = "TestOutput";
+
+    inputTensorInfo = armnn::TensorInfo(2, inputShape, armnn::GetDataType<T>());
+    outputTensorInfo = armnn::TensorInfo(2, outputShape, armnn::GetDataType<T>());
+
+    std::vector<float> input = std::vector<float>(
+    {
+        1.0f, 2.0f,
+        3.0f, 4.0f,
+    });
+
+    std::vector<float> outputExpected = std::vector<float>(
+    {
+        1.0f, 2.0f,
+        3.0f, 4.0f,
+    });
+
+    const std::string expectedStringOutput =
+        "{ \"layer\": \"TestOutput\","
+        " \"outputSlot\": 0,"
+        " \"shape\": [2, 2],"
+        " \"min\": 1, \"max\": 4,"
+        " \"data\": [[1, 2], [3, 4]] }\n";
+
+    return DebugTestImpl<T, 2>(workloadFactory,
+                               memoryManager,
+                               inputTensorInfo,
+                               outputTensorInfo,
+                               input,
+                               outputExpected,
+                               desc,
+                               expectedStringOutput);
+}
+
+template <typename T>
+LayerTestResult<T, 1> Debug1DTest(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    armnn::TensorInfo inputTensorInfo;
+    armnn::TensorInfo outputTensorInfo;
+
+    unsigned int inputShape[]  = {4};
+    unsigned int outputShape[] = {4};
+
+    armnn::DebugQueueDescriptor desc;
+    desc.m_Parameters.m_LayerName = "TestOutput";
+
+    inputTensorInfo = armnn::TensorInfo(1, inputShape, armnn::GetDataType<T>());
+    outputTensorInfo = armnn::TensorInfo(1, outputShape, armnn::GetDataType<T>());
+
+    std::vector<float> input = std::vector<float>(
+    {
+        1.0f, 2.0f, 3.0f, 4.0f,
+    });
+
+    std::vector<float> outputExpected = std::vector<float>(
+    {
+        1.0f, 2.0f, 3.0f, 4.0f,
+    });
+
+    const std::string expectedStringOutput =
+        "{ \"layer\": \"TestOutput\","
+        " \"outputSlot\": 0,"
+        " \"shape\": [4],"
+        " \"min\": 1, \"max\": 4,"
+        " \"data\": [1, 2, 3, 4] }\n";
+
+    return DebugTestImpl<T, 1>(workloadFactory,
+                               memoryManager,
+                               inputTensorInfo,
+                               outputTensorInfo,
+                               input,
+                               outputExpected,
+                               desc,
+                               expectedStringOutput);
+}
+
+} // anonymous namespace
diff --git a/src/backends/backendsCommon/test/LayerTests.cpp b/src/backends/backendsCommon/test/LayerTests.cpp
index 131b84c..b44c835 100755
--- a/src/backends/backendsCommon/test/LayerTests.cpp
+++ b/src/backends/backendsCommon/test/LayerTests.cpp
@@ -40,6 +40,7 @@
 #include "LstmTestImpl.hpp"
 #include "ConvertFp16ToFp32TestImpl.hpp"
 #include "ConvertFp32ToFp16TestImpl.hpp"
+#include "DebugTestImpl.hpp"
 
 // 3-channel 16x8 image used as common input data for a number of Conv2d tests.
 static std::vector<float> ConvInput3x8x16({
@@ -7930,4 +7931,60 @@
     return BatchToSpaceNdHelper<uint8_t, 4, 4>(workloadFactory, memoryManager,
                                                armnn::DataLayout::NCHW, inputShape, input, blockShape,
                                                crops, outputShape, expectedOutput);
-}
\ No newline at end of file
+}
+
+LayerTestResult<float, 4> Debug4DFloat32Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return Debug4DTest<float>(workloadFactory, memoryManager);
+}
+
+LayerTestResult<float, 3> Debug3DFloat32Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return Debug3DTest<float>(workloadFactory, memoryManager);
+}
+
+LayerTestResult<float, 2> Debug2DFloat32Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return Debug2DTest<float>(workloadFactory, memoryManager);
+}
+
+LayerTestResult<float, 1> Debug1DFloat32Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return Debug1DTest<float>(workloadFactory, memoryManager);
+}
+
+LayerTestResult<uint8_t, 4> Debug4DUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return Debug4DTest<uint8_t>(workloadFactory, memoryManager);
+}
+
+LayerTestResult<uint8_t, 3> Debug3DUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return Debug3DTest<uint8_t>(workloadFactory, memoryManager);
+}
+
+LayerTestResult<uint8_t, 2> Debug2DUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return Debug2DTest<uint8_t>(workloadFactory, memoryManager);
+}
+
+LayerTestResult<uint8_t, 1> Debug1DUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager)
+{
+    return Debug1DTest<uint8_t>(workloadFactory, memoryManager);
+}
diff --git a/src/backends/backendsCommon/test/LayerTests.hpp b/src/backends/backendsCommon/test/LayerTests.hpp
index 1797f9f..1f38675 100644
--- a/src/backends/backendsCommon/test/LayerTests.hpp
+++ b/src/backends/backendsCommon/test/LayerTests.hpp
@@ -1139,7 +1139,7 @@
 LayerTestResult<float, 4> StridedSlice4DFloat32Test(
     armnn::IWorkloadFactory& workloadFactory,
     const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
-    
+
 LayerTestResult<float, 4> StridedSlice4DReverseFloat32Test(
     armnn::IWorkloadFactory& workloadFactory,
     const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
@@ -1207,3 +1207,35 @@
 LayerTestResult<uint8_t, 2> StridedSlice2DReverseUint8Test(
     armnn::IWorkloadFactory& workloadFactory,
     const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<float, 4> Debug4DFloat32Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<float, 3> Debug3DFloat32Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<float, 2> Debug2DFloat32Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<float, 1> Debug1DFloat32Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<uint8_t, 4> Debug4DUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<uint8_t, 3> Debug3DUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<uint8_t, 2> Debug2DUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
+
+LayerTestResult<uint8_t, 1> Debug1DUint8Test(
+    armnn::IWorkloadFactory& workloadFactory,
+    const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager);
diff --git a/src/backends/reference/RefLayerSupport.cpp b/src/backends/reference/RefLayerSupport.cpp
index 3b49fa0..2c8f9cb 100644
--- a/src/backends/reference/RefLayerSupport.cpp
+++ b/src/backends/reference/RefLayerSupport.cpp
@@ -160,6 +160,19 @@
                                      &TrueFunc<>);
 }
 
+bool RefLayerSupport::IsDebugSupported(const TensorInfo& input,
+                                       const TensorInfo& output,
+                                       const DebugDescriptor& descriptor,
+                                       Optional<std::string&> reasonIfUnsupported) const
+{
+    ignore_unused(output);
+    ignore_unused(descriptor);
+    return IsSupportedForDataTypeRef(reasonIfUnsupported,
+                                     input.GetDataType(),
+                                     &TrueFunc<>,
+                                     &TrueFunc<>);
+}
+
 bool RefLayerSupport::IsDepthwiseConvolutionSupported(const TensorInfo& input,
                                                       const TensorInfo& output,
                                                       const DepthwiseConvolution2dDescriptor& descriptor,
diff --git a/src/backends/reference/RefLayerSupport.hpp b/src/backends/reference/RefLayerSupport.hpp
index 0d34c08..9dc64cb 100644
--- a/src/backends/reference/RefLayerSupport.hpp
+++ b/src/backends/reference/RefLayerSupport.hpp
@@ -54,6 +54,11 @@
                                   const Optional<TensorInfo>& biases,
                                   Optional<std::string&> reasonIfUnsupported = EmptyOptional()) const override;
 
+    bool IsDebugSupported(const TensorInfo& input,
+                          const TensorInfo& output,
+                          const DebugDescriptor& descriptor,
+                          Optional<std::string&> reasonIfUnsupported = EmptyOptional()) const override;
+
     bool IsDepthwiseConvolutionSupported(const TensorInfo& input,
                                          const TensorInfo& output,
                                          const DepthwiseConvolution2dDescriptor& descriptor,
diff --git a/src/backends/reference/RefWorkloadFactory.cpp b/src/backends/reference/RefWorkloadFactory.cpp
index 43651cf..ac837d3 100644
--- a/src/backends/reference/RefWorkloadFactory.cpp
+++ b/src/backends/reference/RefWorkloadFactory.cpp
@@ -297,7 +297,7 @@
 std::unique_ptr<IWorkload> RefWorkloadFactory::CreateDebug(const DebugQueueDescriptor& descriptor,
                                                            const WorkloadInfo& info) const
 {
-    return MakeWorkload<NullWorkload, NullWorkload>(descriptor, info);
+    return MakeWorkload<RefDebugFloat32Workload, RefDebugUint8Workload>(descriptor, info);
 }
 
 } // namespace armnn
diff --git a/src/backends/reference/backend.mk b/src/backends/reference/backend.mk
index 66675bd..d868069 100644
--- a/src/backends/reference/backend.mk
+++ b/src/backends/reference/backend.mk
@@ -15,6 +15,7 @@
         workloads/BatchToSpaceNd.cpp \
         workloads/Broadcast.cpp \
         workloads/ConvImpl.cpp \
+        workloads/Debug.cpp \
         workloads/ElementwiseFunction.cpp \
         workloads/FullyConnected.cpp \
         workloads/Mean.cpp \
@@ -33,6 +34,7 @@
         workloads/RefConvertFp32ToFp16Workload.cpp \
         workloads/RefConvolution2dFloat32Workload.cpp \
         workloads/RefConvolution2dUint8Workload.cpp \
+        workloads/RefDebugWorkload.cpp \
         workloads/RefDepthwiseConvolution2dFloat32Workload.cpp \
         workloads/RefDepthwiseConvolution2dUint8Workload.cpp \
         workloads/RefElementwiseWorkload.cpp \
diff --git a/src/backends/reference/test/RefLayerTests.cpp b/src/backends/reference/test/RefLayerTests.cpp
index fa4af96..d3c2231 100644
--- a/src/backends/reference/test/RefLayerTests.cpp
+++ b/src/backends/reference/test/RefLayerTests.cpp
@@ -454,4 +454,15 @@
 ARMNN_AUTO_TEST_CASE(StridedSlice2DUint8, StridedSlice2DUint8Test)
 ARMNN_AUTO_TEST_CASE(StridedSlice2DReverseUint8, StridedSlice2DReverseUint8Test)
 
+// Debug
+ARMNN_AUTO_TEST_CASE(Debug4DFloat32, Debug4DFloat32Test)
+ARMNN_AUTO_TEST_CASE(Debug3DFloat32, Debug3DFloat32Test)
+ARMNN_AUTO_TEST_CASE(Debug2DFloat32, Debug2DFloat32Test)
+ARMNN_AUTO_TEST_CASE(Debug1DFloat32, Debug1DFloat32Test)
+
+ARMNN_AUTO_TEST_CASE(Debug4DUint8, Debug4DUint8Test)
+ARMNN_AUTO_TEST_CASE(Debug3DUint8, Debug3DUint8Test)
+ARMNN_AUTO_TEST_CASE(Debug2DUint8, Debug2DUint8Test)
+ARMNN_AUTO_TEST_CASE(Debug1DUint8, Debug1DUint8Test)
+
 BOOST_AUTO_TEST_SUITE_END()
diff --git a/src/backends/reference/workloads/CMakeLists.txt b/src/backends/reference/workloads/CMakeLists.txt
index 7028f18..14d6ca9 100644
--- a/src/backends/reference/workloads/CMakeLists.txt
+++ b/src/backends/reference/workloads/CMakeLists.txt
@@ -13,6 +13,8 @@
     Broadcast.hpp
     ConvImpl.cpp
     ConvImpl.hpp
+    Debug.cpp
+    Debug.hpp
     ElementwiseFunction.cpp
     ElementwiseFunction.hpp
     FullyConnected.cpp
@@ -52,6 +54,8 @@
     RefConvolution2dUint8Workload.hpp
     RefElementwiseWorkload.cpp
     RefElementwiseWorkload.hpp
+    RefDebugWorkload.cpp
+    RefDebugWorkload.hpp
     RefDepthwiseConvolution2dFloat32Workload.cpp
     RefDepthwiseConvolution2dFloat32Workload.hpp
     RefDepthwiseConvolution2dUint8Workload.cpp
diff --git a/src/backends/reference/workloads/Debug.cpp b/src/backends/reference/workloads/Debug.cpp
new file mode 100644
index 0000000..dfcbbd8
--- /dev/null
+++ b/src/backends/reference/workloads/Debug.cpp
@@ -0,0 +1,101 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+#include "Debug.hpp"
+
+#include <boost/numeric/conversion/cast.hpp>
+
+#include <cstring>
+#include <algorithm>
+#include <iostream>
+
+namespace armnn
+{
+
+template <typename T>
+void Debug(const TensorInfo& inputInfo,
+           const TensorInfo& outputInfo,
+           const DebugDescriptor& descriptor,
+           const T* inputData,
+           T* outputData)
+{
+    const unsigned int numDims = inputInfo.GetNumDimensions();
+    const unsigned int numElements = inputInfo.GetNumElements();
+    const TensorShape& inputShape = inputInfo.GetShape();
+
+    unsigned int strides[numDims];
+    strides[numDims - 1] = inputShape[numDims - 1];
+
+    for (unsigned int i = 2; i <= numDims; i++)
+    {
+        strides[numDims - i] = strides[numDims - i + 1] * inputShape[numDims - i];
+    }
+
+    std::cout << "{ ";
+    std::cout << "\"layer\": \"" << descriptor.m_LayerName << "\", ";
+    std::cout << "\"outputSlot\": " << descriptor.m_SlotIndex << ", ";
+    std::cout << "\"shape\": ";
+
+    std::cout << "[";
+    for (unsigned int i = 0; i < numDims; i++)
+    {
+        std::cout << inputShape[i];
+        if (i != numDims - 1)
+        {
+            std::cout << ", ";
+        }
+    }
+    std::cout << "], ";
+
+    std::cout << "\"min\": "
+        << boost::numeric_cast<float>(*std::min_element(inputData, inputData + numElements)) << ", ";
+
+    std::cout << "\"max\": "
+        << boost::numeric_cast<float>(*std::max_element(inputData, inputData + numElements)) << ", ";
+
+    std::cout << "\"data\": ";
+
+    for (unsigned int i = 0; i < numElements; i++)
+    {
+        for (unsigned int j = 0; j < numDims; j++)
+        {
+            if (i % strides[j] == 0)
+            {
+                std::cout << "[" ;
+            }
+        }
+
+        std::cout << boost::numeric_cast<float>(inputData[i]);
+
+        for (unsigned int j = 0; j < numDims; j++)
+        {
+            if ((i+1) % strides[j] == 0)
+            {
+                std::cout << "]" ;
+            }
+        }
+
+        if (i != numElements - 1)
+        {
+            std::cout << ", ";
+        }
+    }
+
+    std::cout << " }" << std::endl;
+
+    std::memcpy(outputData, inputData, inputInfo.GetNumElements()*sizeof(T));
+}
+
+template void Debug<float>(const TensorInfo& inputInfo,
+                           const TensorInfo& outputInfo,
+                           const DebugDescriptor& descriptor,
+                           const float* inputData,
+                           float* outputData);
+
+template void Debug<uint8_t>(const TensorInfo& inputInfo,
+                             const TensorInfo& outputInfo,
+                             const DebugDescriptor& descriptor,
+                             const uint8_t* inputData,
+                             uint8_t* outputData);
+} //namespace armnn
diff --git a/src/backends/reference/workloads/Debug.hpp b/src/backends/reference/workloads/Debug.hpp
new file mode 100644
index 0000000..682f0bd
--- /dev/null
+++ b/src/backends/reference/workloads/Debug.hpp
@@ -0,0 +1,20 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+#pragma once
+
+#include <armnn/Descriptors.hpp>
+#include <armnn/Tensor.hpp>
+
+namespace armnn
+{
+
+template <typename T>
+void Debug(const TensorInfo& inputInfo,
+           const TensorInfo& outputInfo,
+           const DebugDescriptor& descriptor,
+           const T* inputData,
+           T* outputData);
+
+} //namespace armnn
diff --git a/src/backends/reference/workloads/RefDebugWorkload.cpp b/src/backends/reference/workloads/RefDebugWorkload.cpp
new file mode 100644
index 0000000..17eb8fc
--- /dev/null
+++ b/src/backends/reference/workloads/RefDebugWorkload.cpp
@@ -0,0 +1,33 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+#include "RefDebugWorkload.hpp"
+#include "Debug.hpp"
+
+#include "RefWorkloadUtils.hpp"
+#include "TypeUtils.hpp"
+
+namespace armnn
+{
+
+template<armnn::DataType DataType>
+void RefDebugWorkload<DataType>::Execute() const
+{
+    using T = ResolveType<DataType>;
+
+    ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuRef, GetName() + "_Execute");
+
+    const TensorInfo& inputInfo = GetTensorInfo(m_Data.m_Inputs[0]);
+    const TensorInfo& outputInfo = GetTensorInfo(m_Data.m_Outputs[0]);
+
+    const T* inputData = GetInputTensorData<T>(0, m_Data);
+    T* outputData = GetOutputTensorData<T>(0, m_Data);
+
+    Debug(inputInfo, outputInfo, m_Data.m_Parameters, inputData, outputData);
+}
+
+template class RefDebugWorkload<DataType::Float32>;
+template class RefDebugWorkload<DataType::QuantisedAsymm8>;
+
+} //namespace armnn
diff --git a/src/backends/reference/workloads/RefDebugWorkload.hpp b/src/backends/reference/workloads/RefDebugWorkload.hpp
new file mode 100644
index 0000000..a1231f9
--- /dev/null
+++ b/src/backends/reference/workloads/RefDebugWorkload.hpp
@@ -0,0 +1,33 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+#pragma once
+
+#include <backendsCommon/Workload.hpp>
+
+#include <armnn/TypesUtils.hpp>
+
+namespace armnn
+{
+
+template <armnn::DataType DataType>
+class RefDebugWorkload : public TypedWorkload<DebugQueueDescriptor, DataType>
+{
+public:
+    static const std::string& GetName()
+    {
+        static const std::string name = std::string("RefDebug") + GetDataTypeName(DataType) + "Workload";
+        return name;
+    }
+
+    using TypedWorkload<DebugQueueDescriptor, DataType>::m_Data;
+    using TypedWorkload<DebugQueueDescriptor, DataType>::TypedWorkload;
+
+    void Execute() const override;
+};
+
+using RefDebugFloat32Workload = RefDebugWorkload<DataType::Float32>;
+using RefDebugUint8Workload = RefDebugWorkload<DataType::QuantisedAsymm8>;
+
+} //namespace armnn
diff --git a/src/backends/reference/workloads/RefWorkloads.hpp b/src/backends/reference/workloads/RefWorkloads.hpp
index 86d8624..ddce68e 100644
--- a/src/backends/reference/workloads/RefWorkloads.hpp
+++ b/src/backends/reference/workloads/RefWorkloads.hpp
@@ -58,3 +58,4 @@
 #include "RefPadWorkload.hpp"
 #include "RefBatchToSpaceNdUint8Workload.hpp"
 #include "RefBatchToSpaceNdFloat32Workload.hpp"
+#include "RefDebugWorkload.hpp"