IVGCVSW-2095 Add reference implementation and unit tests for SpaceToBatchNd

Change-Id: I27ffebdece6e68460931a44c15b9b029f9fce638
diff --git a/src/backends/backendsCommon/WorkloadData.cpp b/src/backends/backendsCommon/WorkloadData.cpp
index 62cbd05..7c02947 100644
--- a/src/backends/backendsCommon/WorkloadData.cpp
+++ b/src/backends/backendsCommon/WorkloadData.cpp
@@ -749,21 +749,14 @@
     ValidateTensorNumDimensions(workloadInfo.m_InputTensorInfos[0], "SpaceToBatchNdQueueDescriptor", 4, "input");
     ValidateTensorNumDimensions(workloadInfo.m_OutputTensorInfos[0], "SpaceToBatchNdQueueDescriptor", 4, "output");
 
-    if (workloadInfo.m_InputTensorInfos[0].GetNumElements() != workloadInfo.m_OutputTensorInfos[0].GetNumElements())
-    {
-        throw InvalidArgumentException("SpaceToBatchNdQueueDescriptor: Input tensor has " +
-            to_string(workloadInfo.m_InputTensorInfos[0].GetNumElements()) + " but output tensor has " +
-            to_string(workloadInfo.m_OutputTensorInfos[0].GetNumElements()) + " elements.");
-    }
-
     if (m_Parameters.m_BlockShape.size() != 2)
     {
-        throw InvalidArgumentException("Block Shape must contains 2 spatial dimensions");
+        throw InvalidArgumentException("Block Shape must contain 2 spatial dimensions");
     }
 
     if (m_Parameters.m_BlockShape.size() != m_Parameters.m_PadList.size())
     {
-        throw InvalidArgumentException("Pad List must contains the same number of dimensions as Block Shape.");
+        throw InvalidArgumentException("Pad List must contain the same number of dimensions as Block Shape.");
     }
 
     const TensorShape inputShape = workloadInfo.m_InputTensorInfos[0].GetShape();
@@ -771,10 +764,23 @@
     std::pair<unsigned int, unsigned int> heightPad = m_Parameters.m_PadList[0];
     std::pair<unsigned int, unsigned int> widthPad = m_Parameters.m_PadList[1];
 
-    if ((inputShape[m_Parameters.m_DataLayout.GetHeightIndex()] + heightPad.first + heightPad.second)
-            % m_Parameters.m_BlockShape[0] != 0 ||
-        (inputShape[m_Parameters.m_DataLayout.GetWidthIndex()] + widthPad.first + widthPad.second)
-            % m_Parameters.m_BlockShape[1] != 0)
+    unsigned int inputHeight = inputShape[m_Parameters.m_DataLayout.GetHeightIndex()]
+                               + heightPad.first + heightPad.second;
+
+    unsigned int inputWidth = inputShape[m_Parameters.m_DataLayout.GetWidthIndex()]
+                              + widthPad.first + widthPad.second;
+
+    unsigned int numInputElements = inputShape[0] * inputHeight * inputWidth
+                                    * inputShape[m_Parameters.m_DataLayout.GetChannelsIndex()];
+
+    if (workloadInfo.m_OutputTensorInfos[0].GetNumElements() != numInputElements)
+    {
+        throw InvalidArgumentException("SpaceToBatchNdQueueDescriptor: Input tensor has " +
+            to_string(numInputElements) + " after padding but output tensor has " +
+            to_string(workloadInfo.m_OutputTensorInfos[0].GetNumElements()) + " elements.");
+    }
+
+    if (inputHeight % m_Parameters.m_BlockShape[0] != 0 || inputWidth % m_Parameters.m_BlockShape[1] != 0)
     {
         throw InvalidArgumentException(
             "Input shape after padding must be divisible by Block Shape in all spatial dimensions");
diff --git a/src/backends/backendsCommon/test/LayerTests.cpp b/src/backends/backendsCommon/test/LayerTests.cpp
index 6b5fa72..cdc989f 100755
--- a/src/backends/backendsCommon/test/LayerTests.cpp
+++ b/src/backends/backendsCommon/test/LayerTests.cpp
@@ -26,6 +26,7 @@
 #include "Pooling2dTestImpl.hpp"
 #include "ReshapeTestImpl.hpp"
 #include "FullyConnectedTestImpl.hpp"
+#include "SpaceToBatchNdTestImpl.hpp"
 #include "SplitterTestImpl.hpp"
 #include "SoftmaxTestImpl.hpp"
 #include "NormTestImpl.hpp"
@@ -6088,3 +6089,83 @@
 
     return addRet;
 }
+
+LayerTestResult<float, 4> SpaceToBatchNdSimpleFloat32Test(armnn::IWorkloadFactory& workloadFactory)
+{
+    return SpaceToBatchNdSimpleTest<float>(workloadFactory);
+}
+
+LayerTestResult<float, 4> SpaceToBatchNdMultiChannelsFloat32Test(armnn::IWorkloadFactory& workloadFactory)
+{
+    return SpaceToBatchNdMultiChannelsTest<float>(workloadFactory);
+}
+
+LayerTestResult<float, 4> SpaceToBatchNdMultiBlockFloat32Test(armnn::IWorkloadFactory& workloadFactory)
+{
+    return SpaceToBatchNdMultiBlockTest<float>(workloadFactory);
+}
+
+LayerTestResult<float, 4> SpaceToBatchNdPaddingFloat32Test(armnn::IWorkloadFactory& workloadFactory)
+{
+    return SpaceToBatchNdPaddingTest<float>(workloadFactory);
+}
+
+LayerTestResult<uint8_t, 4> SpaceToBatchNdSimpleUint8Test(armnn::IWorkloadFactory& workloadFactory)
+{
+    return SpaceToBatchNdSimpleTest<uint8_t>(workloadFactory);
+}
+
+LayerTestResult<uint8_t, 4> SpaceToBatchNdMultiChannelsUint8Test(armnn::IWorkloadFactory& workloadFactory)
+{
+    return SpaceToBatchNdMultiChannelsTest<uint8_t>(workloadFactory);
+}
+
+LayerTestResult<uint8_t, 4> SpaceToBatchNdMultiBlockUint8Test(armnn::IWorkloadFactory& workloadFactory)
+{
+    return SpaceToBatchNdMultiBlockTest<uint8_t>(workloadFactory);
+}
+
+LayerTestResult<uint8_t, 4> SpaceToBatchNdPaddingUint8Test(armnn::IWorkloadFactory& workloadFactory)
+{
+    return SpaceToBatchNdPaddingTest<uint8_t>(workloadFactory);
+}
+
+LayerTestResult<float, 4> SpaceToBatchNdSimpleNHWCFloat32Test(armnn::IWorkloadFactory& workloadFactory)
+{
+    return SpaceToBatchNdSimpleNHWCTest<float>(workloadFactory);
+}
+
+LayerTestResult<float, 4> SpaceToBatchNdMultiChannelsNHWCFloat32Test(armnn::IWorkloadFactory& workloadFactory)
+{
+    return SpaceToBatchNdMultiChannelsNHWCTest<float>(workloadFactory);
+}
+
+LayerTestResult<float, 4> SpaceToBatchNdMultiBlockNHWCFloat32Test(armnn::IWorkloadFactory& workloadFactory)
+{
+    return SpaceToBatchNdMultiBlockNHWCTest<float>(workloadFactory);
+}
+
+LayerTestResult<float, 4> SpaceToBatchNdPaddingNHWCFloat32Test(armnn::IWorkloadFactory& workloadFactory)
+{
+    return SpaceToBatchNdPaddingNHWCTest<float>(workloadFactory);
+}
+
+LayerTestResult<uint8_t, 4> SpaceToBatchNdSimpleNHWCUint8Test(armnn::IWorkloadFactory& workloadFactory)
+{
+    return SpaceToBatchNdSimpleNHWCTest<uint8_t>(workloadFactory);
+}
+
+LayerTestResult<uint8_t, 4> SpaceToBatchNdMultiChannelsNHWCUint8Test(armnn::IWorkloadFactory& workloadFactory)
+{
+    return SpaceToBatchNdMultiChannelsNHWCTest<uint8_t>(workloadFactory);
+}
+
+LayerTestResult<uint8_t, 4> SpaceToBatchNdMultiBlockNHWCUint8Test(armnn::IWorkloadFactory& workloadFactory)
+{
+    return SpaceToBatchNdMultiBlockNHWCTest<uint8_t>(workloadFactory);
+}
+
+LayerTestResult<uint8_t, 4> SpaceToBatchNdPaddingNHWCUint8Test(armnn::IWorkloadFactory& workloadFactory)
+{
+    return SpaceToBatchNdPaddingNHWCTest<uint8_t>(workloadFactory);
+}
diff --git a/src/backends/backendsCommon/test/LayerTests.hpp b/src/backends/backendsCommon/test/LayerTests.hpp
index 57383d3..66032c8 100644
--- a/src/backends/backendsCommon/test/LayerTests.hpp
+++ b/src/backends/backendsCommon/test/LayerTests.hpp
@@ -414,3 +414,23 @@
 LayerTestResult<float, 3> MeanVtsFloat3Test(armnn::IWorkloadFactory& workloadFactory);
 
 LayerTestResult<float, 4> AdditionAfterMaxPoolTest(armnn::IWorkloadFactory& workloadFactory);
+
+LayerTestResult<float, 4> SpaceToBatchNdSimpleFloat32Test(armnn::IWorkloadFactory& workloadFactory);
+LayerTestResult<float, 4> SpaceToBatchNdMultiChannelsFloat32Test(armnn::IWorkloadFactory& workloadFactory);
+LayerTestResult<float, 4> SpaceToBatchNdMultiBlockFloat32Test(armnn::IWorkloadFactory& workloadFactory);
+LayerTestResult<float, 4> SpaceToBatchNdPaddingFloat32Test(armnn::IWorkloadFactory& workloadFactory);
+
+LayerTestResult<uint8_t, 4> SpaceToBatchNdSimpleUint8Test(armnn::IWorkloadFactory& workloadFactory);
+LayerTestResult<uint8_t, 4> SpaceToBatchNdMultiChannelsUint8Test(armnn::IWorkloadFactory& workloadFactory);
+LayerTestResult<uint8_t, 4> SpaceToBatchNdMultiBlockUint8Test(armnn::IWorkloadFactory& workloadFactory);
+LayerTestResult<uint8_t, 4> SpaceToBatchNdPaddingUint8Test(armnn::IWorkloadFactory& workloadFactory);
+
+LayerTestResult<float, 4> SpaceToBatchNdSimpleNHWCFloat32Test(armnn::IWorkloadFactory& workloadFactory);
+LayerTestResult<float, 4> SpaceToBatchNdMultiChannelsNHWCFloat32Test(armnn::IWorkloadFactory& workloadFactory);
+LayerTestResult<float, 4> SpaceToBatchNdMultiBlockNHWCFloat32Test(armnn::IWorkloadFactory& workloadFactory);
+LayerTestResult<float, 4> SpaceToBatchNdPaddingNHWCFloat32Test(armnn::IWorkloadFactory& workloadFactory);
+
+LayerTestResult<uint8_t, 4> SpaceToBatchNdSimpleNHWCUint8Test(armnn::IWorkloadFactory& workloadFactory);
+LayerTestResult<uint8_t, 4> SpaceToBatchNdMultiChannelsNHWCUint8Test(armnn::IWorkloadFactory& workloadFactory);
+LayerTestResult<uint8_t, 4> SpaceToBatchNdMultiBlockNHWCUint8Test(armnn::IWorkloadFactory& workloadFactory);
+LayerTestResult<uint8_t, 4> SpaceToBatchNdPaddingNHWCUint8Test(armnn::IWorkloadFactory& workloadFactory);
diff --git a/src/backends/backendsCommon/test/SpaceToBatchNdTestImpl.hpp b/src/backends/backendsCommon/test/SpaceToBatchNdTestImpl.hpp
new file mode 100644
index 0000000..5dd21bf
--- /dev/null
+++ b/src/backends/backendsCommon/test/SpaceToBatchNdTestImpl.hpp
@@ -0,0 +1,243 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+#pragma once
+
+#include <armnn/ArmNN.hpp>
+#include <armnn/Tensor.hpp>
+#include <armnn/TypesUtils.hpp>
+
+#include <backendsCommon/CpuTensorHandle.hpp>
+#include <backendsCommon/WorkloadFactory.hpp>
+
+#include <test/TensorHelpers.hpp>
+
+template<typename T>
+LayerTestResult<T, 4> SpaceToBatchNdTestImpl(
+    const armnn::IWorkloadFactory& workloadFactory,
+    armnn::TensorInfo& inputTensorInfo,
+    armnn::TensorInfo& outputTensorInfo,
+    std::vector<float>& inputData,
+    std::vector<float>& outputExpectedData,
+    armnn::SpaceToBatchNdQueueDescriptor descriptor,
+    const float qScale = 1.0f,
+    const int32_t qOffset = 0)
+{
+    const armnn::PermutationVector NCHWToNHWC = {0, 3, 1, 2};
+    if (descriptor.m_Parameters.m_DataLayout == armnn::DataLayout::NHWC)
+    {
+        inputTensorInfo = armnnUtils::Permuted(inputTensorInfo, NCHWToNHWC);
+        outputTensorInfo = armnnUtils::Permuted(outputTensorInfo, NCHWToNHWC);
+
+        std::vector<float> inputTmp(inputData.size());
+        armnnUtils::Permute(inputTensorInfo.GetShape(), NCHWToNHWC, inputData.data(), inputTmp.data());
+        inputData = inputTmp;
+
+        std::vector<float> outputTmp(outputExpectedData.size());
+        armnnUtils::Permute(outputTensorInfo.GetShape(), NCHWToNHWC, outputExpectedData.data(), outputTmp.data());
+        outputExpectedData = outputTmp;
+    }
+
+    if(armnn::IsQuantizedType<T>())
+    {
+        inputTensorInfo.SetQuantizationScale(qScale);
+        inputTensorInfo.SetQuantizationOffset(qOffset);
+        outputTensorInfo.SetQuantizationScale(qScale);
+        outputTensorInfo.SetQuantizationOffset(qOffset);
+    }
+
+    boost::multi_array<T, 4> input = MakeTensor<T, 4>(inputTensorInfo, QuantizedVector<T>(qScale, qOffset, inputData));
+
+    LayerTestResult<T, 4> ret(outputTensorInfo);
+    ret.outputExpected = MakeTensor<T, 4>(outputTensorInfo, QuantizedVector<T>(qScale, qOffset, outputExpectedData));
+
+    std::unique_ptr<armnn::ITensorHandle> inputHandle = workloadFactory.CreateTensorHandle(inputTensorInfo);
+    std::unique_ptr<armnn::ITensorHandle> outputHandle = workloadFactory.CreateTensorHandle(outputTensorInfo);
+
+    armnn::WorkloadInfo info;
+    AddInputToWorkload(descriptor, info, inputTensorInfo, inputHandle.get());
+    AddOutputToWorkload(descriptor, info, outputTensorInfo, outputHandle.get());
+
+    std::unique_ptr<armnn::IWorkload> workload = workloadFactory.CreateSpaceToBatchNd(descriptor, info);
+
+    inputHandle->Allocate();
+    outputHandle->Allocate();
+
+    CopyDataToITensorHandle(inputHandle.get(), &input[0][0][0][0]);
+
+    workload->Execute();
+
+    CopyDataFromITensorHandle(&ret.output[0][0][0][0], outputHandle.get());
+
+    return ret;
+}
+
+template <typename T>
+LayerTestResult<T, 4> SpaceToBatchNdSimpleTest(armnn::IWorkloadFactory& workloadFactory,
+                                               armnn::DataLayout dataLayout = armnn::DataLayout::NCHW)
+{
+    armnn::TensorInfo inputTensorInfo;
+    armnn::TensorInfo outputTensorInfo;
+
+    unsigned int inputShape[] = {1, 1, 2, 2};
+    unsigned int outputShape[] = {4, 1, 1, 1};
+
+    armnn::SpaceToBatchNdQueueDescriptor desc;
+    desc.m_Parameters.m_DataLayout = dataLayout;
+    desc.m_Parameters.m_BlockShape = {2, 2};
+    desc.m_Parameters.m_PadList = {{0, 0}, {0, 0}};
+
+    inputTensorInfo = armnn::TensorInfo(4, inputShape, armnn::GetDataType<T>());
+    outputTensorInfo = armnn::TensorInfo(4, outputShape, armnn::GetDataType<T>());
+
+    std::vector<float> input = std::vector<float>(
+    {
+        1.0f, 2.0f, 3.0f, 4.0f
+    });
+
+    std::vector<float> outputExpected = std::vector<float>(
+    {
+        1.0f, 2.0f, 3.0f, 4.0f
+    });
+
+    return SpaceToBatchNdTestImpl<T>(workloadFactory, inputTensorInfo, outputTensorInfo, input, outputExpected, desc);
+}
+
+template <typename T>
+LayerTestResult<T, 4> SpaceToBatchNdMultiChannelsTest(armnn::IWorkloadFactory& workloadFactory,
+                                                      armnn::DataLayout dataLayout = armnn::DataLayout::NCHW)
+{
+    armnn::TensorInfo inputTensorInfo;
+    armnn::TensorInfo outputTensorInfo;
+
+    unsigned int inputShape[] = {1, 3, 2, 2};
+    unsigned int outputShape[] = {4, 3, 1, 1};
+
+    armnn::SpaceToBatchNdQueueDescriptor desc;
+    desc.m_Parameters.m_DataLayout = dataLayout;
+    desc.m_Parameters.m_BlockShape = {2, 2};
+    desc.m_Parameters.m_PadList = {{0, 0}, {0, 0}};
+
+    inputTensorInfo = armnn::TensorInfo(4, inputShape, armnn::GetDataType<T>());
+    outputTensorInfo = armnn::TensorInfo(4, outputShape, armnn::GetDataType<T>());
+
+    std::vector<float> input = std::vector<float>(
+    {
+        1.0f, 4.0f, 7.0f, 10.0f,
+        2.0f, 5.0, 8.0, 11.0f,
+        3.0f, 6.0f, 9.0f, 12.0f
+    });
+
+    std::vector<float> outputExpected = std::vector<float>(
+    {
+        1.0f, 2.0f, 3.0f,
+        4.0f, 5.0f, 6.0f,
+        7.0f, 8.0f, 9.0f,
+        10.0f, 11.0f, 12.0f
+    });
+
+    return SpaceToBatchNdTestImpl<T>(workloadFactory, inputTensorInfo, outputTensorInfo, input, outputExpected, desc);
+}
+
+template <typename T>
+LayerTestResult<T, 4> SpaceToBatchNdMultiBlockTest(armnn::IWorkloadFactory& workloadFactory,
+                                                   armnn::DataLayout dataLayout = armnn::DataLayout::NCHW)
+{
+    armnn::TensorInfo inputTensorInfo;
+    armnn::TensorInfo outputTensorInfo;
+
+    unsigned int inputShape[] = {1, 1, 4, 4};
+    unsigned int outputShape[] = {4, 1, 2, 2};
+
+    armnn::SpaceToBatchNdQueueDescriptor desc;
+    desc.m_Parameters.m_DataLayout = dataLayout;
+    desc.m_Parameters.m_BlockShape = {2, 2};
+    desc.m_Parameters.m_PadList = {{0, 0}, {0, 0}};
+
+    inputTensorInfo = armnn::TensorInfo(4, inputShape, armnn::GetDataType<T>());
+    outputTensorInfo = armnn::TensorInfo(4, outputShape, armnn::GetDataType<T>());
+
+    std::vector<float> input = std::vector<float>(
+    {
+        1.0f, 2.0f, 3.0f, 4.0f,
+        5.0f, 6.0f, 7.0f, 8.0f,
+        9.0f, 10.0f, 11.0f, 12.0f,
+        13.0f, 14.0f, 15.0f, 16.0f
+    });
+
+    std::vector<float> outputExpected = std::vector<float>(
+    {
+        1.0f, 3.0f, 9.0f, 11.0f,
+        2.0f, 4.0f, 10.0f, 12.0f,
+        5.0f, 7.0f, 13.0f, 15.0f,
+        6.0f, 8.0f, 14.0f, 16.0f
+    });
+
+    return SpaceToBatchNdTestImpl<T>(workloadFactory, inputTensorInfo, outputTensorInfo, input, outputExpected, desc);
+}
+
+template <typename T>
+LayerTestResult<T, 4> SpaceToBatchNdPaddingTest(armnn::IWorkloadFactory& workloadFactory,
+                                                armnn::DataLayout dataLayout = armnn::DataLayout::NCHW)
+{
+    armnn::TensorInfo inputTensorInfo;
+    armnn::TensorInfo outputTensorInfo;
+
+    unsigned int inputShape[] = {2, 1, 2, 4};
+    unsigned int outputShape[] = {8, 1, 1, 3};
+
+    armnn::SpaceToBatchNdQueueDescriptor desc;
+    desc.m_Parameters.m_DataLayout = dataLayout;
+    desc.m_Parameters.m_BlockShape = {2, 2};
+    desc.m_Parameters.m_PadList = {{0, 0}, {2, 0}};
+
+    inputTensorInfo = armnn::TensorInfo(4, inputShape, armnn::GetDataType<T>());
+    outputTensorInfo = armnn::TensorInfo(4, outputShape, armnn::GetDataType<T>());
+
+    std::vector<float> input = std::vector<float>(
+    {
+        1.0f, 2.0f, 3.0f, 4.0f,
+        5.0f, 6.0f, 7.0f, 8.0f,
+        9.0f, 10.0f, 11.0f, 12.0f,
+        13.0f, 14.0f, 15.0f, 16.0f
+    });
+
+    std::vector<float> outputExpected = std::vector<float>(
+    {
+        0.0f, 1.0f, 3.0f,
+        0.0f, 9.0f, 11.0f,
+        0.0f, 2.0f, 4.0f,
+        0.0f, 10.0f, 12.0f,
+        0.0f, 5.0f, 7.0f,
+        0.0f, 13.0f, 15.0f,
+        0.0f, 6.0f, 8.0f,
+        0.0f, 14.0f, 16.0f
+    });
+
+    return SpaceToBatchNdTestImpl<T>(workloadFactory, inputTensorInfo, outputTensorInfo, input, outputExpected, desc);
+}
+
+template <typename T>
+LayerTestResult<T, 4> SpaceToBatchNdSimpleNHWCTest(armnn::IWorkloadFactory& workloadFactory)
+{
+    return SpaceToBatchNdSimpleTest<T>(workloadFactory, armnn::DataLayout::NHWC);
+}
+
+template <typename T>
+LayerTestResult<T, 4> SpaceToBatchNdMultiChannelsNHWCTest(armnn::IWorkloadFactory& workloadFactory)
+{
+    return SpaceToBatchNdMultiChannelsTest<T>(workloadFactory, armnn::DataLayout::NHWC);
+}
+
+template <typename T>
+LayerTestResult<T, 4> SpaceToBatchNdMultiBlockNHWCTest(armnn::IWorkloadFactory& workloadFactory)
+{
+    return SpaceToBatchNdMultiBlockTest<T>(workloadFactory, armnn::DataLayout::NHWC);
+}
+
+template <typename T>
+LayerTestResult<T, 4> SpaceToBatchNdPaddingNHWCTest(armnn::IWorkloadFactory& workloadFactory)
+{
+    return SpaceToBatchNdPaddingTest<T>(workloadFactory, armnn::DataLayout::NHWC);
+}
diff --git a/src/backends/reference/RefLayerSupport.cpp b/src/backends/reference/RefLayerSupport.cpp
index 629903e..0902b0f 100644
--- a/src/backends/reference/RefLayerSupport.cpp
+++ b/src/backends/reference/RefLayerSupport.cpp
@@ -434,6 +434,19 @@
                                      &TrueFunc<>);
 }
 
+bool RefLayerSupport::IsSpaceToBatchNdSupported(const TensorInfo& input,
+                                                const TensorInfo& output,
+                                                const SpaceToBatchNdDescriptor& descriptor,
+                                                Optional<std::string&> reasonIfUnsupported) const
+{
+    ignore_unused(output);
+    ignore_unused(descriptor);
+    return IsSupportedForDataTypeRef(reasonIfUnsupported,
+                                     input.GetDataType(),
+                                     &TrueFunc<>,
+                                     &TrueFunc<>);
+}
+
 bool RefLayerSupport::IsSplitterSupported(const TensorInfo& input,
                                           const ViewsDescriptor& descriptor,
                                           Optional<std::string&> reasonIfUnsupported) const
diff --git a/src/backends/reference/RefLayerSupport.hpp b/src/backends/reference/RefLayerSupport.hpp
index 0da5998..b161f5c 100644
--- a/src/backends/reference/RefLayerSupport.hpp
+++ b/src/backends/reference/RefLayerSupport.hpp
@@ -159,6 +159,11 @@
                             const SoftmaxDescriptor& descriptor,
                             Optional<std::string&> reasonIfUnsupported = EmptyOptional()) const override;
 
+    bool IsSpaceToBatchNdSupported(const TensorInfo& input,
+                                   const TensorInfo& output,
+                                   const SpaceToBatchNdDescriptor& descriptor,
+                                   Optional<std::string&> reasonIfUnsupported = EmptyOptional()) const override;
+
     bool IsSplitterSupported(const TensorInfo& input,
                              const ViewsDescriptor& descriptor,
                              Optional<std::string&> reasonIfUnsupported = EmptyOptional()) const override;
diff --git a/src/backends/reference/RefWorkloadFactory.cpp b/src/backends/reference/RefWorkloadFactory.cpp
index 177c5ed..a238d5f 100644
--- a/src/backends/reference/RefWorkloadFactory.cpp
+++ b/src/backends/reference/RefWorkloadFactory.cpp
@@ -217,7 +217,7 @@
 std::unique_ptr<IWorkload> RefWorkloadFactory::CreateSpaceToBatchNd(const SpaceToBatchNdQueueDescriptor& descriptor,
     const WorkloadInfo& info) const
 {
-    return MakeWorkload<NullWorkload, NullWorkload>(descriptor, info);
+    return MakeWorkload<RefSpaceToBatchNdFloat32Workload, RefSpaceToBatchNdUint8Workload>(descriptor, info);
 }
 
 std::unique_ptr<IWorkload> RefWorkloadFactory::CreateFloor(const FloorQueueDescriptor& descriptor,
diff --git a/src/backends/reference/backend.mk b/src/backends/reference/backend.mk
index 007efce..cc8c24f 100644
--- a/src/backends/reference/backend.mk
+++ b/src/backends/reference/backend.mk
@@ -54,9 +54,11 @@
         workloads/RefResizeBilinearUint8Workload.cpp \
         workloads/RefSoftmaxFloat32Workload.cpp \
         workloads/RefSoftmaxUint8Workload.cpp \
+        workloads/RefSpaceToBatchNdWorkload.cpp \
         workloads/RefSplitterFloat32Workload.cpp \
         workloads/RefSplitterUint8Workload.cpp \
         workloads/ResizeBilinear.cpp \
+        workloads/SpaceToBatchNd.cpp \
         workloads/Softmax.cpp
 
 # BACKEND_TEST_SOURCES contains the list of files to be included
diff --git a/src/backends/reference/test/RefLayerTests.cpp b/src/backends/reference/test/RefLayerTests.cpp
index e79953f..df0e378 100644
--- a/src/backends/reference/test/RefLayerTests.cpp
+++ b/src/backends/reference/test/RefLayerTests.cpp
@@ -355,4 +355,25 @@
 
 ARMNN_AUTO_TEST_CASE(AdditionAfterMaxPool, AdditionAfterMaxPoolTest)
 
+// Space To Batch Nd
+ARMNN_AUTO_TEST_CASE(SpaceToBatchNdSimpleFloat32, SpaceToBatchNdSimpleFloat32Test)
+ARMNN_AUTO_TEST_CASE(SpaceToBatchNdMultiChannelsFloat32, SpaceToBatchNdMultiChannelsFloat32Test)
+ARMNN_AUTO_TEST_CASE(SpaceToBatchNdMultiBlockFloat32, SpaceToBatchNdMultiBlockFloat32Test)
+ARMNN_AUTO_TEST_CASE(SpaceToBatchNdPaddingFloat32, SpaceToBatchNdPaddingFloat32Test)
+
+ARMNN_AUTO_TEST_CASE(SpaceToBatchNdSimpleUint8, SpaceToBatchNdSimpleUint8Test)
+ARMNN_AUTO_TEST_CASE(SpaceToBatchNdMultiChannelsUint8, SpaceToBatchNdMultiChannelsUint8Test)
+ARMNN_AUTO_TEST_CASE(SpaceToBatchNdMultiBlockUint8, SpaceToBatchNdMultiBlockUint8Test)
+ARMNN_AUTO_TEST_CASE(SpaceToBatchNdPaddingUint8, SpaceToBatchNdPaddingUint8Test)
+
+ARMNN_AUTO_TEST_CASE(SpaceToBatchNdSimpleNHWCFloat32, SpaceToBatchNdSimpleNHWCFloat32Test)
+ARMNN_AUTO_TEST_CASE(SpaceToBatchNdMultiChannelsNHWCFloat32, SpaceToBatchNdMultiChannelsNHWCFloat32Test)
+ARMNN_AUTO_TEST_CASE(SpaceToBatchNdMultiBlockNHWCFloat32, SpaceToBatchNdMultiBlockNHWCFloat32Test)
+ARMNN_AUTO_TEST_CASE(SpaceToBatchNdPaddingNHWCFloat32, SpaceToBatchNdPaddingNHWCFloat32Test)
+
+ARMNN_AUTO_TEST_CASE(SpaceToBatchNdSimpleNHWCUint8, SpaceToBatchNdSimpleNHWCUint8Test)
+ARMNN_AUTO_TEST_CASE(SpaceToBatchNdMultiChannelsNHWCUint8, SpaceToBatchNdMultiChannelsNHWCUint8Test)
+ARMNN_AUTO_TEST_CASE(SpaceToBatchNdMultiBlockNHWCUint8, SpaceToBatchNdMultiBlockNHWCUint8Test)
+ARMNN_AUTO_TEST_CASE(SpaceToBatchNdPaddingNHWCUint8, SpaceToBatchNdPaddingNHWCUint8Test)
+
 BOOST_AUTO_TEST_SUITE_END()
diff --git a/src/backends/reference/workloads/CMakeLists.txt b/src/backends/reference/workloads/CMakeLists.txt
index 7dc7289..4cef2d0 100644
--- a/src/backends/reference/workloads/CMakeLists.txt
+++ b/src/backends/reference/workloads/CMakeLists.txt
@@ -86,6 +86,8 @@
     RefSoftmaxFloat32Workload.hpp
     RefSoftmaxUint8Workload.cpp
     RefSoftmaxUint8Workload.hpp
+    RefSpaceToBatchNdWorkload.cpp
+    RefSpaceToBatchNdWorkload.hpp
     RefSplitterFloat32Workload.cpp
     RefSplitterFloat32Workload.hpp
     RefSplitterUint8Workload.cpp
@@ -96,6 +98,8 @@
     ResizeBilinear.hpp
     Softmax.cpp
     Softmax.hpp
+    SpaceToBatchNd.hpp
+    SpaceToBatchNd.cpp
     Splitter.hpp
     TensorBufferArrayView.hpp
     Mean.cpp
diff --git a/src/backends/reference/workloads/RefSpaceToBatchNdWorkload.cpp b/src/backends/reference/workloads/RefSpaceToBatchNdWorkload.cpp
new file mode 100644
index 0000000..fb98118
--- /dev/null
+++ b/src/backends/reference/workloads/RefSpaceToBatchNdWorkload.cpp
@@ -0,0 +1,34 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "RefSpaceToBatchNdWorkload.hpp"
+#include "SpaceToBatchNd.hpp"
+
+#include "RefWorkloadUtils.hpp"
+#include "TypeUtils.hpp"
+
+namespace armnn
+{
+
+template<armnn::DataType DataType>
+void RefSpaceToBatchNdWorkload<DataType>::Execute() const
+{
+    using T = ResolveType<DataType>;
+
+    ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuRef, GetName() + "_Execute");
+
+    const TensorInfo& inputInfo = GetTensorInfo(m_Data.m_Inputs[0]);
+    const TensorInfo& outputInfo = GetTensorInfo(m_Data.m_Outputs[0]);
+
+    const T* inputData = GetInputTensorData<T>(0, m_Data);
+    T* outputData = GetOutputTensorData<T>(0, m_Data);
+
+    SpaceToBatchNd(inputInfo, outputInfo, m_Data.m_Parameters, inputData, outputData);
+}
+
+template class RefSpaceToBatchNdWorkload<DataType::Float32>;
+template class RefSpaceToBatchNdWorkload<DataType::QuantisedAsymm8>;
+
+} //namespace armnn
diff --git a/src/backends/reference/workloads/RefSpaceToBatchNdWorkload.hpp b/src/backends/reference/workloads/RefSpaceToBatchNdWorkload.hpp
new file mode 100644
index 0000000..3a08662
--- /dev/null
+++ b/src/backends/reference/workloads/RefSpaceToBatchNdWorkload.hpp
@@ -0,0 +1,33 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+#pragma once
+
+#include "backendsCommon/Workload.hpp"
+
+#include <armnn/TypesUtils.hpp>
+
+namespace armnn
+{
+
+template <armnn::DataType DataType>
+class RefSpaceToBatchNdWorkload : public TypedWorkload<SpaceToBatchNdQueueDescriptor, DataType>
+{
+public:
+    static const std::string& GetName()
+    {
+        static const std::string name = std::string("RefSpaceToBatchNd") + GetDataTypeName(DataType) + "Workload";
+        return name;
+    }
+
+    using TypedWorkload<SpaceToBatchNdQueueDescriptor, DataType>::m_Data;
+    using TypedWorkload<SpaceToBatchNdQueueDescriptor, DataType>::TypedWorkload;
+
+    void Execute() const override;
+};
+
+using RefSpaceToBatchNdFloat32Workload = RefSpaceToBatchNdWorkload<DataType::Float32>;
+using RefSpaceToBatchNdUint8Workload = RefSpaceToBatchNdWorkload<DataType::QuantisedAsymm8>;
+
+} //namespace armnn
diff --git a/src/backends/reference/workloads/RefWorkloads.hpp b/src/backends/reference/workloads/RefWorkloads.hpp
index 14e6699..03907a6 100644
--- a/src/backends/reference/workloads/RefWorkloads.hpp
+++ b/src/backends/reference/workloads/RefWorkloads.hpp
@@ -41,6 +41,7 @@
 #include "BatchNormImpl.hpp"
 #include "Activation.hpp"
 #include "Merger.hpp"
+#include "RefSpaceToBatchNdWorkload.hpp"
 #include "RefSplitterFloat32Workload.hpp"
 #include "RefConstantFloat32Workload.hpp"
 #include "RefActivationFloat32Workload.hpp"
diff --git a/src/backends/reference/workloads/SpaceToBatchNd.cpp b/src/backends/reference/workloads/SpaceToBatchNd.cpp
new file mode 100644
index 0000000..48c2127
--- /dev/null
+++ b/src/backends/reference/workloads/SpaceToBatchNd.cpp
@@ -0,0 +1,123 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "SpaceToBatchNd.hpp"
+
+namespace armnn
+{
+
+unsigned int GetOffset(const TensorShape& shape,
+                       unsigned int b,
+                       unsigned int h,
+                       unsigned int w,
+                       unsigned int c,
+                       const DataLayoutIndexed& dataLayout)
+{
+    if (dataLayout.GetDataLayout() == DataLayout::NHWC)
+    {
+        return ((b * shape[dataLayout.GetHeightIndex()] + h) * shape[dataLayout.GetWidthIndex()] + w) *
+               shape[dataLayout.GetChannelsIndex()] + c;
+    }
+    else
+    {
+        return ((b * shape[dataLayout.GetChannelsIndex()] + c) * shape[dataLayout.GetHeightIndex()] + h) *
+               shape[dataLayout.GetWidthIndex()] + w;
+    }
+}
+
+template<typename T>
+void SpaceToBatchNd(const TensorInfo& inputInfo,
+                    const TensorInfo& outputInfo,
+                    const SpaceToBatchNdDescriptor& params,
+                    const T* inputData,
+                    T* outputData)
+{
+    DataLayoutIndexed dataLayout = params.m_DataLayout;
+
+    const TensorShape& inputShape = inputInfo.GetShape();
+    const TensorShape& outputShape = outputInfo.GetShape();
+
+    const unsigned int channels = inputShape[dataLayout.GetChannelsIndex()];
+
+    const unsigned int inputBatchSize = inputShape[0];
+    const unsigned int inputHeight = inputShape[dataLayout.GetHeightIndex()];
+    const unsigned int inputWidth = inputShape[dataLayout.GetWidthIndex()];
+
+    const unsigned int outputBatchSize = outputShape[0];
+    const unsigned int outputHeight = outputShape[dataLayout.GetHeightIndex()];
+    const unsigned int outputWidth = outputShape[dataLayout.GetWidthIndex()];
+
+    const unsigned int blockHeight = params.m_BlockShape[0];
+    const unsigned int blockWidth = params.m_BlockShape[1];
+
+    const unsigned int paddingTop = params.m_PadList[0].first;
+    const unsigned int paddingLeft = params.m_PadList[1].first;
+
+    for (unsigned int outB = 0; outB < outputBatchSize; outB++)
+    {
+        unsigned int inB = outB % inputBatchSize;
+
+        unsigned int shiftW = (outB / inputBatchSize) % blockWidth;
+        unsigned int shiftH = (outB / inputBatchSize) / blockWidth;
+
+        for (unsigned int outH = 0; outH < outputHeight; outH++)
+        {
+            for (unsigned int outW = 0; outW < outputWidth; outW++)
+            {
+                if (outH * blockHeight + shiftH < paddingTop ||
+                    outH * blockHeight + shiftH >= paddingTop + inputHeight ||
+                    outW * blockWidth + shiftW < paddingLeft ||
+                    outW * blockWidth + shiftW >= paddingLeft + inputWidth)
+                {
+                    for (unsigned int c = 0; c < channels; c++)
+                    {
+                        unsigned int outOffset = GetOffset(outputShape,
+                                                           outB,
+                                                           outH,
+                                                           outW,
+                                                           c,
+                                                           dataLayout);
+                        outputData[outOffset] = 0;
+                    }
+                }
+                else
+                {
+                    for (unsigned int c = 0; c < channels; c++)
+                    {
+                        unsigned int inOffset = GetOffset(inputShape,
+                                                          inB,
+                                                          (outH * blockHeight + shiftH) - paddingTop,
+                                                          (outW * blockWidth + shiftW) - paddingLeft,
+                                                          c,
+                                                          dataLayout);
+
+                        unsigned int outOffset = GetOffset(outputShape,
+                                                           outB,
+                                                           outH,
+                                                           outW,
+                                                           c,
+                                                           dataLayout);
+
+                        outputData[outOffset] = inputData[inOffset];
+                    }
+                }
+            }
+        }
+    }
+}
+
+template void SpaceToBatchNd<float>(const TensorInfo& inputInfo,
+                                    const TensorInfo& outputInfo,
+                                    const SpaceToBatchNdDescriptor& params,
+                                    const float* inputData,
+                                    float* outData);
+
+template void SpaceToBatchNd<uint8_t>(const TensorInfo& inputInfo,
+                                      const TensorInfo& outputInfo,
+                                      const SpaceToBatchNdDescriptor& params,
+                                      const uint8_t* inputData,
+                                      uint8_t* outData);
+
+} //namespace armnn
diff --git a/src/backends/reference/workloads/SpaceToBatchNd.hpp b/src/backends/reference/workloads/SpaceToBatchNd.hpp
new file mode 100644
index 0000000..e74e457
--- /dev/null
+++ b/src/backends/reference/workloads/SpaceToBatchNd.hpp
@@ -0,0 +1,21 @@
+//
+// Copyright © 2017 Arm Ltd. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include <armnn/Descriptors.hpp>
+#include "armnn/Tensor.hpp"
+
+namespace armnn
+{
+
+template <typename T>
+void SpaceToBatchNd(const TensorInfo& inputInfo,
+                    const TensorInfo& outputInfo,
+                    const SpaceToBatchNdDescriptor& params,
+                    const T* inputData,
+                    T* outputData);
+
+} //namespace armnn