IVGCVSW-7785 Extend support for 3D tensors BATCH_TO_SPACE and SPACE_TO_BATCH in CpuRef

* Both layers were assuming 4D tensors, now 3D is supported too.
* Remove some unnecessary includes
* Add Unit Tests

Signed-off-by: Teresa Charlin <teresa.charlinreyes@arm.com>
Change-Id: I7bdd11e4936a27cd97ec65fd915e6ccaa1494cff
diff --git a/src/armnn/layers/BatchToSpaceNdLayer.cpp b/src/armnn/layers/BatchToSpaceNdLayer.cpp
index f022c52..b760b56 100644
--- a/src/armnn/layers/BatchToSpaceNdLayer.cpp
+++ b/src/armnn/layers/BatchToSpaceNdLayer.cpp
@@ -1,18 +1,11 @@
 //
-// Copyright © 2017 Arm Ltd and Contributors. All rights reserved.
+// Copyright © 2017,2023 Arm Ltd and Contributors. All rights reserved.
 // SPDX-License-Identifier: MIT
 //
 
 #include "BatchToSpaceNdLayer.hpp"
 #include "LayerCloneBase.hpp"
-#include "LayerWithParameters.hpp"
-#include "BatchToSpaceNdLayer.hpp"
 
-#include <armnn/TypesUtils.hpp>
-
-#include <armnnUtils/DataLayoutIndexed.hpp>
-
-#include <armnn/backends/TensorHandle.hpp>
 #include <armnn/backends/WorkloadData.hpp>
 #include <armnn/backends/WorkloadFactory.hpp>
 
@@ -59,8 +52,6 @@
 
 std::vector<TensorShape> BatchToSpaceNdLayer::InferOutputShapes(const std::vector<TensorShape>& inputShapes) const
 {
-    ARMNN_ASSERT(inputShapes.size() == 1);
-
     const TensorShape& inputShape = inputShapes[0];
     TensorShape outputShape(inputShape);
 
@@ -68,29 +59,18 @@
                                                          m_Param.m_BlockShape.end(),
                                                          1U,
                                                          std::multiplies<>());
+    outputShape[0] = (inputShape[0] / accumulatedBlockShape) < 1 ? 1 : (inputShape[0] / accumulatedBlockShape) ;
 
-    ARMNN_ASSERT(inputShape[0] % accumulatedBlockShape == 0);
-
-    outputShape[0] = inputShape[0] / accumulatedBlockShape;
-
-    DataLayoutIndexed dimensionIndices = m_Param.m_DataLayout;
-    unsigned int heightIndex = dimensionIndices.GetHeightIndex();
-    unsigned int widthIndex = dimensionIndices.GetWidthIndex();
-
-    unsigned int heightCrop = m_Param.m_Crops[0].first + m_Param.m_Crops[0].second;
-    unsigned int widthCrop = m_Param.m_Crops[1].first + m_Param.m_Crops[1].second;
-
-    unsigned int outputHeight = inputShape[heightIndex] * m_Param.m_BlockShape[0];
-    unsigned int outputWidth = inputShape[widthIndex] * m_Param.m_BlockShape[1];
-
-    ARMNN_ASSERT_MSG(heightCrop <= outputHeight,
-        "BatchToSpaceLayer: Overall height crop should be less than or equal to the uncropped output height.");
-
-    ARMNN_ASSERT_MSG(widthCrop <= outputWidth,
-        "BatchToSpaceLayer: Overall width crop should be less than or equal to the uncropped output width.");
-
-    outputShape[heightIndex] = outputHeight - heightCrop;
-    outputShape[widthIndex] = outputWidth - widthCrop;
+    // In a 4D tensor, there will be 2 spatialDimensions (H and W), and the for loop will run twice.
+    // In a 3D tensor, there will be 1 spatialDimensions, and the for loop will run once.
+    unsigned int firstSpatialDimension = m_Param.m_DataLayout == DataLayout::NCHW ? 2 : 1;
+    for (unsigned int i = 0; i < m_Param.m_BlockShape.size(); ++i)
+    {
+        unsigned int spatialDimension = firstSpatialDimension + i;
+        unsigned int cropSize = m_Param.m_Crops[i].first + m_Param.m_Crops[i].second;
+        unsigned int outputSize = inputShape[spatialDimension] * m_Param.m_BlockShape[i];
+        outputShape[spatialDimension] = outputSize - cropSize;
+    }
 
     return std::vector<TensorShape>({ outputShape });
 }
diff --git a/src/armnn/layers/SpaceToBatchNdLayer.cpp b/src/armnn/layers/SpaceToBatchNdLayer.cpp
index 151b6a5..a758617 100644
--- a/src/armnn/layers/SpaceToBatchNdLayer.cpp
+++ b/src/armnn/layers/SpaceToBatchNdLayer.cpp
@@ -1,15 +1,11 @@
 //
-// Copyright © 2017 Arm Ltd and Contributors. All rights reserved.
+// Copyright © 2017,2023 Arm Ltd and Contributors. All rights reserved.
 // SPDX-License-Identifier: MIT
 //
 
 #include "SpaceToBatchNdLayer.hpp"
 #include "LayerCloneBase.hpp"
 
-#include <armnn/TypesUtils.hpp>
-
-#include <armnnUtils/DataLayoutIndexed.hpp>
-
 #include <armnn/backends/WorkloadData.hpp>
 #include <armnn/backends/WorkloadFactory.hpp>
 
@@ -42,9 +38,7 @@
 
 std::vector<TensorShape> SpaceToBatchNdLayer::InferOutputShapes(const std::vector<TensorShape>& inputShapes) const
 {
-    ARMNN_ASSERT(inputShapes.size() == 1);
-
-    TensorShape inputShape = inputShapes[0];
+    const TensorShape inputShape = inputShapes[0];
     TensorShape outputShape(inputShape);
 
     outputShape[0] = inputShape[0] * std::accumulate(m_Param.m_BlockShape.begin(),
@@ -52,17 +46,16 @@
                                                      1U,
                                                      std::multiplies<>());
 
-    DataLayoutIndexed dimensionIndices = m_Param.m_DataLayout;
-    unsigned int heightIndex = dimensionIndices.GetHeightIndex();
-    unsigned int widthIndex = dimensionIndices.GetWidthIndex();
-
-    std::pair<unsigned int, unsigned int> heightPad = m_Param.m_PadList[0];
-    std::pair<unsigned int, unsigned int> widthPad = m_Param.m_PadList[1];
-
-    outputShape[heightIndex] =
-        (inputShape[heightIndex] + heightPad.first + heightPad.second) / m_Param.m_BlockShape[0];
-    outputShape[widthIndex] =
-        (inputShape[widthIndex] + widthPad.first + widthPad.second) / m_Param.m_BlockShape[1];
+    // In a 4D tensor, there will be 2 spatialDimensions (H and W), and the for loop will run twice.
+    // In a 3D tensor, there will be 1 spatialDimensions, and the for loop will run once.
+    unsigned int firstSpatialDimension = m_Param.m_DataLayout == DataLayout::NCHW ? 2 : 1;
+    for (unsigned int i = 0; i < m_Param.m_BlockShape.size(); ++i)
+    {
+        unsigned int spatialDimension = firstSpatialDimension + i;
+        outputShape[spatialDimension] = 
+          (inputShape[spatialDimension] + m_Param.m_PadList[i].first + m_Param.m_PadList[i].second)
+          / m_Param.m_BlockShape[i];
+    }
 
     return std::vector<TensorShape>({ outputShape });
 }
diff --git a/src/backends/backendsCommon/WorkloadData.cpp b/src/backends/backendsCommon/WorkloadData.cpp
index 6a5963d..d4ae08d 100644
--- a/src/backends/backendsCommon/WorkloadData.cpp
+++ b/src/backends/backendsCommon/WorkloadData.cpp
@@ -1815,47 +1815,66 @@
     const TensorInfo& inputTensorInfo  = workloadInfo.m_InputTensorInfos[0];
     const TensorInfo& outputTensorInfo = workloadInfo.m_OutputTensorInfos[0];
 
-    ValidateTensorNumDimensions(inputTensorInfo,  descriptorName, 4, "input");
-    ValidateTensorNumDimensions(outputTensorInfo, descriptorName, 4, "output");
-
-    if (m_Parameters.m_BlockShape.size() != 2)
-    {
-        throw InvalidArgumentException(descriptorName + ": Block Shape must contain 2 spatial dimensions.");
-    }
-
     if (m_Parameters.m_BlockShape.size() != m_Parameters.m_PadList.size())
     {
         throw InvalidArgumentException(descriptorName + ": Pad List must contain the same number of "
                                        "dimensions as Block Shape.");
     }
 
-    const TensorShape& inputShape = inputTensorInfo.GetShape();
-
-    std::pair<unsigned int, unsigned int> heightPad = m_Parameters.m_PadList[0];
-    std::pair<unsigned int, unsigned int> widthPad  = m_Parameters.m_PadList[1];
-
-    DataLayoutIndexed dimensionIndices(m_Parameters.m_DataLayout);
-
-    const unsigned int inputWidth  = inputShape[dimensionIndices.GetWidthIndex()] +
-                                     widthPad.first + widthPad.second;
-    const unsigned int inputHeight = inputShape[dimensionIndices.GetHeightIndex()] +
-                                     heightPad.first + heightPad.second;
-
-    const unsigned int numInputElements  = inputShape[0] * inputHeight * inputWidth *
-                                           inputShape[dimensionIndices.GetChannelsIndex()];
-    const unsigned int numOutputElements = outputTensorInfo.GetNumElements();
-
-    if (numOutputElements != numInputElements)
+    if (m_Parameters.m_BlockShape.size() == 2)
     {
-        throw InvalidArgumentException(descriptorName + ": Input tensor has " +
-            to_string(numInputElements) + " after padding but output tensor has " +
-            to_string(numOutputElements) + " elements.");
+        ValidateTensorNumDimensions(inputTensorInfo,  descriptorName, 4, "input");
+        ValidateTensorNumDimensions(outputTensorInfo, descriptorName, 4, "output");
+    }
+    else if (m_Parameters.m_BlockShape.size() == 1)
+    {
+        ValidateTensorNumDimensions(inputTensorInfo,  descriptorName, 3, "input");
+        ValidateTensorNumDimensions(outputTensorInfo, descriptorName, 3, "output");
+    }
+    else
+    {
+        throw InvalidArgumentException(descriptorName + ": Invalid Block and Crops size.");
     }
 
-    if (inputHeight % m_Parameters.m_BlockShape[0] != 0 || inputWidth % m_Parameters.m_BlockShape[1] != 0)
+    // Check input + padding and output have the same number of elements
+    DataLayoutIndexed dimensionIndices(m_Parameters.m_DataLayout);
+    const unsigned int inputHeight = inputTensorInfo.GetShape()[dimensionIndices.GetHeightIndex()] +
+                                     m_Parameters.m_PadList[0].first + m_Parameters.m_PadList[0].second;
+    const unsigned int inputWidth = (inputTensorInfo.GetNumDimensions() == 3) ? 1 :
+                                    inputTensorInfo.GetShape()[dimensionIndices.GetWidthIndex()] +
+                                    m_Parameters.m_PadList[1].first + m_Parameters.m_PadList[1].second;
+
+    const int channelsIndex_int = (m_Parameters.m_DataLayout == DataLayout::NCHW) ? 1 : -1;
+    const unsigned int channelsIndex = channelsIndex_int < 0 ?
+            static_cast<unsigned int>(channelsIndex_int) + inputTensorInfo.GetNumDimensions()
+            : static_cast<unsigned int>(channelsIndex_int);
+
+    const unsigned int numInputElements = inputTensorInfo.GetShape()[0] *
+                                          inputHeight *
+                                          inputWidth *
+                                          inputTensorInfo.GetShape()[channelsIndex];
+
+    if (outputTensorInfo.GetNumElements() != numInputElements)
     {
-        throw InvalidArgumentException(descriptorName + ": Input shape after padding must be "
-                                       "divisible by Block Shape in all spatial dimensions");
+        throw InvalidArgumentException(descriptorName + ": Input tensor has " +
+                                       to_string(numInputElements) + " after padding but output tensor has " +
+                                       to_string(outputTensorInfo.GetNumElements()) + " elements.");
+    }
+
+    // In a 4D tensor, there will be 2 spatialDimensions (H and W), and the for loop will run twice.
+    // In a 3D tensor, there will be 1 spatialDimensions, and the for loop will run once.
+    unsigned int firstSpatialDimension = m_Parameters.m_DataLayout == DataLayout::NCHW ? 2 : 1;
+    for (unsigned int i = 0; i < m_Parameters.m_BlockShape.size(); ++i)
+    {
+        unsigned int spatialDimension = firstSpatialDimension + i;
+        auto inputSize = inputTensorInfo.GetShape()[spatialDimension] +
+                         m_Parameters.m_PadList[i].first +
+                         m_Parameters.m_PadList[i].second;
+        if (inputSize % m_Parameters.m_BlockShape[i] != 0)
+        {
+            throw InvalidArgumentException(descriptorName + ": Input dimension size after padding must be "
+                                        "divisible by Block Shape in dimension: " + to_string(spatialDimension) + ".");
+        }
     }
 
     std::vector<DataType> supportedTypes =
@@ -2472,6 +2491,42 @@
     const TensorInfo& inputTensorInfo  = workloadInfo.m_InputTensorInfos[0];
     const TensorInfo& outputTensorInfo = workloadInfo.m_OutputTensorInfos[0];
 
+    if (m_Parameters.m_BlockShape.size() != m_Parameters.m_Crops.size())
+    {
+        throw InvalidArgumentException(descriptorName + ": Crops must contain the same number of "
+                                                        "dimensions as Block Shape.");
+    }
+
+    if (m_Parameters.m_BlockShape.size() == 2)
+    {
+        ValidateTensorNumDimensions(inputTensorInfo,  descriptorName, 4, "input");
+        ValidateTensorNumDimensions(outputTensorInfo, descriptorName, 4, "output");
+    }
+    else if (m_Parameters.m_BlockShape.size() == 1)
+    {
+        ValidateTensorNumDimensions(inputTensorInfo,  descriptorName, 3, "input");
+        ValidateTensorNumDimensions(outputTensorInfo, descriptorName, 3, "output");
+    }
+    else
+    {
+        throw InvalidArgumentException(descriptorName + ": Invalid Block and Crops size.");
+    }
+
+    // In a 4D tensor, there will be 2 spatialDimensions (H and W), and the for loop will run twice.
+    // In a 3D tensor, there will be 1 spatialDimensions, and the for loop will run once.
+    unsigned int firstSpatialDimension = m_Parameters.m_DataLayout == DataLayout::NCHW ? 2 : 1;
+    for (unsigned int i = 0; i < m_Parameters.m_BlockShape.size(); ++i)
+    {
+        unsigned int spatialDimension = firstSpatialDimension + i;
+        unsigned int cropSize = m_Parameters.m_Crops[i].first + m_Parameters.m_Crops[i].second;
+        unsigned int outputSize = inputTensorInfo.GetShape()[spatialDimension] * m_Parameters.m_BlockShape[i];
+        if (cropSize > outputSize)
+        {
+            throw InvalidArgumentException(descriptorName + ": CropSize must be less than or equal to the uncropped"
+                                           "outputSize in dimension: " + to_string(spatialDimension) + ".");
+        }
+    }
+
     std::vector<DataType> supportedTypes =
     {
         DataType::BFloat16,
diff --git a/src/backends/backendsCommon/test/layerTests/BatchToSpaceNdTestImpl.hpp b/src/backends/backendsCommon/test/layerTests/BatchToSpaceNdTestImpl.hpp
index b300777..4f8b7d0 100644
--- a/src/backends/backendsCommon/test/layerTests/BatchToSpaceNdTestImpl.hpp
+++ b/src/backends/backendsCommon/test/layerTests/BatchToSpaceNdTestImpl.hpp
@@ -279,6 +279,33 @@
 }
 
 template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 3> BatchToSpaceNdNhwcTest8(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        const armnn::ITensorHandleFactory& tensorHandleFactory)
+{
+    const unsigned int inputShape[] = {4, 2, 1};
+    const unsigned int outputShape[] = {1, 8, 1};
+
+    std::vector<float> input({
+                                  1.0f,  2.0f,  3.0f,  4.0f,
+                                  5.0f,  6.0f,  7.0f,  8.0f
+                             });
+
+    std::vector<float> expectedOutput({
+                                           1.0f,  3.0f,  5.0f,  7.0f,
+                                           2.0f,  4.0f,  6.0f,  8.0f
+                                      });
+
+    std::vector<unsigned int> blockShape {4};
+    std::vector<std::pair<unsigned int, unsigned int>> crops = {{0, 0}};
+
+    return BatchToSpaceNdHelper<ArmnnType, 3, 3>(workloadFactory, memoryManager, tensorHandleFactory,
+                                                 armnn::DataLayout::NHWC, inputShape, input, blockShape,
+                                                 crops, outputShape, expectedOutput);
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
 LayerTestResult<T, 4> BatchToSpaceNdNchwTest1(
         armnn::IWorkloadFactory &workloadFactory,
         const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
diff --git a/src/backends/backendsCommon/test/layerTests/SpaceToBatchNdTestImpl.cpp b/src/backends/backendsCommon/test/layerTests/SpaceToBatchNdTestImpl.cpp
index 92876e1..4e40692 100644
--- a/src/backends/backendsCommon/test/layerTests/SpaceToBatchNdTestImpl.cpp
+++ b/src/backends/backendsCommon/test/layerTests/SpaceToBatchNdTestImpl.cpp
@@ -88,6 +88,59 @@
                                  outputTensorInfo.GetShape());
 }
 
+template<typename T>
+LayerTestResult<T, 3> SpaceToBatchNd3DTestImpl(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        const armnn::ITensorHandleFactory& tensorHandleFactory,
+        armnn::TensorInfo& inputTensorInfo,
+        armnn::TensorInfo& outputTensorInfo,
+        std::vector<float>& inputData,
+        std::vector<float>& outputExpectedData,
+        armnn::SpaceToBatchNdQueueDescriptor descriptor,
+        const float qScale = 1.0f,
+        const int32_t qOffset = 0)
+{
+    IgnoreUnused(memoryManager);
+
+    if(armnn::IsQuantizedType<T>())
+    {
+        inputTensorInfo.SetQuantizationScale(qScale);
+        inputTensorInfo.SetQuantizationOffset(qOffset);
+        outputTensorInfo.SetQuantizationScale(qScale);
+        outputTensorInfo.SetQuantizationOffset(qOffset);
+    }
+
+    std::vector<T> input = armnnUtils::QuantizedVector<T>(inputData, qScale, qOffset);
+    std::vector<T> expectedOutput = armnnUtils::QuantizedVector<T>(outputExpectedData, qScale, qOffset);
+    std::vector<T> actualOutput(outputTensorInfo.GetNumElements());
+
+    std::unique_ptr<armnn::ITensorHandle> inputHandle  = tensorHandleFactory.CreateTensorHandle(inputTensorInfo);
+    std::unique_ptr<armnn::ITensorHandle> outputHandle = tensorHandleFactory.CreateTensorHandle(outputTensorInfo);
+
+    armnn::WorkloadInfo info;
+    AddInputToWorkload(descriptor, info, inputTensorInfo, inputHandle.get());
+    AddOutputToWorkload(descriptor, info, outputTensorInfo, outputHandle.get());
+
+    std::unique_ptr<armnn::IWorkload> workload = workloadFactory.CreateWorkload(armnn::LayerType::SpaceToBatchNd,
+                                                                                descriptor,
+                                                                                info);
+
+    inputHandle->Allocate();
+    outputHandle->Allocate();
+
+    CopyDataToITensorHandle(inputHandle.get(), input.data());
+
+    workload->Execute();
+
+    CopyDataFromITensorHandle(actualOutput.data(), outputHandle.get());
+
+    return LayerTestResult<T, 3>(actualOutput,
+                                 expectedOutput,
+                                 outputHandle->GetShape(),
+                                 outputTensorInfo.GetShape());
+}
+
 template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
 LayerTestResult<T, 4> SpaceToBatchNdSimpleTest(
     armnn::IWorkloadFactory& workloadFactory,
@@ -254,6 +307,44 @@
 }
 
 template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 3> SpaceToBatchNdSimple3DTest(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        const armnn::ITensorHandleFactory& tensorHandleFactory,
+        armnn::DataLayout dataLayout = armnn::DataLayout::NHWC)
+{
+    armnn::TensorInfo inputTensorInfo;
+    armnn::TensorInfo outputTensorInfo;
+
+    unsigned int inputShape[] = {1, 8, 1};
+    unsigned int outputShape[] = {4, 2, 1};
+
+    armnn::SpaceToBatchNdQueueDescriptor desc;
+    desc.m_Parameters.m_DataLayout = dataLayout;
+    desc.m_Parameters.m_BlockShape = {4};
+    desc.m_Parameters.m_PadList = {{0, 0}};
+
+    inputTensorInfo = armnn::TensorInfo(3, inputShape, ArmnnType);
+    outputTensorInfo = armnn::TensorInfo(3, outputShape, ArmnnType);
+
+    std::vector<float> input = std::vector<float>(
+            {
+                    1.0f,  3.0f,  5.0f,  7.0f,
+                    2.0f,  4.0f,  6.0f,  8.0f
+            });
+
+    std::vector<float> outputExpected = std::vector<float>(
+            {
+                    1.0f,  2.0f,  3.0f,  4.0f,
+                    5.0f,  6.0f,  7.0f,  8.0f
+            });
+
+    return SpaceToBatchNd3DTestImpl<T>(
+            workloadFactory, memoryManager, tensorHandleFactory,
+            inputTensorInfo, outputTensorInfo, input, outputExpected, desc);
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
 LayerTestResult<T, 4> SpaceToBatchNdSimpleNhwcTest(
     armnn::IWorkloadFactory& workloadFactory,
     const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
@@ -463,6 +554,16 @@
                                                                    tensorHandleFactory);
 }
 
+LayerTestResult<float, 3> SpaceToBatchNdSimpleNhwc3DFloat32Test(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        const armnn::ITensorHandleFactory& tensorHandleFactory)
+{
+    return SpaceToBatchNdSimple3DTest<armnn::DataType::Float32>(workloadFactory,
+                                                                memoryManager,
+                                                                tensorHandleFactory);
+}
+
 LayerTestResult<armnn::Half, 4> SpaceToBatchNdSimpleNhwcFloat16Test(
     armnn::IWorkloadFactory& workloadFactory,
     const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
@@ -503,6 +604,16 @@
                                                                    tensorHandleFactory);
 }
 
+LayerTestResult<armnn::Half, 3> SpaceToBatchNdSimpleNhwc3DFloat16Test(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        const armnn::ITensorHandleFactory& tensorHandleFactory)
+{
+    return SpaceToBatchNdSimple3DTest<armnn::DataType::Float16>(workloadFactory,
+                                                                   memoryManager,
+                                                                   tensorHandleFactory);
+}
+
 LayerTestResult<uint8_t, 4> SpaceToBatchNdSimpleNhwcUint8Test(
     armnn::IWorkloadFactory& workloadFactory,
     const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
@@ -543,6 +654,16 @@
                                                                     tensorHandleFactory);
 }
 
+LayerTestResult<uint8_t, 3> SpaceToBatchNdSimpleNhwc3DUint8Test(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        const armnn::ITensorHandleFactory& tensorHandleFactory)
+{
+    return SpaceToBatchNdSimple3DTest<armnn::DataType::QAsymmU8>(workloadFactory,
+                                                                 memoryManager,
+                                                                 tensorHandleFactory);
+}
+
 LayerTestResult<int16_t, 4> SpaceToBatchNdSimpleUint16Test(
         armnn::IWorkloadFactory& workloadFactory,
         const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
diff --git a/src/backends/backendsCommon/test/layerTests/SpaceToBatchNdTestImpl.hpp b/src/backends/backendsCommon/test/layerTests/SpaceToBatchNdTestImpl.hpp
index 7768b16..4e87d6a 100644
--- a/src/backends/backendsCommon/test/layerTests/SpaceToBatchNdTestImpl.hpp
+++ b/src/backends/backendsCommon/test/layerTests/SpaceToBatchNdTestImpl.hpp
@@ -91,6 +91,11 @@
     const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
     const armnn::ITensorHandleFactory& tensorHandleFactory);
 
+LayerTestResult<float, 3> SpaceToBatchNdSimpleNhwc3DFloat32Test(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        const armnn::ITensorHandleFactory& tensorHandleFactory);
+
 LayerTestResult<armnn::Half, 4> SpaceToBatchNdSimpleNhwcFloat16Test(
     armnn::IWorkloadFactory& workloadFactory,
     const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
@@ -111,6 +116,11 @@
     const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
     const armnn::ITensorHandleFactory& tensorHandleFactory);
 
+LayerTestResult<armnn::Half, 3> SpaceToBatchNdSimpleNhwc3DFloat16Test(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        const armnn::ITensorHandleFactory& tensorHandleFactory);
+
 LayerTestResult<uint8_t, 4> SpaceToBatchNdSimpleNhwcUint8Test(
     armnn::IWorkloadFactory& workloadFactory,
     const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
@@ -131,6 +141,11 @@
     const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
     const armnn::ITensorHandleFactory& tensorHandleFactory);
 
+LayerTestResult<uint8_t, 3> SpaceToBatchNdSimpleNhwc3DUint8Test(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        const armnn::ITensorHandleFactory& tensorHandleFactory);
+
 LayerTestResult<int16_t, 4> SpaceToBatchNdSimpleUint16Test(
         armnn::IWorkloadFactory& workloadFactory,
         const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
diff --git a/src/backends/reference/RefLayerSupport.cpp b/src/backends/reference/RefLayerSupport.cpp
index cbc6723..81e5c83 100644
--- a/src/backends/reference/RefLayerSupport.cpp
+++ b/src/backends/reference/RefLayerSupport.cpp
@@ -794,20 +794,6 @@
     supported &= CheckSupportRule(TypesAreEqual(input, output), reasonIfUnsupported,
                                   "Reference BatchToSpaceNd: input and output types mismatched.");
 
-    supported &= CheckSupportRule(TensorNumDimensionsAreCorrect(output, 4),
-                                  reasonIfUnsupported,
-                                  CreateIncorrectDimensionsErrorMsg(4,
-                                                                    output.GetNumDimensions(),
-                                                                    batchToSpaceNdLayerStr,
-                                                                    outputTensorStr).data());
-
-    supported &= CheckSupportRule(TensorNumDimensionsAreCorrect(input, 4),
-                                  reasonIfUnsupported,
-                                  CreateIncorrectDimensionsErrorMsg(4,
-                                                                    input.GetNumDimensions(),
-                                                                    batchToSpaceNdLayerStr,
-                                                                    inputTensorStr).data());
-
     return supported;
 }
 
diff --git a/src/backends/reference/test/RefLayerTests.cpp b/src/backends/reference/test/RefLayerTests.cpp
index 8b89743..6e69772 100644
--- a/src/backends/reference/test/RefLayerTests.cpp
+++ b/src/backends/reference/test/RefLayerTests.cpp
@@ -1938,16 +1938,19 @@
 ARMNN_AUTO_TEST_CASE_WITH_THF(SpaceToBatchNdMultiChannelsNhwcFloat32, SpaceToBatchNdMultiChannelsNhwcFloat32Test)
 ARMNN_AUTO_TEST_CASE_WITH_THF(SpaceToBatchNdMultiBlockNhwcFloat32, SpaceToBatchNdMultiBlockNhwcFloat32Test)
 ARMNN_AUTO_TEST_CASE_WITH_THF(SpaceToBatchNdPaddingNhwcFloat32, SpaceToBatchNdPaddingNhwcFloat32Test)
+ARMNN_AUTO_TEST_CASE_WITH_THF(SpaceToBatchNdSimpleNhwc3DFloat32, SpaceToBatchNdSimpleNhwc3DFloat32Test)
 
 ARMNN_AUTO_TEST_CASE_WITH_THF(SpaceToBatchNdSimpleNhwcFloat16, SpaceToBatchNdSimpleNhwcFloat16Test)
 ARMNN_AUTO_TEST_CASE_WITH_THF(SpaceToBatchNdMultiChannelsNhwcFloat16, SpaceToBatchNdMultiChannelsNhwcFloat16Test)
 ARMNN_AUTO_TEST_CASE_WITH_THF(SpaceToBatchNdMultiBlockNhwcFloat16, SpaceToBatchNdMultiBlockNhwcFloat16Test)
 ARMNN_AUTO_TEST_CASE_WITH_THF(SpaceToBatchNdPaddingNhwcFloat16, SpaceToBatchNdPaddingNhwcFloat16Test)
+ARMNN_AUTO_TEST_CASE_WITH_THF(SpaceToBatchNdSimpleNhwc3DFloat16, SpaceToBatchNdSimpleNhwc3DFloat16Test)
 
 ARMNN_AUTO_TEST_CASE_WITH_THF(SpaceToBatchNdSimpleNhwcUint8, SpaceToBatchNdSimpleNhwcUint8Test)
 ARMNN_AUTO_TEST_CASE_WITH_THF(SpaceToBatchNdMultiChannelsNhwcUint8, SpaceToBatchNdMultiChannelsNhwcUint8Test)
 ARMNN_AUTO_TEST_CASE_WITH_THF(SpaceToBatchNdMultiBlockNhwcUint8, SpaceToBatchNdMultiBlockNhwcUint8Test)
 ARMNN_AUTO_TEST_CASE_WITH_THF(SpaceToBatchNdPaddingNhwcUint8, SpaceToBatchNdPaddingNhwcUint8Test)
+ARMNN_AUTO_TEST_CASE_WITH_THF(SpaceToBatchNdSimpleNhwc3DUint8, SpaceToBatchNdSimpleNhwc3DUint8Test)
 
 ARMNN_AUTO_TEST_CASE_WITH_THF(SpaceToBatchNdSimpleUint16, SpaceToBatchNdSimpleUint16Test)
 ARMNN_AUTO_TEST_CASE_WITH_THF(SpaceToBatchNdMultiChannelsUint16, SpaceToBatchNdMultiChannelsUint16Test)
@@ -1967,6 +1970,7 @@
 ARMNN_AUTO_TEST_CASE_WITH_THF(BatchToSpaceNdNhwcFloat32_5, BatchToSpaceNdNhwcTest5<DataType::Float32>)
 ARMNN_AUTO_TEST_CASE_WITH_THF(BatchToSpaceNdNhwcFloat32_6, BatchToSpaceNdNhwcTest6<DataType::Float32>)
 ARMNN_AUTO_TEST_CASE_WITH_THF(BatchToSpaceNdNhwcFloat32_7, BatchToSpaceNdNhwcTest7<DataType::Float32>)
+ARMNN_AUTO_TEST_CASE_WITH_THF(BatchToSpaceNdNhwcFloat32_3D, BatchToSpaceNdNhwcTest8<DataType::Float32>)
 
 ARMNN_AUTO_TEST_CASE_WITH_THF(BatchToSpaceNdNhwcFloat16_1, BatchToSpaceNdNhwcTest1<DataType::Float16>)
 ARMNN_AUTO_TEST_CASE_WITH_THF(BatchToSpaceNdNhwcFloat16_2, BatchToSpaceNdNhwcTest2<DataType::Float16>)
@@ -1975,6 +1979,7 @@
 ARMNN_AUTO_TEST_CASE_WITH_THF(BatchToSpaceNdNhwcFloat16_5, BatchToSpaceNdNhwcTest5<DataType::Float16>)
 ARMNN_AUTO_TEST_CASE_WITH_THF(BatchToSpaceNdNhwcFloat16_6, BatchToSpaceNdNhwcTest6<DataType::Float16>)
 ARMNN_AUTO_TEST_CASE_WITH_THF(BatchToSpaceNdNhwcFloat16_7, BatchToSpaceNdNhwcTest7<DataType::Float16>)
+ARMNN_AUTO_TEST_CASE_WITH_THF(BatchToSpaceNdNhwcFloat16_3D, BatchToSpaceNdNhwcTest8<DataType::Float16>)
 
 ARMNN_AUTO_TEST_CASE_WITH_THF(BatchToSpaceNdNhwcInt1,  BatchToSpaceNdNhwcTest1<DataType::QAsymmS8>)
 ARMNN_AUTO_TEST_CASE_WITH_THF(BatchToSpaceNdNhwcInt2,  BatchToSpaceNdNhwcTest2<DataType::QAsymmS8>)
@@ -1983,6 +1988,7 @@
 ARMNN_AUTO_TEST_CASE_WITH_THF(BatchToSpaceNdNhwcInt5,  BatchToSpaceNdNhwcTest5<DataType::QAsymmS8>)
 ARMNN_AUTO_TEST_CASE_WITH_THF(BatchToSpaceNdNhwcInt6,  BatchToSpaceNdNhwcTest6<DataType::QAsymmS8>)
 ARMNN_AUTO_TEST_CASE_WITH_THF(BatchToSpaceNdNhwcInt7,  BatchToSpaceNdNhwcTest7<DataType::QAsymmS8>)
+ARMNN_AUTO_TEST_CASE_WITH_THF(BatchToSpaceNdNhwcInt_3D, BatchToSpaceNdNhwcTest8<DataType::QAsymmS8>)
 
 ARMNN_AUTO_TEST_CASE_WITH_THF(BatchToSpaceNdNhwcUint1,  BatchToSpaceNdNhwcTest1<DataType::QAsymmU8>)
 ARMNN_AUTO_TEST_CASE_WITH_THF(BatchToSpaceNdNhwcUint2,  BatchToSpaceNdNhwcTest2<DataType::QAsymmU8>)
@@ -1991,6 +1997,7 @@
 ARMNN_AUTO_TEST_CASE_WITH_THF(BatchToSpaceNdNhwcUint5,  BatchToSpaceNdNhwcTest5<DataType::QAsymmU8>)
 ARMNN_AUTO_TEST_CASE_WITH_THF(BatchToSpaceNdNhwcUint6,  BatchToSpaceNdNhwcTest6<DataType::QAsymmU8>)
 ARMNN_AUTO_TEST_CASE_WITH_THF(BatchToSpaceNdNhwcUint7,  BatchToSpaceNdNhwcTest7<DataType::QAsymmU8>)
+ARMNN_AUTO_TEST_CASE_WITH_THF(BatchToSpaceNdNhwcUint_3D,  BatchToSpaceNdNhwcTest8<DataType::QAsymmU8>)
 
 ARMNN_AUTO_TEST_CASE_WITH_THF(BatchToSpaceNdNhwcQsymm16_1,  BatchToSpaceNdNhwcTest1<DataType::QSymmS16>)
 ARMNN_AUTO_TEST_CASE_WITH_THF(BatchToSpaceNdNhwcQsymm16_2,  BatchToSpaceNdNhwcTest2<DataType::QSymmS16>)
@@ -1999,6 +2006,7 @@
 ARMNN_AUTO_TEST_CASE_WITH_THF(BatchToSpaceNdNhwcQsymm16_5,  BatchToSpaceNdNhwcTest5<DataType::QSymmS16>)
 ARMNN_AUTO_TEST_CASE_WITH_THF(BatchToSpaceNdNhwcQsymm16_6,  BatchToSpaceNdNhwcTest6<DataType::QSymmS16>)
 ARMNN_AUTO_TEST_CASE_WITH_THF(BatchToSpaceNdNhwcQsymm16_7,  BatchToSpaceNdNhwcTest7<DataType::QSymmS16>)
+ARMNN_AUTO_TEST_CASE_WITH_THF(BatchToSpaceNdNhwcQsymm16_3D,  BatchToSpaceNdNhwcTest8<DataType::QSymmS16>)
 
 ARMNN_AUTO_TEST_CASE_WITH_THF(BatchToSpaceNdNchwFloat16_1, BatchToSpaceNdNchwTest1<DataType::Float16>)
 ARMNN_AUTO_TEST_CASE_WITH_THF(BatchToSpaceNdNchwFloat16_2, BatchToSpaceNdNchwTest2<DataType::Float16>)
diff --git a/src/backends/reference/workloads/BatchToSpaceNd.cpp b/src/backends/reference/workloads/BatchToSpaceNd.cpp
index bf7de1b..ebe9d2c 100644
--- a/src/backends/reference/workloads/BatchToSpaceNd.cpp
+++ b/src/backends/reference/workloads/BatchToSpaceNd.cpp
@@ -1,85 +1,105 @@
 //
-// Copyright © 2017 Arm Ltd. All rights reserved.
+// Copyright © 2017-2020,2023 Arm Ltd and Contributors. All rights reserved.
 // SPDX-License-Identifier: MIT
 //
 
 #include "BatchToSpaceNd.hpp"
 
-#include "RefWorkloadUtils.hpp"
-
-#include <armnn/Types.hpp>
-
-#include <armnn/utility/Assert.hpp>
+#include <armnnUtils/DataLayoutIndexed.hpp>
 
 using namespace armnnUtils;
 
 namespace armnn
 {
 
-inline unsigned int Offset(const TensorShape& shape, unsigned int batch, unsigned int height, unsigned int width,
-        unsigned int channels, const DataLayoutIndexed& dataLayout)
+unsigned int Offset(const TensorShape& shape,
+                    unsigned int batch,
+                    unsigned int height,
+                    unsigned int width,
+                    unsigned int channels,
+                    const DataLayoutIndexed& dataLayout)
 {
-    if (dataLayout.GetDataLayout() == DataLayout::NHWC)
+    // 3D Tensors
+    unsigned int channelDimension3D = dataLayout.GetDataLayout() == DataLayout::NCHW ? 1 : 2;
+    if (shape.GetNumDimensions() == 3)
     {
-        return ((batch * shape[dataLayout.GetHeightIndex()] + height) * shape[dataLayout.GetWidthIndex()] + width) *
-               shape[dataLayout.GetChannelsIndex()] + channels;
+        return (batch * shape[dataLayout.GetHeightIndex()] + height) * shape[channelDimension3D] + channels;
+    }
+    // 4D Tensors
+    else if (shape.GetNumDimensions() == 4)
+    {
+        if (dataLayout.GetDataLayout() == DataLayout::NHWC)
+        {
+            return ((batch * shape[dataLayout.GetHeightIndex()] + height) *
+                    shape[dataLayout.GetWidthIndex()] + width) *
+                    shape[dataLayout.GetChannelsIndex()] + channels;
+        }
+        else
+        {
+            return ((batch * shape[dataLayout.GetChannelsIndex()] + channels) *
+                    shape[dataLayout.GetHeightIndex()] + height) *
+                    shape[dataLayout.GetWidthIndex()] + width;
+        }
     }
     else
     {
-        return ((batch * shape[dataLayout.GetChannelsIndex()] + channels) *
-               shape[dataLayout.GetHeightIndex()] + height) *
-               shape[dataLayout.GetWidthIndex()] + width;
+        throw InvalidArgumentException("Tensor rank must be either 3 or 4", CHECK_LOCATION());
     }
 }
 
-void BatchToSpaceNd(const DataLayoutIndexed& dataLayout,
-                    const TensorInfo& inputTensorInfo,
-                    const TensorInfo& outputTensorInfo,
-                    const std::vector<unsigned int>& blockShape,
-                    const std::vector<std::pair<unsigned int, unsigned int>>& cropsData,
-                    Decoder<float>& inputDecoder,
-                    Encoder<float>& outputEncoder)
+void BatchToSpaceNd(const TensorInfo& inputInfo,
+                    const TensorInfo& outputInfo,
+                    const BatchToSpaceNdDescriptor& params,
+                    Decoder<float>& inputData,
+                    Encoder<float>& outputData)
 {
-    TensorShape inputShape = inputTensorInfo.GetShape();
+    unsigned int rank = inputInfo.GetNumDimensions();
+    if (rank != 3 && rank != 4 )
+    {
+        throw InvalidArgumentException("Tensor rank must be either 3 or 4, but it is " + std::to_string(rank),
+                                       CHECK_LOCATION());
+    }
 
-    ARMNN_ASSERT_MSG(inputShape.GetNumDimensions() == 4, "Expected Input with 4 Dimensions");
+    DataLayoutIndexed dataLayout = params.m_DataLayout;
+    unsigned int channelDimension3D = params.m_DataLayout == DataLayout::NCHW ? 1 : 2;
 
-    TensorShape outputShape = outputTensorInfo.GetShape();
+    TensorShape inputShape = inputInfo.GetShape();
+    TensorShape outputShape = outputInfo.GetShape();
 
-    ARMNN_ASSERT_MSG(outputShape.GetNumDimensions() == 4, "Expected Output with 4 Dimensions");
-
-    const unsigned int inputBatchSize = inputShape[0];
-    const unsigned int channels = inputShape[dataLayout.GetChannelsIndex()];
-
+    const unsigned int inputBatchSize  = inputShape[0];
     const unsigned int outputBatchSize = outputShape[0];
+
+    const unsigned int channels = (rank == 3) ? inputShape[channelDimension3D]
+                                              : inputShape[dataLayout.GetChannelsIndex()];
+
+    const unsigned int inputHeight  = inputShape[dataLayout.GetHeightIndex()];
+    const unsigned int inputWidth   = (rank == 3) ? 1 : inputShape[dataLayout.GetWidthIndex()];
     const unsigned int outputHeight = outputShape[dataLayout.GetHeightIndex()];
-    const unsigned int outputWidth = outputShape[dataLayout.GetWidthIndex()];
+    const unsigned int outputWidth  = (rank == 3) ? 1 : outputShape[dataLayout.GetWidthIndex()];
 
-    ARMNN_ASSERT_MSG(blockShape.size() > 0, "BlockShape must contain 1 or more entries");
+    const unsigned int blockHeight = params.m_BlockShape[0];
+    const unsigned int blockWidth  = (rank == 3) ? 1 : params.m_BlockShape[1];
 
-    const unsigned int blockShapeHeight = blockShape[0];
-    const unsigned int blockShapeWidth = blockShape[1];
-
-    ARMNN_ASSERT_MSG(cropsData.size() > 0, "Crops must contain 1 or more entries");
-
-    const unsigned int cropsTop = cropsData[0].first;
-    const unsigned int cropsLeft = cropsData[1].first;
+    const unsigned int cropsTop  = params.m_Crops[0].first;
+    const unsigned int cropsLeft = (rank == 3) ? 0 : params.m_Crops[1].first;
 
     for (unsigned int inBatch = 0; inBatch < inputBatchSize; ++inBatch)
     {
         const unsigned int outBatch = inBatch % outputBatchSize;
         const unsigned int spatialOffset = inBatch / outputBatchSize;
 
-        for (unsigned int inH = 0; inH < inputTensorInfo.GetShape()[dataLayout.GetHeightIndex()]; ++inH) {
-            const unsigned int outH = inH * blockShapeHeight + spatialOffset / blockShapeWidth - cropsTop;
+        for (unsigned int inH = 0; inH < inputHeight; ++inH)
+        {
+            const unsigned int outH = inH * blockHeight + spatialOffset / blockWidth - cropsTop;
 
             if (outH >= outputHeight)
             {
                 continue;
             }
 
-            for (unsigned int inW = 0; inW < inputTensorInfo.GetShape()[dataLayout.GetWidthIndex()]; ++inW) {
-                const unsigned int outW = inW * blockShapeWidth + spatialOffset % blockShapeWidth - cropsLeft;
+            for (unsigned int inW = 0; inW < inputWidth; ++inW)
+            {
+                const unsigned int outW = inW * blockWidth + spatialOffset % blockWidth - cropsLeft;
 
                 if (outW >= outputWidth)
                 {
@@ -91,9 +111,9 @@
                     unsigned int outOffset = Offset(outputShape, outBatch, outH, outW, c, dataLayout);
                     unsigned int inOffset = Offset(inputShape, inBatch, inH, inW, c, dataLayout);
 
-                    outputEncoder[outOffset];
-                    inputDecoder[inOffset];
-                    outputEncoder.Set(inputDecoder.Get());
+                    outputData[outOffset];
+                    inputData[inOffset];
+                    outputData.Set(inputData.Get());
                 }
             }
         }
diff --git a/src/backends/reference/workloads/BatchToSpaceNd.hpp b/src/backends/reference/workloads/BatchToSpaceNd.hpp
index 0fcef58..acacda4 100644
--- a/src/backends/reference/workloads/BatchToSpaceNd.hpp
+++ b/src/backends/reference/workloads/BatchToSpaceNd.hpp
@@ -1,5 +1,5 @@
 //
-// Copyright © 2017 Arm Ltd. All rights reserved.
+// Copyright © 2017-2019,2021,2023 Arm Ltd and Contributors. All rights reserved.
 // SPDX-License-Identifier: MIT
 //
 
@@ -9,21 +9,15 @@
 #include "Decoders.hpp"
 #include "Encoders.hpp"
 
-#include <armnn/Types.hpp>
-
-#include <armnnUtils/DataLayoutIndexed.hpp>
-
-#include <armnn/backends/Workload.hpp>
-#include <armnn/backends/WorkloadData.hpp>
+#include <armnn/Descriptors.hpp>
 
 namespace armnn
 {
 
-void BatchToSpaceNd(const armnnUtils::DataLayoutIndexed& dataLayout,
-                    const TensorInfo& inputTensorInfo,
-                    const TensorInfo& outputTensorInfo,
-                    const std::vector<unsigned int>& blockShape,
-                    const std::vector<std::pair<unsigned int, unsigned int>>& cropsData,
-                    Decoder<float>& inputDecoder,
-                    Encoder<float>& outputEncoder);
+void BatchToSpaceNd(const TensorInfo& inputInfo,
+                    const TensorInfo& outputInfo,
+                    const BatchToSpaceNdDescriptor& params,
+                    Decoder<float>& inputData,
+                    Encoder<float>& outputData);
+
 } // namespace armnn
diff --git a/src/backends/reference/workloads/RefBatchToSpaceNdWorkload.cpp b/src/backends/reference/workloads/RefBatchToSpaceNdWorkload.cpp
index 72c7a76..6bb8aff 100644
--- a/src/backends/reference/workloads/RefBatchToSpaceNdWorkload.cpp
+++ b/src/backends/reference/workloads/RefBatchToSpaceNdWorkload.cpp
@@ -1,11 +1,11 @@
 //
-// Copyright © 2022 Arm Ltd and Contributors. All rights reserved.
+// Copyright © 2018-2019,2021-2023 Arm Ltd and Contributors. All rights reserved.
 // SPDX-License-Identifier: MIT
 //
 
-#include "BatchToSpaceNd.hpp"
-#include "Profiling.hpp"
 #include "RefBatchToSpaceNdWorkload.hpp"
+#include "BatchToSpaceNd.hpp"
+
 #include "RefWorkloadUtils.hpp"
 
 namespace armnn
@@ -32,8 +32,7 @@
     std::unique_ptr<Decoder<float>> inputDecoder  = MakeDecoder<float>(inputInfo, inputs[0]->Map());
     std::unique_ptr<Encoder<float>> outputEncoder = MakeEncoder<float>(outputInfo, outputs[0]->Map());
 
-    BatchToSpaceNd(m_Data.m_Parameters.m_DataLayout, inputInfo, outputInfo, m_Data.m_Parameters.m_BlockShape,
-                   m_Data.m_Parameters.m_Crops, *inputDecoder, *outputEncoder);
+    BatchToSpaceNd(inputInfo, outputInfo, m_Data.m_Parameters, *inputDecoder, *outputEncoder);
 }
 
 
diff --git a/src/backends/reference/workloads/RefBatchToSpaceNdWorkload.hpp b/src/backends/reference/workloads/RefBatchToSpaceNdWorkload.hpp
index ac6aad3..5fb5835 100644
--- a/src/backends/reference/workloads/RefBatchToSpaceNdWorkload.hpp
+++ b/src/backends/reference/workloads/RefBatchToSpaceNdWorkload.hpp
@@ -1,14 +1,14 @@
 //
-// Copyright © 2022 Arm Ltd and Contributors. All rights reserved.
+// Copyright © 2018-2019,2021-2023 Arm Ltd and Contributors. All rights reserved.
 // SPDX-License-Identifier: MIT
 //
 
 #pragma once
 
 #include "RefBaseWorkload.hpp"
-#include <armnn/backends/WorkloadData.hpp>
 
-namespace armnn {
+namespace armnn
+{
 
 class RefBatchToSpaceNdWorkload : public RefBaseWorkload<BatchToSpaceNdQueueDescriptor>
 {
diff --git a/src/backends/reference/workloads/RefSpaceToBatchNdWorkload.cpp b/src/backends/reference/workloads/RefSpaceToBatchNdWorkload.cpp
index 6aa422a..d29c2c8 100644
--- a/src/backends/reference/workloads/RefSpaceToBatchNdWorkload.cpp
+++ b/src/backends/reference/workloads/RefSpaceToBatchNdWorkload.cpp
@@ -1,5 +1,5 @@
 //
-// Copyright © 2022 Arm Ltd and Contributors. All rights reserved.
+// Copyright © 2018-2023 Arm Ltd and Contributors. All rights reserved.
 // SPDX-License-Identifier: MIT
 //
 
@@ -7,7 +7,6 @@
 #include "SpaceToBatchNd.hpp"
 
 #include "RefWorkloadUtils.hpp"
-#include <ResolveType.hpp>
 
 namespace armnn
 {
@@ -28,12 +27,12 @@
     ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuRef, "RefSpaceToBatchNdWorkload_Execute");
 
     const TensorInfo& inputInfo = GetTensorInfo(inputs[0]);
-    std::unique_ptr<Decoder<float>> decoder = MakeDecoder<float>(inputInfo, inputs[0]->Map());
-
     const TensorInfo& outputInfo = GetTensorInfo(outputs[0]);
-    std::unique_ptr<Encoder<float>> encoder = MakeEncoder<float>(outputInfo, outputs[0]->Map());
 
-    SpaceToBatchNd(inputInfo, outputInfo, m_Data.m_Parameters, *decoder, *encoder);
+    std::unique_ptr<Decoder<float>> inputDecoder = MakeDecoder<float>(inputInfo, inputs[0]->Map());
+    std::unique_ptr<Encoder<float>> outputEncoder = MakeEncoder<float>(outputInfo, outputs[0]->Map());
+
+    SpaceToBatchNd(inputInfo, outputInfo, m_Data.m_Parameters, *inputDecoder, *outputEncoder);
 }
 
 } //namespace armnn
diff --git a/src/backends/reference/workloads/RefSpaceToBatchNdWorkload.hpp b/src/backends/reference/workloads/RefSpaceToBatchNdWorkload.hpp
index f2c8768..f9d75ee 100644
--- a/src/backends/reference/workloads/RefSpaceToBatchNdWorkload.hpp
+++ b/src/backends/reference/workloads/RefSpaceToBatchNdWorkload.hpp
@@ -1,13 +1,11 @@
 //
-// Copyright © 2022 Arm Ltd and Contributors. All rights reserved.
+// Copyright © 2018-2023 Arm Ltd and Contributors. All rights reserved.
 // SPDX-License-Identifier: MIT
 //
 #pragma once
 
 #include "RefBaseWorkload.hpp"
 
-#include <armnn/TypesUtils.hpp>
-
 namespace armnn
 {
 
@@ -15,8 +13,10 @@
 {
 public:
     using RefBaseWorkload<SpaceToBatchNdQueueDescriptor>::RefBaseWorkload;
+
     void Execute() const override;
     void ExecuteAsync(ExecutionData& executionData)  override;
+
 private:
     void Execute(std::vector<ITensorHandle*> inputs, std::vector<ITensorHandle*> outputs) const;
 };
diff --git a/src/backends/reference/workloads/SpaceToBatchNd.cpp b/src/backends/reference/workloads/SpaceToBatchNd.cpp
index b6bab17..c3f022c 100644
--- a/src/backends/reference/workloads/SpaceToBatchNd.cpp
+++ b/src/backends/reference/workloads/SpaceToBatchNd.cpp
@@ -1,5 +1,5 @@
 //
-// Copyright © 2017 Arm Ltd. All rights reserved.
+// Copyright © 2017-2019,2023 Arm Ltd and Contributors. All rights reserved.
 // SPDX-License-Identifier: MIT
 //
 
@@ -19,15 +19,29 @@
                        unsigned int c,
                        const DataLayoutIndexed& dataLayout)
 {
-    if (dataLayout.GetDataLayout() == DataLayout::NHWC)
+    // 3D Tensors
+    unsigned int channelDimension3D = dataLayout.GetDataLayout() == DataLayout::NCHW ? 1 : 2;
+    if (shape.GetNumDimensions() == 3)
     {
-        return ((b * shape[dataLayout.GetHeightIndex()] + h) * shape[dataLayout.GetWidthIndex()] + w) *
-               shape[dataLayout.GetChannelsIndex()] + c;
+        return (b * shape[dataLayout.GetHeightIndex()] + h) * shape[channelDimension3D] + c;
+    }
+    // 4D Tensors
+    else if (shape.GetNumDimensions() == 4)
+    {
+        if (dataLayout.GetDataLayout() == DataLayout::NHWC)
+        {
+            return ((b * shape[dataLayout.GetHeightIndex()] + h) * shape[dataLayout.GetWidthIndex()] + w) *
+                   shape[dataLayout.GetChannelsIndex()] + c;
+        }
+        else
+        {
+            return ((b * shape[dataLayout.GetChannelsIndex()] + c) * shape[dataLayout.GetHeightIndex()] + h) *
+                   shape[dataLayout.GetWidthIndex()] + w;
+        }
     }
     else
     {
-        return ((b * shape[dataLayout.GetChannelsIndex()] + c) * shape[dataLayout.GetHeightIndex()] + h) *
-               shape[dataLayout.GetWidthIndex()] + w;
+        throw InvalidArgumentException("Tensor rank must be either 3 or 4", CHECK_LOCATION());
     }
 }
 
@@ -37,37 +51,46 @@
                     Decoder<float>& inputData,
                     Encoder<float>& outputData)
 {
+    unsigned int rank = inputInfo.GetNumDimensions();
+    if (rank != 3 && rank != 4 )
+    {
+        throw InvalidArgumentException("Tensor rank must be either 3 or 4, but it is " + std::to_string(rank),
+                                       CHECK_LOCATION());
+    }
+
     DataLayoutIndexed dataLayout = params.m_DataLayout;
+    unsigned int channelDimension3D = params.m_DataLayout == DataLayout::NCHW ? 1 : 2;
 
     const TensorShape& inputShape = inputInfo.GetShape();
     const TensorShape& outputShape = outputInfo.GetShape();
 
-    const unsigned int channels = inputShape[dataLayout.GetChannelsIndex()];
-
-    const unsigned int inputBatchSize = inputShape[0];
-    const unsigned int inputHeight = inputShape[dataLayout.GetHeightIndex()];
-    const unsigned int inputWidth = inputShape[dataLayout.GetWidthIndex()];
-
+    const unsigned int inputBatchSize  = inputShape[0];
     const unsigned int outputBatchSize = outputShape[0];
+
+    const unsigned int channels = (rank == 3) ? inputShape[channelDimension3D]
+                                              : inputShape[dataLayout.GetChannelsIndex()];
+
+    const unsigned int inputHeight  = inputShape[dataLayout.GetHeightIndex()];
+    const unsigned int inputWidth   = (rank == 3) ? 1 : inputShape[dataLayout.GetWidthIndex()];
     const unsigned int outputHeight = outputShape[dataLayout.GetHeightIndex()];
-    const unsigned int outputWidth = outputShape[dataLayout.GetWidthIndex()];
+    const unsigned int outputWidth  = (rank == 3) ? 1 : outputShape[dataLayout.GetWidthIndex()];
 
     const unsigned int blockHeight = params.m_BlockShape[0];
-    const unsigned int blockWidth = params.m_BlockShape[1];
+    const unsigned int blockWidth  = (rank == 3) ? 1 : params.m_BlockShape[1];
 
-    const unsigned int paddingTop = params.m_PadList[0].first;
-    const unsigned int paddingLeft = params.m_PadList[1].first;
+    const unsigned int paddingTop  = params.m_PadList[0].first;
+    const unsigned int paddingLeft = (rank == 3) ? 0 : params.m_PadList[1].first;
 
-    for (unsigned int outB = 0; outB < outputBatchSize; outB++)
+    for (unsigned int outB = 0; outB < outputBatchSize; ++outB)
     {
         unsigned int inB = outB % inputBatchSize;
 
         unsigned int shiftW = (outB / inputBatchSize) % blockWidth;
         unsigned int shiftH = (outB / inputBatchSize) / blockWidth;
 
-        for (unsigned int outH = 0; outH < outputHeight; outH++)
+        for (unsigned int outH = 0; outH < outputHeight; ++outH)
         {
-            for (unsigned int outW = 0; outW < outputWidth; outW++)
+            for (unsigned int outW = 0; outW < outputWidth; ++outW)
             {
                 if (outH * blockHeight + shiftH < paddingTop ||
                     outH * blockHeight + shiftH >= paddingTop + inputHeight ||
@@ -117,10 +140,4 @@
     }
 }
 
-void SpaceToBatchNd(const TensorInfo& inputInfo,
-                    const TensorInfo& outputInfo,
-                    const SpaceToBatchNdDescriptor& params,
-                    Decoder<float>& inputData,
-                    Encoder<float>& outData);
-
 } //namespace armnn
diff --git a/src/backends/reference/workloads/SpaceToBatchNd.hpp b/src/backends/reference/workloads/SpaceToBatchNd.hpp
index 57c9b6b..7de34ee 100644
--- a/src/backends/reference/workloads/SpaceToBatchNd.hpp
+++ b/src/backends/reference/workloads/SpaceToBatchNd.hpp
@@ -1,5 +1,5 @@
 //
-// Copyright © 2017 Arm Ltd. All rights reserved.
+// Copyright © 2017-2019,2023 Arm Ltd and Contributors. All rights reserved.
 // SPDX-License-Identifier: MIT
 //
 
@@ -10,7 +10,6 @@
 #include "Encoders.hpp"
 
 #include <armnn/Descriptors.hpp>
-#include "armnn/Tensor.hpp"
 
 namespace armnn
 {