IVGCVSW-6856 Add GATHERNd FrontEnd and Ref Implementation

* Add front end
* Add reference workload
* Add unit tests
* Add EndToEnd test

Signed-off-by: Teresa Charlin <teresa.charlinreyes@arm.com>
Change-Id: I4cebd17b18476df86162e2dda3366c10e80bd2f8
diff --git a/Android.mk b/Android.mk
index b91fd9c..0dd593a 100644
--- a/Android.mk
+++ b/Android.mk
@@ -224,6 +224,7 @@
         src/armnn/layers/FloorLayer.cpp \
         src/armnn/layers/FullyConnectedLayer.cpp \
         src/armnn/layers/GatherLayer.cpp \
+        src/armnn/layers/GatherNdLayer.cpp \
         src/armnn/layers/InputLayer.cpp \
         src/armnn/layers/InstanceNormalizationLayer.cpp \
         src/armnn/layers/L2NormalizationLayer.cpp \
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 60e0e52..d3d1fac 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -244,6 +244,8 @@
     src/armnn/layers/FullyConnectedLayer.cpp
     src/armnn/layers/GatherLayer.cpp
     src/armnn/layers/GatherLayer.hpp
+    src/armnn/layers/GatherNdLayer.cpp
+    src/armnn/layers/GatherNdLayer.hpp
     src/armnn/layers/InputLayer.hpp
     src/armnn/layers/InputLayer.cpp
     src/armnn/layers/InstanceNormalizationLayer.hpp
diff --git a/docs/02_operator_list.dox b/docs/02_operator_list.dox
index 2d16818..b29d56f 100644
--- a/docs/02_operator_list.dox
+++ b/docs/02_operator_list.dox
@@ -1455,6 +1455,51 @@
     <tr><td>All
     </table>
 <tr>
+  <td rowspan="3">GatherNdLayer
+  <td rowspan="3" style="width:200px;"> Layer to perform the gatherNd operation.
+  <td rowspan="3">
+      <ul>
+       <li>N/A
+      </ul>
+   <td>CpuRef
+     <td>
+         <ul>
+          <li>All
+         </ul>
+     <td>
+      <table>
+       <tr><th>
+       <tr><td>BFLOAT16
+       <tr><td>FLOAT16
+       <tr><td>FLOAT32
+       <tr><td>QASYMMS8
+       <tr><td>QASYMMU8
+       <tr><td>QSYMMS16
+       <tr><td>SIGNED32
+      </table>
+<tr>
+  <td>CpuAcc
+  <td>
+      <ul>
+       <li>TBD
+      </ul>
+  <td>
+    <table>
+    <tr><th>
+    <tr><td>TBD
+    </table>
+<tr>
+  <td>GpuAcc
+  <td>
+      <ul>
+       <li>TBD
+      </ul>
+  <td>
+    <table>
+    <tr><th>
+    <tr><td>TBD
+    </table>
+<tr>
   <td rowspan="1">InputLayer
   <td rowspan="1" style="width:200px;"> Special layer used to provide input data to the computational network.
   <td rowspan="1">
diff --git a/include/armnn/BackendHelper.hpp b/include/armnn/BackendHelper.hpp
index 0c625a6..4772ca9 100644
--- a/include/armnn/BackendHelper.hpp
+++ b/include/armnn/BackendHelper.hpp
@@ -185,6 +185,11 @@
                            const GatherDescriptor& descriptor,
                            Optional<std::string&> reasonIfUnsupported = EmptyOptional());
 
+    bool IsGatherNdSupported(const TensorInfo& input0,
+                             const TensorInfo& input1,
+                             const TensorInfo& output,
+                             Optional<std::string&> reasonIfUnsupported = EmptyOptional());
+
     bool IsInputSupported(const TensorInfo& input,
                           Optional<std::string&> reasonIfUnsupported = EmptyOptional());
 
diff --git a/include/armnn/INetwork.hpp b/include/armnn/INetwork.hpp
index eaec973..7488fdc 100644
--- a/include/armnn/INetwork.hpp
+++ b/include/armnn/INetwork.hpp
@@ -651,6 +651,11 @@
     IConnectableLayer* AddGatherLayer(const GatherDescriptor& descriptor,
                                               const char* name = nullptr);
 
+    /// Add GatherNd layer to the network.
+    /// @param name - Optional name for the layer.
+    /// @return - Interface for configuring the layer.
+    IConnectableLayer* AddGatherNdLayer(const char* name = nullptr);
+
     /// Adds a switch layer to the network.
     /// @param name - Optional name for the layer.
     /// @return - Interface for configuring the layer.
diff --git a/include/armnn/Types.hpp b/include/armnn/Types.hpp
index a804f55..cc704a6 100644
--- a/include/armnn/Types.hpp
+++ b/include/armnn/Types.hpp
@@ -458,6 +458,7 @@
     X(ChannelShuffle) \
     X(Convolution3d) \
     X(Pooling3d) \
+    X(GatherNd)\
 
 // New layers should be added at last to minimize instability.
 
diff --git a/include/armnn/backends/WorkloadData.hpp b/include/armnn/backends/WorkloadData.hpp
index 2114158..ed89f96 100644
--- a/include/armnn/backends/WorkloadData.hpp
+++ b/include/armnn/backends/WorkloadData.hpp
@@ -527,6 +527,11 @@
     void Validate(const WorkloadInfo& workloadInfo) const;
 };
 
+struct GatherNdQueueDescriptor : QueueDescriptor
+{
+    void Validate(const WorkloadInfo& workloadInfo) const;
+};
+
 struct GatherQueueDescriptor : QueueDescriptorWithParameters<GatherDescriptor>
 {
     void Validate(const WorkloadInfo& workloadInfo) const;
diff --git a/src/armnn/BackendHelper.cpp b/src/armnn/BackendHelper.cpp
index e5c9759..056fbb0 100644
--- a/src/armnn/BackendHelper.cpp
+++ b/src/armnn/BackendHelper.cpp
@@ -655,6 +655,21 @@
                                             reasonIfUnsupported);
 }
 
+bool LayerSupportHandle::IsGatherNdSupported(const TensorInfo& input0,
+                                             const TensorInfo& input1,
+                                             const TensorInfo& output,
+                                             Optional<std::string&> reasonIfUnsupported)
+{
+    TensorInfos infos{input0, input1, output};
+
+    return m_LayerSupport->IsLayerSupported(LayerType::GatherNd,
+                                            infos,
+                                            BaseDescriptor(),
+                                            EmptyOptional(),
+                                            EmptyOptional(),
+                                            reasonIfUnsupported);
+}
+
 bool LayerSupportHandle::IsInputSupported(const TensorInfo& input,
                                           Optional<std::string&> reasonIfUnsupported)
 {
diff --git a/src/armnn/LayersFwd.hpp b/src/armnn/LayersFwd.hpp
index 607c83b..dcfb91b 100644
--- a/src/armnn/LayersFwd.hpp
+++ b/src/armnn/LayersFwd.hpp
@@ -34,6 +34,7 @@
 #include "layers/FloorLayer.hpp"
 #include "layers/FullyConnectedLayer.hpp"
 #include "layers/GatherLayer.hpp"
+#include "layers/GatherNdLayer.hpp"
 #include "layers/InputLayer.hpp"
 #include "layers/InstanceNormalizationLayer.hpp"
 #include "layers/L2NormalizationLayer.hpp"
@@ -134,6 +135,7 @@
 DECLARE_LAYER(Floor)
 DECLARE_LAYER(FullyConnected)
 DECLARE_LAYER(Gather)
+DECLARE_LAYER(GatherNd)
 DECLARE_LAYER(Input)
 DECLARE_LAYER(InstanceNormalization)
 DECLARE_LAYER(L2Normalization)
diff --git a/src/armnn/Network.cpp b/src/armnn/Network.cpp
index a365550..226d478 100644
--- a/src/armnn/Network.cpp
+++ b/src/armnn/Network.cpp
@@ -379,6 +379,11 @@
     return pNetworkImpl->AddGatherLayer(descriptor, name);
 }
 
+IConnectableLayer* INetwork::AddGatherNdLayer(const char* name)
+{
+    return pNetworkImpl->AddGatherNdLayer(name);
+}
+
 IConnectableLayer* INetwork::AddSwitchLayer(const char* name)
 {
     return pNetworkImpl->AddSwitchLayer(name);
@@ -2442,17 +2447,22 @@
 }
 
 IConnectableLayer* NetworkImpl::AddStridedSliceLayer(const StridedSliceDescriptor& stridedSliceDescriptor,
-                                                 const char* name)
+                                                     const char* name)
 {
     return m_Graph->AddLayer<StridedSliceLayer>(stridedSliceDescriptor, name);
 }
 
 IConnectableLayer* NetworkImpl::AddGatherLayer(const GatherDescriptor& gatherDescriptor,
-                                           const char* name)
+                                               const char* name)
 {
     return m_Graph->AddLayer<GatherLayer>(gatherDescriptor, name);
 }
 
+IConnectableLayer* NetworkImpl::AddGatherNdLayer(const char* name)
+{
+    return m_Graph->AddLayer<GatherNdLayer>(name);
+}
+
 IConnectableLayer* NetworkImpl::AddMergeLayer(const char* name)
 {
     return m_Graph->AddLayer<MergeLayer>(name);
diff --git a/src/armnn/Network.hpp b/src/armnn/Network.hpp
index fffad86..6e4d29e 100644
--- a/src/armnn/Network.hpp
+++ b/src/armnn/Network.hpp
@@ -130,6 +130,8 @@
     IConnectableLayer* AddGatherLayer(const GatherDescriptor& gatherDescriptor,
                                       const char* name = nullptr);
 
+    IConnectableLayer* AddGatherNdLayer(const char* name = nullptr);
+
     IConnectableLayer* AddInstanceNormalizationLayer(const InstanceNormalizationDescriptor& desc,
                                                      const char* name = nullptr);
 
diff --git a/src/armnn/layers/GatherNdLayer.cpp b/src/armnn/layers/GatherNdLayer.cpp
new file mode 100644
index 0000000..1ca2cbb
--- /dev/null
+++ b/src/armnn/layers/GatherNdLayer.cpp
@@ -0,0 +1,104 @@
+//
+// Copyright © 2022 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "GatherNdLayer.hpp"
+#include "LayerCloneBase.hpp"
+
+#include <armnn/TypesUtils.hpp>
+#include <armnn/backends/WorkloadData.hpp>
+#include <armnn/backends/WorkloadFactory.hpp>
+
+namespace armnn
+{
+
+GatherNdLayer::GatherNdLayer(const char* name)
+    : Layer(2, 1, LayerType::GatherNd, name)
+{
+}
+
+std::unique_ptr<IWorkload> GatherNdLayer::CreateWorkload(const armnn::IWorkloadFactory& factory) const
+{
+    GatherNdQueueDescriptor descriptor;
+    SetAdditionalInfo(descriptor);
+
+    return factory.CreateWorkload(LayerType::GatherNd, descriptor, PrepInfoAndDesc(descriptor));
+}
+
+GatherNdLayer* GatherNdLayer::Clone(Graph& graph) const
+{
+    return CloneBase<GatherNdLayer>(graph, GetName());
+}
+
+std::vector<TensorShape> GatherNdLayer::InferOutputShapes(const std::vector<TensorShape>& inputShapes) const
+{
+    ARMNN_ASSERT(inputShapes.size() == 2);
+    const TensorShape& params = inputShapes[0];
+    const TensorShape& indices = inputShapes[1];
+
+    if (indices.GetDimensionality() == Dimensionality::Scalar && indices.GetNumDimensions() == 1)
+    {
+         return std::vector<TensorShape>({ TensorShape(Dimensionality::Scalar)});
+    }
+
+    const unsigned int paramsDim = params.GetNumDimensions();
+    const unsigned int indicesDim = indices.GetNumDimensions();
+
+    // last dimension of indices
+    unsigned int index_depth = indices[indicesDim - 1];
+    ARMNN_ASSERT(index_depth <= paramsDim);
+
+    // all but the last dimension of indices
+    std::vector<unsigned int> outer_shape;
+    outer_shape.reserve(indicesDim - 1);
+    for (unsigned int i = 0; i < indicesDim - 1; ++i)
+    {
+        outer_shape.emplace_back(indices[i]);
+    }
+
+    // elements after index_depth
+    std::vector<unsigned int> inner_shape;
+    inner_shape.reserve(paramsDim - index_depth);
+    for (unsigned int i = index_depth; i < paramsDim; ++i)
+    {
+        inner_shape.emplace_back(params[i]);
+    }
+
+    // concatenate outer_shape + inner_shape
+    std::vector<unsigned int> output_shape;
+    output_shape.reserve( outer_shape.size() + inner_shape.size() );
+    output_shape.insert( output_shape.end(), outer_shape.begin(), outer_shape.end() );
+    output_shape.insert( output_shape.end(), inner_shape.begin(), inner_shape.end() );
+
+    const auto outputDim = static_cast<unsigned int>(output_shape.size());
+    return std::vector<TensorShape>({ TensorShape({outputDim, output_shape.data()})});
+}
+
+void GatherNdLayer::ValidateTensorShapesFromInputs()
+{
+    VerifyLayerConnections(2, CHECK_LOCATION());
+
+    const TensorShape& outputShape = GetOutputSlot(0).GetTensorInfo().GetShape();
+
+    VerifyShapeInferenceType(outputShape, m_ShapeInferenceMethod);
+
+    std::vector<TensorShape> inferredShapes = InferOutputShapes(
+            {GetInputSlot(0).GetConnection()->GetTensorInfo().GetShape(),
+             GetInputSlot(1).GetConnection()->GetTensorInfo().GetShape()});
+    ARMNN_ASSERT(inferredShapes.size() == 1);
+    ARMNN_ASSERT(inferredShapes[0].GetDimensionality() == Dimensionality::Specified ||
+                 inferredShapes[0].GetDimensionality() == Dimensionality::Scalar);
+
+    ValidateAndCopyShape(outputShape, inferredShapes[0], m_ShapeInferenceMethod, "GatherNdLayer");
+}
+
+ARMNN_NO_DEPRECATE_WARN_BEGIN
+void GatherNdLayer::Accept(ILayerVisitor& visitor) const
+{
+    IgnoreUnused(visitor);
+    throw armnn::Exception("GatherNdLayer VisitGatherNdLayer is not implemented");
+}
+ARMNN_NO_DEPRECATE_WARN_END
+
+} // namespace armnn
diff --git a/src/armnn/layers/GatherNdLayer.hpp b/src/armnn/layers/GatherNdLayer.hpp
new file mode 100644
index 0000000..9e07715
--- /dev/null
+++ b/src/armnn/layers/GatherNdLayer.hpp
@@ -0,0 +1,48 @@
+//
+// Copyright © 2022 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include "Layer.hpp"
+
+namespace armnn
+{
+
+/// This layer represents a GatherNd operator.
+class GatherNdLayer : public Layer
+{
+public:
+    /// Makes a workload for the Gather type.
+    /// @param [in] factory The workload factory which will create the workload.
+    /// @return A pointer to the created workload, or nullptr if not created.
+    virtual std::unique_ptr<IWorkload> CreateWorkload(const IWorkloadFactory& factory) const override;
+
+    /// Creates a dynamically-allocated copy of this layer.
+    /// @param [in] graph The graph into which this layer is being cloned.
+    GatherNdLayer* Clone(Graph& graph) const override;
+
+    /// Infers the output shapes from given input shapes and layer properties.
+    /// @param [in] inputShapes The input shapes layer has.
+    /// @return A vector to the inferred output shape.
+    std::vector<TensorShape> InferOutputShapes(const std::vector<TensorShape>& inputShapes) const override;
+
+    /// Check if the input tensor shape(s)
+    /// will lead to a valid configuration of @ref GatherNdLayer.
+    void ValidateTensorShapesFromInputs() override;
+
+    ARMNN_NO_DEPRECATE_WARN_BEGIN
+    void Accept(ILayerVisitor& visitor) const override;
+    ARMNN_NO_DEPRECATE_WARN_END
+
+protected:
+    /// Constructor to create a GatherNdLayer.
+    /// @param [in] name Optional name for the layer.
+    GatherNdLayer(const char* name);
+
+    /// Default destructor
+    ~GatherNdLayer() = default;
+};
+
+} // namespace armnn
diff --git a/src/backends/backendsCommon/WorkloadData.cpp b/src/backends/backendsCommon/WorkloadData.cpp
index fc48ffc..962ecde 100644
--- a/src/backends/backendsCommon/WorkloadData.cpp
+++ b/src/backends/backendsCommon/WorkloadData.cpp
@@ -2718,6 +2718,41 @@
     ValidateTensorDataTypesMatch(inputTensorInfo, outputTensorInfo, descriptorName, "input", "output");
 }
 
+void GatherNdQueueDescriptor::Validate(const WorkloadInfo& workloadInfo) const
+{
+    const std::string descriptorName{"GatherNdQueueDescriptor"};
+
+    ValidateNumInputs(workloadInfo,  descriptorName, 2);
+    ValidateNumOutputs(workloadInfo, descriptorName, 1);
+
+    const TensorInfo& indicesTensorInfo = workloadInfo.m_InputTensorInfos[1];
+    if (indicesTensorInfo.GetDataType() != DataType::Signed32)
+    {
+        throw InvalidArgumentException(descriptorName + ": Indices tensor type must be Int32.");
+    }
+
+    const TensorInfo& inputTensorInfo  = workloadInfo.m_InputTensorInfos[0];
+    const TensorInfo& outputTensorInfo = workloadInfo.m_OutputTensorInfos[0];
+
+    std::vector<DataType> supportedTypes =
+            {
+                    DataType::BFloat16,
+                    DataType::Float16,
+                    DataType::Float32,
+                    DataType::QAsymmS8,
+                    DataType::QAsymmU8,
+                    DataType::QSymmS16,
+                    DataType::Signed32,
+            };
+
+    ValidateDataTypes(inputTensorInfo, supportedTypes, descriptorName);
+
+    ValidateTensorDataTypesMatch(inputTensorInfo, outputTensorInfo, descriptorName, "input", "output");
+
+    unsigned int outputDim  = outputTensorInfo.GetNumDimensions();
+    ValidateTensorNumDimensions(outputTensorInfo, descriptorName, outputDim, "output");
+}
+
 void GatherQueueDescriptor::Validate(const WorkloadInfo& workloadInfo) const
 {
     const std::string descriptorName{"GatherQueueDescriptor"};
diff --git a/src/backends/backendsCommon/WorkloadFactory.cpp b/src/backends/backendsCommon/WorkloadFactory.cpp
index 090e285..f955aec 100644
--- a/src/backends/backendsCommon/WorkloadFactory.cpp
+++ b/src/backends/backendsCommon/WorkloadFactory.cpp
@@ -509,6 +509,17 @@
                                                           reason);
             break;
         }
+        case LayerType::GatherNd:
+        {
+            const TensorInfo& input0 = layer.GetInputSlot(0).GetConnection()->GetTensorInfo();
+            const TensorInfo& input1 = layer.GetInputSlot(1).GetConnection()->GetTensorInfo();
+            const TensorInfo& output = layer.GetOutputSlot(0).GetTensorInfo();
+            result = layerSupportObject.IsGatherNdSupported(OverrideDataType(input0, dataType),
+                                                            input1,
+                                                            OverrideDataType(output, dataType),
+                                                            reason);
+            break;
+        }
         case LayerType::Input:
         {
             const TensorInfo& input = layer.GetOutputSlot(0).GetTensorInfo();
diff --git a/src/backends/backendsCommon/WorkloadUtils.cpp b/src/backends/backendsCommon/WorkloadUtils.cpp
index fcdad3e..d2ae16a 100644
--- a/src/backends/backendsCommon/WorkloadUtils.cpp
+++ b/src/backends/backendsCommon/WorkloadUtils.cpp
@@ -10,6 +10,7 @@
 #include <armnnUtils/DataLayoutIndexed.hpp>
 
 #include <fmt/format.h>
+#include <numeric>
 
 namespace armnn
 {
@@ -294,4 +295,48 @@
     return reversedMask;
 }
 
+std::map<std::string, unsigned int> CalculateGatherNdKeyIndices(TensorInfo inputInfo0, TensorInfo inputInfo1)
+{
+    std::vector<unsigned int> paramsShape;
+    for (unsigned int i = 0; i < inputInfo0.GetNumDimensions(); ++i)
+    {
+        paramsShape.push_back(inputInfo0.GetShape()[i]);
+    }
+
+    std::vector<unsigned int> indicesShape;
+    for (unsigned int i = 0; i < inputInfo1.GetNumDimensions(); ++i)
+    {
+        indicesShape.push_back(inputInfo1.GetShape()[i]);
+    }
+
+    std::map<std::string, unsigned int> keyIndices;
+
+    // N: number of batches
+    keyIndices["N"] = 1;
+
+    // ND: number of dimensions that are sliced from params
+    keyIndices["ND"] = indicesShape.back();
+
+    // W: number of indices in each batch (all but the last dimension)
+    keyIndices["W"] =
+        static_cast<unsigned int>(std::accumulate(std::begin(indicesShape),
+                                                  std::end(indicesShape) - 1,
+                                                  1,
+                                                  std::multiplies<>() ));
+    // K: range of each index
+    keyIndices["K"] =
+        static_cast<unsigned int>(std::accumulate(std::begin(paramsShape),
+                                                  std::begin(paramsShape) + static_cast<int>(keyIndices["ND"]),
+                                                  1,
+                                                  std::multiplies<>() ));
+    //  C: number of channels for each index
+    keyIndices["C"] =
+        static_cast<unsigned int>(std::accumulate(std::begin(paramsShape) + static_cast<int>(keyIndices["ND"]),
+                                                  std::end(paramsShape),
+                                                  1,
+                                                  std::multiplies<>() ));
+
+    return keyIndices;
+}
+
 } // namespace armnn
diff --git a/src/backends/backendsCommon/WorkloadUtils.hpp b/src/backends/backendsCommon/WorkloadUtils.hpp
index 2f1c5c4..0e54873 100644
--- a/src/backends/backendsCommon/WorkloadUtils.hpp
+++ b/src/backends/backendsCommon/WorkloadUtils.hpp
@@ -251,4 +251,11 @@
                                                         const DataLayout& dataLayout,
                                                         void* permuteBuffer);
 
+/// Calculates the key index values needed for GatherNd: N, ND, K, W, C (N is always 1)
+///
+/// \param inputInfo0 - TensorInfo of the corresponding input tensor: params
+/// \param inputInfo1 - TensorInfo of the corresponding input tensor: indices
+/// \return - A map with names and values for  N, ND, K, W, C
+std::map<std::string, unsigned int> CalculateGatherNdKeyIndices(TensorInfo inputInfo0, TensorInfo inputInfo1);
+
 }  //namespace armnn
diff --git a/src/backends/backendsCommon/common.mk b/src/backends/backendsCommon/common.mk
index 8f97669..1f42a5c 100644
--- a/src/backends/backendsCommon/common.mk
+++ b/src/backends/backendsCommon/common.mk
@@ -68,6 +68,7 @@
     test/layerTests/FillTestImpl.cpp \
     test/layerTests/FloorTestImpl.cpp \
     test/layerTests/FullyConnectedTestImpl.cpp \
+    test/layerTests/GatherNdTestImpl.cpp \
     test/layerTests/GatherTestImpl.cpp \
     test/layerTests/InstanceNormalizationTestImpl.cpp \
     test/layerTests/L2NormalizationTestImpl.cpp \
diff --git a/src/backends/backendsCommon/test/CMakeLists.txt b/src/backends/backendsCommon/test/CMakeLists.txt
index 8ec65b3..06d230b 100644
--- a/src/backends/backendsCommon/test/CMakeLists.txt
+++ b/src/backends/backendsCommon/test/CMakeLists.txt
@@ -30,6 +30,7 @@
     FillEndToEndTestImpl.hpp
     FullyConnectedEndToEndTestImpl.hpp
     GatherEndToEndTestImpl.hpp
+    GatherNdEndToEndTestImpl.hpp
     InstanceNormalizationEndToEndTestImpl.cpp
     InstanceNormalizationEndToEndTestImpl.hpp
     IsLayerSupportedTestImpl.hpp
@@ -113,6 +114,8 @@
     layerTests/FloorTestImpl.hpp
     layerTests/FullyConnectedTestImpl.cpp
     layerTests/FullyConnectedTestImpl.hpp
+    layerTests/GatherNdTestImpl.cpp
+    layerTests/GatherNdTestImpl.hpp
     layerTests/GatherTestImpl.cpp
     layerTests/GatherTestImpl.hpp
     layerTests/InstanceNormalizationTestImpl.cpp
diff --git a/src/backends/backendsCommon/test/GatherNdEndToEndTestImpl.hpp b/src/backends/backendsCommon/test/GatherNdEndToEndTestImpl.hpp
new file mode 100644
index 0000000..0eea911
--- /dev/null
+++ b/src/backends/backendsCommon/test/GatherNdEndToEndTestImpl.hpp
@@ -0,0 +1,161 @@
+//
+// Copyright © 2022 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include <CommonTestUtils.hpp>
+
+#include <armnn/INetwork.hpp>
+#include <ResolveType.hpp>
+
+#include <doctest/doctest.h>
+
+namespace{
+
+armnn::INetworkPtr CreateGatherNdNetwork(const armnn::TensorInfo& paramsInfo,
+                                         const armnn::TensorInfo& indicesInfo,
+                                         const armnn::TensorInfo& outputInfo,
+                                         const std::vector<int32_t>& indicesData)
+{
+    armnn::INetworkPtr net(armnn::INetwork::Create());
+
+    armnn::IConnectableLayer* paramsLayer = net->AddInputLayer(0);
+    armnn::IConnectableLayer* indicesLayer = net->AddConstantLayer(armnn::ConstTensor(indicesInfo, indicesData));
+    armnn::IConnectableLayer* gatherNdLayer = net->AddGatherNdLayer("gatherNd");
+    armnn::IConnectableLayer* outputLayer = net->AddOutputLayer(0, "output");
+    Connect(paramsLayer, gatherNdLayer, paramsInfo, 0, 0);
+    Connect(indicesLayer, gatherNdLayer, indicesInfo, 0, 1);
+    Connect(gatherNdLayer, outputLayer, outputInfo, 0, 0);
+
+    return net;
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+void GatherNdEndToEnd(const std::vector<BackendId>& backends)
+{
+    armnn::TensorInfo paramsInfo({ 2, 3, 8, 4 }, ArmnnType);
+    armnn::TensorInfo indicesInfo({ 2, 2 }, armnn::DataType::Signed32);
+    armnn::TensorInfo outputInfo({ 2, 8, 4 }, ArmnnType);
+
+    paramsInfo.SetQuantizationScale(1.0f);
+    paramsInfo.SetQuantizationOffset(0);
+    paramsInfo.SetConstant(true);
+    indicesInfo.SetConstant(true);
+    outputInfo.SetQuantizationScale(1.0f);
+    outputInfo.SetQuantizationOffset(0);
+
+    // Creates structures for input & output.
+    std::vector<T> paramsData{
+             0,   1,   2,   3, 4,   5,   6,   7, 8,   9,  10,  11, 12,  13,  14,  15,
+            16,  17,  18,  19, 20,  21,  22,  23, 24,  25,  26,  27, 28,  29,  30,  31,
+
+            32,  33,  34,  35, 36,  37,  38,  39, 40,  41,  42,  43, 44,  45,  46,  47,
+            48,  49,  50,  51, 52,  53,  54,  55, 56,  57,  58,  59, 60,  61,  62,  63,
+
+            64,  65,  66,  67, 68,  69,  70,  71, 72,  73,  74,  75, 76,  77,  78,  79,
+            80,  81,  82,  83, 84,  85,  86,  87, 88,  89,  90,  91, 92,  93,  94,  95,
+
+            96,  97,  98,  99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,
+            112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127,
+
+            128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,
+            144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159,
+
+            160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175,
+            176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191
+    };
+
+    std::vector<int32_t> indicesData{
+            { 1, 2, 1, 1},
+    };
+
+    std::vector<T> expectedOutput{
+        160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175,
+        176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191,
+
+        128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,
+        144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159
+    };
+
+    // Builds up the structure of the network
+    armnn::INetworkPtr net = CreateGatherNdNetwork(paramsInfo, indicesInfo, outputInfo, indicesData);
+
+    CHECK(net);
+
+    std::map<int, std::vector<T>> inputTensorData = {{ 0, paramsData }};
+    std::map<int, std::vector<T>> expectedOutputData = {{ 0, expectedOutput }};
+
+    EndToEndLayerTestImpl<ArmnnType, ArmnnType>(move(net), inputTensorData, expectedOutputData, backends);
+}
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+void GatherNdMultiDimEndToEnd(const std::vector<BackendId>& backends)
+{
+    armnn::TensorInfo paramsInfo({ 5, 5, 2 }, ArmnnType);
+    armnn::TensorInfo indicesInfo({ 2, 2, 3, 2 }, armnn::DataType::Signed32);
+    armnn::TensorInfo outputInfo({ 2, 2, 3, 2 }, ArmnnType);
+
+    paramsInfo.SetQuantizationScale(1.0f);
+    paramsInfo.SetQuantizationOffset(0);
+    paramsInfo.SetConstant(true);
+    indicesInfo.SetConstant(true);
+    outputInfo.SetQuantizationScale(1.0f);
+    outputInfo.SetQuantizationOffset(0);
+
+    // Creates structures for input & output.
+    std::vector<T> paramsData{
+            0,  1,    2,  3,    4,  5,    6,  7,    8,  9,
+            10, 11,   12,  13,   14, 15,   16, 17,   18, 19,
+            20, 21,   22,  23,   24, 25,   26, 27,   28, 29,
+            30, 31,   32,  33,   34, 35,   36, 37,   38, 39,
+            40, 41,   42,  43,   44, 45,   46, 47,   48, 49
+    };
+
+    std::vector<int32_t> indicesData{
+            0, 0,
+            3, 3,
+            4, 4,
+
+            0, 0,
+            1, 1,
+            2, 2,
+
+            4, 4,
+            3, 3,
+            0, 0,
+
+            2, 2,
+            1, 1,
+            0, 0
+    };
+
+    std::vector<T> expectedOutput{
+            0,  1,
+            36, 37,
+            48, 49,
+
+            0,  1,
+            12, 13,
+            24, 25,
+
+            48, 49,
+            36, 37,
+            0,  1,
+
+            24, 25,
+            12, 13,
+            0,  1
+    };
+
+    // Builds up the structure of the network
+    armnn::INetworkPtr net = CreateGatherNdNetwork(paramsInfo, indicesInfo, outputInfo, indicesData);
+
+    std::map<int, std::vector<T>> inputTensorData = {{ 0, paramsData }};
+    std::map<int, std::vector<T>> expectedOutputData = {{ 0, expectedOutput }};
+
+    EndToEndLayerTestImpl<ArmnnType, ArmnnType>(move(net), inputTensorData, expectedOutputData, backends);
+}
+
+} // anonymous namespace
diff --git a/src/backends/backendsCommon/test/IsLayerSupportedTestImpl.hpp b/src/backends/backendsCommon/test/IsLayerSupportedTestImpl.hpp
index 06f3eb5..ba8cfd5 100644
--- a/src/backends/backendsCommon/test/IsLayerSupportedTestImpl.hpp
+++ b/src/backends/backendsCommon/test/IsLayerSupportedTestImpl.hpp
@@ -666,6 +666,8 @@
 
 DECLARE_LAYER_POLICY_2_PARAM(Gather)
 
+DECLARE_LAYER_POLICY_1_PARAM(GatherNd)
+
 DECLARE_LAYER_POLICY_CUSTOM_PARAM(Input, armnn::LayerBindingId)
 
 DECLARE_LAYER_POLICY_2_PARAM(InstanceNormalization)
diff --git a/src/backends/backendsCommon/test/LayerTests.hpp b/src/backends/backendsCommon/test/LayerTests.hpp
index 6bd2943..e30cf2b 100644
--- a/src/backends/backendsCommon/test/LayerTests.hpp
+++ b/src/backends/backendsCommon/test/LayerTests.hpp
@@ -33,6 +33,7 @@
 #include <backendsCommon/test/layerTests/FillTestImpl.hpp>
 #include <backendsCommon/test/layerTests/FloorTestImpl.hpp>
 #include <backendsCommon/test/layerTests/FullyConnectedTestImpl.hpp>
+#include <backendsCommon/test/layerTests/GatherNdTestImpl.hpp>
 #include <backendsCommon/test/layerTests/GatherTestImpl.hpp>
 #include <backendsCommon/test/layerTests/InstanceNormalizationTestImpl.hpp>
 #include <backendsCommon/test/layerTests/L2NormalizationTestImpl.hpp>
diff --git a/src/backends/backendsCommon/test/layerTests/GatherNdTestImpl.cpp b/src/backends/backendsCommon/test/layerTests/GatherNdTestImpl.cpp
new file mode 100644
index 0000000..57a30c6
--- /dev/null
+++ b/src/backends/backendsCommon/test/layerTests/GatherNdTestImpl.cpp
@@ -0,0 +1,300 @@
+//
+// Copyright © 2022 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "GatherNdTestImpl.hpp"
+
+#include <DataTypeUtils.hpp>
+#include <armnnTestUtils/TensorCopyUtils.hpp>
+#include <armnnTestUtils/WorkloadTestUtils.hpp>
+
+namespace
+{
+
+template<armnn::DataType ArmnnType,
+        typename T = armnn::ResolveType<ArmnnType>,
+        size_t ParamsDim,
+        size_t IndicesDim,
+        size_t OutputDim>
+LayerTestResult<T, OutputDim> GatherNdTestImpl(
+        armnn::IWorkloadFactory &workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr &memoryManager,
+        const armnn::ITensorHandleFactory &tensorHandleFactory,
+        const armnn::TensorInfo &paramsInfo,
+        const armnn::TensorInfo &indicesInfo,
+        const armnn::TensorInfo &outputInfo,
+        const std::vector<T> &paramsData,
+        const std::vector<int32_t> &indicesData,
+        const std::vector<T> &outputData)
+{
+    IgnoreUnused(memoryManager);
+
+    std::vector<T> actualOutput(outputInfo.GetNumElements());
+
+    std::unique_ptr<armnn::ITensorHandle> paramsHandle = tensorHandleFactory.CreateTensorHandle(paramsInfo);
+    std::unique_ptr<armnn::ITensorHandle> indicesHandle = tensorHandleFactory.CreateTensorHandle(indicesInfo);
+    std::unique_ptr<armnn::ITensorHandle> outputHandle = tensorHandleFactory.CreateTensorHandle(outputInfo);
+
+    armnn::GatherNdQueueDescriptor data;
+    armnn::WorkloadInfo info;
+    AddInputToWorkload(data, info, paramsInfo, paramsHandle.get());
+    AddInputToWorkload(data, info, indicesInfo, indicesHandle.get());
+    AddOutputToWorkload(data, info, outputInfo, outputHandle.get());
+
+    std::unique_ptr<armnn::IWorkload> workload = workloadFactory.CreateWorkload(armnn::LayerType::GatherNd,
+                                                                                data,
+                                                                                info);
+
+    paramsHandle->Allocate();
+    indicesHandle->Allocate();
+    outputHandle->Allocate();
+
+    CopyDataToITensorHandle(paramsHandle.get(), paramsData.data());
+    CopyDataToITensorHandle(indicesHandle.get(), indicesData.data());
+
+    workload->Execute();
+
+    CopyDataFromITensorHandle(actualOutput.data(), outputHandle.get());
+
+    return LayerTestResult<T, OutputDim>(actualOutput,
+                                         outputData,
+                                         outputHandle->GetShape(),
+                                         outputInfo.GetShape());
+}
+} // anonymous namespace
+
+template<armnn::DataType ArmnnType, typename T>
+LayerTestResult<T, 2> SimpleGatherNd2dTest(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        const armnn::ITensorHandleFactory& tensorHandleFactory)
+{
+    armnn::TensorInfo paramsInfo({ 5, 2 }, ArmnnType);
+    armnn::TensorInfo indicesInfo({ 3, 1 }, armnn::DataType::Signed32);
+    armnn::TensorInfo outputInfo({ 3, 2 }, ArmnnType);
+    if (armnn::IsQuantizedType<T>())
+    {
+        paramsInfo.SetQuantizationScale(1.0f);
+        paramsInfo.SetQuantizationOffset(1);
+        outputInfo.SetQuantizationScale(1.0f);
+        outputInfo.SetQuantizationOffset(1);
+    }
+    const std::vector<T> params = ConvertToDataType<ArmnnType>(
+            { 1, 2,
+              3, 4,
+              5, 6,
+              7, 8,
+              9, 10},
+            paramsInfo);
+    const std::vector<int32_t> indices  = ConvertToDataType<armnn::DataType::Signed32>(
+            { 1, 0, 4},
+            indicesInfo);
+    const std::vector<T> expectedOutput = ConvertToDataType<ArmnnType>(
+            { 3, 4,
+              1, 2,
+              9, 10},
+            outputInfo);
+    return GatherNdTestImpl<ArmnnType, T, 2, 2, 2>(
+            workloadFactory,
+            memoryManager,
+            tensorHandleFactory,
+            paramsInfo,
+            indicesInfo,
+            outputInfo,
+            params,
+            indices,
+            expectedOutput);
+}
+
+template<armnn::DataType ArmnnType, typename T>
+LayerTestResult<T, 3> SimpleGatherNd3dTest(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        const armnn::ITensorHandleFactory& tensorHandleFactory)
+{
+    armnn::TensorInfo paramsInfo({ 2, 3, 8, 4 }, ArmnnType);
+    armnn::TensorInfo indicesInfo({ 2, 2 }, armnn::DataType::Signed32);
+    armnn::TensorInfo outputInfo({ 2, 8, 4 }, ArmnnType);
+
+    if (armnn::IsQuantizedType<T>())
+    {
+        paramsInfo.SetQuantizationScale(1.0f);
+        paramsInfo.SetQuantizationOffset(0);
+        outputInfo.SetQuantizationScale(1.0f);
+        outputInfo.SetQuantizationOffset(0);
+    }
+    const std::vector<T> params = ConvertToDataType<ArmnnType>(
+            { 0,   1,   2,   3, 4,   5,   6,   7, 8,   9,  10,  11, 12,  13,  14,  15,
+             16,  17,  18,  19, 20,  21,  22,  23, 24,  25,  26,  27, 28,  29,  30,  31,
+
+             32,  33,  34,  35, 36,  37,  38,  39, 40,  41,  42,  43, 44,  45,  46,  47,
+             48,  49,  50,  51, 52,  53,  54,  55, 56,  57,  58,  59, 60,  61,  62,  63,
+
+             64,  65,  66,  67, 68,  69,  70,  71, 72,  73,  74,  75, 76,  77,  78,  79,
+             80,  81,  82,  83, 84,  85,  86,  87, 88,  89,  90,  91, 92,  93,  94,  95,
+
+             96,  97,  98,  99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,
+            112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127,
+
+            128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,
+            144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159,
+
+            160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175,
+            176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191 },
+            paramsInfo);
+
+    const std::vector<int32_t> indices  = ConvertToDataType<armnn::DataType::Signed32>(
+            { 1, 2, 1, 1},
+            indicesInfo);
+
+    const std::vector<T> expectedOutput = ConvertToDataType<ArmnnType>(
+            { 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175,
+            176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191,
+
+            128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,
+            144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159},
+            outputInfo);
+
+    return GatherNdTestImpl<ArmnnType, T, 4, 2, 3>(
+            workloadFactory,
+            memoryManager,
+            tensorHandleFactory,
+            paramsInfo,
+            indicesInfo,
+            outputInfo,
+            params,
+            indices,
+            expectedOutput);
+}
+
+template<armnn::DataType ArmnnType, typename T>
+LayerTestResult<T, 4> SimpleGatherNd4dTest(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        const armnn::ITensorHandleFactory& tensorHandleFactory)
+{
+    armnn::TensorInfo paramsInfo({ 5, 5, 2 }, ArmnnType);
+    armnn::TensorInfo indicesInfo({ 2, 2, 3, 2 }, armnn::DataType::Signed32);
+    armnn::TensorInfo outputInfo({ 2, 2, 3, 2 }, ArmnnType);
+
+    if (armnn::IsQuantizedType<T>())
+    {
+        paramsInfo.SetQuantizationScale(1.0f);
+        paramsInfo.SetQuantizationOffset(0);
+        outputInfo.SetQuantizationScale(1.0f);
+        outputInfo.SetQuantizationOffset(0);
+    }
+    const std::vector<T> params = ConvertToDataType<ArmnnType>(
+        { 0,  1,    2,  3,    4,  5,    6,  7,    8,  9,
+         10, 11,   12,  13,   14, 15,   16, 17,   18, 19,
+         20, 21,   22,  23,   24, 25,   26, 27,   28, 29,
+         30, 31,   32,  33,   34, 35,   36, 37,   38, 39,
+         40, 41,   42,  43,   44, 45,   46, 47,   48, 49 },
+        paramsInfo);
+
+    const std::vector<int32_t> indices  = ConvertToDataType<armnn::DataType::Signed32>(
+        { 0, 0,
+          3, 3,
+          4, 4,
+
+          0, 0,
+          1, 1,
+          2, 2,
+
+          4, 4,
+          3, 3,
+          0, 0,
+
+          2, 2,
+          1, 1,
+          0, 0 },
+        indicesInfo);
+
+    const std::vector<T> expectedOutput = ConvertToDataType<ArmnnType>(
+        {  0,  1,
+          36, 37,
+          48, 49,
+
+           0,  1,
+          12, 13,
+          24, 25,
+
+          48, 49,
+          36, 37,
+           0,  1,
+
+          24, 25,
+          12, 13,
+           0,  1 },
+        outputInfo);
+
+    return GatherNdTestImpl<ArmnnType, T, 3, 4, 4>(
+            workloadFactory,
+            memoryManager,
+            tensorHandleFactory,
+            paramsInfo,
+            indicesInfo,
+            outputInfo,
+            params,
+            indices,
+            expectedOutput);
+}
+
+//
+// Explicit template specializations
+//
+
+template LayerTestResult<armnn::ResolveType<armnn::DataType::Float32>, 2>
+SimpleGatherNd2dTest<armnn::DataType::Float32>(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        const armnn::ITensorHandleFactory& tensorHandleFactory);
+
+template LayerTestResult<armnn::ResolveType<armnn::DataType::Float32>, 3>
+SimpleGatherNd3dTest<armnn::DataType::Float32>(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        const armnn::ITensorHandleFactory& tensorHandleFactory);
+
+template LayerTestResult<armnn::ResolveType<armnn::DataType::Float32>, 4>
+SimpleGatherNd4dTest<armnn::DataType::Float32>(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        const armnn::ITensorHandleFactory& tensorHandleFactory);
+
+template LayerTestResult<armnn::ResolveType<armnn::DataType::QAsymmS8>, 2>
+SimpleGatherNd2dTest<armnn::DataType::QAsymmS8>(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        const armnn::ITensorHandleFactory& tensorHandleFactory);
+
+template LayerTestResult<armnn::ResolveType<armnn::DataType::QAsymmS8>, 3>
+SimpleGatherNd3dTest<armnn::DataType::QAsymmS8>(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        const armnn::ITensorHandleFactory& tensorHandleFactory);
+
+template LayerTestResult<armnn::ResolveType<armnn::DataType::QAsymmS8>, 4>
+SimpleGatherNd4dTest<armnn::DataType::QAsymmS8>(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        const armnn::ITensorHandleFactory& tensorHandleFactory);
+
+template LayerTestResult<armnn::ResolveType<armnn::DataType::Signed32>, 2>
+SimpleGatherNd2dTest<armnn::DataType::Signed32>(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        const armnn::ITensorHandleFactory& tensorHandleFactory);
+
+template LayerTestResult<armnn::ResolveType<armnn::DataType::Signed32>, 3>
+SimpleGatherNd3dTest<armnn::DataType::Signed32>(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        const armnn::ITensorHandleFactory& tensorHandleFactory);
+
+template LayerTestResult<armnn::ResolveType<armnn::DataType::Signed32>, 4>
+SimpleGatherNd4dTest<armnn::DataType::Signed32>(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        const armnn::ITensorHandleFactory& tensorHandleFactory);
\ No newline at end of file
diff --git a/src/backends/backendsCommon/test/layerTests/GatherNdTestImpl.hpp b/src/backends/backendsCommon/test/layerTests/GatherNdTestImpl.hpp
new file mode 100644
index 0000000..6f0845d
--- /dev/null
+++ b/src/backends/backendsCommon/test/layerTests/GatherNdTestImpl.hpp
@@ -0,0 +1,32 @@
+//
+// Copyright © 2022 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include <armnnTestUtils/LayerTestResult.hpp>
+
+#include <Half.hpp>
+#include <ResolveType.hpp>
+
+#include <armnn/backends/IBackendInternal.hpp>
+#include <armnn/backends/WorkloadFactory.hpp>
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 2> SimpleGatherNd2dTest(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        const armnn::ITensorHandleFactory& tensorHandleFactory);
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 3> SimpleGatherNd3dTest(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        const armnn::ITensorHandleFactory& tensorHandleFactory);
+
+template<armnn::DataType ArmnnType, typename T = armnn::ResolveType<ArmnnType>>
+LayerTestResult<T, 4> SimpleGatherNd4dTest(
+        armnn::IWorkloadFactory& workloadFactory,
+        const armnn::IBackendInternal::IMemoryManagerSharedPtr& memoryManager,
+        const armnn::ITensorHandleFactory& tensorHandleFactory);
\ No newline at end of file
diff --git a/src/backends/reference/RefLayerSupport.cpp b/src/backends/reference/RefLayerSupport.cpp
index b55adfa..3bc4aff 100644
--- a/src/backends/reference/RefLayerSupport.cpp
+++ b/src/backends/reference/RefLayerSupport.cpp
@@ -212,6 +212,11 @@
                                      infos[2],
                                      *(PolymorphicDowncast<const GatherDescriptor*>(&descriptor)),
                                      reasonIfUnsupported);
+        case LayerType::GatherNd:
+            return IsGatherNdSupported(infos[0],
+                                       infos[1],
+                                       infos[2],
+                                       reasonIfUnsupported);
         case LayerType::Input:
             return IsInputSupported(infos[0], reasonIfUnsupported);
         case LayerType::InstanceNormalization:
@@ -1591,6 +1596,38 @@
     return supported;
 }
 
+bool RefLayerSupport::IsGatherNdSupported(const armnn::TensorInfo& input0,
+                                          const armnn::TensorInfo& input1,
+                                          const armnn::TensorInfo& output,
+                                          armnn::Optional<std::string&> reasonIfUnsupported) const
+{
+    bool supported = true;
+    std::array<DataType,7> supportedTypes =
+    {
+            DataType::BFloat16,
+            DataType::Float32,
+            DataType::Float16,
+            DataType::QAsymmS8,
+            DataType::QAsymmU8,
+            DataType::QSymmS16,
+            DataType::Signed32
+    };
+
+    supported &= CheckSupportRule(TypeAnyOf(input0, supportedTypes), reasonIfUnsupported,
+                                  "Reference GatherNd: input type not supported");
+
+    supported &= CheckSupportRule(TypeAnyOf(output, supportedTypes), reasonIfUnsupported,
+                                  "Reference GatherNd: output type not supported");
+
+    supported &= CheckSupportRule(TypeIs(input1, DataType::Signed32), reasonIfUnsupported,
+                                  "Reference GatherNd: indices (input1) type not supported");
+
+    supported &= CheckSupportRule(TypesAreEqual(input0, output), reasonIfUnsupported,
+                                  "Reference GatherNd: input and output types not matching");
+
+    return supported;
+}
+
 bool RefLayerSupport::IsGatherSupported(const armnn::TensorInfo& input0,
                                         const armnn::TensorInfo& input1,
                                         const armnn::TensorInfo& output,
diff --git a/src/backends/reference/RefLayerSupport.hpp b/src/backends/reference/RefLayerSupport.hpp
index b787d25..98770ad 100644
--- a/src/backends/reference/RefLayerSupport.hpp
+++ b/src/backends/reference/RefLayerSupport.hpp
@@ -169,6 +169,11 @@
                                    const FullyConnectedDescriptor& descriptor,
                                    Optional<std::string&> reasonIfUnsupported = EmptyOptional()) const override;
 
+    bool IsGatherNdSupported(const TensorInfo& input0,
+                             const TensorInfo& input1,
+                             const TensorInfo& output,
+                             Optional<std::string&> reasonIfUnsupported = EmptyOptional()) const;
+
     bool IsGatherSupported(const TensorInfo& input0,
                            const TensorInfo& input1,
                            const TensorInfo& output,
diff --git a/src/backends/reference/RefWorkloadFactory.cpp b/src/backends/reference/RefWorkloadFactory.cpp
index 9db81fc..2d95658 100644
--- a/src/backends/reference/RefWorkloadFactory.cpp
+++ b/src/backends/reference/RefWorkloadFactory.cpp
@@ -353,6 +353,11 @@
             auto gatherQueueDescriptor = PolymorphicDowncast<const GatherQueueDescriptor*>(&descriptor);
             return std::make_unique<RefGatherWorkload>(*gatherQueueDescriptor, info);
         }
+        case LayerType::GatherNd:
+        {
+            auto gatherNdQueueDescriptor = PolymorphicDowncast<const GatherNdQueueDescriptor*>(&descriptor);
+            return std::make_unique<RefGatherNdWorkload>(*gatherNdQueueDescriptor, info);
+        }
         case LayerType::Input:
         {
             auto inputQueueDescriptor = PolymorphicDowncast<const InputQueueDescriptor*>(&descriptor);
diff --git a/src/backends/reference/backend.mk b/src/backends/reference/backend.mk
index 33e161c..d9a5a1d 100644
--- a/src/backends/reference/backend.mk
+++ b/src/backends/reference/backend.mk
@@ -73,6 +73,7 @@
         workloads/RefFillWorkload.cpp \
         workloads/RefFloorWorkload.cpp \
         workloads/RefFullyConnectedWorkload.cpp \
+        workloads/RefGatherNdWorkload.cpp \
         workloads/RefGatherWorkload.cpp \
         workloads/RefInstanceNormalizationWorkload.cpp \
         workloads/RefL2NormalizationWorkload.cpp \
diff --git a/src/backends/reference/test/RefEndToEndTests.cpp b/src/backends/reference/test/RefEndToEndTests.cpp
index e1c2e2f..2ed5ad8 100644
--- a/src/backends/reference/test/RefEndToEndTests.cpp
+++ b/src/backends/reference/test/RefEndToEndTests.cpp
@@ -19,6 +19,7 @@
 #include <backendsCommon/test/FillEndToEndTestImpl.hpp>
 #include <backendsCommon/test/FullyConnectedEndToEndTestImpl.hpp>
 #include <backendsCommon/test/GatherEndToEndTestImpl.hpp>
+#include <backendsCommon/test/GatherNdEndToEndTestImpl.hpp>
 #include <backendsCommon/test/InstanceNormalizationEndToEndTestImpl.hpp>
 #include <backendsCommon/test/LogSoftmaxEndToEndTestImpl.hpp>
 #include <backendsCommon/test/PreluEndToEndTestImpl.hpp>
@@ -720,6 +721,36 @@
     GatherMultiDimEndToEnd<armnn::DataType::QSymmS16>(defaultBackends);
 }
 
+TEST_CASE("RefGatherNdFloatTest")
+{
+    GatherNdEndToEnd<armnn::DataType::Float32>(defaultBackends);
+}
+
+TEST_CASE("RefGatherNdUint8Test")
+{
+    GatherNdEndToEnd<armnn::DataType::QAsymmU8>(defaultBackends);
+}
+
+TEST_CASE("RefGatherNdInt16Test")
+{
+    GatherNdEndToEnd<armnn::DataType::QSymmS16>(defaultBackends);
+}
+
+TEST_CASE("RefGatherNdMultiDimFloatTest")
+{
+    GatherNdMultiDimEndToEnd<armnn::DataType::Float32>(defaultBackends);
+}
+
+TEST_CASE("RefGatherNdMultiDimUint8Test")
+{
+    GatherNdMultiDimEndToEnd<armnn::DataType::QAsymmU8>(defaultBackends);
+}
+
+TEST_CASE("RefGatherNdMultiDimInt16Test")
+{
+    GatherNdMultiDimEndToEnd<armnn::DataType::QSymmS16>(defaultBackends);
+}
+
 // DepthToSpace
 TEST_CASE("DephtToSpaceEndToEndNchwFloat32")
 {
diff --git a/src/backends/reference/test/RefLayerTests.cpp b/src/backends/reference/test/RefLayerTests.cpp
index 9dca621..496b11d 100644
--- a/src/backends/reference/test/RefLayerTests.cpp
+++ b/src/backends/reference/test/RefLayerTests.cpp
@@ -2155,6 +2155,18 @@
 ARMNN_AUTO_TEST_CASE_WITH_THF(GatherMultiDimParamsMultiDimIndicesInt16, GatherMultiDimParamsMultiDimIndicesInt16Test)
 ARMNN_AUTO_TEST_CASE_WITH_THF(GatherMultiDimParamsMultiDimIndicesInt32, GatherMultiDimParamsMultiDimIndicesInt32Test)
 
+
+// GatherNd
+ARMNN_AUTO_TEST_CASE_WITH_THF(GatherNd2dFloat32, SimpleGatherNd2dTest<DataType::Float32>)
+ARMNN_AUTO_TEST_CASE_WITH_THF(GatherNd3dFloat32, SimpleGatherNd3dTest<DataType::Float32>)
+ARMNN_AUTO_TEST_CASE_WITH_THF(GatherNd4dFloat32, SimpleGatherNd4dTest<DataType::Float32>)
+ARMNN_AUTO_TEST_CASE_WITH_THF(GatherNd2dInt8, SimpleGatherNd2dTest<DataType::QAsymmS8>)
+ARMNN_AUTO_TEST_CASE_WITH_THF(GatherNd3dInt8, SimpleGatherNd3dTest<DataType::QAsymmS8>)
+ARMNN_AUTO_TEST_CASE_WITH_THF(GatherNd4dInt8, SimpleGatherNd4dTest<DataType::QAsymmS8>)
+ARMNN_AUTO_TEST_CASE_WITH_THF(GatherNd2dInt32, SimpleGatherNd2dTest<DataType::Signed32>)
+ARMNN_AUTO_TEST_CASE_WITH_THF(GatherNd3dInt32, SimpleGatherNd3dTest<DataType::Signed32>)
+ARMNN_AUTO_TEST_CASE_WITH_THF(GatherNd4dInt32, SimpleGatherNd4dTest<DataType::Signed32>)
+
 // Abs
 ARMNN_AUTO_TEST_CASE_WITH_THF(Abs2d, Abs2dTest<DataType::Float32>)
 ARMNN_AUTO_TEST_CASE_WITH_THF(Abs3d, Abs3dTest<DataType::Float32>)
diff --git a/src/backends/reference/workloads/CMakeLists.txt b/src/backends/reference/workloads/CMakeLists.txt
index c18342f..b1f6d8b 100644
--- a/src/backends/reference/workloads/CMakeLists.txt
+++ b/src/backends/reference/workloads/CMakeLists.txt
@@ -118,6 +118,8 @@
     RefFloorWorkload.hpp
     RefFullyConnectedWorkload.cpp
     RefFullyConnectedWorkload.hpp
+    RefGatherNdWorkload.cpp
+    RefGatherNdWorkload.hpp
     RefGatherWorkload.cpp
     RefGatherWorkload.hpp
     RefInstanceNormalizationWorkload.cpp
diff --git a/src/backends/reference/workloads/RefGatherNdWorkload.cpp b/src/backends/reference/workloads/RefGatherNdWorkload.cpp
new file mode 100644
index 0000000..4c6b559
--- /dev/null
+++ b/src/backends/reference/workloads/RefGatherNdWorkload.cpp
@@ -0,0 +1,91 @@
+//
+// Copyright © 2022 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "RefGatherNdWorkload.hpp"
+
+#include "Gather.hpp"
+#include "Profiling.hpp"
+#include "RefWorkloadUtils.hpp"
+#include "backendsCommon/WorkloadUtils.hpp"
+
+namespace armnn
+{
+
+void RefGatherNdWorkload::Execute() const
+{
+    Execute(m_Data.m_Inputs, m_Data.m_Outputs);
+}
+
+void RefGatherNdWorkload::ExecuteAsync(WorkingMemDescriptor &workingMemDescriptor)
+{
+    Execute(workingMemDescriptor.m_Inputs, workingMemDescriptor.m_Outputs);
+}
+
+void RefGatherNdWorkload::Execute(std::vector<ITensorHandle*> inputs, std::vector<ITensorHandle*> outputs) const
+{
+    ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuRef, "RefGatherNdWorkload_Execute");
+
+    const TensorInfo& inputInfo0 = GetTensorInfo(inputs[0]);
+    const TensorInfo& inputInfo1 = GetTensorInfo(inputs[1]);
+    const TensorInfo& outputInfo = GetTensorInfo(outputs[0]);
+
+    std::unique_ptr<Decoder<float>> params_decoderPtr = MakeDecoder<float>(inputInfo0, inputs[0]->Map());
+
+    const int32_t* indicesDataPtr = reinterpret_cast<int32_t*>(inputs[1]->Map());
+    std::vector<int32_t> indices(indicesDataPtr, indicesDataPtr + inputInfo1.GetNumElements());
+
+    std::unique_ptr<Encoder<float>> output_encoderPtr = MakeEncoder<float>(outputInfo, outputs[0]->Map());
+
+    std::map<std::string, unsigned int> keyIndices = CalculateGatherNdKeyIndices(inputInfo0, inputInfo1);
+
+    /// Calculate flattened indices: flattenedIndices = indices * flattenedCoefficients
+    // Calculate the flattened coefficients to use in the multiplication
+    // to calculate the flattened indices needed by gather
+    TensorShape paramsShape = inputInfo0.GetShape();
+    std::vector<unsigned int> flattenedCoeff(keyIndices["ND"], 1);
+    for (unsigned int i = 1; i < keyIndices["ND"]; ++i)
+    {
+        flattenedCoeff[i-1] = paramsShape[i];
+    }
+    for (unsigned int i = keyIndices["ND"]-1; i > 0; --i)
+    {
+        flattenedCoeff[i-1] *= flattenedCoeff[i];
+    }
+
+    // Prepare the vector to store the output of the matrix multiplication,
+    // which will represent the flattened indices needed by gather
+    armnn::TensorInfo flattenedIndices_Info = inputInfo1;
+    flattenedIndices_Info.SetShape({ keyIndices["W"] });
+    std::vector<int32_t> flattenedIndices(flattenedIndices_Info.GetNumElements(), 0);
+
+    // Multiplication to calculate the flattened indices, which are the indices needed by gather.
+    for (unsigned int i = 0; i < keyIndices["W"]; ++i)
+    {
+        for (unsigned int j = 0; j < keyIndices["ND"]; ++j)
+        {
+            flattenedIndices[i] += indices[i * keyIndices["ND"] + j] * static_cast<int32_t>(flattenedCoeff[j]);
+        }
+    }
+
+    /// Call Gather with adequate shapes
+    // Reshape params into {K, C}
+    armnn::TensorInfo params_K_C_Info =  inputInfo0;
+    params_K_C_Info.SetShape({ keyIndices["K"], keyIndices["C"] });
+
+    // Reshape indices into {N, W}
+    armnn::TensorInfo indices_N_W_Info = inputInfo1;
+    indices_N_W_Info.SetShape({ keyIndices["N"], keyIndices["W"] });
+
+    // Reshape output to have the shape given by gather {N, W, C}
+    // (the original outputInfo has the shape given by gatherNd)
+    armnn::TensorInfo outputGather_Info = outputInfo;
+    outputGather_Info.SetShape({ keyIndices["N"], keyIndices["W"], keyIndices["C"]  });
+
+    // output_gather = gather(params_K_C, indices_N_W)
+    Gather(params_K_C_Info, indices_N_W_Info, outputGather_Info,
+           *params_decoderPtr, flattenedIndices.data(), *output_encoderPtr, 0);
+}
+
+} //namespace armnn
diff --git a/src/backends/reference/workloads/RefGatherNdWorkload.hpp b/src/backends/reference/workloads/RefGatherNdWorkload.hpp
new file mode 100644
index 0000000..a0d9158
--- /dev/null
+++ b/src/backends/reference/workloads/RefGatherNdWorkload.hpp
@@ -0,0 +1,24 @@
+//
+// Copyright © 2022 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include "RefBaseWorkload.hpp"
+
+namespace armnn
+{
+
+class RefGatherNdWorkload : public RefBaseWorkload<GatherNdQueueDescriptor>
+{
+public:
+    using RefBaseWorkload<GatherNdQueueDescriptor>::RefBaseWorkload;
+    void Execute() const override;
+    void ExecuteAsync(WorkingMemDescriptor& workingMemDescriptor)  override;
+private:
+    void Execute(std::vector<ITensorHandle*> inputs, std::vector<ITensorHandle*> outputs) const;
+
+};
+
+} // namespace armnn
diff --git a/src/backends/reference/workloads/RefGatherWorkload.cpp b/src/backends/reference/workloads/RefGatherWorkload.cpp
index be3274f..8ad36e4 100644
--- a/src/backends/reference/workloads/RefGatherWorkload.cpp
+++ b/src/backends/reference/workloads/RefGatherWorkload.cpp
@@ -1,5 +1,5 @@
 //
-// Copyright © 2017 Arm Ltd and Contributors. All rights reserved.
+// Copyright © 2022 Arm Ltd and Contributors. All rights reserved.
 // SPDX-License-Identifier: MIT
 //
 
diff --git a/src/backends/reference/workloads/RefWorkloads.hpp b/src/backends/reference/workloads/RefWorkloads.hpp
index 700a1d6..3e83304 100644
--- a/src/backends/reference/workloads/RefWorkloads.hpp
+++ b/src/backends/reference/workloads/RefWorkloads.hpp
@@ -42,6 +42,7 @@
 #include "RefFillWorkload.hpp"
 #include "RefFloorWorkload.hpp"
 #include "RefFullyConnectedWorkload.hpp"
+#include "RefGatherNdWorkload.hpp"
 #include "RefGatherWorkload.hpp"
 #include "RefInstanceNormalizationWorkload.hpp"
 #include "RefL2NormalizationWorkload.hpp"