IVGCVSW-6861 Add GATHERNd CL workload


Signed-off-by: Teresa Charlin <teresa.charlinreyes@arm.com>
Change-Id: I8ba7e56062c285c672dcaa9d13be319eb4f1fca6
diff --git a/docs/02_operator_list.dox b/docs/02_operator_list.dox
index a02b4da..9604289 100644
--- a/docs/02_operator_list.dox
+++ b/docs/02_operator_list.dox
@@ -1498,13 +1498,19 @@
   <td>GpuAcc
   <td>
       <ul>
-       <li>TBD
+       <li>All
       </ul>
   <td>
-    <table>
-    <tr><th>
-    <tr><td>TBD
-    </table>
+      <table>
+       <tr><th>
+       <tr><td>BFLOAT16
+       <tr><td>FLOAT16
+       <tr><td>FLOAT32
+       <tr><td>QASYMMS8
+       <tr><td>QASYMMU8
+       <tr><td>QSYMMS16
+       <tr><td>SIGNED32
+      </table>
 <tr>
   <td rowspan="1">InputLayer
   <td rowspan="1" style="width:200px;"> Special layer used to provide input data to the computational network.
diff --git a/src/backends/cl/ClLayerSupport.cpp b/src/backends/cl/ClLayerSupport.cpp
index 6b8cf52..9c40391 100644
--- a/src/backends/cl/ClLayerSupport.cpp
+++ b/src/backends/cl/ClLayerSupport.cpp
@@ -41,6 +41,7 @@
 #include "workloads/ClFloorFloatWorkload.hpp"
 #include "workloads/ClFullyConnectedWorkload.hpp"
 #include "workloads/ClGatherWorkload.hpp"
+#include "workloads/ClGatherNdWorkload.hpp"
 #include "workloads/ClInstanceNormalizationWorkload.hpp"
 #include "workloads/ClL2NormalizationFloatWorkload.hpp"
 #include "workloads/ClLogWorkload.hpp"
@@ -372,6 +373,11 @@
                                      infos[2],
                                      *(PolymorphicDowncast<const GatherDescriptor*>(&descriptor)),
                                      reasonIfUnsupported);
+        case LayerType::GatherNd:
+            return IsGatherNdSupported(infos[0],
+                                       infos[1],
+                                       infos[2],
+                                       reasonIfUnsupported);
         case LayerType::Input:
             return IsInputSupported(infos[0], reasonIfUnsupported);
         case LayerType::InstanceNormalization:
@@ -1021,6 +1027,18 @@
                                    descriptor);
 }
 
+bool ClLayerSupport::IsGatherNdSupported(const TensorInfo& input0,
+                                         const TensorInfo& input1,
+                                         const TensorInfo& output,
+                                         Optional<std::string&> reasonIfUnsupported) const
+{
+    FORWARD_WORKLOAD_VALIDATE_FUNC(ClGatherNdWorkloadValidate,
+                                   reasonIfUnsupported,
+                                   input0,
+                                   input1,
+                                   output);
+}
+
 bool ClLayerSupport::IsInputSupported(const TensorInfo& input,
                                       Optional<std::string&> reasonIfUnsupported) const
 {
diff --git a/src/backends/cl/ClLayerSupport.hpp b/src/backends/cl/ClLayerSupport.hpp
index 4f4e64e..27311f7 100644
--- a/src/backends/cl/ClLayerSupport.hpp
+++ b/src/backends/cl/ClLayerSupport.hpp
@@ -148,6 +148,11 @@
                                    const FullyConnectedDescriptor& descriptor,
                                    Optional<std::string&> reasonIfUnsupported = EmptyOptional()) const override;
 
+    bool IsGatherNdSupported(const TensorInfo& input0,
+                             const TensorInfo& input1,
+                             const TensorInfo& output,
+                             Optional<std::string&> reasonIfUnsupported) const;
+
     bool IsGatherSupported(const TensorInfo& input0,
                            const TensorInfo& input1,
                            const TensorInfo& output,
diff --git a/src/backends/cl/ClWorkloadFactory.cpp b/src/backends/cl/ClWorkloadFactory.cpp
index 213f474..d4a1cb0 100644
--- a/src/backends/cl/ClWorkloadFactory.cpp
+++ b/src/backends/cl/ClWorkloadFactory.cpp
@@ -463,6 +463,11 @@
             auto gatherQueueDescriptor = PolymorphicDowncast<const GatherQueueDescriptor*>(&descriptor);
             return MakeWorkload<ClGatherWorkload>(*gatherQueueDescriptor, info, m_CLCompileContext);
         }
+        case LayerType::GatherNd :
+        {
+            auto gatherNdQueueDescriptor = PolymorphicDowncast<const GatherNdQueueDescriptor*>(&descriptor);
+            return MakeWorkload<ClGatherNdWorkload>(*gatherNdQueueDescriptor, info, m_CLCompileContext);
+        }
         case LayerType::Input :
         {
             auto inputQueueDescriptor = PolymorphicDowncast<const InputQueueDescriptor*>(&descriptor);
diff --git a/src/backends/cl/backend.mk b/src/backends/cl/backend.mk
index bf9689a..6fda16d 100644
--- a/src/backends/cl/backend.mk
+++ b/src/backends/cl/backend.mk
@@ -50,6 +50,7 @@
         workloads/ClFloorFloatWorkload.cpp \
         workloads/ClFullyConnectedWorkload.cpp \
         workloads/ClGatherWorkload.cpp \
+        workloads/ClGatherNdWorkload.cpp \
         workloads/ClInstanceNormalizationWorkload.cpp \
         workloads/ClL2NormalizationFloatWorkload.cpp \
         workloads/ClLogWorkload.cpp \
diff --git a/src/backends/cl/test/ClLayerTests.cpp b/src/backends/cl/test/ClLayerTests.cpp
index fd24043..de39f98 100644
--- a/src/backends/cl/test/ClLayerTests.cpp
+++ b/src/backends/cl/test/ClLayerTests.cpp
@@ -1018,6 +1018,17 @@
 ARMNN_AUTO_TEST_FIXTURE_WITH_THF(GatherMultiDimParamsFloat32, ClContextControlFixture, GatherMultiDimParamsFloat32Test)
 ARMNN_AUTO_TEST_FIXTURE_WITH_THF(GatherMultiDimParamsUint8, ClContextControlFixture, GatherMultiDimParamsUint8Test)
 
+// GatherNd
+ARMNN_AUTO_TEST_FIXTURE_WITH_THF(GatherNd2dFloat32, ClContextControlFixture, SimpleGatherNd2dTest<DataType::Float32>)
+ARMNN_AUTO_TEST_FIXTURE_WITH_THF(GatherNd3dFloat32, ClContextControlFixture, SimpleGatherNd3dTest<DataType::Float32>)
+ARMNN_AUTO_TEST_FIXTURE_WITH_THF(GatherNd4dFloat32, ClContextControlFixture, SimpleGatherNd4dTest<DataType::Float32>)
+ARMNN_AUTO_TEST_FIXTURE_WITH_THF(GatherNd2dInt8, ClContextControlFixture, SimpleGatherNd2dTest<DataType::QAsymmS8>)
+ARMNN_AUTO_TEST_FIXTURE_WITH_THF(GatherNd3dInt8, ClContextControlFixture, SimpleGatherNd3dTest<DataType::QAsymmS8>)
+ARMNN_AUTO_TEST_FIXTURE_WITH_THF(GatherNd4dInt8, ClContextControlFixture, SimpleGatherNd4dTest<DataType::QAsymmS8>)
+ARMNN_AUTO_TEST_FIXTURE_WITH_THF(GatherNd2dInt32, ClContextControlFixture, SimpleGatherNd2dTest<DataType::Signed32>)
+ARMNN_AUTO_TEST_FIXTURE_WITH_THF(GatherNd3dInt32, ClContextControlFixture, SimpleGatherNd3dTest<DataType::Signed32>)
+ARMNN_AUTO_TEST_FIXTURE_WITH_THF(GatherNd4dInt32, ClContextControlFixture, SimpleGatherNd4dTest<DataType::Signed32>)
+
 // Reshape
 ARMNN_AUTO_TEST_FIXTURE_WITH_THF(SimpleReshapeFloat32, ClContextControlFixture, SimpleReshapeTest<DataType::Float32>)
 ARMNN_AUTO_TEST_FIXTURE_WITH_THF(SimpleReshapeInt8, ClContextControlFixture, SimpleReshapeTest<DataType::QAsymmS8>)
diff --git a/src/backends/cl/workloads/CMakeLists.txt b/src/backends/cl/workloads/CMakeLists.txt
index 59e11cd..aef7fc7 100644
--- a/src/backends/cl/workloads/CMakeLists.txt
+++ b/src/backends/cl/workloads/CMakeLists.txt
@@ -52,6 +52,8 @@
     ClFullyConnectedWorkload.hpp
     ClGatherWorkload.cpp
     ClGatherWorkload.hpp
+    ClGatherNdWorkload.cpp
+    ClGatherNdWorkload.hpp
     ClInstanceNormalizationWorkload.cpp
     ClInstanceNormalizationWorkload.hpp
     ClLogWorkload.cpp
diff --git a/src/backends/cl/workloads/ClGatherNdWorkload.cpp b/src/backends/cl/workloads/ClGatherNdWorkload.cpp
new file mode 100644
index 0000000..f689146
--- /dev/null
+++ b/src/backends/cl/workloads/ClGatherNdWorkload.cpp
@@ -0,0 +1,206 @@
+//
+// Copyright © 2022 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#include "ClGatherNdWorkload.hpp"
+#include "ClWorkloadUtils.hpp"
+#include "backendsCommon/WorkloadUtils.hpp"
+#include <aclCommon/ArmComputeUtils.hpp>
+#include <cl/ClTensorHandle.hpp>
+
+using namespace armnn::armcomputetensorutils;
+
+namespace armnn
+{
+arm_compute::Status ClGatherNdWorkloadValidate(const TensorInfo& paramsInfo,
+                                               const TensorInfo& indicesInfo,
+                                               const TensorInfo& outputInfo)
+{
+    // Calculate ND, K, W, C.
+    std::map<std::string, unsigned int> keyIndices = CalculateGatherNdKeyIndices(paramsInfo, indicesInfo);
+
+    /// Validate Mul
+    // Indices with shape { W, ND }
+    armnn::TensorInfo indices_W_ND_Info = indicesInfo;
+    indices_W_ND_Info.SetShape({ keyIndices["W"], keyIndices["ND"] });
+    const arm_compute::TensorInfo aclIndicesInfo = BuildArmComputeTensorInfo(indices_W_ND_Info);
+
+    // Flattened coefficients with shape { ND }
+    armnn::TensorInfo flattenedCoeff_Info = indicesInfo;
+    flattenedCoeff_Info.SetShape({ keyIndices["ND"] });
+    const arm_compute::TensorInfo aclFlattenedCoeffInfo = BuildArmComputeTensorInfo(flattenedCoeff_Info);
+
+    // Output of Mul with shape { W, ND }
+    const arm_compute::TensorInfo aclOutputMulInfo = BuildArmComputeTensorInfo(indices_W_ND_Info);
+
+    auto statusMul = arm_compute::CLPixelWiseMultiplication::validate(&aclIndicesInfo,
+                                                                      &aclFlattenedCoeffInfo,
+                                                                      &aclOutputMulInfo,
+                                                                      1.0f,
+                                                                      arm_compute::ConvertPolicy::WRAP,
+                                                                      arm_compute::RoundingPolicy::TO_ZERO,
+                                                                      arm_compute::ActivationLayerInfo());
+
+    /// Validate ReduceSum
+    // Flattened indices with shape { W }
+    armnn::TensorInfo flattenedIndices_Info = indicesInfo;
+    flattenedIndices_Info.SetShape({ keyIndices["W"] });
+    const arm_compute::TensorInfo aclFlattenedIndicesInfo = BuildArmComputeTensorInfo(flattenedIndices_Info);
+
+    const std::vector<unsigned int> armnnReduceAxes(1, 1);
+    arm_compute::Coordinates coords = BuildArmComputeReductionCoordinates(aclOutputMulInfo.num_dimensions(),
+                                                                          indices_W_ND_Info.GetNumDimensions(),
+                                                                          armnnReduceAxes);
+
+    auto statusReduceSum = arm_compute::CLReductionOperation::validate(&aclOutputMulInfo,
+                                                                       &aclFlattenedIndicesInfo,
+                                                                       static_cast<unsigned int>(coords[0]),
+                                                                       arm_compute::ReductionOperation::SUM,
+                                                                       false);
+
+    /// Validate Gather
+    // Params with shape { K, C }
+    armnn::TensorInfo params_K_C_Info =  paramsInfo;
+    params_K_C_Info.SetShape({ keyIndices["K"], keyIndices["C"] });
+    const arm_compute::TensorInfo aclParamsInfo = BuildArmComputeTensorInfo(params_K_C_Info);
+
+    // Output of gather with shape { W, C }
+    armnn::TensorInfo outputGather_Info = outputInfo;
+    outputGather_Info.SetShape({ keyIndices["W"], keyIndices["C"] });
+    const arm_compute::TensorInfo aclOutputGatherInfo = BuildArmComputeTensorInfo(outputGather_Info);
+
+    auto aclAxis = ComputeAclAxis(0, params_K_C_Info);
+    auto statusGather =
+            arm_compute::CLGather::validate(&aclParamsInfo, &aclFlattenedIndicesInfo, &aclOutputGatherInfo, aclAxis);
+
+    /// Validate Reshape
+    const arm_compute::TensorInfo aclOutputInfo = BuildArmComputeTensorInfo(outputInfo);
+
+    auto statusReshape = arm_compute::CLReshapeLayer::validate(&aclOutputGatherInfo, &aclOutputInfo);
+
+    /// Return OK if all the layers are valid
+    auto okCode = arm_compute::ErrorCode::OK;
+    if (statusMul.error_code()       == okCode &&
+        statusReduceSum.error_code() == okCode &&
+        statusGather.error_code()    == okCode &&
+        statusReshape.error_code()   == okCode)
+    {
+        return arm_compute::Status(arm_compute::ErrorCode::OK,
+                                   "All GatherND layers validate status OK.");
+    }
+    else
+    {
+        return arm_compute::Status(arm_compute::ErrorCode::RUNTIME_ERROR,
+                                   "GatherND layer validate status failed.");
+    }
+}
+
+ClGatherNdWorkload::ClGatherNdWorkload(const GatherNdQueueDescriptor& descriptor,
+                                       const WorkloadInfo& info,
+                                       const arm_compute::CLCompileContext& clCompileContext)
+        : ClBaseWorkload<GatherNdQueueDescriptor>(descriptor, info)
+{
+    m_Data.ValidateInputsOutputs("ClGatherNdWorkload", 2, 1);
+
+    TensorInfo paramsInfo  = info.m_InputTensorInfos[0];
+    TensorInfo indicesInfo = info.m_InputTensorInfos[1];
+    TensorInfo outputInfo  = info.m_OutputTensorInfos[0];
+
+    arm_compute::ICLTensor& input   = static_cast<IClTensorHandle*>(m_Data.m_Inputs[0])->GetTensor();
+    arm_compute::ICLTensor& indices = static_cast<IClTensorHandle*>(m_Data.m_Inputs[1])->GetTensor();
+    arm_compute::ICLTensor& output  = static_cast<IClTensorHandle*>(m_Data.m_Outputs[0])->GetTensor();
+
+    // Calculate ND, K, W, C.
+    std::map<std::string, unsigned int> keyIndices = CalculateGatherNdKeyIndices(paramsInfo, indicesInfo);
+
+    /// Calculate flattened indices: m_FlattenedIndices = indices * m_FlattenedCoeff.
+    /// This could be done using MatMul instead of multiplication followed by reduce sum operation,
+    /// but GeMM does not support s32 at the moment.
+
+    // Prepare the tensor to store the output of the reduce_sum operation
+    armnn::TensorInfo flattenedIndices_Info = indicesInfo;
+    flattenedIndices_Info.SetShape({ keyIndices["W"] });
+    BuildArmComputeTensor(m_FlattenedIndices, flattenedIndices_Info);
+    armcomputetensorutils::InitialiseArmComputeTensorEmpty(m_FlattenedIndices);
+
+    // Reshape indices into { W, ND }
+    indices.info()->set_tensor_shape(BuildArmComputeTensorShape({ keyIndices["W"], keyIndices["ND"] }));
+
+    // Calculate the m_FlattenedCoeff
+    TensorShape paramsShape = paramsInfo.GetShape();
+    std::vector<int32_t> flattenedCoeff(keyIndices["ND"], 1);
+    for (unsigned int i = 1; i < keyIndices["ND"]; ++i)
+    {
+        flattenedCoeff[i - 1] = static_cast<int32_t>(paramsShape[i]);
+    }
+    for (unsigned int i = keyIndices["ND"] - 1; i > 0; --i)
+    {
+        flattenedCoeff[i - 1] *= flattenedCoeff[i];
+    }
+    armnn::TensorInfo flattenedCoeff_Info = indicesInfo;
+    flattenedCoeff_Info.SetShape({ keyIndices["ND"] });
+    BuildArmComputeTensor(m_FlattenedCoeff, flattenedCoeff_Info);
+    armcomputetensorutils::InitialiseArmComputeTensorEmpty(m_FlattenedCoeff);
+    ARMNN_ASSERT_MSG(indicesInfo.GetDataType() == DataType::Signed32,
+                     "flattenedCoeff must be same data type as m_FlattenedCoeff");
+    CopyArmComputeClTensorData<int32_t>(m_FlattenedCoeff, flattenedCoeff.data());
+
+    // Prepare the tensor to store the output of the multiplication
+    armnn::TensorInfo outputMul_Info = indicesInfo;
+    outputMul_Info.SetShape({ keyIndices["W"], keyIndices["ND"] });
+    BuildArmComputeTensor(m_OutputMul, outputMul_Info);
+    armcomputetensorutils::InitialiseArmComputeTensorEmpty(m_OutputMul);
+
+    // Multiply
+    m_MulLayer.configure(clCompileContext,
+                         &indices,
+                         &m_FlattenedCoeff,
+                         &m_OutputMul,
+                         1.0f,
+                         arm_compute::ConvertPolicy::WRAP,
+                         arm_compute::RoundingPolicy::TO_ZERO,
+                         arm_compute::ActivationLayerInfo());
+
+    // Reduce Sum
+    const std::vector<unsigned int> armnnReduceAxes(1, 1);
+    arm_compute::Coordinates coords = BuildArmComputeReductionCoordinates(m_OutputMul.info()->num_dimensions(),
+                                                                          outputMul_Info.GetNumDimensions(),
+                                                                          armnnReduceAxes);
+    m_ReduceSumLayer.configure(clCompileContext,
+                               &m_OutputMul,
+                               &m_FlattenedIndices,
+                               static_cast<unsigned int>(coords[0]),
+                               arm_compute::ReductionOperation::SUM,
+                               false);
+
+    /// Call Gather with adequate shapes
+    // Reshape params into { K, C }
+    paramsInfo.SetShape({ keyIndices["K"], keyIndices["C"] });
+    input.info()->set_tensor_shape(BuildArmComputeTensorShape(paramsInfo.GetShape()));
+
+    // Reshape output to have the shape given by gather { W, C }
+    // (the original outputInfo has the shape given by gatherNd)
+    armnn::TensorInfo outputGather_Info = outputInfo;
+    outputGather_Info.SetShape({ keyIndices["W"], keyIndices["C"] });
+    BuildArmComputeTensor(m_OutputGather, outputGather_Info);
+    armcomputetensorutils::InitialiseArmComputeTensorEmpty(m_OutputGather);
+    {
+        ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "ClGatherNdWorkload_configure");
+        auto aclAxis = ComputeAclAxis(0, paramsInfo);
+        m_GatherLayer.configure(clCompileContext, &input, &m_FlattenedIndices, &m_OutputGather, aclAxis);
+    }
+
+    // Reshape output to the original output shape
+    m_ReshapeLayer.configure(clCompileContext, &m_OutputGather, &output);
+};
+
+void ClGatherNdWorkload::Execute() const
+{
+    ARMNN_SCOPED_PROFILING_EVENT_CL_GUID("ClGatherNdWorkload_Execute", this->GetGuid());
+    RunClFunction(m_MulLayer, CHECK_LOCATION());
+    RunClFunction(m_ReduceSumLayer, CHECK_LOCATION());
+    RunClFunction(m_GatherLayer, CHECK_LOCATION());
+    RunClFunction(m_ReshapeLayer, CHECK_LOCATION());
+}
+} // namespace armnn
diff --git a/src/backends/cl/workloads/ClGatherNdWorkload.hpp b/src/backends/cl/workloads/ClGatherNdWorkload.hpp
new file mode 100644
index 0000000..dd30024
--- /dev/null
+++ b/src/backends/cl/workloads/ClGatherNdWorkload.hpp
@@ -0,0 +1,42 @@
+//
+// Copyright © 2022 Arm Ltd and Contributors. All rights reserved.
+// SPDX-License-Identifier: MIT
+//
+
+#pragma once
+
+#include "ClBaseWorkload.hpp"
+
+#include "arm_compute/runtime/Tensor.h"
+#include "arm_compute/runtime/CL/functions/CLGather.h"
+#include "arm_compute/runtime/CL/functions/CLPixelWiseMultiplication.h"
+#include "arm_compute/runtime/CL/functions/CLReductionOperation.h"
+#include "arm_compute/runtime/CL/functions/CLReshapeLayer.h"
+
+namespace armnn
+{
+arm_compute::Status ClGatherNdWorkloadValidate(const TensorInfo& params,
+                                               const TensorInfo& indices,
+                                               const TensorInfo& output);
+
+class ClGatherNdWorkload : public ClBaseWorkload<GatherNdQueueDescriptor>
+{
+public:
+    ClGatherNdWorkload(const GatherNdQueueDescriptor& descriptor,
+                       const WorkloadInfo& info,
+                       const arm_compute::CLCompileContext& clCompileContext);
+    virtual void Execute() const override;
+
+private:
+    arm_compute::CLTensor m_FlattenedCoeff;
+    arm_compute::CLTensor m_OutputMul;
+    arm_compute::CLTensor m_FlattenedIndices;
+    arm_compute::CLTensor m_OutputGather;
+
+    mutable arm_compute::CLPixelWiseMultiplication m_MulLayer;
+    mutable arm_compute::CLReductionOperation m_ReduceSumLayer;
+    mutable arm_compute::CLGather m_GatherLayer;
+    mutable arm_compute::CLReshapeLayer m_ReshapeLayer;
+};
+
+} //namespace armnn
\ No newline at end of file
diff --git a/src/backends/cl/workloads/ClWorkloads.hpp b/src/backends/cl/workloads/ClWorkloads.hpp
index 27119bb..71f401a 100644
--- a/src/backends/cl/workloads/ClWorkloads.hpp
+++ b/src/backends/cl/workloads/ClWorkloads.hpp
@@ -25,6 +25,7 @@
 #include "ClFloorFloatWorkload.hpp"
 #include "ClFullyConnectedWorkload.hpp"
 #include "ClGatherWorkload.hpp"
+#include "ClGatherNdWorkload.hpp"
 #include "ClInstanceNormalizationWorkload.hpp"
 #include "ClL2NormalizationFloatWorkload.hpp"
 #include "ClLogWorkload.hpp"