Multi-Dimensional and Batched Scatter Reference and Dataset Implementation.

Resolves: [COMPMID-6893, COMPMID-6895, COMPMID-6898]
Change-Id: I355f46aeba2213cd8d067cac7643d8d96e713c93
Signed-off-by: Mohammed Suhail Munshi <MohammedSuhail.Munshi@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/11430
Reviewed-by: Gunes Bayir <gunes.bayir@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Benchmark: Arm Jenkins <bsgcomp@arm.com>
diff --git a/tests/datasets/ScatterDataset.h b/tests/datasets/ScatterDataset.h
index f7547ec..8b0972f 100644
--- a/tests/datasets/ScatterDataset.h
+++ b/tests/datasets/ScatterDataset.h
@@ -113,6 +113,8 @@
     std::vector<TensorShape> _dst_shapes{};
 };
 
+
+// 1D dataset for simple scatter tests.
 class Small1DScatterDataset final : public ScatterDataset
 {
 public:
@@ -122,6 +124,67 @@
         add_config(TensorShape(10U), TensorShape(2U), TensorShape(1U, 2U), TensorShape(10U));
     }
 };
+
+// This dataset represents the (m+1)-D updates/dst case.
+class SmallScatterMultiDimDataset final : public ScatterDataset
+{
+public:
+    SmallScatterMultiDimDataset()
+    {
+        // NOTE: Config is src, updates, indices, output.
+        //      - In this config, the dim replaced is the final number (largest tensor dimension)
+        //      - Largest "updates" dim should match y-dim of indices.
+        //      - src/updates/dst should all have same number of dims. Indices should be 2D.
+        add_config(TensorShape(6U, 5U), TensorShape(6U, 2U), TensorShape(1U, 2U), TensorShape(6U, 5U));
+        add_config(TensorShape(9U, 3U, 4U), TensorShape(9U, 3U, 2U), TensorShape(1U, 2U), TensorShape(9U, 3U, 4U));
+        add_config(TensorShape(3U, 2U, 4U, 2U), TensorShape(3U, 2U, 4U, 2U), TensorShape(1U, 2U), TensorShape(3U, 2U, 4U, 2U));
+    }
+};
+
+// This dataset represents the (m+1)-D updates tensor, (m+n)-d output tensor cases
+class SmallScatterMultiIndicesDataset final : public ScatterDataset
+{
+public:
+    SmallScatterMultiIndicesDataset()
+    {
+        // NOTE: Config is src, updates, indices, output.
+        // NOTE: indices.shape.x = src.num_dimensions - updates.num_dimensions + 1
+
+        // index length is 2
+        add_config(TensorShape(6U, 5U, 2U), TensorShape(6U, 4U), TensorShape(2U, 4U), TensorShape(6U, 5U, 2U));
+        add_config(TensorShape(17U, 3U, 3U, 2U), TensorShape(17U, 3U, 2U), TensorShape(2U, 2U), TensorShape(17U, 3U, 3U, 2U));
+        add_config(TensorShape(11U, 3U, 3U, 2U, 4U), TensorShape(11U, 3U, 3U, 4U), TensorShape(2U, 4U), TensorShape(11U, 3U, 3U, 2U, 4U));
+        add_config(TensorShape(5U, 4U, 3U, 3U, 2U, 4U), TensorShape(5U, 4U, 3U, 3U, 5U), TensorShape(2U, 5U), TensorShape(5U, 4U, 3U, 3U, 2U, 4U));
+
+        // index length is 3
+        add_config(TensorShape(4U, 3U, 2U, 2U), TensorShape(4U, 2U), TensorShape(3U, 2U), TensorShape(4U, 3U, 2U, 2U));
+        add_config(TensorShape(17U, 4U, 3U, 2U, 2U), TensorShape(17U, 4U, 4U), TensorShape(3U, 4U), TensorShape(17U, 4U, 3U, 2U, 2U));
+        add_config(TensorShape(10U, 4U, 5U, 3U, 2U, 2U), TensorShape(10U, 4U, 5U, 3U), TensorShape(3U, 3U), TensorShape(10U, 4U, 5U, 3U, 2U, 2U));
+
+        // index length is 4
+        add_config(TensorShape(35U, 4U, 3U, 2U, 2U), TensorShape(35U, 4U), TensorShape(4U, 4U), TensorShape(35U, 4U, 3U, 2U, 2U));
+        add_config(TensorShape(10U, 4U, 5U, 3U, 2U, 2U), TensorShape(10U, 4U, 3U), TensorShape(4U, 3U), TensorShape(10U, 4U, 5U, 3U, 2U, 2U));
+
+        // index length is 5
+        add_config(TensorShape(10U, 4U, 5U, 3U, 2U, 2U), TensorShape(10U, 3U), TensorShape(5U, 3U), TensorShape(10U, 4U, 5U, 3U, 2U, 2U));
+    }
+};
+
+// This dataset represents the (m+k)-D updates tensor, (k+1)-d indices tensor and (m+n)-d output tensor cases
+class SmallScatterBatchedDataset final : public ScatterDataset
+{
+public:
+    SmallScatterBatchedDataset()
+    {
+        // NOTE: Config is src, updates, indices, output.
+        // NOTE: Updates/Indices tensors are now batched.
+        // NOTE: indices.shape.x = (updates_batched) ? (src.num_dimensions - updates.num_dimensions) + 2 : (src.num_dimensions - updates.num_dimensions) + 1
+        add_config(TensorShape(6U, 5U), TensorShape(6U, 2U, 2U), TensorShape(1U, 2U, 2U), TensorShape(6U, 5U));
+        add_config(TensorShape(6U, 5U, 2U), TensorShape(6U, 2U, 2U), TensorShape(2U, 2U, 2U), TensorShape(6U, 5U, 2U));
+        add_config(TensorShape(6U, 5U, 2U, 2U), TensorShape(3U, 2U), TensorShape(4U, 3U, 2U), TensorShape(6U, 5U, 2U, 2U));
+        add_config(TensorShape(5U, 5U, 4U, 2U, 2U), TensorShape(6U, 2U), TensorShape(5U, 6U, 2U), TensorShape(5U, 5U, 4U, 2U, 2U));
+    }
+};
 } // namespace datasets
 } // namespace test
 } // namespace arm_compute
diff --git a/tests/validation/CL/ScatterLayer.cpp b/tests/validation/CL/ScatterLayer.cpp
index 9711671..5b1d5af 100644
--- a/tests/validation/CL/ScatterLayer.cpp
+++ b/tests/validation/CL/ScatterLayer.cpp
@@ -111,6 +111,7 @@
 {
     validate(CLAccessor(_target), _reference, tolerance_f32);
 }
+
 TEST_SUITE_END() // FP32
 TEST_SUITE_END() // Float
 TEST_SUITE_END() // Scatter
diff --git a/tests/validation/fixtures/ScatterLayerFixture.h b/tests/validation/fixtures/ScatterLayerFixture.h
index 451a1e1..91e28b5 100644
--- a/tests/validation/fixtures/ScatterLayerFixture.h
+++ b/tests/validation/fixtures/ScatterLayerFixture.h
@@ -54,7 +54,7 @@
 
 protected:
     template <typename U>
-    void fill(U &&tensor, int i, float lo = -1.f, float hi = 1.f)
+    void fill(U &&tensor, int i, float lo = -10.f, float hi = 10.f)
     {
         switch(tensor.data_type())
         {
@@ -135,6 +135,22 @@
     {
         // Output Quantization not currently in use - fixture should be extended to support this.
         ARM_COMPUTE_UNUSED(o_qinfo);
+        TensorShape src_shape = a_shape;
+        TensorShape updates_shape = b_shape;
+        TensorShape indices_shape = c_shape;
+
+        // 1. Collapse batch index into a single dim if necessary for update tensor and indices tensor.
+        if(c_shape.num_dimensions() >= 3)
+        {
+            indices_shape = indices_shape.collapsed_from(1);
+            updates_shape = updates_shape.collapsed_from(updates_shape.num_dimensions() - 2); // Collapses from last 2 dims
+        }
+
+        // 2. Collapse data dims into a single dim.
+        //    Collapse all src dims into 2 dims. First one holding data, the other being the index we iterate over.
+        src_shape.collapse(updates_shape.num_dimensions() - 1);     // Collapse all data dims into single dim.
+        src_shape = src_shape.collapsed_from(1);                    // Collapse all index dims into a single dim
+        updates_shape.collapse(updates_shape.num_dimensions() - 1); // Collapse data dims (all except last dim which is batch dim)
 
         // Create reference tensors
         SimpleTensor<T> src{ a_shape, data_type, 1, a_qinfo };
diff --git a/tests/validation/reference/ScatterLayer.cpp b/tests/validation/reference/ScatterLayer.cpp
index 7543b46..283022e 100644
--- a/tests/validation/reference/ScatterLayer.cpp
+++ b/tests/validation/reference/ScatterLayer.cpp
@@ -23,6 +23,7 @@
  */
 #include "ScatterLayer.h"
 #include "tests/validation/Helpers.h"
+#include "arm_compute/core/TensorShape.h"
 
 namespace arm_compute
 {
@@ -64,36 +65,67 @@
 template float reduce_op(const float &current,const float &update,const ScatterFunction func);
 }
 
-// Note : This function currently only supports 1D src, 1D updates, 2D indices, 1D output tensors.
+// NOTE: This function expects collapsed tensors as input.
+// Batch dims for update/indices tensors should be collapsed into a single dim.
+// Data dims should be collapsed into a single dim for both update and src tensors prior to calling this function.
 template <typename T>
 SimpleTensor<T> scatter_layer_internal(const SimpleTensor<T> &src, const SimpleTensor<T> &updates, const SimpleTensor<int32_t> &indices, const TensorShape &out_shape, const ScatterInfo &info)
 {
+    // 1. If zero initialization variable is false, copy src data to dst.
     SimpleTensor<T> dst{ out_shape, src.data_type(), 1 };
-
-    // 1. If zero initialization variable is true, fill dst with 0 values. Else copy src data to dst.
-    if(info.zero_initialization)
-    {
-        for (int i = 0; i < src.num_elements(); ++i)
-        {
-            dst[i] = static_cast<T>(0);
-        }
-    }
-    else
+    if(!info.zero_initialization)
     {
         std::copy_n(src.data(), src.num_elements(), dst.data());
     }
 
-    // 2. Get max index of output tensor, then iterate over index tensor.
-    const int x_bound = static_cast<int>(dst.shape().x());
+    // Number of elements between each value of the dim being iterated through
+    const unsigned int data_stride = updates.shape().total_size_lower(updates.shape().num_dimensions() - 1);
+    const unsigned int no_output_dims = out_shape.num_dimensions();
 
-
-    for(int i = 0; i < indices.num_elements(); ++i)
+    // Calculate output stride at given index for all output dims.
+    std::vector<unsigned int> out_stride_at_idx(no_output_dims);
+    for (unsigned int i = 0 ; i < no_output_dims; i++)
     {
-        // 3. Check whether index is out of bounds for dst, if not then apply reduce op.
-        const auto index = indices[i];
-        if (index < x_bound && index >= 0) // Note : we ignore negative index values.
+        out_stride_at_idx[i] = out_shape.total_size_lower(i);
+    }
+
+    const unsigned int indices_x_dim = static_cast<unsigned int>(indices.shape()[0]);
+    const unsigned int indices_y_dim = static_cast<unsigned int>(indices.shape()[1]);
+
+    // 2. Iterate over indices tensor y-dim and replace sections of dst tensor with relevant areas of update tensor.
+    for(unsigned int i = 0; i < indices_y_dim; i++)
+    {
+        // NOTE : Currently, indices.shape() == [X, Y, 1, 1], where  X is the indices dim and Y is the batch dim
+        // Starting index for both the update and indices tensors.
+        const unsigned int update_dim_start = i * data_stride;
+        const unsigned int indices_dim_start = i * indices_x_dim;
+        bool out_of_bounds = false;
+        unsigned int out_offset_acc = 0;
+
+        // Iterate over each indices value for the relevant batch and accumulate the offset.
+        for(unsigned int j = 0; j < indices_x_dim; j++)
         {
-            dst[index] = reduce_op(dst[index], updates[i], info.func);
+            // Get first index value with i * indices_x_dim (iterating through y-dim/batch idx), then iterate through x dim by adding k
+            const int index_value = indices[indices_dim_start + j];
+            const unsigned int out_dim = no_output_dims - (j+1);   // Calculate corresponding output dim to current index value.
+            if(index_value < static_cast<int>(out_shape[out_dim]) && index_value >= 0)
+            {
+                out_offset_acc += (index_value * out_stride_at_idx[out_dim]); // offset accumulation
+            }
+            else
+            {
+                out_of_bounds = true;
+                break;
+            }
+        }
+
+        // If not out of bounds, copy update tensor elements to output
+        if(!out_of_bounds)
+        {
+            for (unsigned int j = 0 ; j < data_stride; j++)
+            {
+                dst[out_offset_acc + j] = reduce_op(dst[out_offset_acc + j], updates[update_dim_start + j], info.func);
+            }
         }
     }
     return dst;