COMPMID-1342 Add grouping support to CLIm2ColKernel

Change-Id: I4afb19751520a90fee27fb49b775cd10e92a94f5
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/140476
Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
Tested-by: Jenkins <bsgcomp@arm.com>
diff --git a/tests/datasets/ShapeDatasets.h b/tests/datasets/ShapeDatasets.h
index 4a6d778..a1ca983 100644
--- a/tests/datasets/ShapeDatasets.h
+++ b/tests/datasets/ShapeDatasets.h
@@ -635,6 +635,39 @@
     }
 };
 
+/** Data set containing small grouped im2col tensor shapes. */
+class GroupedIm2ColSmallShapes final : public ShapeDataset
+{
+public:
+    GroupedIm2ColSmallShapes()
+        : ShapeDataset("Shape",
+    {
+        TensorShape{ 11U, 11U, 48U },
+                     TensorShape{ 27U, 13U, 24U },
+                     TensorShape{ 128U, 64U, 12U, 3U },
+                     TensorShape{ 11U, 11U, 48U, 4U },
+                     TensorShape{ 27U, 13U, 24U, 4U },
+                     TensorShape{ 11U, 11U, 48U, 5U }
+    })
+    {
+    }
+};
+
+/** Data set containing large grouped im2col tensor shapes. */
+class GroupedIm2ColLargeShapes final : public ShapeDataset
+{
+public:
+    GroupedIm2ColLargeShapes()
+        : ShapeDataset("Shape",
+    {
+        TensorShape{ 1921U, 1083U, 12U },
+                     TensorShape{ 641U, 485U, 24U, 3U },
+                     TensorShape{ 799U, 595U, 12U, 4U },
+    })
+    {
+    }
+};
+
 /** Data set containing small grouped weights tensor shapes. */
 class GroupedWeightsSmallShapes final : public ShapeDataset
 {
diff --git a/tests/validation/CL/Im2Col.cpp b/tests/validation/CL/Im2Col.cpp
index e8019cc..cf7c79a 100644
--- a/tests/validation/CL/Im2Col.cpp
+++ b/tests/validation/CL/Im2Col.cpp
@@ -53,9 +53,14 @@
 const auto padstrides        = framework::dataset::make("PadStride", { PadStrideInfo(1U, 1U, 0U, 0U),
                                                                        PadStrideInfo(1U, 1U, 1U, 1U),
                                                                        PadStrideInfo(2U, 2U, 0U, 2U) });
-const auto conv_args         = combine(combine(combine(conv_filter_sizes, padstrides),
-                                               framework::dataset::make("QuantizationInfo", QuantizationInfo(0.5f, 10))),
-                                       framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC }));
+const auto conv_args         = combine(combine(combine(combine(conv_filter_sizes, padstrides),
+                                                       framework::dataset::make("QuantizationInfo", QuantizationInfo(0.5f, 10))),
+                                                       framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                                                       framework::dataset::make("NumGroups", { 1 }));
+const auto grouped_args      = combine(combine(combine(combine(conv_filter_sizes, padstrides),
+                                                       framework::dataset::make("QuantizationInfo", QuantizationInfo(0.5f, 10))),
+                                                       framework::dataset::make("DataLayout", { DataLayout::NCHW })),
+                                                       framework::dataset::make("NumGroups", { 2, 3, 4 }));
 
 } // namespace
 TEST_SUITE(CL)
@@ -96,7 +101,6 @@
     // Validate output
     validate(CLAccessor(_target), _reference);
 }
-TEST_SUITE_END()
 
 FIXTURE_DATA_TEST_CASE(RunLarge, CLIm2ColFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType", DataType::F32)),
                                                                                                           conv_args),
@@ -105,8 +109,7 @@
     // Validate output
     validate(CLAccessor(_target), _reference);
 }
-
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+TEST_SUITE_END()
 
 TEST_SUITE(FP16)
 FIXTURE_DATA_TEST_CASE(RunSmall, CLIm2ColFixture<half>, framework::DatasetMode::ALL, combine(combine(combine(datasets::SmallShapes(), framework::dataset::make("DataType", DataType::F16)),
@@ -124,9 +127,6 @@
     validate(CLAccessor(_target), _reference);
 }
 TEST_SUITE_END()
-
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-
 TEST_SUITE_END()
 
 TEST_SUITE(QASYMM8)
@@ -146,6 +146,68 @@
 }
 TEST_SUITE_END()
 
+TEST_SUITE(Grouped)
+TEST_SUITE(FP32)
+FIXTURE_DATA_TEST_CASE(RunSmall, CLIm2ColFixture<float>, framework::DatasetMode::ALL, combine(combine(combine(datasets::GroupedIm2ColSmallShapes(), framework::dataset::make("DataType",
+                                                                                                              DataType::F32)),
+                                                                                                      grouped_args),
+                                                                                              framework::dataset::make("ChannelsFirstOutputNHWC", true)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge, CLIm2ColFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::GroupedIm2ColLargeShapes(), framework::dataset::make("DataType",
+                                                                                                                  DataType::F32)),
+                                                                                                          grouped_args),
+                                                                                                  framework::dataset::make("ChannelsFirstOutputNHWC", true)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+TEST_SUITE_END()
+
+TEST_SUITE(FP16)
+FIXTURE_DATA_TEST_CASE(RunSmall, CLIm2ColFixture<half>, framework::DatasetMode::ALL, combine(combine(combine(datasets::GroupedIm2ColSmallShapes(), framework::dataset::make("DataType",
+                                                                                                             DataType::F16)),
+                                                                                                     grouped_args),
+                                                                                             framework::dataset::make("ChannelsFirstOutputNHWC", true)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge, CLIm2ColFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::GroupedIm2ColLargeShapes(), framework::dataset::make("DataType",
+                                                                                                                 DataType::F16)),
+                                                                                                         grouped_args),
+                                                                                                 framework::dataset::make("ChannelsFirstOutputNHWC", true)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+TEST_SUITE_END()
+
+TEST_SUITE(QASYMM8)
+FIXTURE_DATA_TEST_CASE(RunSmall, CLIm2ColFixture<uint8_t>, framework::DatasetMode::ALL, combine(combine(combine(datasets::GroupedIm2ColSmallShapes(), framework::dataset::make("DataType",
+                                                                                                                DataType::QASYMM8)),
+                                                                                                        grouped_args),
+                                                                                                framework::dataset::make("ChannelsFirstOutputNHWC", true)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge, CLIm2ColFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(combine(datasets::GroupedIm2ColLargeShapes(), framework::dataset::make("DataType",
+                                                                                                                    DataType::QASYMM8)),
+                                                                                                            grouped_args),
+                                                                                                    framework::dataset::make("ChannelsFirstOutputNHWC", true)))
+{
+    // Validate output
+    validate(CLAccessor(_target), _reference);
+}
+TEST_SUITE_END()
+TEST_SUITE_END()
+
 TEST_SUITE_END()
 TEST_SUITE_END()
 } // namespace validation
diff --git a/tests/validation/NEON/Im2Col.cpp b/tests/validation/NEON/Im2Col.cpp
index f011ebe..0ea68bf 100644
--- a/tests/validation/NEON/Im2Col.cpp
+++ b/tests/validation/NEON/Im2Col.cpp
@@ -40,9 +40,10 @@
 namespace
 {
 const auto conv_filter_sizes = framework::dataset::make("KernelDims", { Size2D(3U, 3U), Size2D(3U, 1U), Size2D(1U, 5U), Size2D(5U, 5U), Size2D(7U, 7U) });
-const auto conv_args         = combine(combine(combine(conv_filter_sizes, framework::dataset::make("PadStride", { PadStrideInfo(1U, 1U, 0U, 0U), PadStrideInfo(1U, 1U, 1U, 1U), PadStrideInfo(2U, 2U, 0U, 2U) })),
-                                               framework::dataset::make("QuantizationInfo", QuantizationInfo(0.5f, 10))),
-                                       framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC }));
+const auto conv_args         = combine(combine(combine(combine(conv_filter_sizes, framework::dataset::make("PadStride", { PadStrideInfo(1U, 1U, 0U, 0U), PadStrideInfo(1U, 1U, 1U, 1U), PadStrideInfo(2U, 2U, 0U, 2U) })),
+                                                       framework::dataset::make("QuantizationInfo", QuantizationInfo(0.5f, 10))),
+                                               framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
+                                       framework::dataset::make("NumGroups", { 1 }));
 } // namespace
 TEST_SUITE(NEON)
 TEST_SUITE(Im2Col)
@@ -66,7 +67,7 @@
                framework::dataset::make("Expected", { false, false, false, false, true })),
                input_info, output_info, has_bias, expected)
 {
-    bool status = bool(NEIm2Col::validate(&input_info, &output_info, Size2D(3U, 3U), PadStrideInfo(), has_bias, false, false));
+    bool status = bool(NEIm2Col::validate(&input_info, &output_info, Size2D(3U, 3U), PadStrideInfo(), has_bias));
     ARM_COMPUTE_EXPECT(status == expected, framework::LogLevel::ERRORS);
 }
 // clang-format on
diff --git a/tests/validation/fixtures/Im2ColFixture.h b/tests/validation/fixtures/Im2ColFixture.h
index da2576b..b5e83a9 100644
--- a/tests/validation/fixtures/Im2ColFixture.h
+++ b/tests/validation/fixtures/Im2ColFixture.h
@@ -50,13 +50,14 @@
 public:
     template <typename...>
     void setup(TensorShape input_shape, DataType data_type, const Size2D &kernel_dims, const PadStrideInfo &conv_info, const QuantizationInfo &quant_info, const DataLayout &data_layout,
-               bool channels_first_output_nhwc)
+               unsigned int num_groups, bool channels_first_output_nhwc)
     {
         _kernel_dims = kernel_dims;
         _conv_info   = conv_info;
         _quant_info  = quant_info;
         _data_layout = data_layout;
         _has_bias    = data_type != DataType::QASYMM8;
+        _num_groups  = num_groups;
 
         if(_data_layout == DataLayout::NHWC)
         {
@@ -66,7 +67,7 @@
         TensorInfo input_info(input_shape, 1, data_type);
         input_info.set_data_layout(_data_layout);
 
-        const TensorShape output_shape = compute_im2col_conv_shape(&input_info, _kernel_dims, _conv_info, _has_bias, Size2D(1U, 1U), batch_size_on_z);
+        const TensorShape output_shape = compute_im2col_conv_shape(&input_info, _kernel_dims, _conv_info, _has_bias, Size2D(1U, 1U), batch_size_on_z && _num_groups == 1, _num_groups);
         _target                        = compute_target(input_shape, output_shape, data_type);
 
         compute_reference(input_shape, output_shape, data_type, channels_first_output_nhwc);
@@ -87,7 +88,7 @@
 
         // Create and configure function
         FunctionType im2col_func;
-        im2col_func.configure(&src, &dst, _kernel_dims, _conv_info, _has_bias);
+        im2col_func.configure(&src, &dst, _kernel_dims, _conv_info, _has_bias, Size2D(1U, 1U), _num_groups);
 
         ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS);
         ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS);
@@ -117,7 +118,7 @@
         // Fill reference
         fill(src);
 
-        reference::im2col<T>(src, _reference, _kernel_dims, _conv_info, _has_bias, channels_first_output_nhwc);
+        reference::im2col<T>(src, _reference, _kernel_dims, _conv_info, _has_bias, _num_groups, channels_first_output_nhwc);
     }
     TensorType       _target{};
     SimpleTensor<T>  _reference{};
@@ -126,6 +127,7 @@
     DataLayout       _data_layout{};
     QuantizationInfo _quant_info{};
     bool             _has_bias{};
+    unsigned int     _num_groups{};
 };
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/reference/Im2Col.cpp b/tests/validation/reference/Im2Col.cpp
index 2459499..0c41d88 100644
--- a/tests/validation/reference/Im2Col.cpp
+++ b/tests/validation/reference/Im2Col.cpp
@@ -36,7 +36,7 @@
 namespace reference
 {
 template <typename T>
-void im2col_nchw(const SimpleTensor<T> &src, SimpleTensor<T> &dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias)
+void im2col_nchw(const SimpleTensor<T> &src, SimpleTensor<T> &dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, unsigned int num_groups)
 {
     ARM_COMPUTE_ERROR_ON(src.data_layout() != DataLayout::NCHW);
     const int stride_x      = conv_info.stride().first;
@@ -58,26 +58,32 @@
 
     for(int b = 0; b < batches; ++b)
     {
-        for(int yo = 0; yo < dst_height; ++yo)
+        for(int g = 0; g < static_cast<int>(num_groups); ++g)
         {
-            // Compute input spatial coordinates
-            const int xi = (yo % convolved_dims.first) * stride_x;
-            const int yi = (yo / convolved_dims.first) * stride_y;
+            const int first_group_ch = g * (src_channels / num_groups);
+            const int last_group_ch  = (g + 1) * (src_channels / num_groups);
 
-            for(int ci = 0; ci < src_channels; ++ci)
+            for(int yo = 0; yo < dst_height; ++yo)
             {
-                for(int yk = 0; yk < kernel_height; ++yk)
+                // Compute input spatial coordinates
+                const int xi = (yo % convolved_dims.first) * stride_x;
+                const int yi = (yo / convolved_dims.first) * stride_y;
+
+                for(int ci = first_group_ch; ci < last_group_ch; ++ci)
                 {
-                    for(int xk = 0; xk < kernel_width; ++xk)
+                    for(int yk = 0; yk < kernel_height; ++yk)
                     {
-                        dst[dst_idx++] = tensor_elem_at(src, Coordinates(xi + xk - pad_x, yi + yk - pad_y, ci, b), BorderMode::CONSTANT, static_cast<T>(pad_val));
+                        for(int xk = 0; xk < kernel_width; ++xk)
+                        {
+                            dst[dst_idx++] = tensor_elem_at(src, Coordinates(xi + xk - pad_x, yi + yk - pad_y, ci, b), BorderMode::CONSTANT, static_cast<T>(pad_val));
+                        }
                     }
                 }
-            }
 
-            if(has_bias)
-            {
-                dst[dst_idx++] = static_cast<T>(1);
+                if(has_bias)
+                {
+                    dst[dst_idx++] = static_cast<T>(1);
+                }
             }
         }
     }
@@ -179,13 +185,13 @@
 }
 
 template <typename T>
-void im2col(const SimpleTensor<T> &src, SimpleTensor<T> &dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, bool channels_first_output_nhwc)
+void im2col(const SimpleTensor<T> &src, SimpleTensor<T> &dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const unsigned int num_groups, bool channels_first_output_nhwc)
 {
     switch(src.data_layout())
     {
         case DataLayout::NCHW:
         {
-            im2col_nchw(src, dst, kernel_dims, conv_info, has_bias);
+            im2col_nchw(src, dst, kernel_dims, conv_info, has_bias, num_groups);
             break;
         }
         case DataLayout::NHWC:
@@ -208,9 +214,12 @@
     }
 }
 
-template void im2col(const SimpleTensor<uint8_t> &src, SimpleTensor<uint8_t> &dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, bool channels_first_output_nhwc);
-template void im2col(const SimpleTensor<half> &src, SimpleTensor<half> &dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, bool channels_first_output_nhwc);
-template void im2col(const SimpleTensor<float> &src, SimpleTensor<float> &dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, bool channels_first_output_nhwc);
+template void im2col(const SimpleTensor<uint8_t> &src, SimpleTensor<uint8_t> &dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, unsigned int num_groups,
+                     bool channels_first_output_nhwc);
+template void im2col(const SimpleTensor<half> &src, SimpleTensor<half> &dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, unsigned int num_groups,
+                     bool channels_first_output_nhwc);
+template void im2col(const SimpleTensor<float> &src, SimpleTensor<float> &dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, unsigned int num_groups,
+                     bool channels_first_output_nhwc);
 } // namespace reference
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/reference/Im2Col.h b/tests/validation/reference/Im2Col.h
index b1ebaf2..84ee237 100644
--- a/tests/validation/reference/Im2Col.h
+++ b/tests/validation/reference/Im2Col.h
@@ -35,7 +35,8 @@
 namespace reference
 {
 template <typename T>
-void im2col(const SimpleTensor<T> &src, SimpleTensor<T> &dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, bool channels_first_output_nhwc = false);
+void im2col(const SimpleTensor<T> &src, SimpleTensor<T> &dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const unsigned int num_groups,
+            bool channels_first_output_nhwc = false);
 } // namespace reference
 } // namespace validation
 } // namespace test