COMPMID-1498 - Enable grouping in CLGEMMConvolutionLayer

Change-Id: I15c7df21773145b03f42b6f78bd7ad2e5b8a5219
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/144126
Tested-by: Jenkins <bsgcomp@arm.com>
Reviewed-by: Giorgio Arena <giorgio.arena@arm.com>
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
diff --git a/arm_compute/core/CL/kernels/CLCol2ImKernel.h b/arm_compute/core/CL/kernels/CLCol2ImKernel.h
index 5c047ca..2a18ae0 100644
--- a/arm_compute/core/CL/kernels/CLCol2ImKernel.h
+++ b/arm_compute/core/CL/kernels/CLCol2ImKernel.h
@@ -63,12 +63,11 @@
     CLCol2ImKernel &operator=(CLCol2ImKernel &&) = default;
     /** Default destructor */
     ~CLCol2ImKernel() = default;
-
     /** Set the input and output of the kernel.
      *
      * @param[in]  input          The input tensor to convert. Data types supported: QASYMM8/F16/F32
      * @param[out] output         The output tensor. 3 lower dimensions represent a single output [width, height, OFM],
-     *                            while the rest represent batch of outputs. Data types supported: Same as @p input
+     *                            while the rest represent batch of outputs. Data types supported: Same as @p input. Data layout: NCHW
      * @param[in]  convolved_dims Output convolved dimensions.
      * @param[in]  num_groups     (Optional) Number of groups when performing a grouped convolution
      */
@@ -77,7 +76,7 @@
      *
      * @param[in] input          The input tensor to convert. Data types supported: QASYMM8/F16/F32
      * @param[in] output         The output tensor. 3 lower dimensions represent a single output [width, height, OFM],
-     *                           while the rest represent batch of outputs. Data types supported: Same as @p input
+     *                           while the rest represent batch of outputs. Data types supported: Same as @p input. Data layout: NCHW
      * @param[in] convolved_dims Output convolved dimensions.
      * @param[in] num_groups     (Optional) Number of groups when performing a grouped convolution
      *
diff --git a/arm_compute/core/CL/kernels/CLIm2ColKernel.h b/arm_compute/core/CL/kernels/CLIm2ColKernel.h
index c678f27..0647f5d 100644
--- a/arm_compute/core/CL/kernels/CLIm2ColKernel.h
+++ b/arm_compute/core/CL/kernels/CLIm2ColKernel.h
@@ -76,7 +76,7 @@
      * @param[in]  conv_info   Contains padding and stride information described in @ref PadStrideInfo.
      * @param[in]  has_bias    In case biases are provided expands the matrix with 1.
      * @param[in]  dilation    (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
-     * @param[in]  num_groups  (Optional) Number of groups when performing a grouped convolution
+     * @param[in]  num_groups  (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is only supported for NCHW data layout
      */
     void configure(const ICLTensor *input, ICLTensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation = Size2D(1U, 1U),
                    unsigned int num_groups = 1);
@@ -90,7 +90,7 @@
      * @param[in] conv_info   Contains padding and stride information described in @ref PadStrideInfo.
      * @param[in] has_bias    In case biases are provided expands the matrix with 1.
      * @param[in] dilation    (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
-     * @param[in] num_groups  (Optional) Number of groups when performing a grouped convolution
+     * @param[in] num_groups  (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is only supported for NCHW data layout
      *
      * @return a status
      */
diff --git a/arm_compute/core/CL/kernels/CLWeightsReshapeKernel.h b/arm_compute/core/CL/kernels/CLWeightsReshapeKernel.h
index d3bbbaf..bdc5792 100644
--- a/arm_compute/core/CL/kernels/CLWeightsReshapeKernel.h
+++ b/arm_compute/core/CL/kernels/CLWeightsReshapeKernel.h
@@ -75,10 +75,10 @@
      *                        @warning Appending biases to weights reshaped matrix is not supported for quantized asymmetric types.
      * @param[out] output     The output tensor. Should be a 2D Tensor if there are no groups and the weights are not shared; a 3D Tensor otherwise.
      *                        Data types supported: Same as @p input
-     * @param[in]  num_groups (Optional) Number of groups when performing a grouped convolution.
+     * @param[in]  num_groups (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is only supported for NCHW data layout
      *                        Number of groups greater than one are only supported for NCHW data layout, and the number of weights must be a multiple of it.
      */
-    void configure(const ICLTensor *input, const ICLTensor *biases, ICLTensor *output, const unsigned int num_groups = 1);
+    void configure(const ICLTensor *input, const ICLTensor *biases, ICLTensor *output, unsigned int num_groups = 1);
     /** Static function to check if given info will lead to a valid configuration of @ref CLWeightsReshapeKernel
      *
      * @param[in] input      The input tensor to convert. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] if shared,
@@ -88,12 +88,12 @@
      *                       @warning Appending biases to weights reshaped matrix is not supported for quantized asymmetric types.
      * @param[in] output     The output tensor. Should be a 2D Tensor if there are no groups and the weights are not shared; a 3D Tensor otherwise.
      *                       Data types supported: Same as @p input
-     * @param[in] num_groups (Optional) Number of groups when performing a grouped convolution.
+     * @param[in] num_groups (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is only supported for NCHW data layout
      *                       Number of groups greater than one are only supported for NCHW data layout, and the number of weights must be a multiple of it.
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *biases, const ITensorInfo *output, const unsigned int num_groups = 1);
+    static Status validate(const ITensorInfo *input, const ITensorInfo *biases, const ITensorInfo *output, unsigned int num_groups = 1);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/arm_compute/core/NEON/kernels/NEIm2ColKernel.h b/arm_compute/core/NEON/kernels/NEIm2ColKernel.h
index 37145a3..38bdff0 100644
--- a/arm_compute/core/NEON/kernels/NEIm2ColKernel.h
+++ b/arm_compute/core/NEON/kernels/NEIm2ColKernel.h
@@ -84,7 +84,7 @@
      * @param[in]  conv_info          Contains padding and stride information described in @ref PadStrideInfo.
      * @param[in]  has_bias           In case biases are provided expands the matrix with 1.
      * @param[in]  dilation           (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
-     * @param[in]  num_groups         (Optional) Number of groups when performing a grouped convolution
+     * @param[in]  num_groups         (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is not supported
      * @param[in]  is_fully_connected (Optional) Determines whether this kernel will be called by @ref NEFullyConnectedLayer in order to validate the arguments
      * @param[in]  is_flatten         (Optional) Determines whether this kernel will be called by @ref NEFlattenLayer in order to validate the arguments
      */
@@ -100,7 +100,7 @@
      * @param[in] conv_info          Contains padding and stride information described in @ref PadStrideInfo.
      * @param[in] has_bias           In case biases are provided expands the matrix with 1.
      * @param[in] dilation           (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
-     * @param[in] num_groups         (Optional) Number of groups when performing a grouped convolution
+     * @param[in] num_groups         (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is not supported
      * @param[in] is_fully_connected (Optional)Determines whether this kernel will be called by @ref NEFullyConnectedLayer in order to validate the arguments
      * @param[in] is_flatten         (Optional) Determines whether this kernel will be called by @ref NEFlattenLayer in order to validate the arguments
      *
diff --git a/arm_compute/core/utils/misc/ShapeCalculator.h b/arm_compute/core/utils/misc/ShapeCalculator.h
index 8a00c22..c40e711 100644
--- a/arm_compute/core/utils/misc/ShapeCalculator.h
+++ b/arm_compute/core/utils/misc/ShapeCalculator.h
@@ -55,19 +55,20 @@
     permute(output_shape, perm);
     return output_shape;
 }
-inline TensorShape compute_weights_reshaped_shape(const ITensorInfo &weights, bool has_bias = false, const unsigned int num_groups = 1)
+inline TensorShape compute_weights_reshaped_shape(const ITensorInfo &weights, bool has_bias = false, unsigned int num_groups = 1)
 {
     // Number of groups greater than one are only supported for NCHW data layout, and the number of weights must be a multiple of it.
-
     ARM_COMPUTE_ERROR_ON(num_groups == 0);
-    ARM_COMPUTE_ERROR_ON((weights.dimension(3) % num_groups) != 0);
     ARM_COMPUTE_ERROR_ON(weights.data_layout() == DataLayout::NHWC && num_groups > 1);
+    ARM_COMPUTE_ERROR_ON((weights.dimension(3) % num_groups) != 0);
 
     // Calculate output shape
     TensorShape weights_reshaped{ weights.tensor_shape() };
+    weights_reshaped.set(3, weights_reshaped[3] / num_groups);
+
     weights_reshaped.collapse(3);
     const size_t tmp_dim = weights_reshaped[0];
-    weights_reshaped.set(0, weights_reshaped[1] / num_groups);
+    weights_reshaped.set(0, weights_reshaped[1]);
     weights_reshaped.set(1, tmp_dim + (has_bias ? 1 : 0));
     if(weights.num_dimensions() < 5)
     {
diff --git a/arm_compute/runtime/CL/functions/CLConvolutionLayer.h b/arm_compute/runtime/CL/functions/CLConvolutionLayer.h
index c9a74f2..8270e97 100644
--- a/arm_compute/runtime/CL/functions/CLConvolutionLayer.h
+++ b/arm_compute/runtime/CL/functions/CLConvolutionLayer.h
@@ -60,10 +60,11 @@
      * @param[in]  dilation         (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
      * @param[in]  act_info         (Optional) Activation layer information in case of a fused activation.
      * @param[in]  enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation
-     *                                available which may introduce a drop of accuracy as well. Default is false
+     *                              available which may introduce a drop of accuracy as well. Default is false
+     * @param[in]  num_groups       (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is only supported for NCHW data layout
      */
     void configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info = WeightsInfo(),
-                   const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false);
+                   const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false, unsigned int num_groups = 1);
     /** Static function to check if given info will lead to a valid configuration of @ref CLConvolutionLayer
      *
      * @param[in] input            Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
@@ -79,11 +80,13 @@
      * @param[in] act_info         (Optional) Activation layer information in case of a fused activation.
      * @param[in] enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation
      *                             available which may introduce a drop of accuracy as well. Default is false
+     * @param[in] num_groups       (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is only supported for NCHW data layout
      *
      * @return a status
      */
     static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                           const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false);
+                           const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false,
+                           unsigned int num_groups = 1);
     /** Static function to check if given info will return the convolution called by @ref CLConvolutionLayer
      *
      * @param[in] input            Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
diff --git a/arm_compute/runtime/CL/functions/CLGEMMConvolutionLayer.h b/arm_compute/runtime/CL/functions/CLGEMMConvolutionLayer.h
index 8538d83..c6ae535 100644
--- a/arm_compute/runtime/CL/functions/CLGEMMConvolutionLayer.h
+++ b/arm_compute/runtime/CL/functions/CLGEMMConvolutionLayer.h
@@ -59,22 +59,24 @@
     CLConvolutionLayerReshapeWeights();
     /** Set the input and output tensors.
      *
-     * @param[in]  weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
-     *                     Data type supported: QASYMM8/F16/F32.
-     * @param[in]  biases  Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p weights.
-     * @param[out] output  Destination tensor. Data types supported: Same as @p weights.
+     * @param[in]  weights    Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
+     *                        Data type supported: QASYMM8/F16/F32.
+     * @param[in]  biases     Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p weights.
+     * @param[out] output     Destination tensor. Data types supported: Same as @p weights.
+     * @param[in]  num_groups (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is only supported for NCHW data layout
      */
-    void configure(const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output);
+    void configure(const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, unsigned int num_groups = 1);
     /** Static function to check if given info will lead to a valid configuration of @ref CLConvolutionLayerReshapeWeights
      *
-     * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
-     *                    Data type supported: QASYMM8/F16/F32.
-     * @param[in] biases  Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p weights.
-     * @param[in] output  Destination tensor. Data types supported: Same as @p weights.
+     * @param[in] weights    Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
+     *                       Data type supported: QASYMM8/F16/F32.
+     * @param[in] biases     Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p weights.
+     * @param[in] output     Destination tensor. Data types supported: Same as @p weights.
+     * @param[in] num_groups (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is only supported for NCHW data layout
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output);
+    static Status validate(const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, unsigned int num_groups = 1);
     // Inherited methods overridden:
     void run() override;
 
@@ -122,9 +124,10 @@
      *                          tensor has also been transposed with CLGEMMTranspose1xWKernel. Data type supported: Same as @p input.
      * @param[in]  dilation     (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
      * @param[in]  act_info     (Optional) Activation layer information in case of a fused activation.
+     * @param[in]  num_groups   (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is only supported for NCHW data layout
      */
     void configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info = WeightsInfo(),
-                   const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo());
+                   const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), unsigned int num_groups = 1);
     /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMConvolutionLayer.
      *
      * @param[in]  input        Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
@@ -140,11 +143,12 @@
      *                          tensor has also been transposed with CLGEMMTranspose1xWKernel. Data type supported: Same as @p input.
      * @param[in]  dilation     (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
      * @param[in]  act_info     (Optional) Activation layer information in case of a fused activation.
+     * @param[in]  num_groups   (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is only supported for NCHW data layout
      *
      * @return a status
      */
     static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                           const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo());
+                           const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), unsigned int num_groups = 1);
 
     // Inherited methods overridden:
     void run() override;
diff --git a/arm_compute/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.h b/arm_compute/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.h
index 421150e..4fac95e 100644
--- a/arm_compute/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.h
+++ b/arm_compute/runtime/GLES_COMPUTE/functions/GCConvolutionLayer.h
@@ -101,29 +101,10 @@
      *                          tensor has also been transposed with GCGEMMTranspose1xWKernel. Data type supported: Same as @p input.
      * @param[in]  dilation     (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
      * @param[in]  act_info     (Optional) Activation layer information in case of a fused activation.
+     * @param[in]  num_groups   (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is not supported
      */
     void configure(const IGCTensor *input, const IGCTensor *weights, const IGCTensor *biases, IGCTensor *output, const PadStrideInfo &conv_info,
-                   const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo());
-    /** Static function to check if given info will lead to a valid configuration of @ref GCConvolutionLayer.
-     *
-     * @param[in]  input        Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
-     *                          while every optional dimension from 4 and above represent a batch of inputs.
-     *                          Data types supported: QASYMM8/F16/F32.
-     * @param[in]  weights      Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported: Same as @p input.
-     * @param[in]  biases       Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
-     *                          Data type supported: Should match @p input data type, except for input of QASYMM8 type where biases should be of S32 type.
-     * @param[out] output       Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
-     *                          Data types supported: Same as @p input.
-     * @param[in]  conv_info    Contains padding and stride information described in @ref PadStrideInfo.
-     * @param[in]  weights_info Specifies if the weights tensor has been reshaped with GCWeightsReshapeKernel. If this is not part of the fully connected layer the weights
-     *                          tensor has also been transposed with GCGEMMTranspose1xWKernel. Data type supported: Same as @p input.
-     * @param[in]  dilation     (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
-     * @param[in]  act_info     (Optional) Activation layer information in case of a fused activation.
-     *
-     * @return a status
-     */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                           const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo());
+                   const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), unsigned int num_groups = 1);
 
     // Inherited methods overridden:
     void run() override;
diff --git a/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h
index c4226cb..5b53aec 100644
--- a/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEConvolutionLayer.h
@@ -64,10 +64,11 @@
      * @param[in]  dilation         (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
      * @param[in]  act_info         (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported.
      * @param[in]  enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation
-     *                                available which may introduce a drop of accuracy as well. Default is false
+     *                              available which may introduce a drop of accuracy as well. Default is false
+     * @param[in]  num_groups       (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is not supported
      */
     void configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info = WeightsInfo(),
-                   const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false);
+                   const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false, unsigned int num_groups = 1);
     /** Static function to check if given info will lead to a valid configuration of @ref NEConvolutionLayer
      *
      * @param[in] input            Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
@@ -85,11 +86,13 @@
      * @param[in] act_info         (Optional) Activation layer information in case of a fused activation.
      * @param[in] enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation
      *                             available which may introduce a drop of accuracy as well. Default is false
+     * @param[in] num_groups       (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is not supported
      *
      * @return a status
      */
     static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                           const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false);
+                           const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false,
+                           unsigned int num_groups = 1);
     /** Static function to check if given info will return the convolution called by @ref NEConvolutionLayer
      *
      * @param[in] input            Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
diff --git a/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h
index e587cb4..dc00b17 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h
@@ -116,9 +116,10 @@
      *                          tensor has also been transposed with NEGEMMTranspose1xWKernel. Data type supported: Same as @p input.
      * @param[in]  dilation     (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
      * @param[in]  act_info     (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported.
+     * @param[in]  num_groups   (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is not supported
      */
     void configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info = WeightsInfo(),
-                   const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo());
+                   const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), unsigned int num_groups = 1);
     /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMConvolutionLayer
      *
      * @param[in] input        Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
@@ -134,11 +135,12 @@
      *                         tensor has also been transposed with NEGEMMTranspose1xWKernel. Data type supported: Same as @p input.
      * @param[in] dilation     (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
      * @param[in] act_info     (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported.
+     * @param[in] num_groups   (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is not supported
      *
      * @return a status
      */
     static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                           const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo());
+                           const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), unsigned int num_groups = 1);
 
     // Inherited methods overridden:
     void run() override;