COMPMID-872 - Rework NEGEMMConvolutionLayer to use NEGEMM

Change-Id: I55f0018ac7214775ebbca63f58a3bf5c93732fec
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/142632
Tested-by: Jenkins <bsgcomp@arm.com>
Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
diff --git a/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h
index 8f41462..a362a29 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h
@@ -28,17 +28,13 @@
 
 #include "arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h"
 #include "arm_compute/core/NEON/kernels/NECol2ImKernel.h"
-#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMAssemblyBaseKernel.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
-#include "arm_compute/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h"
 #include "arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h"
 #include "arm_compute/core/NEON/kernels/NEIm2ColKernel.h"
 #include "arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/MemoryGroup.h"
 #include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
-#include "arm_compute/runtime/NEON/functions/NEGEMMAssemblyDispatch.h"
+#include "arm_compute/runtime/NEON/functions/NEGEMM.h"
 #include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h"
 #include "arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h"
 #include "arm_compute/runtime/Tensor.h"
@@ -49,55 +45,47 @@
 {
 class ITensor;
 
-/** Function to reshape and perform 1xW transposition on the weights. This function calls the following kernels:
+/** Function to reshape the weights. This function calls the following kernel:
  * -# @ref NEWeightsReshapeKernel
- * -# @ref NEGEMMTranspose1xWKernel (executed in case GEMM is required for the operation)
  */
 class NEConvolutionLayerReshapeWeights : public IFunction
 {
 public:
     /** Constructor */
-    NEConvolutionLayerReshapeWeights(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+    NEConvolutionLayerReshapeWeights();
     /** Set the input and output tensors.
      *
-     * @param[in]  weights      Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported: QASYMM8/F32.
-     * @param[in]  biases       Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p weights.
-     * @param[out] output       Destination tensor. Data types supported: Same as @p weights.
-     * @param[in]  transpose1xW True if the weights are to undergo a 1xW transposition after reshaping (in case of GEMM operation), false otherwise.
-     *                          Data types supported: Same as @p weights.
+     * @param[in]  weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported: QASYMM8/F16/F32.
+     * @param[in]  biases  Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p weights.
+     * @param[out] output  Destination tensor. Data types supported: Same as @p weights.
      */
-    void configure(const ITensor *weights, const ITensor *biases, ITensor *output, bool transpose1xW);
+    void configure(const ITensor *weights, const ITensor *biases, ITensor *output);
     /** Static function to check if given info will lead to a valid configuration of @ref NEConvolutionLayerReshapeWeights
      *
-     * @param[in] weights      Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported: QASYMM8/F16/F32.
-     * @param[in] biases       Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p weights.
-     * @param[in] output       Destination tensor. Data types supported: Same as @p weights.
-     * @param[in] transpose1xW True if the weights are to undergo a 1xW transposition after reshaping (in case of GEMM operation), false otherwise.
-     *                         Data types supported: Same as @p weights.
+     * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported: QASYMM8/F16/F32.
+     * @param[in] biases  Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p weights.
+     * @param[in] output  Destination tensor. Data types supported: Same as @p weights.
      *
      * @return an error status
      */
-    static Status validate(const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, bool transpose1xW);
+    static Status validate(const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output);
 
     // Inherited methods overridden:
     void run() override;
 
 private:
-    MemoryGroup              _memory_group;
-    NEWeightsReshapeKernel   _weights_reshape_kernel;
-    NEGEMMTranspose1xWKernel _weights_transposed_kernel;
-    Tensor                   _weights_reshaped;
-    bool                     _transpose1xW;
+    NEWeightsReshapeKernel _weights_reshape_kernel;
 };
 
-/** Basic function to simulate a convolution layer. This function calls the following NEON kernels:
- * -# @ref NEWeightsReshapeKernel   (executed only once for each configuration)
+/** Basic function to compute the convolution layer. This function calls the following NEON kernels/functions:
+ *
  * -# @ref NEIm2ColKernel
- * -# @ref NEGEMMInterleave4x4Kernel (executed only in case GEMM is required for the operation)
- * -# @ref NEGEMMMatrixMultiplyKernel or @ref NEGEMMLowpMatrixMultiplyCore (if quantized asymmetric)
- * -# @ref NEGEMMLowpQuantizeDownInt32ToUint8Scale (if quantized asymmetric)
+ * -# @ref NEGEMM (if the data type is FP32 or FP16)
+ * -# @ref NEGEMMLowpMatrixMultiplyCore (if the data type is QASYMM8)
+ * -# @ref NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint (if the data type is QASYMM8)
+ * -# @ref NEArithmeticAdditionKernel (if biases != nullptr and we have a 1x1 convolution with the NHWC data layout)
  * -# @ref NECol2ImKernel
- * -# @ref NEActivationLayer (executed only if the activation layer is enabled)
+ *
  */
 class NEGEMMConvolutionLayer : public IFunction
 {
@@ -158,45 +146,52 @@
 private:
     /** Configures the appropriate matrix multiply routine
      *
-     * @param[in]  input          Input tensor. Data types supported: QASYMM8/F16/F32.
-     * @param[in]  weights        Weights tensor. Data type supported: Same as @p input.
-     * @param[out] output         Output tensor. Data types supported: Same as @p input,
-     *                            except for input of QASYMM8 type where output should be of S32 type.
-     * @param[in]  is_interleaved (Optional) True if input0 and input1 have been reshaped respectively using @ref CLGEMMInterleave4x4Kernel and @ref CLGEMMTranspose1xWKernel
-     * @param[in]  reshape_info   (Optional) GEMM reshape info. If is_interleaved_transposed = true, this object must contain the information to understand how the matrix A and matrix B have been reshaped
+     * @param[in]  input         Input tensor. Data types supported: QASYMM8/F16/F32.
+     * @param[in]  weights       Weights tensor. Data type supported: Same as @p input.
+     * @param[out] output        Output tensor. Data types supported: Same as @p input,
+     *                           except for input of QASYMM8 type where output should be of S32 type.
+     * @param[in]  gemm_3d_depth (Optional) Depth of GEMM 3D (Defaults to 1)
      */
-    void configure_mm(const ITensor *input, const ITensor *weights, ITensor *output, bool is_interleaved, const GEMMReshapeInfo &reshape_info = GEMMReshapeInfo());
+    void configure_mm(const ITensor *input, const ITensor *weights, ITensor *output, int gemm_3d_depth = 1);
+    /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMConvolutionLayer matrix multiply routines
+     *
+     * @param[in] input         Input tensor. Data types supported: QASYMM8/F16/F32.
+     * @param[in] weights       Weights tensor. Data type supported: Same as @p input.
+     * @param[in] output        Output tensor. Data types supported: Same as @p input,
+     *                          except for input of QASYMM8 type where output should be of S32 type.
+     * @param[in] gemm_3d_depth (Optional) Depth of GEMM 3D (Defaults to 1)
+     * @param[in] skip_im2col   (Optional) Flag which specifies if im2col has to be skipped. i.e. 1x1 convolution with NHWC data layout. (Default to false)
+     *
+     * @return a status
+     */
+    static Status validate_mm(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, int gemm_3d_depth = 1, bool skip_im2col = false);
 
 private:
     MemoryGroup                                         _memory_group;
-    NEGEMMAssemblyDispatch                              _asm_glue;
-    NEIm2ColKernel                                      _input_im2col_kernel;
-    NEGEMMInterleave4x4Kernel                           _input_interleave_kernel;
     NEConvolutionLayerReshapeWeights                    _reshape_weights;
-    NEGEMMMatrixMultiplyKernel                          _mm_kernel;
+    NEIm2ColKernel                                      _im2col_kernel;
+    NEGEMM                                              _mm_gemm;
     NEGEMMLowpMatrixMultiplyCore                        _mm_gemmlowp;
     NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint _gemmlowp_output_stage;
-    NECol2ImKernel                                      _output_col2im_kernel;
+    NECol2ImKernel                                      _col2im_kernel;
     NEActivationLayer                                   _activationlayer_function;
     NEArithmeticAdditionKernel                          _add_bias_kernel;
 
     const ITensor *_original_weights;
 
-    Tensor _input_im2col_reshaped;
-    Tensor _input_interleaved_reshaped;
+    Tensor _im2col_output;
     Tensor _weights_reshaped;
     Tensor _gemm_output;
     Tensor _tmp_output;
 
     DataLayout _data_layout;
-    bool       _append_bias;
-    bool       _is_fully_connected_convolution;
-    bool       _are_weights_reshaped;
-    bool       _is_quantized;
-    bool       _is_interleaved;
-    bool       _is_activationlayer_enabled;
-    bool       _skip_im2col;
-    bool       _is_prepared;
+
+    bool _append_bias;
+    bool _skip_im2col;
+    bool _skip_col2im;
+    bool _is_quantized;
+    bool _is_activationlayer_enabled;
+    bool _is_prepared;
 };
 }
 #endif /* __ARM_COMPUTE_NECONVOLUTIONGEMMLAYER_H__ */