Port NEGEMMConv2d to memory injecting interface

Resolves: COMPMID-4506, COMPMID-4570

Change-Id: I6d37a06da141f1fcfcaa8525322a319cb0234791
Signed-off-by: Michele Di Giorgio <michele.digiorgio@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5824
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
diff --git a/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h b/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h
index dc9783f..ff88876 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h
@@ -24,32 +24,15 @@
 #ifndef ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYCORE_H
 #define ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYCORE_H
 
-#include "NEActivationLayer.h"
-#include "arm_compute/core/ITensorPack.h"
 #include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/IMemoryManager.h"
 #include "arm_compute/runtime/IWeightsManager.h"
-#include "arm_compute/runtime/MemoryGroup.h"
-#include "arm_compute/runtime/Tensor.h"
 
 #include <memory>
 
 namespace arm_compute
 {
 class ITensor;
-class NEConvertQuantizedSignednessKernel;
-class NEGEMMInterleave4x4Kernel;
-class NEGEMMLowpMatrixMultiplyKernel;
-class NEGEMMLowpOffsetContributionKernel;
-class NEGEMMLowpOffsetContributionOutputStageKernel;
-class NEGEMMLowpMatrixAReductionKernel;
-class NEGEMMLowpMatrixBReductionKernel;
-class NEGEMMTranspose1xWKernel;
-namespace cpu
-{
-class CpuGemmAssemblyDispatch;
-}
-
 /** Basic function to execute GEMMLowpMatrixMultiplyCore. This function calls the following kernels if the DOT product instruction is not available:
  *
  *  -# @ref NEGEMMInterleave4x4Kernel
@@ -119,14 +102,7 @@
     void configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *output, const GEMMInfo &gemm_info = GEMMInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpMatrixMultiplyCore
      *
-     * @note The @p output type is S32 if @p gemm_info.type == GEMMLowpOutputStageType::NONE. It is QASYMM8/QASYMM8_SIGNED otherwise
-     *
-     * @param[in] a         First input tensor info  (Matrix A). Data type supported: QASYMM8/QASYMM8_SIGNED.
-     * @param[in] b         Second input tensor info (Matrix B). Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL.
-     * @param[in] c         Third input tensor  info (Matrix C). It can be a nullptr. Data type supported: S32
-     * @param[in] output    Output tensor info. Data type supported: Data type supported: S32/QASYMM8/QASYMM8_SIGNED
-     * @param[in] gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped and
-     *                      if the reshape of matrix B should be executed only for the first run
+     * Similar to @ref NEGEMMLowpMatrixMultiplyCore::configure()
      *
      * @return a status
      */
@@ -137,41 +113,8 @@
     void prepare() override;
 
 private:
-    MemoryGroup                                                    _memory_group;
-    IWeightsManager                                               *_weights_manager;
-    std::unique_ptr<cpu::CpuGemmAssemblyDispatch>                  _asm_glue;
-    std::unique_ptr<NEGEMMLowpMatrixMultiplyKernel>                _mm_kernel;
-    std::unique_ptr<NEGEMMInterleave4x4Kernel>                     _mtx_a_reshape_kernel;
-    std::unique_ptr<NEGEMMTranspose1xWKernel>                      _mtx_b_reshape_kernel;
-    std::unique_ptr<NEGEMMLowpMatrixAReductionKernel>              _mtx_a_reduction_kernel;
-    std::unique_ptr<NEGEMMLowpMatrixBReductionKernel>              _mtx_b_reduction_kernel;
-    std::unique_ptr<NEGEMMLowpOffsetContributionKernel>            _offset_contribution_kernel;
-    std::unique_ptr<NEGEMMLowpOffsetContributionOutputStageKernel> _offset_contribution_output_stage_kernel;
-    NEActivationLayer                                              _activation_func;
-    std::unique_ptr<NEConvertQuantizedSignednessKernel>            _convert_to_signed_asymm;
-    std::unique_ptr<NEConvertQuantizedSignednessKernel>            _convert_from_signed_asymm;
-
-    Tensor         _vector_sum_col;
-    Tensor         _vector_sum_row;
-    Tensor         _tmp_a;
-    Tensor         _tmp_b;
-    Tensor         _mm_result_s32;
-    Tensor         _signed_a;
-    Tensor         _signed_output;
-    const ITensor *_original_b;
-    int32_t        _a_offset;
-    int32_t        _b_offset;
-
-    bool _run_vector_matrix_multiplication;
-    bool _assembly_path;
-    bool _fused_assembly_path;
-    bool _reshape_b_only_on_first_run;
-    bool _is_prepared;
-    bool _fuse_output_stage;
-    bool _run_activation;
-    bool _flip_signedness;
-
-    ITensorPack _asm_glue_tensors{};
+    struct Impl;
+    std::unique_ptr<struct Impl> _impl;
 };
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYCORE_H */