Port CLGEMM to memory injecting interface

Moves the following kernels:
 - CLGEMMMatrixMultiplyKernel
 - CLGEMMMatrixMultiplyNativeKernel
 - CLGEMMMatrixMultipluReshapedKernel
 - CLGEMMMatrixMultiplyReshapedOnlyRHSKernel

 Moves the following functions
 - CLGEMM

Introduces facilities to easy handling of auxiliary temporary buffers
under then new run interface. Such are:
 - CLAuxTensorHandler: That allows wrapping of workspace buffers memory
 to CLBuffer objects
 - Ability to inject TensorInfo to allocator without transferring
 ownership. This reduce the copy overhead if needed.

Resolves: COMPMID-4188

Signed-off-by: Georgios Pinitas <georgios.pinitas@arm.com>
Change-Id: I7055435d831b05b749b26302082e4ac45f26dfb0
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5498
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Michalis Spyrou <michalis.spyrou@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
diff --git a/arm_compute/runtime/CL/functions/CLGEMM.h b/arm_compute/runtime/CL/functions/CLGEMM.h
index 1e2ae7b..38a07ef 100644
--- a/arm_compute/runtime/CL/functions/CLGEMM.h
+++ b/arm_compute/runtime/CL/functions/CLGEMM.h
@@ -35,76 +35,12 @@
 
 namespace arm_compute
 {
+// Forward declarations
 class CLCompileContext;
-class CLGEMMReshapeRHSMatrixKernel;
-class CLGEMMMatrixMultiplyKernel;
-class CLGEMMMatrixMultiplyReshapedKernel;
-class CLGEMMMatrixMultiplyReshapedOnlyRHSKernel;
-class CLGEMMReshapeLHSMatrixKernel;
 class ICLTensor;
 class ITensorInfo;
 
-namespace weights_transformations
-{
-/** Basic function to manage the reshape weights generated from @ref CLGEMMReshapeRHSMatrixKernel */
-class CLGEMMReshapeRHSMatrixKernelManaged : public ITransformWeights
-{
-public:
-    /** Default constructor */
-    CLGEMMReshapeRHSMatrixKernelManaged();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGEMMReshapeRHSMatrixKernelManaged(const CLGEMMReshapeRHSMatrixKernelManaged &) = delete;
-    /** Default move constructor */
-    CLGEMMReshapeRHSMatrixKernelManaged(CLGEMMReshapeRHSMatrixKernelManaged &&) = default;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLGEMMReshapeRHSMatrixKernelManaged &operator=(const CLGEMMReshapeRHSMatrixKernelManaged &) = delete;
-    /** Default move assignment operator */
-    CLGEMMReshapeRHSMatrixKernelManaged &operator=(CLGEMMReshapeRHSMatrixKernelManaged &&) = default;
-    /** Default desctructor */
-    ~CLGEMMReshapeRHSMatrixKernelManaged();
-    //Inherited method override
-    void run() override;
-
-    //Inherited method override
-    void release() override;
-
-    //Inherited method override
-    ICLTensor *get_weights() override;
-
-    //Inherited method override
-    uint32_t uid() override;
-
-    /** Configures the @ref CLGEMMReshapeRHSMatrixKernel kernel
-     *
-     * @param[in] input Input tensor. Data types supported: All
-     * @param[in] info  RHS matrix information to be used for reshaping.
-     */
-    void configure(const ICLTensor *input, GEMMRHSMatrixInfo info);
-
-    /** Configures the @ref CLGEMMReshapeRHSMatrixKernel kernel
-     *
-     * @param[in] compile_context The compile context to be used.
-     * @param[in] input           Input tensor. Data types supported: All
-     * @param[in] info            RHS matrix information to be used for reshaping.
-     */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, GEMMRHSMatrixInfo info);
-
-private:
-    static constexpr uint32_t                     _uid{ 0x15 };
-    CLTensor                                      _output{};
-    std::unique_ptr<CLGEMMReshapeRHSMatrixKernel> _kernel;
-};
-} // namespace weights_transformations
-
-/** Basic function to execute GEMM on OpenCL. This function calls the following OpenCL kernels:
- *
- *  -# @ref CLGEMMReshapeLHSMatrixKernel (only if the RESHAPED_V1 is selected by the heuristic model)
- *  -# @ref CLGEMMReshapeRHSMatrixKernel (only if either the RESHAPED_V1 or RESHAPED_ONLY_RHS is selected by the select_gemm_kernel method())
- *  -# @ref CLGEMMMatrixMultiplyKernel (only if either the NATIVE or RESHAPED_V1 is selected by the select_gemm_kernel method())
- *  -# @ref CLGEMMMatrixMultiplyReshapedKernel (only if RESHAPED_V1 is selected by the select_gemm_kernel method())
- *  -# @ref CLGEMMMatrixMultiplyReshapedOnlyRHSKernel (only if RESHAPED_ONLY_RHS is selected by the select_gemm_kernel method())
- *
- */
+/** Basic function to execute GEMM on OpenCL */
 class CLGEMM : public IFunction
 {
 public:
@@ -114,16 +50,16 @@
      * @param[in] weights_manager (Optional) Weights manager.
      */
     CLGEMM(std::shared_ptr<IMemoryManager> memory_manager = nullptr, IWeightsManager *weights_manager = nullptr);
+    /** Default destructor */
+    ~CLGEMM();
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     CLGEMM(const CLGEMM &) = delete;
     /** Default move constructor */
-    CLGEMM(CLGEMM &&) = default;
+    CLGEMM(CLGEMM &&);
     /** Prevent instances of this class from being copied (As this class contains pointers) */
     CLGEMM &operator=(const CLGEMM &) = delete;
     /** Default move assignment operator */
-    CLGEMM &operator=(CLGEMM &&) = default;
-    /** Default destructor */
-    ~CLGEMM();
+    CLGEMM &operator=(CLGEMM &&);
     /** Initialise the kernel's inputs and output
      *
      * Valid data layouts:
@@ -141,25 +77,6 @@
      *
      * @note Whilst the first input tensor can be a vector, the second input tensor must be at least a matrix
      *
-     * @param[in]  a         First input tensor  (Matrix or Vector A). Data types supported: F16/F32
-     * @param[in]  b         Second input tensor (Matrix B). Data type supported: same as @p a.
-     * @param[in]  c         Third input tensor  (Matrix C). It can be a nullptr if just the multiplication between @p a and @p b is needed. Data type supported: same as @p a.
-     * @param[out] output    Output tensor. Data type supported: same as @p a
-     * @param[in]  alpha     Weight of the matrix product
-     * @param[in]  beta      Weight of matrix C
-     * @param[in]  gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped and
-     *                       if the reshape of matrix B should happen only for the first run. GEMMInfo also contains information about the reshaping
-     *                       in case matrix A and matrix B have been already transformed.
-     */
-    void configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info = GEMMInfo());
-    /** Initialise the kernel's inputs and output
-     *
-     * @note GEMM: General Matrix Multiply - [alpha * A * B + beta * C].
-     *
-     * @note All tensors must have the same data type.
-     *
-     * @note Whilst the first input tensor can be a vector, the second input tensor must be at least a matrix
-     *
      * @param[in]  compile_context The compile context to be used.
      * @param[in]  a               First input tensor  (Matrix or Vector A). Data types supported: F16/F32
      * @param[in]  b               Second input tensor (Matrix B). Data type supported: same as @p a.
@@ -168,20 +85,20 @@
      * @param[in]  alpha           Weight of the matrix product
      * @param[in]  beta            Weight of matrix C
      * @param[in]  gemm_info       (Optional) Specifies if the matrix A and/or matrix B have been reshaped and
-     *                       if the reshape of matrix B should happen only for the first run. GEMMInfo also contains information about the reshaping
-     *                       in case matrix A and matrix B have been already transformed.
+     *                             if the reshape of matrix B should happen only for the first run. GEMMInfo also contains information about the reshaping
+     *                             in case matrix A and matrix B have been already transformed.
      */
     void configure(const CLCompileContext &compile_context, const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info = GEMMInfo());
+
+    /** Initialise the kernel's inputs and output
+     *
+     * Similar to @ref CLGEMM::configure()
+     */
+    void configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info = GEMMInfo());
+
     /** Static function to check if given info will lead to a valid configuration of @ref CLGEMM.
      *
-     * @param[in] a         First input tensor info  (Matrix or Vector A). Data types supported: F16/F32
-     * @param[in] b         Second input tensor info (Matrix B). Data type supported: same as @p a.
-     * @param[in] c         Third input tensor info  (Matrix C). It can be a nullptr if just the multiplication between @p a and @p b is needed. Data type supported: same as @p a.
-     * @param[in] output    Output tensor info. Data type supported: same as @p a
-     * @param[in] alpha     Weight of the matrix product
-     * @param[in] beta      Weight of matrix C
-     * @param[in] gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped and
-     *                       if the reshape of matrix B should happen only for the first run
+     * Similar to @ref CLGEMM::configure()
      *
      * @return a status
      */
@@ -192,34 +109,8 @@
     void prepare() override;
 
 private:
-    void configure_native_v1(const CLCompileContext &compile_context, const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info);
-    void configure_reshaped_v1(const CLCompileContext &compile_context, const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info);
-    void configure_reshaped_v2(const CLCompileContext &compile_context, const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info);
-    void configure_reshaped_only_rhs(const CLCompileContext &compile_context, const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta,
-                                     const GEMMInfo &gemm_info);
-
-    static Status validate_native_v1(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info);
-    static Status validate_reshaped_v1(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info);
-    static Status validate_reshaped(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info);
-    static Status validate_reshaped_only_rhs(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info);
-
-    MemoryGroup                                                                   _memory_group;
-    IWeightsManager                                                              *_weights_manager;
-    std::unique_ptr<CLGEMMMatrixMultiplyKernel>                                   _mm_kernel;
-    std::unique_ptr<CLGEMMReshapeLHSMatrixKernel>                                 _reshape_lhs_kernel;
-    std::unique_ptr<CLGEMMReshapeRHSMatrixKernel>                                 _reshape_rhs_kernel;
-    std::unique_ptr<weights_transformations::CLGEMMReshapeRHSMatrixKernelManaged> _reshape_rhs_kernel_managed;
-    std::unique_ptr<CLGEMMMatrixMultiplyReshapedKernel>                           _mm_reshaped_kernel;
-    std::unique_ptr<CLGEMMMatrixMultiplyReshapedOnlyRHSKernel>                    _mm_reshaped_only_rhs_kernel;
-    std::unique_ptr<CLGEMMMatrixMultiplyReshapedOnlyRHSKernel>                    _mm_reshaped_only_rhs_fallback_kernel;
-    CLTensor                                                                      _tmp_a;
-    CLTensor                                                                      _tmp_b;
-    const ICLTensor                                                              *_original_b;
-    const ICLTensor                                                              *_lhs;
-    ICLTensor                                                                    *_dst;
-    bool                                                                          _reshape_b_only_on_first_run;
-    bool                                                                          _is_prepared;
-    CLGEMMKernelType                                                              _gemm_kernel_type;
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 } // namespace arm_compute