Update CpuGemmConv2d and CpuFlatten to use CpuReshape operator

- Following CpuReshapeKernel Optimizations, update the CpuGemmConv2D and CpuFlatten
  to use CpuReshape operator instead of CpuReshapeKernel
- Minor changes to comment in NEReorgLayerKernel.h

Resolves COMPMID-6504

Signed-off-by: Anitha Raj <anitha.raj@arm.com>
Change-Id: Ib6ee1fdc313d91249f9fe41c81e73324031c1ff4
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/10186
Reviewed-by: Jakub Sujak <jakub.sujak@arm.com>
Reviewed-by: Gunes Bayir <gunes.bayir@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Benchmark: Arm Jenkins <bsgcomp@arm.com>
diff --git a/src/core/NEON/kernels/NEReorgLayerKernel.h b/src/core/NEON/kernels/NEReorgLayerKernel.h
index 38a7d9f..6e67eb3 100644
--- a/src/core/NEON/kernels/NEReorgLayerKernel.h
+++ b/src/core/NEON/kernels/NEReorgLayerKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_NEREORGLAYERKERNEL_H
-#define ARM_COMPUTE_NEREORGLAYERKERNEL_H
+#ifndef ACL_SRC_CORE_NEON_KERNELS_NEREORGLAYERKERNEL_H
+#define ACL_SRC_CORE_NEON_KERNELS_NEREORGLAYERKERNEL_H
 
 #include "src/core/NEON/INEKernel.h"
 
@@ -60,7 +60,7 @@
      */
     void configure(const ITensor *input, ITensor *output, int32_t stride);
 
-    /** Static function to check if given info will lead to a valid configuration of @ref cpu::kernels::CpuReshapeKernel
+    /** Static function to check if given info will lead to a valid configuration
      *
      * @param[in] input  Source tensor info. Data type supported: All
      * @param[in] output Destination tensor info. Data type supported: Same as @p input
@@ -80,4 +80,4 @@
     int32_t        _stride;
 };
 } // namespace arm_compute
-#endif /*ARM_COMPUTE_NEREORGLAYERKERNEL_H */
+#endif // ACL_SRC_CORE_NEON_KERNELS_NEREORGLAYERKERNEL_H
diff --git a/src/cpu/operators/CpuFlatten.cpp b/src/cpu/operators/CpuFlatten.cpp
index f6ae139..7bab9e4 100644
--- a/src/cpu/operators/CpuFlatten.cpp
+++ b/src/cpu/operators/CpuFlatten.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,7 +23,7 @@
  */
 #include "src/cpu/operators/CpuFlatten.h"
 
-#include "src/cpu/kernels/CpuReshapeKernel.h"
+#include "src/cpu/operators/CpuReshape.h"
 
 #include "src/common/utils/Log.h"
 
@@ -31,17 +31,28 @@
 {
 namespace cpu
 {
+CpuFlatten::CpuFlatten()
+    : _reshape(nullptr)
+{
+}
+
+CpuFlatten::~CpuFlatten() = default;
+
 void CpuFlatten::configure(const ITensorInfo *src, ITensorInfo *dst)
 {
     ARM_COMPUTE_LOG_PARAMS(src, dst);
-    auto k = std::make_unique<kernels::CpuReshapeKernel>();
-    k->configure(src, dst);
-    _kernel = std::move(k);
+    _reshape = std::make_unique<CpuReshape>();
+    _reshape->configure(src, dst);
 }
 
 Status CpuFlatten::validate(const ITensorInfo *src, const ITensorInfo *dst)
 {
-    return kernels::CpuReshapeKernel::validate(src, dst);
+    return CpuReshape::validate(src, dst);
+}
+
+void CpuFlatten::run(ITensorPack &tensors)
+{
+    _reshape->run(tensors);
 }
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/cpu/operators/CpuFlatten.h b/src/cpu/operators/CpuFlatten.h
index 0e9fcbd..911760d 100644
--- a/src/cpu/operators/CpuFlatten.h
+++ b/src/cpu/operators/CpuFlatten.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CPU_FLATTEN_H
-#define ARM_COMPUTE_CPU_FLATTEN_H
+#ifndef ACL_SRC_CPU_OPERATORS_CPUFLATTEN_H
+#define ACL_SRC_CPU_OPERATORS_CPUFLATTEN_H
 
 #include "src/cpu/ICpuOperator.h"
 
@@ -30,10 +30,15 @@
 {
 namespace cpu
 {
+class CpuReshape;
 /** Basic function to flatten a given input */
 class CpuFlatten : public ICpuOperator
 {
 public:
+    /** Constructor */
+    CpuFlatten();
+    /** Destructor */
+    ~CpuFlatten();
     /** Configure operator for a given list of arguments
      *
      * Valid data layouts:
@@ -58,7 +63,13 @@
      * @return a status
      */
     static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
+
+    // Inherited methods overridden:
+    void run(ITensorPack &tensors) override;
+
+private:
+    std::unique_ptr<CpuReshape> _reshape;
 };
 } // namespace cpu
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_FLATTEN_H */
+#endif // ACL_SRC_CPU_OPERATORS_CPUFLATTEN_H
diff --git a/src/cpu/operators/CpuGemmConv2d.cpp b/src/cpu/operators/CpuGemmConv2d.cpp
index 7c0e58b..d11e4f0 100644
--- a/src/cpu/operators/CpuGemmConv2d.cpp
+++ b/src/cpu/operators/CpuGemmConv2d.cpp
@@ -35,11 +35,11 @@
 #include "src/core/helpers/MemoryHelpers.h"
 #include "src/cpu/kernels/CpuCol2ImKernel.h"
 #include "src/cpu/kernels/CpuIm2ColKernel.h"
-#include "src/cpu/kernels/CpuReshapeKernel.h"
 #include "src/cpu/kernels/CpuWeightsReshapeKernel.h"
 #include "src/cpu/operators/CpuGemm.h"
 #include "src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.h"
 #include "src/cpu/operators/CpuGemmLowpOutputStage.h"
+#include "src/cpu/operators/CpuReshape.h"
 #include "src/cpu/utils/CpuAuxTensorHandler.h"
 
 #include <set>
@@ -92,7 +92,7 @@
 }
 
 CpuGemmConv2d::CpuGemmConv2d()
-    : _weights_reshape_kernel(nullptr), _im2col_kernel(), _mm_gemm(), _mm_gemmlowp(), _col2im_kernel(), _reshape_kernel(), _im2col_output(), _weights_reshaped(), _gemm_output(), _gemm_output_3d(),
+    : _weights_reshape_kernel(nullptr), _im2col_kernel(), _mm_gemm(), _mm_gemmlowp(), _col2im_kernel(), _reshape(), _im2col_output(), _weights_reshaped(), _gemm_output(), _gemm_output_3d(),
       _data_layout(DataLayout::NCHW), _skip_im2col(false), _skip_col2im(false), _is_quantized(false), _is_prepared(false), _aux_mem(AuxTensorIdx::Count)
 {
 }
@@ -379,8 +379,8 @@
     else
     {
         // Configure reshape layer
-        _reshape_kernel = std::make_unique<kernels::CpuReshapeKernel>();
-        _reshape_kernel->configure(gemm_output_to_use, dst);
+        _reshape = std::make_unique<CpuReshape>();
+        _reshape->configure(gemm_output_to_use, dst);
     }
 
     // Check if GEMM transforms weights
@@ -642,7 +642,7 @@
                 { TensorType::ACL_SRC, gemm_output_to_use },
                 { TensorType::ACL_DST, dst }
             };
-            NEScheduler::get().schedule_op(_reshape_kernel.get(), Window::DimY, _reshape_kernel->window(), pack);
+            _reshape->run(pack);
         }
     }
     else if(out_has_padding)
@@ -652,7 +652,7 @@
             { TensorType::ACL_SRC, gemm_output_to_use },
             { TensorType::ACL_DST, dst }
         };
-        NEScheduler::get().schedule_op(_reshape_kernel.get(), Window::DimY, _reshape_kernel->window(), pack);
+        _reshape->run(pack);
     }
 }
 
diff --git a/src/cpu/operators/CpuGemmConv2d.h b/src/cpu/operators/CpuGemmConv2d.h
index 81d34ae..61fe63a 100644
--- a/src/cpu/operators/CpuGemmConv2d.h
+++ b/src/cpu/operators/CpuGemmConv2d.h
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CPU_GEMM_CONV2D_H
-#define ARM_COMPUTE_CPU_GEMM_CONV2D_H
+#ifndef ACL_SRC_CPU_OPERATORS_CPUGEMMCONV2D_H
+#define ACL_SRC_CPU_OPERATORS_CPUGEMMCONV2D_H
 
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
@@ -38,12 +38,12 @@
 class CpuGemm;
 class CpuGemmLowpMatrixMultiplyCore;
 class CpuGemmLowpOutputStage;
+class CpuReshape;
 namespace kernels
 {
 class CpuWeightsReshapeKernel;
 class CpuIm2ColKernel;
 class CpuCol2ImKernel;
-class CpuReshapeKernel;
 } // namespace kernels
 
 /** Basic function to compute the convolution layer. This function calls the following kernels/functions:
@@ -130,8 +130,8 @@
                                const bool enable_fast_math = false);
 
     // Inherited methods overridden:
-    void run(ITensorPack &tensors) override;
-    void prepare(ITensorPack &tensors) override;
+    void                             run(ITensorPack &tensors) override;
+    void                             prepare(ITensorPack &tensors) override;
     experimental::MemoryRequirements workspace() const override;
 
 private:
@@ -222,7 +222,7 @@
     std::unique_ptr<CpuGemm>                          _mm_gemm;
     std::unique_ptr<CpuGemmLowpMatrixMultiplyCore>    _mm_gemmlowp;
     std::unique_ptr<kernels::CpuCol2ImKernel>         _col2im_kernel;
-    std::unique_ptr<kernels::CpuReshapeKernel>        _reshape_kernel;
+    std::unique_ptr<CpuReshape>                       _reshape;
 
     TensorInfo _im2col_output;
     TensorInfo _weights_reshaped;
@@ -240,4 +240,4 @@
 };
 } // namespace cpu
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_GEMM_CONV2D_H */
+#endif // ACL_SRC_CPU_OPERATORS_CPUGEMMCONV2D_H