Make memset/copy functions state-less

Port following functions:
- NECopy
- NEFill
- NEPermute
- NEReshapeLayer

Signed-off-by: Georgios Pinitas <georgios.pinitas@arm.com>
Change-Id: I75f3f837012abab79c7dde9a20a34f64f75571d8
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/4800
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
diff --git a/src/runtime/NEON/functions/NECopy.cpp b/src/runtime/NEON/functions/NECopy.cpp
index 11707cb..20642b5 100644
--- a/src/runtime/NEON/functions/NECopy.cpp
+++ b/src/runtime/NEON/functions/NECopy.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,23 +23,51 @@
  */
 #include "arm_compute/runtime/NEON/functions/NECopy.h"
 
-#include "src/core/NEON/kernels/NECopyKernel.h"
+#include "arm_compute/core/Validate.h"
+#include "src/runtime/cpu/operators/CpuCopy.h"
 
 #include <utility>
 
 namespace arm_compute
 {
-NECopy::~NECopy() = default;
+struct NECopy::Impl
+{
+    const ITensor                *src{ nullptr };
+    ITensor                      *dst{ nullptr };
+    std::unique_ptr<cpu::CpuCopy> op{ nullptr };
+};
+
+NECopy::NECopy()
+    : _impl(std::make_unique<Impl>())
+{
+}
+NECopy::NECopy(NECopy &&) = default;
+NECopy &NECopy::operator=(NECopy &&) = default;
+NECopy::~NECopy()                    = default;
 
 void NECopy::configure(ITensor *input, ITensor *output)
 {
-    auto k = std::make_unique<NECopyKernel>();
-    k->configure(input, output);
-    _kernel = std::move(k);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+    _impl->src = input;
+    _impl->dst = output;
+    _impl->op  = std::make_unique<cpu::CpuCopy>();
+    _impl->op->configure(input->info(), output->info());
 }
 
-Status NECopy::validate(const arm_compute::ITensorInfo *input, const arm_compute::ITensorInfo *output)
+Status NECopy::validate(const ITensorInfo *input, const ITensorInfo *output)
 {
-    return NECopyKernel::validate(input, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuCopy::validate(input, output));
+
+    return Status{};
+}
+
+void NECopy::run()
+{
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC, _impl->src);
+    pack.add_tensor(TensorType::ACL_DST, _impl->dst);
+    _impl->op->run(pack);
 }
 } // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEFFTConvolutionLayer.cpp b/src/runtime/NEON/functions/NEFFTConvolutionLayer.cpp
index 60a747d..56fc2e4 100644
--- a/src/runtime/NEON/functions/NEFFTConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEFFTConvolutionLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,7 +27,6 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/NEON/kernels/NECopyKernel.h"
 #include "src/core/NEON/kernels/NEFFTDigitReverseKernel.h"
 #include "src/core/NEON/kernels/NEFFTRadixStageKernel.h"
 #include "src/core/NEON/kernels/NEFFTScaleKernel.h"
diff --git a/src/runtime/NEON/functions/NEFill.cpp b/src/runtime/NEON/functions/NEFill.cpp
index 74e366a..ee539fd 100644
--- a/src/runtime/NEON/functions/NEFill.cpp
+++ b/src/runtime/NEON/functions/NEFill.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,18 +23,40 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEFill.h"
 
-#include "arm_compute/core/Window.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "src/core/NEON/kernels/NEMemsetKernel.h"
+#include "arm_compute/core/Validate.h"
+#include "src/runtime/cpu/operators/CpuFill.h"
 
 #include <utility>
 
 namespace arm_compute
 {
+struct NEFill::Impl
+{
+    ITensor                      *tensor{ nullptr };
+    std::unique_ptr<cpu::CpuFill> op{ nullptr };
+};
+
+NEFill::NEFill()
+    : _impl(std::make_unique<Impl>())
+{
+}
+NEFill::NEFill(NEFill &&) = default;
+NEFill &NEFill::operator=(NEFill &&) = default;
+NEFill::~NEFill()                    = default;
+
 void NEFill::configure(ITensor *tensor, PixelValue constant_value)
 {
-    auto k = std::make_unique<NEMemsetKernel>();
-    k->configure(tensor, constant_value);
-    _kernel = std::move(k);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(tensor);
+
+    _impl->tensor = tensor;
+    _impl->op     = std::make_unique<cpu::CpuFill>();
+    _impl->op->configure(tensor->info(), constant_value);
+}
+
+void NEFill::run()
+{
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC_DST, _impl->tensor);
+    _impl->op->run(pack);
 }
 } // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEGenerateProposalsLayer.cpp b/src/runtime/NEON/functions/NEGenerateProposalsLayer.cpp
index f3a6a30..931fdb2 100644
--- a/src/runtime/NEON/functions/NEGenerateProposalsLayer.cpp
+++ b/src/runtime/NEON/functions/NEGenerateProposalsLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -25,7 +25,6 @@
 
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "src/core/NEON/kernels/NECopyKernel.h"
 #include "src/core/NEON/kernels/NEFillBorderKernel.h"
 #include "src/core/NEON/kernels/NEGenerateProposalsLayerKernel.h"
 #include "src/core/NEON/kernels/NEPadLayerKernel.h"
diff --git a/src/runtime/NEON/functions/NEMaxUnpoolingLayer.cpp b/src/runtime/NEON/functions/NEMaxUnpoolingLayer.cpp
index da6260b..656777d 100644
--- a/src/runtime/NEON/functions/NEMaxUnpoolingLayer.cpp
+++ b/src/runtime/NEON/functions/NEMaxUnpoolingLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 Arm Limited.
+ * Copyright (c) 2020-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,26 +24,26 @@
 #include "arm_compute/runtime/NEON/functions/NEMaxUnpoolingLayer.h"
 
 #include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "arm_compute/runtime/NEON/functions/NEFill.h"
 #include "src/core/NEON/kernels/NEMaxUnpoolingLayerKernel.h"
-#include "src/core/NEON/kernels/NEMemsetKernel.h"
 
 namespace arm_compute
 {
 NEMaxUnpoolingLayer::~NEMaxUnpoolingLayer() = default;
 
 NEMaxUnpoolingLayer::NEMaxUnpoolingLayer()
-
-    : _memset_kernel(), _unpooling_layer_kernel()
+    : _fill_func(), _unpooling_layer_kernel()
 {
 }
 
 void NEMaxUnpoolingLayer::configure(ITensor *input, ITensor *indices, ITensor *output, const PoolingLayerInfo &pool_info)
 {
     const PixelValue zero_value(0.f);
-    _memset_kernel          = std::make_unique<NEMemsetKernel>();
+    _fill_func              = std::make_unique<NEFill>();
     _unpooling_layer_kernel = std::make_unique<NEMaxUnpoolingLayerKernel>();
-    _memset_kernel->configure(output, zero_value);
+    _fill_func->configure(output, zero_value);
     _unpooling_layer_kernel->configure(input, indices, output, pool_info);
 }
 
@@ -54,7 +54,7 @@
 
 void NEMaxUnpoolingLayer::run()
 {
-    NEScheduler::get().schedule(_memset_kernel.get(), Window::DimY);
+    _fill_func->run();
     NEScheduler::get().schedule(_unpooling_layer_kernel.get(), Window::DimY);
 }
 } /* namespace arm_compute */
diff --git a/src/runtime/NEON/functions/NEPadLayer.cpp b/src/runtime/NEON/functions/NEPadLayer.cpp
index 88a73b8..531b06d 100644
--- a/src/runtime/NEON/functions/NEPadLayer.cpp
+++ b/src/runtime/NEON/functions/NEPadLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,7 +27,6 @@
 
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/NEON/kernels/NECopyKernel.h"
 #include "src/core/NEON/kernels/NEPadLayerKernel.h"
 #include "src/core/helpers/AutoConfiguration.h"
 
@@ -52,7 +51,7 @@
 NEPadLayer::~NEPadLayer() = default;
 
 NEPadLayer::NEPadLayer()
-    : _copy_kernel(), _pad_kernel(), _mode(), _padding(), _num_dimensions(0), _slice_functions(), _concat_functions(), _slice_results(), _concat_results()
+    : _copy_function(), _pad_kernel(), _mode(), _padding(), _num_dimensions(0), _slice_functions(), _concat_functions(), _slice_results(), _concat_results()
 {
 }
 
@@ -200,8 +199,7 @@
     else
     {
         // Copy the input to the whole output if no padding is applied
-        _copy_kernel = std::make_unique<NECopyKernel>();
-        _copy_kernel->configure(input, output);
+        _copy_function.configure(input, output);
     }
 }
 
@@ -286,7 +284,7 @@
     }
     else
     {
-        NEScheduler::get().schedule(_copy_kernel.get(), Window::DimY);
+        _copy_function.run();
     }
 }
 } // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEPermute.cpp b/src/runtime/NEON/functions/NEPermute.cpp
index cceb22f..257c1a2 100644
--- a/src/runtime/NEON/functions/NEPermute.cpp
+++ b/src/runtime/NEON/functions/NEPermute.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,19 +23,52 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEPermute.h"
 
-#include "src/core/NEON/kernels/NEPermuteKernel.h"
+#include "arm_compute/core/Validate.h"
+#include "src/runtime/cpu/operators/CpuPermute.h"
 
 namespace arm_compute
 {
+struct NEPermute::Impl
+{
+    const ITensor                   *src{ nullptr };
+    ITensor                         *dst{ nullptr };
+    std::unique_ptr<cpu::CpuPermute> op{ nullptr };
+};
+
+NEPermute::NEPermute()
+    : _impl(std::make_unique<Impl>())
+{
+}
+
+NEPermute::NEPermute(NEPermute &&) = default;
+
+NEPermute &NEPermute::operator=(NEPermute &&) = default;
+
+NEPermute::~NEPermute() = default;
+
 void NEPermute::configure(const ITensor *input, ITensor *output, const PermutationVector &perm)
 {
-    auto k = std::make_unique<NEPermuteKernel>();
-    k->configure(input, output, perm);
-    _kernel = std::move(k);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+    _impl->src = input;
+    _impl->dst = output;
+    _impl->op  = std::make_unique<cpu::CpuPermute>();
+    _impl->op->configure(input->info(), output->info(), perm);
 }
 
 Status NEPermute::validate(const ITensorInfo *input, const ITensorInfo *output, const PermutationVector &perm)
 {
-    return NEPermuteKernel::validate(input, output, perm);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuPermute::validate(input, output, perm));
+
+    return Status{};
+}
+
+void NEPermute::run()
+{
+    ITensorPack pack;
+    pack.add_tensor(TensorType::ACL_SRC, _impl->src);
+    pack.add_tensor(TensorType::ACL_DST, _impl->dst);
+    _impl->op->run(pack);
 }
 } // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NERNNLayer.cpp b/src/runtime/NEON/functions/NERNNLayer.cpp
index 93e37cc..63e8103 100644
--- a/src/runtime/NEON/functions/NERNNLayer.cpp
+++ b/src/runtime/NEON/functions/NERNNLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -32,7 +32,6 @@
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 #include "src/core/NEON/kernels/NEConvertFullyConnectedWeightsKernel.h"
 #include "src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h"
-#include "src/core/NEON/kernels/NECopyKernel.h"
 #include "src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
 #include "src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h"
 #include "src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h"
@@ -47,7 +46,7 @@
 NERNNLayer::~NERNNLayer() = default;
 
 NERNNLayer::NERNNLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _gemm_state_f(), _add_f(), _activation(), _fully_connected(memory_manager), _copy_kernel(), _fully_connected_out(), _gemm_output(), _add_output(),
+    : _memory_group(std::move(memory_manager)), _gemm_state_f(), _add_f(), _activation(), _fully_connected(memory_manager), _copy_f(), _fully_connected_out(), _gemm_output(), _add_output(),
       _is_prepared(false)
 {
 }
@@ -112,8 +111,7 @@
     _activation.configure(&_add_output, hidden_state, info);
     _add_output.allocator()->allocate();
 
-    _copy_kernel = std::make_unique<NECopyKernel>();
-    _copy_kernel->configure(hidden_state, output);
+    _copy_f.configure(hidden_state, output);
 }
 
 void NERNNLayer::run()
@@ -130,7 +128,7 @@
     _activation.run();
 
     // copy hidden out to output
-    NEScheduler::get().schedule(_copy_kernel.get(), Window::DimY);
+    _copy_f.run();
 }
 
 void NERNNLayer::prepare()
diff --git a/src/runtime/NEON/functions/NEReshapeLayer.cpp b/src/runtime/NEON/functions/NEReshapeLayer.cpp
index 9ad6a35..c0c78ea 100644
--- a/src/runtime/NEON/functions/NEReshapeLayer.cpp
+++ b/src/runtime/NEON/functions/NEReshapeLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,61 +24,41 @@
 #include "arm_compute/runtime/NEON/functions/NEReshapeLayer.h"
 
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "arm_compute/runtime/Types.h"
-#include "src/core/NEON/kernels/NEReshapeLayerKernel.h"
+#include "src/runtime/cpu/operators/CpuReshape.h"
 
 #include <utility>
 
 namespace arm_compute
 {
-namespace experimental
-{
-NEReshape::~NEReshape() = default;
-
-void NEReshape::configure(const ITensorInfo *input, ITensorInfo *output)
-{
-    auto k = std::make_unique<NEReshapeLayerKernel>();
-    k->configure(input, output);
-    _kernel = std::move(k);
-}
-
-Status NEReshape::validate(const ITensorInfo *input, const ITensorInfo *output)
-{
-    return arm_compute::NEReshapeLayerKernel::validate(input, output);
-}
-} // namespace experimental
-
 struct NEReshapeLayer::Impl
 {
-    const ITensor                           *src{ nullptr };
-    ITensor                                 *dst{ nullptr };
-    std::unique_ptr<experimental::NEReshape> op{ nullptr };
+    const ITensor                   *src{ nullptr };
+    ITensor                         *dst{ nullptr };
+    std::unique_ptr<cpu::CpuReshape> op{ nullptr };
 };
 
 NEReshapeLayer::NEReshapeLayer()
     : _impl(std::make_unique<Impl>())
 {
 }
-
 NEReshapeLayer::NEReshapeLayer(NEReshapeLayer &&) = default;
-
 NEReshapeLayer &NEReshapeLayer::operator=(NEReshapeLayer &&) = default;
-
-NEReshapeLayer::~NEReshapeLayer() = default;
+NEReshapeLayer::~NEReshapeLayer()                            = default;
 
 void NEReshapeLayer::configure(const ITensor *input, ITensor *output)
 {
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
     _impl->src = input;
     _impl->dst = output;
-    _impl->op  = std::make_unique<experimental::NEReshape>();
+    _impl->op  = std::make_unique<cpu::CpuReshape>();
     _impl->op->configure(input->info(), output->info());
 }
 
 Status NEReshapeLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_RETURN_ON_ERROR(experimental::NEReshape::validate(input, output));
+    ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuReshape::validate(input, output));
 
     return Status{};
 }
diff --git a/src/runtime/NEON/functions/NESpaceToBatchLayer.cpp b/src/runtime/NEON/functions/NESpaceToBatchLayer.cpp
index 10b3841..e8a8424 100644
--- a/src/runtime/NEON/functions/NESpaceToBatchLayer.cpp
+++ b/src/runtime/NEON/functions/NESpaceToBatchLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,7 +29,7 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "src/core/NEON/kernels/NEMemsetKernel.h"
+#include "arm_compute/runtime/NEON/functions/NEFill.h"
 #include "src/core/NEON/kernels/NESpaceToBatchLayerKernel.h"
 
 namespace arm_compute
@@ -37,7 +37,7 @@
 NESpaceToBatchLayer::~NESpaceToBatchLayer() = default;
 
 NESpaceToBatchLayer::NESpaceToBatchLayer()
-    : _space_to_batch_kernel(), _memset_kernel(), _has_padding(false)
+    : _space_to_batch_kernel(), _fill_f(), _has_padding(false)
 {
 }
 
@@ -47,9 +47,9 @@
 
     if(input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size())
     {
-        _has_padding   = true;
-        _memset_kernel = std::make_unique<NEMemsetKernel>();
-        _memset_kernel->configure(output, PixelValue(0, input->info()->data_type(), input->info()->quantization_info()));
+        _has_padding = true;
+        _fill_f      = std::make_unique<NEFill>();
+        _fill_f->configure(output, PixelValue(0, input->info()->data_type(), input->info()->quantization_info()));
     }
     _space_to_batch_kernel = std::make_unique<NESpaceToBatchLayerKernel>();
     _space_to_batch_kernel->configure(input, block_shape, paddings, output);
@@ -61,9 +61,9 @@
 
     if(input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size())
     {
-        _has_padding   = true;
-        _memset_kernel = std::make_unique<NEMemsetKernel>();
-        _memset_kernel->configure(output, PixelValue(0, input->info()->data_type(), input->info()->quantization_info()));
+        _has_padding = true;
+        _fill_f      = std::make_unique<NEFill>();
+        _fill_f->configure(output, PixelValue(0, input->info()->data_type(), input->info()->quantization_info()));
     }
     _space_to_batch_kernel = std::make_unique<NESpaceToBatchLayerKernel>();
     _space_to_batch_kernel->configure(input, block_shape_x, block_shape_y, padding_left, padding_right, output);
@@ -89,7 +89,7 @@
     // Zero out output only if we have paddings
     if(_has_padding)
     {
-        NEScheduler::get().schedule(_memset_kernel.get(), Window::DimY);
+        _fill_f->run();
     }
     NEScheduler::get().schedule(_space_to_batch_kernel.get(), Window::DimY);
 }