Make memset/copy functions state-less

Port following functions:
- NECopy
- NEFill
- NEPermute
- NEReshapeLayer

Signed-off-by: Georgios Pinitas <georgios.pinitas@arm.com>
Change-Id: I75f3f837012abab79c7dde9a20a34f64f75571d8
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/4800
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
diff --git a/src/runtime/NEON/functions/NESpaceToBatchLayer.cpp b/src/runtime/NEON/functions/NESpaceToBatchLayer.cpp
index 10b3841..e8a8424 100644
--- a/src/runtime/NEON/functions/NESpaceToBatchLayer.cpp
+++ b/src/runtime/NEON/functions/NESpaceToBatchLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,7 +29,7 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "src/core/NEON/kernels/NEMemsetKernel.h"
+#include "arm_compute/runtime/NEON/functions/NEFill.h"
 #include "src/core/NEON/kernels/NESpaceToBatchLayerKernel.h"
 
 namespace arm_compute
@@ -37,7 +37,7 @@
 NESpaceToBatchLayer::~NESpaceToBatchLayer() = default;
 
 NESpaceToBatchLayer::NESpaceToBatchLayer()
-    : _space_to_batch_kernel(), _memset_kernel(), _has_padding(false)
+    : _space_to_batch_kernel(), _fill_f(), _has_padding(false)
 {
 }
 
@@ -47,9 +47,9 @@
 
     if(input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size())
     {
-        _has_padding   = true;
-        _memset_kernel = std::make_unique<NEMemsetKernel>();
-        _memset_kernel->configure(output, PixelValue(0, input->info()->data_type(), input->info()->quantization_info()));
+        _has_padding = true;
+        _fill_f      = std::make_unique<NEFill>();
+        _fill_f->configure(output, PixelValue(0, input->info()->data_type(), input->info()->quantization_info()));
     }
     _space_to_batch_kernel = std::make_unique<NESpaceToBatchLayerKernel>();
     _space_to_batch_kernel->configure(input, block_shape, paddings, output);
@@ -61,9 +61,9 @@
 
     if(input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size())
     {
-        _has_padding   = true;
-        _memset_kernel = std::make_unique<NEMemsetKernel>();
-        _memset_kernel->configure(output, PixelValue(0, input->info()->data_type(), input->info()->quantization_info()));
+        _has_padding = true;
+        _fill_f      = std::make_unique<NEFill>();
+        _fill_f->configure(output, PixelValue(0, input->info()->data_type(), input->info()->quantization_info()));
     }
     _space_to_batch_kernel = std::make_unique<NESpaceToBatchLayerKernel>();
     _space_to_batch_kernel->configure(input, block_shape_x, block_shape_y, padding_left, padding_right, output);
@@ -89,7 +89,7 @@
     // Zero out output only if we have paddings
     if(_has_padding)
     {
-        NEScheduler::get().schedule(_memset_kernel.get(), Window::DimY);
+        _fill_f->run();
     }
     NEScheduler::get().schedule(_space_to_batch_kernel.get(), Window::DimY);
 }