Make memset/copy functions state-less

Port following functions:
- CLCopy
- CLFill
- CLPermute
- CLReshapeLayer
- CLCropResize

Resolves: COMPMID-4002

Signed-off-by: Sheri Zhang <sheri.zhang@arm.com>
Change-Id: I8392aa515aaeb5b44dab6122be6a795d08376d5f
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5003
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
diff --git a/src/runtime/CL/functions/CLQLSTMLayer.cpp b/src/runtime/CL/functions/CLQLSTMLayer.cpp
index 4395a39..e7a0e57 100644
--- a/src/runtime/CL/functions/CLQLSTMLayer.cpp
+++ b/src/runtime/CL/functions/CLQLSTMLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020 Arm Limited.
+ * Copyright (c) 2020-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,7 +30,6 @@
 #include "arm_compute/core/utils/misc/InfoHelpers.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/core/CL/kernels/CLCopyKernel.h"
 #include "src/core/CL/kernels/CLDepthConvertLayerKernel.h"
 #include "src/core/CL/kernels/CLFillBorderKernel.h"
 #include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h"
@@ -106,7 +105,7 @@
       _recurrent_to_output_reduction(std::make_unique<CLGEMMLowpMatrixAReductionKernel>()),
       _projection_reduction(std::make_unique<CLGEMMLowpMatrixAReductionKernel>()),
       _layer_norms(),
-      _copy_output(std::make_unique<CLCopyKernel>())
+      _copy_output()
 {
     for(auto &norm : _layer_norms)
     {
@@ -593,7 +592,7 @@
     }
 
     // Copy output_state_out to output
-    _copy_output->configure(compile_context, output_state_out, output);
+    _copy_output.configure(compile_context, output_state_out, output);
 }
 
 Status CLQLSTMLayer::validate(const ITensorInfo *input,
@@ -957,7 +956,7 @@
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output_state_in, output_state_out);
     }
 
-    ARM_COMPUTE_RETURN_ON_ERROR(CLCopyKernel::validate(output_state_out, output));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLCopy::validate(output_state_out, output));
     return Status{};
 }
 
@@ -1099,7 +1098,7 @@
     }
 
     // Copy output_state_out to output
-    CLScheduler::get().enqueue(*_copy_output);
+    _copy_output.run();
 }
 
 void CLQLSTMLayer::prepare()