COMPMID-3388: Async support to CLReshapeLayerKernel kernels/functions

Signed-off-by: Michalis Spyrou <michalis.spyrou@arm.com>
Change-Id: I141a943dfd691069317860e852ecdd0ba7391604
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/3501
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
diff --git a/arm_compute/core/CL/ICLKernel.h b/arm_compute/core/CL/ICLKernel.h
index 3e545c6..cf8771f 100644
--- a/arm_compute/core/CL/ICLKernel.h
+++ b/arm_compute/core/CL/ICLKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,6 +29,7 @@
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/GPUTarget.h"
 #include "arm_compute/core/IKernel.h"
+#include "arm_compute/core/experimental/Types.h"
 
 #include <string>
 
@@ -216,7 +217,23 @@
      * @param[in]     window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
      * @param[in,out] queue  Command queue on which to enqueue the kernel.
      */
-    virtual void run(const Window &window, cl::CommandQueue &queue) = 0;
+    virtual void run(const Window &window, cl::CommandQueue &queue)
+    {
+        ARM_COMPUTE_UNUSED(window, queue);
+    }
+    /** Enqueue the OpenCL kernel to process the given window  on the passed OpenCL command queue.
+     *
+     * @note The queue is *not* flushed by this method, and therefore the kernel will not have been executed by the time this method returns.
+     *
+     * @param[in]     inputs  A vector containing the input tensors.
+     * @param[in]     outputs A vector containing the output tensors.
+     * @param[in]     window  Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
+     * @param[in,out] queue   Command queue on which to enqueue the kernel.
+     */
+    virtual void run_op(const InputTensorMap &inputs, const OutputTensorMap &outputs, const Window &window, cl::CommandQueue &queue)
+    {
+        ARM_COMPUTE_UNUSED(inputs, outputs, window, queue);
+    }
     /** Add the passed parameters to the object's kernel's arguments starting from the index idx.
      *
      * @param[in,out] idx   Index at which to start adding the arguments. Will be incremented by the number of kernel arguments set.
diff --git a/arm_compute/core/CL/kernels/CLReshapeLayerKernel.h b/arm_compute/core/CL/kernels/CLReshapeLayerKernel.h
index 3ea7411..b785e93 100644
--- a/arm_compute/core/CL/kernels/CLReshapeLayerKernel.h
+++ b/arm_compute/core/CL/kernels/CLReshapeLayerKernel.h
@@ -35,31 +35,13 @@
 class CLReshapeLayerKernel : public ICLKernel
 {
 public:
-    /** Default constructor */
-    CLReshapeLayerKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLReshapeLayerKernel(const CLReshapeLayerKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    CLReshapeLayerKernel &operator=(const CLReshapeLayerKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    CLReshapeLayerKernel(CLReshapeLayerKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    CLReshapeLayerKernel &operator=(CLReshapeLayerKernel &&) = default;
-    /** Default destructor */
-    ~CLReshapeLayerKernel() = default;
-    /** Set the input and output of the kernel
-     *
-     * @param[in]  input  Source tensor. Data type supported: All.
-     * @param[out] output Destination tensor. Data type supported: Same as @p input
-     */
-    void configure(const ICLTensor *input, ICLTensor *output);
     /** Set the input and output of the kernel
      *
      * @param[in]  compile_context The compile context to be used.
-     * @param[in]  input           Source tensor. Data type supported: All.
-     * @param[out] output          Destination tensor. Data type supported: Same as @p input
+     * @param[in]  input           Source tensor info. Data type supported: All.
+     * @param[out] output          Destination tensor info. Data type supported: Same as @p input
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output);
+    void configure(const CLCompileContext &compile_context, const ITensorInfo *input, ITensorInfo *output);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLReshapeLayerKernel
      *
@@ -71,11 +53,8 @@
     static Status validate(const ITensorInfo *input, const ITensorInfo *output);
 
     // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
-
-private:
-    const ICLTensor *_input;  /**< Source tensor */
-    ICLTensor       *_output; /**< Destination tensor */
+    void run_op(const InputTensorMap &inputs, const OutputTensorMap &outputs,
+                const Window &window, cl::CommandQueue &queue) override;
 };
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_CLRESHAPELAYERKERNEL_H */
diff --git a/arm_compute/runtime/CL/CLScheduler.h b/arm_compute/runtime/CL/CLScheduler.h
index 573248e..95fdffe 100644
--- a/arm_compute/runtime/CL/CLScheduler.h
+++ b/arm_compute/runtime/CL/CLScheduler.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -29,6 +29,7 @@
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/core/experimental/Types.h"
 #include "arm_compute/runtime/CL/ICLTuner.h"
 
 namespace arm_compute
@@ -72,6 +73,14 @@
      * @param[in] flush  (Optional) Specifies if the command queue will be flushed after running the kernel.
      */
     void enqueue(ICLKernel &kernel, bool flush = true);
+    /** Schedule the execution of the passed kernel if possible.
+     *
+     * @param[in] kernel  Kernel to execute.
+     * @param[in] inputs  Vector containing the input tensors.
+     * @param[in] outputs Vector containing the output tensors.
+     * @param[in] flush   (Optional) Specifies if the command queue will be flushed after running the kernel.
+     */
+    void enqueue_op(ICLKernel &kernel, const InputTensorMap &inputs, const OutputTensorMap &outputs, bool flush = true);
 
     /** Initialises the context and command queue to be used by the scheduler.
      *
@@ -143,6 +152,7 @@
     bool is_initialised() const;
 
 private:
+    void enqueue_common(ICLKernel &kernel, const InputTensorMap &inputs, const OutputTensorMap &outputs, bool flush);
     /** Flag to ensure symbols initialisation is happening before Scheduler creation */
     static std::once_flag _initialize_symbols;
 
diff --git a/arm_compute/runtime/CL/ICLOperator.h b/arm_compute/runtime/CL/ICLOperator.h
new file mode 100644
index 0000000..68489ad
--- /dev/null
+++ b/arm_compute/runtime/CL/ICLOperator.h
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_ICLOPERATOR_H
+#define ARM_COMPUTE_ICLOPERATOR_H
+
+#include "arm_compute/core/CL/ICLKernel.h"
+#include "arm_compute/runtime/IOperator.h"
+#include "arm_compute/runtime/IRuntimeContext.h"
+#include "arm_compute/runtime/Types.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+namespace experimental
+{
+/** Basic interface for functions which have a single async CL kernel */
+class ICLOperator : public IOperator
+{
+public:
+    /** Constructor
+     *
+     * @param[in] ctx Runtime context to be used by the function
+     */
+    ICLOperator(IRuntimeContext *ctx = nullptr);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    ICLOperator(const ICLOperator &) = delete;
+    /** Default move constructor */
+    ICLOperator(ICLOperator &&) = default;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    ICLOperator &operator=(const ICLOperator &) = delete;
+    /** Default move assignment operator */
+    ICLOperator &operator=(ICLOperator &&) = default;
+
+    // Inherited methods overridden:
+    void run(InputTensorMap inputs, OutputTensorMap outputs, OperatorTensorMap workspace) override final;
+    void prepare(OperatorTensorMap constants) override final;
+
+protected:
+    std::unique_ptr<ICLKernel> _kernel;
+    IRuntimeContext           *_ctx;
+    MemoryRequirements         _workspace;
+};
+} // namespace experimental
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_ICLOPERATOR_H */
diff --git a/arm_compute/runtime/CL/functions/CLArgMinMaxLayer.h b/arm_compute/runtime/CL/functions/CLArgMinMaxLayer.h
index 997bb79..64b7ea6 100644
--- a/arm_compute/runtime/CL/functions/CLArgMinMaxLayer.h
+++ b/arm_compute/runtime/CL/functions/CLArgMinMaxLayer.h
@@ -25,9 +25,9 @@
 #define ARM_COMPUTE_CLARGMINMAXLAYER_H
 
 #include "arm_compute/core/CL/kernels/CLArgMinMaxLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLReshapeLayerKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/functions/CLReshapeLayer.h"
 #include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/IMemoryManager.h"
 #include "arm_compute/runtime/MemoryGroup.h"
@@ -89,7 +89,7 @@
     std::vector<CLTensor>               _results_vector;
     CLTensor                            _not_reshaped_output;
     std::vector<CLArgMinMaxLayerKernel> _reduction_kernels_vector;
-    CLReshapeLayerKernel                _reshape_kernel;
+    CLReshapeLayer                      _reshape;
     unsigned int                        _num_of_stages;
     unsigned int                        _reduction_axis;
 };
diff --git a/arm_compute/runtime/CL/functions/CLGenerateProposalsLayer.h b/arm_compute/runtime/CL/functions/CLGenerateProposalsLayer.h
index 91b30fa..490b07b 100644
--- a/arm_compute/runtime/CL/functions/CLGenerateProposalsLayer.h
+++ b/arm_compute/runtime/CL/functions/CLGenerateProposalsLayer.h
@@ -29,10 +29,10 @@
 #include "arm_compute/core/CL/kernels/CLPadLayerKernel.h"
 #include "arm_compute/core/CL/kernels/CLPermuteKernel.h"
 #include "arm_compute/core/CL/kernels/CLQuantizationLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLReshapeLayerKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/functions/CLReshapeLayer.h"
 #include "arm_compute/runtime/CPP/CPPScheduler.h"
 #include "arm_compute/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.h"
 #include "arm_compute/runtime/IFunction.h"
@@ -131,9 +131,9 @@
 
     // OpenCL kernels
     CLPermuteKernel              _permute_deltas_kernel;
-    CLReshapeLayerKernel         _flatten_deltas_kernel;
+    CLReshapeLayer               _flatten_deltas;
     CLPermuteKernel              _permute_scores_kernel;
-    CLReshapeLayerKernel         _flatten_scores_kernel;
+    CLReshapeLayer               _flatten_scores;
     CLComputeAllAnchorsKernel    _compute_anchors_kernel;
     CLBoundingBoxTransformKernel _bounding_box_kernel;
     CLPadLayerKernel             _pad_kernel;
diff --git a/arm_compute/runtime/CL/functions/CLReductionOperation.h b/arm_compute/runtime/CL/functions/CLReductionOperation.h
index 25cf655..b420bbf 100644
--- a/arm_compute/runtime/CL/functions/CLReductionOperation.h
+++ b/arm_compute/runtime/CL/functions/CLReductionOperation.h
@@ -26,9 +26,9 @@
 
 #include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
 #include "arm_compute/core/CL/kernels/CLReductionOperationKernel.h"
-#include "arm_compute/core/CL/kernels/CLReshapeLayerKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/functions/CLReshapeLayer.h"
 #include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/IMemoryManager.h"
 #include "arm_compute/runtime/MemoryGroup.h"
@@ -94,7 +94,7 @@
     std::vector<CLTensor>                   _results_vector;
     std::vector<CLReductionOperationKernel> _reduction_kernels_vector;
     std::vector<CLFillBorderKernel>         _border_handlers_vector;
-    CLReshapeLayerKernel                    _reshape_kernel;
+    CLReshapeLayer                          _reshape;
     ReductionOperation                      _op;
     unsigned int                            _num_of_stages;
     unsigned int                            _reduction_axis;
diff --git a/arm_compute/runtime/CL/functions/CLReshapeLayer.h b/arm_compute/runtime/CL/functions/CLReshapeLayer.h
index e91c2c7..9017384 100644
--- a/arm_compute/runtime/CL/functions/CLReshapeLayer.h
+++ b/arm_compute/runtime/CL/functions/CLReshapeLayer.h
@@ -24,6 +24,7 @@
 #ifndef ARM_COMPUTE_CLRESHAPELAYER_H
 #define ARM_COMPUTE_CLRESHAPELAYER_H
 
+#include "arm_compute/runtime/CL/ICLOperator.h"
 #include "arm_compute/runtime/CL/ICLSimpleFunction.h"
 
 namespace arm_compute
@@ -31,9 +32,21 @@
 class ICLTensor;
 
 /** Basic function to run @ref CLReshapeLayerKernel */
-class CLReshapeLayer : public ICLSimpleFunction
+class CLReshapeLayer : public IFunction
 {
 public:
+    /** Default Constructor */
+    CLReshapeLayer();
+    /** Default Destructor */
+    ~CLReshapeLayer();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLReshapeLayer(const CLReshapeLayer &) = delete;
+    /** Default move constructor */
+    CLReshapeLayer(CLReshapeLayer &&);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLReshapeLayer &operator=(const CLReshapeLayer &) = delete;
+    /** Default move assignment operator */
+    CLReshapeLayer &operator=(CLReshapeLayer &&);
     /** Initialise the kernel's inputs and outputs
      *
      * @param[in]  input  First tensor input. Data type supported: All
@@ -56,6 +69,41 @@
      * @return a status
      */
     static Status validate(const ITensorInfo *input, const ITensorInfo *output);
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
+
+namespace experimental
+{
+/** Basic function to run @ref CLReshapeLayerKernel */
+class CLReshapeLayer : public ICLOperator
+{
+public:
+    /** Initialise the kernel's inputs and outputs
+     *
+     * @param[in]  compile_context The compile context to be used.
+     * @param[in]  input           Input tensor info. Data type supported: All
+     * @param[out] output          Output info. Data type supported: Same as @p input
+     */
+    void configure(const CLCompileContext &compile_context, const ITensorInfo *input, ITensorInfo *output);
+
+    /** Static function to check if given info will lead to a valid configuration of @ref CLReshapeLayer
+     *
+     * @param[in] input  Input tensor info. Data type supported: All
+     * @param[in] output Output tensor info. Data type supported: Same as @p input
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output);
+
+    // Inherited methods overridden:
+    MemoryRequirements workspace() const override;
+};
+} // namespace experimental
 }
 #endif /*ARM_COMPUTE_CLRESHAPELAYER_H */
diff --git a/arm_compute/runtime/CL/functions/CLSoftmaxLayer.h b/arm_compute/runtime/CL/functions/CLSoftmaxLayer.h
index 231a56f..f0ef15a 100644
--- a/arm_compute/runtime/CL/functions/CLSoftmaxLayer.h
+++ b/arm_compute/runtime/CL/functions/CLSoftmaxLayer.h
@@ -24,10 +24,10 @@
 #ifndef ARM_COMPUTE_CLSOFTMAXLAYER_H
 #define ARM_COMPUTE_CLSOFTMAXLAYER_H
 
-#include "arm_compute/core/CL/kernels/CLFlattenLayerKernel.h"
-#include "arm_compute/core/CL/kernels/CLReshapeLayerKernel.h"
 #include "arm_compute/core/CL/kernels/CLSoftmaxLayerKernel.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
+#include "arm_compute/runtime/CL/functions/CLFlattenLayer.h"
+#include "arm_compute/runtime/CL/functions/CLReshapeLayer.h"
 #include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/IMemoryManager.h"
 #include "arm_compute/runtime/MemoryGroup.h"
@@ -136,8 +136,8 @@
     MemoryGroup                    _memory_group;
     CLLogits1DMaxShiftExpSumKernel _max_shift_exp_sum_kernel;
     CLLogits1DNormKernel           _norm_kernel;
-    std::unique_ptr<ICLKernel>     _flatten_kernel_ptr;
-    CLReshapeLayerKernel           _reshape_kernel;
+    std::unique_ptr<IFunction>     _flatten_ptr;
+    CLReshapeLayer                 _reshape;
     CLTensor                       _max;
     CLTensor                       _sum;
     CLTensor                       _tmp;