COMPMID-3391: Implement Async interfaces

Change-Id: I8168cea5056ff48a0253ebb8c88ea549a3ea69a2
Signed-off-by: Michalis Spyrou <michalis.spyrou@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/3335
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
diff --git a/src/runtime/NEON/INEOperator.cpp b/src/runtime/NEON/INEOperator.cpp
new file mode 100644
index 0000000..c24d5c4
--- /dev/null
+++ b/src/runtime/NEON/INEOperator.cpp
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/runtime/NEON/INEOperator.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+namespace arm_compute
+{
+namespace experimental
+{
+INEOperator::INEOperator(IRuntimeContext *ctx)
+    : _kernel(), _ctx(ctx), _workspace()
+{
+}
+
+void INEOperator::run(std::vector<InputOperatorTensors *> &inputs, std::vector<OutputOperatorTensors *> &outputs, std::vector<OperatorTensors *> &workspace)
+{
+    ARM_COMPUTE_UNUSED(workspace);
+
+    if(inputs.empty() || outputs.empty())
+    {
+        ARM_COMPUTE_ERROR("No inputs provided");
+    }
+
+    NEScheduler::get().schedule_op(_kernel.get(), Window::DimY, inputs, outputs);
+}
+
+void INEOperator::prepare(std::vector<OperatorTensors *> constants)
+{
+    ARM_COMPUTE_UNUSED(constants);
+}
+} // namespace experimental
+} // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEGenerateProposalsLayer.cpp b/src/runtime/NEON/functions/NEGenerateProposalsLayer.cpp
index 82880ba..dabbeba 100644
--- a/src/runtime/NEON/functions/NEGenerateProposalsLayer.cpp
+++ b/src/runtime/NEON/functions/NEGenerateProposalsLayer.cpp
@@ -31,9 +31,9 @@
 NEGenerateProposalsLayer::NEGenerateProposalsLayer(std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(memory_manager),
       _permute_deltas_kernel(),
-      _flatten_deltas_kernel(),
+      _flatten_deltas(),
       _permute_scores_kernel(),
-      _flatten_scores_kernel(),
+      _flatten_scores(),
       _compute_anchors_kernel(),
       _bounding_box_kernel(),
       _pad_kernel(),
@@ -95,12 +95,12 @@
     {
         _memory_group.manage(&_deltas_permuted);
         _permute_deltas_kernel.configure(deltas, &_deltas_permuted, PermutationVector{ 2, 0, 1 });
-        _flatten_deltas_kernel.configure(&_deltas_permuted, &_deltas_flattened);
+        _flatten_deltas.configure(&_deltas_permuted, &_deltas_flattened);
         _deltas_permuted.allocator()->allocate();
     }
     else
     {
-        _flatten_deltas_kernel.configure(deltas, &_deltas_flattened);
+        _flatten_deltas.configure(deltas, &_deltas_flattened);
     }
 
     const TensorShape flatten_shape_scores(1, total_num_anchors);
@@ -112,12 +112,12 @@
     {
         _memory_group.manage(&_scores_permuted);
         _permute_scores_kernel.configure(scores, &_scores_permuted, PermutationVector{ 2, 0, 1 });
-        _flatten_scores_kernel.configure(&_scores_permuted, &_scores_flattened);
+        _flatten_scores.configure(&_scores_permuted, &_scores_flattened);
         _scores_permuted.allocator()->allocate();
     }
     else
     {
-        _flatten_scores_kernel.configure(scores, &_scores_flattened);
+        _flatten_scores.configure(scores, &_scores_flattened);
     }
 
     Tensor *anchors_to_use = &_all_anchors;
@@ -244,12 +244,12 @@
     }
 
     TensorInfo deltas_flattened_info(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
-    ARM_COMPUTE_RETURN_ON_ERROR(NEReshapeLayerKernel::validate(&deltas_permuted_info, &deltas_flattened_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEReshapeLayer::validate(&deltas_permuted_info, &deltas_flattened_info));
 
     TensorInfo scores_flattened_info(scores->clone()->set_tensor_shape(TensorShape(1, total_num_anchors)).set_is_resizable(true));
     TensorInfo proposals_4_roi_values(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
 
-    ARM_COMPUTE_RETURN_ON_ERROR(NEReshapeLayerKernel::validate(&scores_permuted_info, &scores_flattened_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEReshapeLayer::validate(&scores_permuted_info, &scores_flattened_info));
 
     TensorInfo *proposals_4_roi_values_to_use = &proposals_4_roi_values;
     TensorInfo  proposals_4_roi_values_quantized(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
@@ -327,8 +327,8 @@
         NEScheduler::get().schedule(&_permute_scores_kernel, Window::DimY);
     }
 
-    NEScheduler::get().schedule(&_flatten_deltas_kernel, Window::DimY);
-    NEScheduler::get().schedule(&_flatten_scores_kernel, Window::DimY);
+    _flatten_deltas.run();
+    _flatten_scores.run();
 
     if(_is_qasymm8)
     {
diff --git a/src/runtime/NEON/functions/NEReductionOperation.cpp b/src/runtime/NEON/functions/NEReductionOperation.cpp
index 80ebe67..a895147 100644
--- a/src/runtime/NEON/functions/NEReductionOperation.cpp
+++ b/src/runtime/NEON/functions/NEReductionOperation.cpp
@@ -54,7 +54,7 @@
 } // namespace
 
 NEReductionOperation::NEReductionOperation(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(memory_manager), _reduction_kernel(), _fill_border_kernel(), _reshape_kernel(), _output_internal(), _window_split(0), _reduction_axis(), _is_reshape_required(false)
+    : _memory_group(memory_manager), _reduction_kernel(), _fill_border_kernel(), _reshape(), _output_internal(), _window_split(0), _reduction_axis(), _is_reshape_required(false)
 {
 }
 
@@ -91,7 +91,7 @@
 
     if(is_reshape_required)
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(NEReshapeLayerKernel::validate(output_internal, output));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEReshapeLayer::validate(output_internal, output));
     }
 
     return Status{};
@@ -171,7 +171,7 @@
 
     if(_is_reshape_required)
     {
-        _reshape_kernel.configure(output_internal, output);
+        _reshape.configure(output_internal, output);
         _output_internal.allocator()->allocate();
     }
 }
@@ -185,7 +185,7 @@
     NEScheduler::get().schedule(&_reduction_kernel, _window_split);
     if(_is_reshape_required)
     {
-        NEScheduler::get().schedule(&_reshape_kernel, Window::DimY);
+        _reshape.run();
     }
 }
 } // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEReshapeLayer.cpp b/src/runtime/NEON/functions/NEReshapeLayer.cpp
index 0a9f42d..680abef 100644
--- a/src/runtime/NEON/functions/NEReshapeLayer.cpp
+++ b/src/runtime/NEON/functions/NEReshapeLayer.cpp
@@ -25,13 +25,17 @@
 
 #include "arm_compute/core/NEON/kernels/NEReshapeLayerKernel.h"
 #include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "arm_compute/runtime/Types.h"
 #include "support/MemorySupport.h"
 
 #include <utility>
 
 namespace arm_compute
 {
-void NEReshapeLayer::configure(const ITensor *input, ITensor *output)
+namespace experimental
+{
+void NEReshapeLayer::configure(const ITensorInfo *input, ITensorInfo *output)
 {
     auto k = arm_compute::support::cpp14::make_unique<NEReshapeLayerKernel>();
     k->configure(input, output);
@@ -40,9 +44,41 @@
 
 Status NEReshapeLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
 {
+    return arm_compute::NEReshapeLayer::validate(input, output);
+}
+
+MemoryRequirements NEReshapeLayer::workspace() const
+{
+    return MemoryRequirements{};
+}
+} // namespace experimental
+
+void NEReshapeLayer::configure(const ITensor *input, ITensor *output)
+{
+    _input  = input;
+    _output = output;
+
+    auto k = arm_compute::support::cpp14::make_unique<NEReshapeLayerKernel>();
+    k->configure(input->info(), output->info());
+    _kernel = std::move(k);
+}
+
+Status NEReshapeLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
+{
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ON_ERROR(NEReshapeLayerKernel::validate(input, output));
 
     return Status{};
 }
+
+void NEReshapeLayer::run()
+{
+    InputOperatorTensors  src_0 = std::make_pair(TensorType::ACL_SRC, _input);
+    OutputOperatorTensors dst_0 = std::make_pair(TensorType::ACL_DST, _output);
+
+    std::vector<InputOperatorTensors *>  inputs  = { &src_0 };
+    std::vector<OutputOperatorTensors *> outputs = { &dst_0 };
+
+    NEScheduler::get().schedule_op(_kernel.get(), Window::DimY, inputs, outputs);
+}
 } // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NESoftmaxLayer.cpp b/src/runtime/NEON/functions/NESoftmaxLayer.cpp
index 5509ede..5cd6a55 100644
--- a/src/runtime/NEON/functions/NESoftmaxLayer.cpp
+++ b/src/runtime/NEON/functions/NESoftmaxLayer.cpp
@@ -32,8 +32,8 @@
 {
 template <bool IS_LOG>
 NESoftmaxLayerGeneric<IS_LOG>::NESoftmaxLayerGeneric(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _max_kernel(), _softmax_kernel(), _flat_or_reshape_kernel_ptr(nullptr), _fill_border_kernel(), _reshape_kernel(), _max(), _tmp(), _input_flattened(),
-      _output_flattened(), _needs_flattening(false)
+    : _memory_group(std::move(memory_manager)), _max_kernel(), _softmax_kernel(), _flat_or_reshape_ptr(nullptr), _fill_border_kernel(), _reshape(), _max(), _tmp(), _input_flattened(), _output_flattened(),
+      _needs_flattening(false)
 {
 }
 
@@ -46,23 +46,20 @@
     // Initialize the flat input
     _input_flattened.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_flatten));
 
-    // If we need to flatten the input, we can use NEFlattenKernel or NEReshapeKernel
-    // If the number of reduced axes is 3 (max dimension), which means collapsing all axes except the batch axis, we use NEFlattenKernel.
-    // In all other cases we have to use NEReshapeKernel
     // Note that the "other cases" include both:
     //   1. first_n_reduce_axes < 3: Reduce the first 1 (no need to reduce) or 2 dimensions (inclusive)
     //   2. first_n_reduce_axes == 4: Reduce all 4 dimensions. This can only be handled by NEReshapeKernel instead of NEFlattenKernel.
     if(first_n_reduce_axes == 3)
     {
-        auto flatten_kernel_ptr = support::cpp14::make_unique<NEFlattenLayerKernel>();
+        auto flatten_kernel_ptr = support::cpp14::make_unique<NEFlattenLayer>();
         flatten_kernel_ptr->configure(input, &_input_flattened);
-        _flat_or_reshape_kernel_ptr = std::move(flatten_kernel_ptr);
+        _flat_or_reshape_ptr = std::move(flatten_kernel_ptr);
     }
     else
     {
-        auto reshape_kernel_ptr = support::cpp14::make_unique<NEReshapeLayerKernel>();
+        auto reshape_kernel_ptr = support::cpp14::make_unique<NEReshapeLayer>();
         reshape_kernel_ptr->configure(input, &_input_flattened);
-        _flat_or_reshape_kernel_ptr = std::move(reshape_kernel_ptr);
+        _flat_or_reshape_ptr = std::move(reshape_kernel_ptr);
     }
 
     // We need to init the output tensor here. Indeed, the reshape kernel expects
@@ -127,7 +124,7 @@
         _input_flattened.allocator()->allocate();
 
         // Reshape the flat output into the requested (4D) output
-        _reshape_kernel.configure(&_output_flattened, output);
+        _reshape.configure(&_output_flattened, output);
 
         // Allocate the intermediate flat tensors
         _output_flattened.allocator()->allocate();
@@ -174,11 +171,11 @@
 
         if(first_n_reduce_axes == 3)
         {
-            ARM_COMPUTE_RETURN_ON_ERROR(NEFlattenLayerKernel::validate(input, &tensor_info_flat));
+            ARM_COMPUTE_RETURN_ON_ERROR(NEFlattenLayer::validate(input, &tensor_info_flat));
         }
         else
         {
-            ARM_COMPUTE_RETURN_ON_ERROR(NEReshapeLayerKernel::validate(input, &tensor_info_flat));
+            ARM_COMPUTE_RETURN_ON_ERROR(NEReshapeLayer::validate(input, &tensor_info_flat));
         }
     }
 
@@ -195,7 +192,7 @@
 
     if(_needs_flattening)
     {
-        NEScheduler::get().schedule(_flat_or_reshape_kernel_ptr.get(), Window::DimY);
+        _flat_or_reshape_ptr->run();
     }
 
     NEScheduler::get().schedule(&_fill_border_kernel, Window::DimY);
@@ -204,11 +201,11 @@
 
     if(_needs_flattening)
     {
-        NEScheduler::get().schedule(&_reshape_kernel, Window::DimY);
+        _reshape.run();
     }
 }
 
 template class NESoftmaxLayerGeneric<false>;
 template class NESoftmaxLayerGeneric<true>;
 
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute