COMPMID-3385: Async support to CLArithmetic* kernels/functions Pt.1

Signed-off-by: Michalis Spyrou <michalis.spyrou@arm.com>
Change-Id: I94007565e688f8a0aead4f14c9fc30bfd9f9f7eb
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/3613
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
diff --git a/src/core/CL/kernels/CLElementwiseOperationKernel.cpp b/src/core/CL/kernels/CLElementwiseOperationKernel.cpp
index ecac1e0..7cc6fb3 100644
--- a/src/core/CL/kernels/CLElementwiseOperationKernel.cpp
+++ b/src/core/CL/kernels/CLElementwiseOperationKernel.cpp
@@ -26,6 +26,7 @@
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/utils/misc/Cast.h"
 #include "support/StringSupport.h"
 #include <map>
 
@@ -241,15 +242,15 @@
 {
 }
 
-void CLElementwiseOperationKernel::configure_common(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
+void CLElementwiseOperationKernel::configure_common(ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output)
 {
     configure_common(CLKernelLibrary::get().get_compile_context(), input1, input2, output);
 }
 
-void CLElementwiseOperationKernel::configure_common(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
+void CLElementwiseOperationKernel::configure_common(const CLCompileContext &compile_context, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output)
 {
     // Configure kernel window
-    auto win_config = validate_and_configure_window(*input1->info(), *input2->info(), *output->info());
+    auto win_config = validate_and_configure_window(*input1, *input2, *output);
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
 
     _input1 = input1;
@@ -257,13 +258,13 @@
     _output = output;
 
     std::string kernel_name = "elementwise_operation_" + name();
-    if(is_data_type_quantized(input1->info()->data_type()))
+    if(is_data_type_quantized(input1->data_type()))
     {
         kernel_name += "_quantized";
     }
 
     // Set kernel build options
-    CLBuildOptions build_opts = generate_build_options(*input1->info(), *input2->info(), *output->info());
+    CLBuildOptions build_opts = generate_build_options(*input1, *input2, *output);
     if(_act_info.enabled())
     {
         build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(_act_info.activation())));
@@ -276,17 +277,21 @@
 
     ICLKernel::configure_internal(win_config.second);
 
-    _config_id = generate_id_for_tuning(kernel_name, *input1->info(), *output->info());
+    _config_id = generate_id_for_tuning(kernel_name, *input1, *output);
 }
 
-void CLElementwiseOperationKernel::run(const Window &window, cl::CommandQueue &queue)
+void CLElementwiseOperationKernel::run_op(const InputTensorMap &inputs, const OutputTensorMap &outputs, const Window &window, cl::CommandQueue &queue)
 {
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const TensorShape &in_shape1 = _input1->info()->tensor_shape();
-    const TensorShape &in_shape2 = _input2->info()->tensor_shape();
-    const TensorShape &out_shape = _output->info()->tensor_shape();
+    const auto src_0 = utils::cast::polymorphic_downcast<const ICLTensor *>(inputs.at(TensorType::ACL_SRC_0));
+    const auto src_1 = utils::cast::polymorphic_downcast<const ICLTensor *>(inputs.at(TensorType::ACL_SRC_1));
+    auto       dst   = utils::cast::polymorphic_downcast<ICLTensor *>(outputs.at(TensorType::ACL_DST));
+
+    const TensorShape &in_shape1 = src_0->info()->tensor_shape();
+    const TensorShape &in_shape2 = src_1->info()->tensor_shape();
+    const TensorShape &out_shape = dst->info()->tensor_shape();
 
     bool       can_collapse = true;
     const bool is_vector    = in_shape1.num_dimensions() == 1 || in_shape2.num_dimensions() == 1;
@@ -313,9 +318,9 @@
     {
         unsigned int idx = 0;
 
-        add_3D_tensor_argument(idx, _input1, slice_input1);
-        add_3D_tensor_argument(idx, _input2, slice_input2);
-        add_3D_tensor_argument(idx, _output, slice);
+        add_3D_tensor_argument(idx, src_0, slice_input1);
+        add_3D_tensor_argument(idx, src_1, slice_input2);
+        add_3D_tensor_argument(idx, dst, slice);
 
         enqueue(queue, *this, slice, lws_hint());
 
@@ -327,25 +332,25 @@
 
 BorderSize CLElementwiseOperationKernel::border_size() const
 {
-    const unsigned int replicateSize = _output->info()->dimension(0) - std::min(_input1->info()->dimension(0), _input2->info()->dimension(0));
+    const unsigned int replicateSize = _output->dimension(0) - std::min(_input1->dimension(0), _input2->dimension(0));
     const unsigned int border        = std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
     return BorderSize{ 0, border, 0, 0 };
 }
 
 /** Arithmetic operations with saturation*/
 
-void CLSaturatedArithmeticOperationKernel::configure(ArithmeticOperation op, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const ConvertPolicy &policy,
+void CLSaturatedArithmeticOperationKernel::configure(ArithmeticOperation op, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, const ConvertPolicy &policy,
                                                      const ActivationLayerInfo &act_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), op, input1, input2, output, policy, act_info);
 }
 
-void CLSaturatedArithmeticOperationKernel::configure(const CLCompileContext &compile_context, ArithmeticOperation op, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output,
+void CLSaturatedArithmeticOperationKernel::configure(const CLCompileContext &compile_context, ArithmeticOperation op, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output,
                                                      const ConvertPolicy       &policy,
                                                      const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
-    ARM_COMPUTE_ERROR_THROW_ON(CLSaturatedArithmeticOperationKernel::validate(op, input1->info(), input2->info(), output->info(), policy, act_info));
+    ARM_COMPUTE_ERROR_THROW_ON(CLSaturatedArithmeticOperationKernel::validate(op, input1, input2, output, policy, act_info));
 
     _policy   = policy;
     _op       = op;
@@ -392,16 +397,16 @@
 
 /** Arithmetic operations*/
 
-void CLArithmeticOperationKernel::configure(ArithmeticOperation op, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+void CLArithmeticOperationKernel::configure(ArithmeticOperation op, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, const ActivationLayerInfo &act_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), op, input1, input2, output, act_info);
 }
 
-void CLArithmeticOperationKernel::configure(const CLCompileContext &compile_context, ArithmeticOperation op, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output,
+void CLArithmeticOperationKernel::configure(const CLCompileContext &compile_context, ArithmeticOperation op, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output,
                                             const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
-    ARM_COMPUTE_ERROR_THROW_ON(CLArithmeticOperationKernel::validate(op, input1->info(), input2->info(), output->info(), act_info));
+    ARM_COMPUTE_ERROR_THROW_ON(CLArithmeticOperationKernel::validate(op, input1, input2, output, act_info));
 
     _op       = op;
     _act_info = act_info;
diff --git a/src/core/CL/kernels/CLFillBorderKernel.cpp b/src/core/CL/kernels/CLFillBorderKernel.cpp
index 67dac32..1fca646 100644
--- a/src/core/CL/kernels/CLFillBorderKernel.cpp
+++ b/src/core/CL/kernels/CLFillBorderKernel.cpp
@@ -33,6 +33,7 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/misc/Cast.h"
 #include "support/StringSupport.h"
 
 namespace arm_compute
@@ -62,10 +63,16 @@
 
 void CLFillBorderKernel::configure(const CLCompileContext &compile_context, ICLTensor *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value)
 {
-    ARM_COMPUTE_ERROR_ON(tensor == nullptr);
-    ARM_COMPUTE_ERROR_ON(tensor->info()->num_channels() != 1);
+    _tensor = tensor;
+    configure(compile_context, tensor->info(), border_size, border_mode, constant_border_value);
+}
 
-    border_size.limit(tensor->info()->padding());
+void CLFillBorderKernel::configure(const CLCompileContext &compile_context, ITensorInfo *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value)
+{
+    ARM_COMPUTE_ERROR_ON(tensor == nullptr);
+    ARM_COMPUTE_ERROR_ON(tensor->num_channels() != 1);
+
+    border_size.limit(tensor->padding());
 
     // If there is no border: early exit
     if(border_size.empty() || border_mode == BorderMode::UNDEFINED)
@@ -76,7 +83,7 @@
     // Select appropriate kernel
     std::string kernel_name = "fill_image_borders_" + lower_string(string_from_border_mode(border_mode));
 
-    const DataType dt = tensor->info()->data_type();
+    const DataType dt = tensor->data_type();
 
     // Define build options
     CLBuildOptions build_opts;
@@ -88,16 +95,15 @@
 
     // Create kernel
     _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
-    _tensor = tensor;
 
     // Create static kernel arguments
-    const unsigned int valid_width  = tensor->info()->valid_region().shape[0];
-    const unsigned int valid_height = tensor->info()->valid_region().shape[1];
+    const unsigned int valid_width  = tensor->valid_region().shape[0];
+    const unsigned int valid_height = tensor->valid_region().shape[1];
     const cl_int2      valid_region_coords =
     {
         {
-            static_cast<cl_int>(tensor->info()->valid_region().anchor[0]),
-            static_cast<cl_int>(tensor->info()->valid_region().anchor[1]),
+            static_cast<cl_int>(tensor->valid_region().anchor[0]),
+            static_cast<cl_int>(tensor->valid_region().anchor[1]),
         }
     };
     const unsigned int total_valid_width = border_size.left + valid_width + border_size.right;
@@ -149,7 +155,7 @@
     Window win;
     win.set(Window::DimX, Window::Dimension(0, total_valid_width + valid_height));
     win.set(Window::DimY, Window::Dimension(0, 1, 1));
-    win.use_tensor_dimensions(tensor->info()->tensor_shape(), Window::DimZ);
+    win.use_tensor_dimensions(tensor->tensor_shape(), Window::DimZ);
     ICLKernel::configure_internal(win);
 
     // Set config_id for enabling LWS tuning
@@ -157,13 +163,40 @@
     _config_id += "_";
     _config_id += lower_string(string_from_data_type(dt));
     _config_id += "_";
-    _config_id += support::cpp11::to_string(tensor->info()->dimension(0));
+    _config_id += support::cpp11::to_string(tensor->dimension(0));
     _config_id += "_";
-    _config_id += support::cpp11::to_string(tensor->info()->dimension(1));
+    _config_id += support::cpp11::to_string(tensor->dimension(1));
     _config_id += "_";
     _config_id += lower_string(string_from_border_mode(border_mode));
 }
 
+void CLFillBorderKernel::run_op(const InputTensorMap &inputs, const OutputTensorMap &outputs, const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_UNUSED(outputs);
+
+    // Border mode undefined or border width == 0
+    if(_kernel() == nullptr)
+    {
+        return;
+    }
+
+    const auto tensor = utils::cast::polymorphic_downcast<const ICLTensor *>(inputs.at(TensorType::ACL_SRC));
+
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
+
+    Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+    Window slice     = collapsed.first_slice_window_3D();
+
+    do
+    {
+        unsigned int idx = 0;
+        add_3D_tensor_argument(idx, tensor, slice);
+        enqueue(queue, *this, slice, lws_hint());
+    }
+    while(collapsed.slide_window_slice_3D(slice));
+}
+
 void CLFillBorderKernel::run(const Window &window, cl::CommandQueue &queue)
 {
     // Border mode undefined or border width == 0