COMPMID-3385: Async support to CLArithmetic* kernels/functions Pt.1

Signed-off-by: Michalis Spyrou <michalis.spyrou@arm.com>
Change-Id: I94007565e688f8a0aead4f14c9fc30bfd9f9f7eb
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/3613
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
diff --git a/src/runtime/CL/functions/CLPReluLayer.cpp b/src/runtime/CL/functions/CLPReluLayer.cpp
index b1b9738..fbb466a 100644
--- a/src/runtime/CL/functions/CLPReluLayer.cpp
+++ b/src/runtime/CL/functions/CLPReluLayer.cpp
@@ -24,6 +24,7 @@
 #include "arm_compute/core/CL/kernels/CLElementwiseOperationKernel.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
 #include "arm_compute/runtime/CL/functions/CLPReluLayer.h"
 #include "support/MemorySupport.h"
 
@@ -31,26 +32,42 @@
 {
 namespace
 {
-void configure_border_handler(const CLCompileContext &compile_context, CLFillBorderKernel &border_handler, BorderSize border_size, ICLTensor *input1, ICLTensor *input2, const ICLTensor *output)
+void configure_border_handler(const CLCompileContext &compile_context, CLFillBorderKernel &border_handler, BorderSize border_size, ITensorInfo *input1, ITensorInfo *input2, const ITensorInfo *output)
 {
-    if(output->info()->dimension(0) > 1)
+    if(output->dimension(0) > 1)
     {
-        ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2;
+        ITensorInfo *broadcasted_info = (input1->dimension(0) == 1) ? input1 : input2;
 
-        if(broadcasted_info->info()->dimension(0) == 1)
+        if(broadcasted_info->dimension(0) == 1)
         {
             border_handler.configure(compile_context, broadcasted_info, border_size, BorderMode::REPLICATE);
         }
     }
 }
+void select_border_input(InputTensorMap &tensor_map, InputTensorMap &inputs, OutputTensorMap &outputs)
+{
+    if(outputs.at(TensorType::ACL_DST)->info()->dimension(0) > 1)
+    {
+        if(inputs.at(TensorType::ACL_SRC_1)->info()->dimension(0) == 1)
+        {
+            tensor_map[TensorType::ACL_SRC] = inputs.at(TensorType::ACL_SRC_1);
+        }
+        else
+        {
+            tensor_map[TensorType::ACL_SRC] = inputs.at(TensorType::ACL_SRC_0);
+        }
+    }
+}
 } // namespace
 
-void CLPReluLayer::configure(ICLTensor *input, ICLTensor *alpha, ICLTensor *output)
+namespace experimental
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input, alpha, output);
+CLPReluLayer::CLPReluLayer()
+    : _border_handler()
+{
 }
 
-void CLPReluLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *alpha, ICLTensor *output)
+void CLPReluLayer::configure(const CLCompileContext &compile_context, ITensorInfo *input, ITensorInfo *alpha, ITensorInfo *output)
 {
     auto k = arm_compute::support::cpp14::make_unique<CLArithmeticOperationKernel>();
     k->configure(compile_context, ArithmeticOperation::PRELU, input, alpha, output);
@@ -62,4 +79,56 @@
 {
     return CLArithmeticOperationKernel::validate(ArithmeticOperation::PRELU, input, alpha, output);
 }
+
+void CLPReluLayer::run(InputTensorMap inputs, OutputTensorMap outputs, OperatorTensorMap workspace)
+{
+    InputTensorMap src;
+    select_border_input(src, inputs, outputs);
+    CLScheduler::get().enqueue_op(_border_handler, src, {});
+    ICLOperator::run(inputs, outputs, workspace);
+}
+} // namespace experimental
+
+struct CLPReluLayer::Impl
+{
+    const ICLTensor                            *src_0{ nullptr };
+    const ICLTensor                            *src_1{ nullptr };
+    ICLTensor                                  *dst{ nullptr };
+    std::unique_ptr<experimental::CLPReluLayer> op{ nullptr };
+};
+
+CLPReluLayer::CLPReluLayer()
+    : _impl(support::cpp14::make_unique<Impl>())
+{
+}
+CLPReluLayer::CLPReluLayer(CLPReluLayer &&) = default;
+CLPReluLayer &CLPReluLayer::operator=(CLPReluLayer &&) = default;
+CLPReluLayer::~CLPReluLayer()                          = default;
+
+void CLPReluLayer::configure(ICLTensor *input, ICLTensor *alpha, ICLTensor *output)
+{
+    configure(CLKernelLibrary::get().get_compile_context(), input, alpha, output);
+}
+
+void CLPReluLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *alpha, ICLTensor *output)
+{
+    _impl->src_0 = input;
+    _impl->src_1 = alpha;
+    _impl->dst   = output;
+    _impl->op    = arm_compute::support::cpp14::make_unique<experimental::CLPReluLayer>();
+    _impl->op->configure(compile_context, input->info(), alpha->info(), output->info());
+}
+
+Status CLPReluLayer::validate(const ITensorInfo *input, const ITensorInfo *alpha, const ITensorInfo *output)
+{
+    return experimental::CLPReluLayer::validate(input, alpha, output);
+}
+
+void CLPReluLayer::run()
+{
+    const InputTensorMap  src{ { TensorType::ACL_SRC_0, _impl->src_0 }, { TensorType::ACL_SRC_1, _impl->src_1 } };
+    const OutputTensorMap dst{ { TensorType::ACL_DST, _impl->dst } };
+
+    _impl->op->run(src, dst, {});
+}
 } // namespace arm_compute