COMPMID-3377: Async support to NEElementwiseUnaryLayerKernel kernels/functions

Signed-off-by: Michalis Spyrou <michalis.spyrou@arm.com>
Change-Id: I208287b44ece051e95f891d43a691cb0ac6e56c5
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/3419
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
diff --git a/src/core/NEON/kernels/NEActivationLayerKernel.cpp b/src/core/NEON/kernels/NEActivationLayerKernel.cpp
index 2c00a76..43426dc 100644
--- a/src/core/NEON/kernels/NEActivationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEActivationLayerKernel.cpp
@@ -855,8 +855,8 @@
     return Status{};
 }
 
-void NEActivationLayerKernel::run_op(const std::vector<InputTensor> &inputs,
-                                     const std::vector<OutputTensor> &outputs,
+void NEActivationLayerKernel::run_op(const InputTensorMap &inputs,
+                                     const OutputTensorMap &outputs,
                                      const Window &window, const ThreadInfo &info)
 {
     // Early exit on disabled activation
@@ -872,5 +872,5 @@
 
     ARM_COMPUTE_ERROR_ON(inputs.empty() || outputs.empty());
 
-    (this->*_func)(inputs[0].tensor, outputs[0].tensor, window);
+    (this->*_func)(inputs.at(TensorType::ACL_SRC), outputs.at(TensorType::ACL_DST), window);
 }
diff --git a/src/core/NEON/kernels/NEElementwiseOperationKernel.cpp b/src/core/NEON/kernels/NEElementwiseOperationKernel.cpp
index 7b2b5e4..b4f7a0a 100644
--- a/src/core/NEON/kernels/NEElementwiseOperationKernel.cpp
+++ b/src/core/NEON/kernels/NEElementwiseOperationKernel.cpp
@@ -1055,13 +1055,13 @@
 }
 
 std::function<void(const ITensor *, const ITensor *, ITensor *, const Window &)>
-configure_func(const ITensor *input1, const ITensor *input2, ITensor *output,
+configure_func(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output,
                std::map<std::string, NEElementwiseOperationKernel::ElementwiseFunction *> map_function)
 {
     std::string function_to_call("op_");
-    function_to_call += string_from_data_type(input1->info()->data_type()) + "_";
-    function_to_call += string_from_data_type(input2->info()->data_type()) + "_";
-    function_to_call += string_from_data_type(output->info()->data_type());
+    function_to_call += string_from_data_type(input1->data_type()) + "_";
+    function_to_call += string_from_data_type(input2->data_type()) + "_";
+    function_to_call += string_from_data_type(output->data_type());
 
     auto it = map_function.find(function_to_call);
 
@@ -1078,7 +1078,7 @@
 
 template <ArithmeticOperation op>
 std::function<void(const ITensor *, const ITensor *, ITensor *, const Window &)>
-configure_arithm_func(const ITensor *input1, const ITensor *input2, ITensor *output)
+configure_arithm_func(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output)
 {
     static std::map<std::string, NEElementwiseOperationKernel::ElementwiseFunction *> map_function =
     {
@@ -1097,7 +1097,7 @@
 
 template <ComparisonOperation op>
 std::function<void(const ITensor *input1, const ITensor *input2, ITensor *output, const Window &window)>
-configure_comp_func(const ITensor *input1, const ITensor *input2, ITensor *output)
+configure_comp_func(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output)
 {
     static std::map<std::string, NEElementwiseOperationKernel::ElementwiseFunction *> map_function =
     {
@@ -1140,41 +1140,36 @@
     return Status{};
 }
 
-void NEElementwiseOperationKernel::configure_common(const ITensor *input1, const ITensor *input2, ITensor *output)
+void NEElementwiseOperationKernel::configure_common(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
 
     // Configure kernel window
-    const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(*input1->info(), *input2->info());
+    const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(*input1, *input2);
     const TensorShape &out_shape    = broadcast_pair.first;
     const ValidRegion &valid_region = broadcast_pair.second;
 
     // Auto initialize output if not initialized
-    auto_init_if_empty(*output->info(), out_shape, 1, input1->info()->data_type());
+    auto_init_if_empty(*output, out_shape, 1, input1->data_type());
 
     Window win = calculate_max_window(valid_region);
 
-    _input1 = input1;
-    _input2 = input2;
-    _output = output;
-
     INEKernel::configure(win);
 }
 
-void NEElementwiseOperationKernel::run(const Window &window, const ThreadInfo &info)
+void NEElementwiseOperationKernel::run_op(const InputTensorMap &inputs, const OutputTensorMap &outputs, const Window &window, const ThreadInfo &info)
 {
     ARM_COMPUTE_UNUSED(info, window);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
     ARM_COMPUTE_ERROR_ON(_function == nullptr);
-    _function(_input1, _input2, _output, window);
+    _function(inputs.at(TensorType::ACL_SRC_0), inputs.at(TensorType::ACL_SRC_1), outputs.at(TensorType::ACL_DST), window);
 }
 
 /** Arithmetic operators (min, max, squared_diff) */
-
-void NEArithmeticOperationKernel::configure(ArithmeticOperation op, const ITensor *input1, const ITensor *input2, ITensor *output)
+void NEArithmeticOperationKernel::configure(ArithmeticOperation op, const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output)
 {
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input1->info(), *input2->info(), *output->info()));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input1, *input2, *output));
     configure_common(input1, input2, output);
     switch(op)
     {
@@ -1215,9 +1210,9 @@
 
 /** The division operator */
 
-void NEDivisionOperationKernel::configure(const ITensor *input1, const ITensor *input2, ITensor *output)
+void NEDivisionOperationKernel::configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output)
 {
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input1->info(), *input2->info(), *output->info()));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input1, *input2, *output));
     configure_common(input1, input2, output);
     _function = configure_arithm_func<ArithmeticOperation::DIV>(input1, input2, output);
 }
@@ -1236,9 +1231,9 @@
 }
 
 /** The power operator */
-void NEPowerOperationKernel::configure(const ITensor *input1, const ITensor *input2, ITensor *output)
+void NEPowerOperationKernel::configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output)
 {
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input1->info(), *input2->info(), *output->info()));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input1, *input2, *output));
     configure_common(input1, input2, output);
     _function = configure_arithm_func<ArithmeticOperation::POWER>(input1, input2, output);
 }
@@ -1257,10 +1252,9 @@
 }
 
 /** Comparison operators (equal, not equal, less than, greater than, less than or equal, greater than or equal) */
-
-void NEComparisonOperationKernel::configure(ComparisonOperation op, const ITensor *input1, const ITensor *input2, ITensor *output)
+void NEComparisonOperationKernel::configure(ComparisonOperation op, const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output)
 {
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input1->info(), *input2->info(), *output->info()));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input1, *input2, *output));
     configure_common(input1, input2, output);
     switch(op)
     {
diff --git a/src/core/NEON/kernels/NEReshapeLayerKernel.cpp b/src/core/NEON/kernels/NEReshapeLayerKernel.cpp
index c141eec..eb1139d7 100644
--- a/src/core/NEON/kernels/NEReshapeLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEReshapeLayerKernel.cpp
@@ -86,14 +86,14 @@
     INEKernel::configure(win);
 }
 
-void NEReshapeLayerKernel::run_op(const std::vector<InputTensor> &inputs, const std::vector<OutputTensor> &outputs, const Window &window, const ThreadInfo &info)
+void NEReshapeLayerKernel::run_op(const InputTensorMap &inputs, const OutputTensorMap &outputs, const Window &window, const ThreadInfo &info)
 {
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
 
-    const auto src = inputs[0].tensor;
-    auto       dst = outputs[0].tensor;
+    const auto src = inputs.at(TensorType::ACL_SRC);
+    auto       dst = outputs.at(TensorType::ACL_DST);
 
     switch(src->info()->data_type())
     {
diff --git a/src/runtime/CPP/CPPScheduler.cpp b/src/runtime/CPP/CPPScheduler.cpp
index 41e1a2d..af6d8d7 100644
--- a/src/runtime/CPP/CPPScheduler.cpp
+++ b/src/runtime/CPP/CPPScheduler.cpp
@@ -363,7 +363,7 @@
 }
 #endif /* DOXYGEN_SKIP_THIS */
 
-void CPPScheduler::schedule_common(ICPPKernel *kernel, const Hints &hints, const std::vector<InputTensor> &inputs, const std::vector<OutputTensor> &outputs)
+void CPPScheduler::schedule_common(ICPPKernel *kernel, const Hints &hints, const InputTensorMap &inputs, const OutputTensorMap &outputs)
 {
     ARM_COMPUTE_ERROR_ON_MSG(!kernel, "The child class didn't set the kernel");
 
@@ -473,15 +473,15 @@
     }
 }
 
-void CPPScheduler::schedule_op(ICPPKernel *kernel, const Hints &hints, const std::vector<InputTensor> &inputs, const std::vector<OutputTensor> &outputs)
+void CPPScheduler::schedule_op(ICPPKernel *kernel, const Hints &hints, const InputTensorMap &inputs, const OutputTensorMap &outputs)
 {
     schedule_common(kernel, hints, inputs, outputs);
 }
 
 void CPPScheduler::schedule(ICPPKernel *kernel, const Hints &hints)
 {
-    const std::vector<InputTensor> inputs;
-    std::vector<OutputTensor>      outputs;
+    const InputTensorMap inputs;
+    OutputTensorMap      outputs;
     schedule_common(kernel, hints, inputs, outputs);
 }
 } // namespace arm_compute
diff --git a/src/runtime/CPP/SingleThreadScheduler.cpp b/src/runtime/CPP/SingleThreadScheduler.cpp
index 8257628..63c6c7b 100644
--- a/src/runtime/CPP/SingleThreadScheduler.cpp
+++ b/src/runtime/CPP/SingleThreadScheduler.cpp
@@ -49,7 +49,7 @@
     kernel->run(kernel->window(), info);
 }
 
-void SingleThreadScheduler::schedule_op(ICPPKernel *kernel, const Hints &hints, const std::vector<InputTensor> &inputs, const std::vector<OutputTensor> &outputs)
+void SingleThreadScheduler::schedule_op(ICPPKernel *kernel, const Hints &hints, const InputTensorMap &inputs, const OutputTensorMap &outputs)
 {
     ARM_COMPUTE_UNUSED(hints);
     ThreadInfo info;
diff --git a/src/runtime/NEON/INEOperator.cpp b/src/runtime/NEON/INEOperator.cpp
index 7879085..00dab75 100644
--- a/src/runtime/NEON/INEOperator.cpp
+++ b/src/runtime/NEON/INEOperator.cpp
@@ -33,7 +33,7 @@
 {
 }
 
-void INEOperator::run(std::vector<InputTensor> inputs, std::vector<OutputTensor> outputs, std::vector<OperatorTensor> workspace)
+void INEOperator::run(InputTensorMap inputs, OutputTensorMap outputs, OperatorTensorMap workspace)
 {
     ARM_COMPUTE_UNUSED(workspace);
 
@@ -45,7 +45,7 @@
     NEScheduler::get().schedule_op(_kernel.get(), Window::DimY, inputs, outputs);
 }
 
-void INEOperator::prepare(std::vector<OperatorTensor> constants)
+void INEOperator::prepare(OperatorTensorMap constants)
 {
     ARM_COMPUTE_UNUSED(constants);
 }
diff --git a/src/runtime/NEON/functions/NEActivationLayer.cpp b/src/runtime/NEON/functions/NEActivationLayer.cpp
index 889ff6b..0322238 100644
--- a/src/runtime/NEON/functions/NEActivationLayer.cpp
+++ b/src/runtime/NEON/functions/NEActivationLayer.cpp
@@ -90,9 +90,9 @@
 
 void NEActivationLayer::run()
 {
-    const InputTensor src{ TensorType::ACL_SRC, _impl->src };
-    OutputTensor      dst{ TensorType::ACL_DST, _impl->dst };
+    const InputTensorMap  src{ { TensorType::ACL_SRC, _impl->src } };
+    const OutputTensorMap dst{ { TensorType::ACL_DST, _impl->dst } };
 
-    _impl->op->run({ src }, { dst }, {});
+    _impl->op->run(src, dst, {});
 }
 } // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEElementwiseOperators.cpp b/src/runtime/NEON/functions/NEElementwiseOperators.cpp
index 926ae1f..63fd565 100644
--- a/src/runtime/NEON/functions/NEElementwiseOperators.cpp
+++ b/src/runtime/NEON/functions/NEElementwiseOperators.cpp
@@ -32,7 +32,9 @@
 
 namespace arm_compute
 {
-void NEElementwiseMax::configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info)
+namespace experimental
+{
+void NEElementwiseMax::configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output, const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_UNUSED(act_info);
     auto k = arm_compute::support::cpp14::make_unique<NEArithmeticOperationKernel>();
@@ -46,7 +48,12 @@
     return NEArithmeticOperationKernel::validate(ArithmeticOperation::MAX, input1, input2, output);
 }
 
-void NEElementwiseMin::configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info)
+MemoryRequirements NEElementwiseMax::workspace() const
+{
+    return MemoryRequirements{};
+}
+
+void NEElementwiseMin::configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output, const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_UNUSED(act_info);
     auto k = arm_compute::support::cpp14::make_unique<NEArithmeticOperationKernel>();
@@ -60,7 +67,12 @@
     return NEArithmeticOperationKernel::validate(ArithmeticOperation::MIN, input1, input2, output);
 }
 
-void NEElementwiseSquaredDiff::configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info)
+MemoryRequirements NEElementwiseMin::workspace() const
+{
+    return MemoryRequirements{};
+}
+
+void NEElementwiseSquaredDiff::configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output, const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_UNUSED(act_info);
     auto k = arm_compute::support::cpp14::make_unique<NEArithmeticOperationKernel>();
@@ -74,7 +86,12 @@
     return NEArithmeticOperationKernel::validate(ArithmeticOperation::SQUARED_DIFF, input1, input2, output);
 }
 
-void NEElementwiseDivision::configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info)
+MemoryRequirements NEElementwiseSquaredDiff::workspace() const
+{
+    return MemoryRequirements{};
+}
+
+void NEElementwiseDivision::configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output, const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_UNUSED(act_info);
     auto k = arm_compute::support::cpp14::make_unique<NEDivisionOperationKernel>();
@@ -88,7 +105,12 @@
     return NEDivisionOperationKernel::validate(input1, input2, output);
 }
 
-void NEElementwisePower::configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info)
+MemoryRequirements NEElementwiseDivision::workspace() const
+{
+    return MemoryRequirements{};
+}
+
+void NEElementwisePower::configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output, const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_UNUSED(act_info);
     auto k = arm_compute::support::cpp14::make_unique<NEPowerOperationKernel>();
@@ -102,8 +124,13 @@
     return NEPowerOperationKernel::validate(input1, input2, output);
 }
 
+MemoryRequirements NEElementwisePower::workspace() const
+{
+    return MemoryRequirements{};
+}
+
 template <ComparisonOperation COP>
-void NEElementwiseComparisonStatic<COP>::configure(ITensor *input1, ITensor *input2, ITensor *output)
+void NEElementwiseComparisonStatic<COP>::configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output)
 {
     auto k = arm_compute::support::cpp14::make_unique<NEComparisonOperationKernel>();
     k->configure(COP, input1, input2, output);
@@ -116,7 +143,13 @@
     return NEComparisonOperationKernel::validate(COP, input1, input2, output);
 }
 
-void NEElementwiseComparison::configure(ITensor *input1, ITensor *input2, ITensor *output, ComparisonOperation op)
+template <ComparisonOperation COP>
+MemoryRequirements            NEElementwiseComparisonStatic<COP>::workspace() const
+{
+    return MemoryRequirements{};
+}
+
+void NEElementwiseComparison::configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output, ComparisonOperation op)
 {
     auto k = arm_compute::support::cpp14::make_unique<NEComparisonOperationKernel>();
     k->configure(op, input1, input2, output);
@@ -128,6 +161,294 @@
     return NEComparisonOperationKernel::validate(op, input1, input2, output);
 }
 
+MemoryRequirements NEElementwiseComparison::workspace() const
+{
+    return MemoryRequirements{};
+}
+
+// Supported Specializations
+template class NEElementwiseComparisonStatic<ComparisonOperation::Equal>;
+template class NEElementwiseComparisonStatic<ComparisonOperation::NotEqual>;
+template class NEElementwiseComparisonStatic<ComparisonOperation::Greater>;
+template class NEElementwiseComparisonStatic<ComparisonOperation::GreaterEqual>;
+template class NEElementwiseComparisonStatic<ComparisonOperation::Less>;
+template class NEElementwiseComparisonStatic<ComparisonOperation::LessEqual>;
+} // namespace experimental
+
+struct NEElementwiseMax::Impl
+{
+    const ITensor                                  *src_0{ nullptr };
+    const ITensor                                  *src_1{ nullptr };
+    ITensor                                        *dst{ nullptr };
+    std::unique_ptr<experimental::NEElementwiseMax> op{ nullptr };
+};
+
+NEElementwiseMax::NEElementwiseMax()
+    : _impl(support::cpp14::make_unique<Impl>())
+{
+}
+NEElementwiseMax::NEElementwiseMax(NEElementwiseMax &&) = default;
+NEElementwiseMax &NEElementwiseMax::operator=(NEElementwiseMax &&) = default;
+NEElementwiseMax::~NEElementwiseMax()                              = default;
+
+void NEElementwiseMax::configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info)
+{
+    _impl->src_0 = input1;
+    _impl->src_1 = input2;
+    _impl->dst   = output;
+    _impl->op    = arm_compute::support::cpp14::make_unique<experimental::NEElementwiseMax>();
+    _impl->op->configure(input1->info(), input2->info(), output->info(), act_info);
+}
+
+Status NEElementwiseMax::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled());
+    return experimental::NEElementwiseMax::validate(input1, input2, output, act_info);
+}
+
+void NEElementwiseMax::run()
+{
+    const InputTensorMap  src{ { TensorType::ACL_SRC_0, _impl->src_0 }, { TensorType::ACL_SRC_1, _impl->src_1 } };
+    const OutputTensorMap dst{ { TensorType::ACL_DST, _impl->dst } };
+    _impl->op->run(src, dst, {});
+}
+
+struct NEElementwiseMin::Impl
+{
+    const ITensor                                  *src_0{ nullptr };
+    const ITensor                                  *src_1{ nullptr };
+    ITensor                                        *dst{ nullptr };
+    std::unique_ptr<experimental::NEElementwiseMin> op{ nullptr };
+};
+
+NEElementwiseMin::NEElementwiseMin()
+    : _impl(support::cpp14::make_unique<Impl>())
+{
+}
+NEElementwiseMin::NEElementwiseMin(NEElementwiseMin &&) = default;
+NEElementwiseMin &NEElementwiseMin::operator=(NEElementwiseMin &&) = default;
+NEElementwiseMin::~NEElementwiseMin()                              = default;
+
+void NEElementwiseMin::configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info)
+{
+    _impl->src_0 = input1;
+    _impl->src_1 = input2;
+    _impl->dst   = output;
+    _impl->op    = arm_compute::support::cpp14::make_unique<experimental::NEElementwiseMin>();
+    _impl->op->configure(input1->info(), input2->info(), output->info(), act_info);
+}
+
+Status NEElementwiseMin::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled());
+    return experimental::NEElementwiseMin::validate(input1, input2, output, act_info);
+}
+
+void NEElementwiseMin::run()
+{
+    const InputTensorMap  src{ { TensorType::ACL_SRC_0, _impl->src_0 }, { TensorType::ACL_SRC_1, _impl->src_1 } };
+    const OutputTensorMap dst{ { TensorType::ACL_DST, _impl->dst } };
+    _impl->op->run(src, dst, {});
+}
+
+struct NEElementwiseSquaredDiff::Impl
+{
+    const ITensor                                          *src_0{ nullptr };
+    const ITensor                                          *src_1{ nullptr };
+    ITensor                                                *dst{ nullptr };
+    std::unique_ptr<experimental::NEElementwiseSquaredDiff> op{ nullptr };
+};
+
+NEElementwiseSquaredDiff::NEElementwiseSquaredDiff()
+    : _impl(support::cpp14::make_unique<Impl>())
+{
+}
+NEElementwiseSquaredDiff::NEElementwiseSquaredDiff(NEElementwiseSquaredDiff &&) = default;
+NEElementwiseSquaredDiff &NEElementwiseSquaredDiff::operator=(NEElementwiseSquaredDiff &&) = default;
+NEElementwiseSquaredDiff::~NEElementwiseSquaredDiff()                                      = default;
+
+void NEElementwiseSquaredDiff::configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info)
+{
+    _impl->src_0 = input1;
+    _impl->src_1 = input2;
+    _impl->dst   = output;
+    _impl->op    = arm_compute::support::cpp14::make_unique<experimental::NEElementwiseSquaredDiff>();
+    _impl->op->configure(input1->info(), input2->info(), output->info(), act_info);
+}
+
+Status NEElementwiseSquaredDiff::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled());
+    return experimental::NEElementwiseSquaredDiff::validate(input1, input2, output, act_info);
+}
+
+void NEElementwiseSquaredDiff::run()
+{
+    const InputTensorMap  src{ { TensorType::ACL_SRC_0, _impl->src_0 }, { TensorType::ACL_SRC_1, _impl->src_1 } };
+    const OutputTensorMap dst{ { TensorType::ACL_DST, _impl->dst } };
+    _impl->op->run(src, dst, {});
+}
+
+struct NEElementwiseDivision::Impl
+{
+    const ITensor                                       *src_0{ nullptr };
+    const ITensor                                       *src_1{ nullptr };
+    ITensor                                             *dst{ nullptr };
+    std::unique_ptr<experimental::NEElementwiseDivision> op{ nullptr };
+};
+
+NEElementwiseDivision::NEElementwiseDivision()
+    : _impl(support::cpp14::make_unique<Impl>())
+{
+}
+NEElementwiseDivision::NEElementwiseDivision(NEElementwiseDivision &&) = default;
+NEElementwiseDivision &NEElementwiseDivision::operator=(NEElementwiseDivision &&) = default;
+NEElementwiseDivision::~NEElementwiseDivision()                                   = default;
+
+void NEElementwiseDivision::configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_UNUSED(act_info);
+    _impl->src_0 = input1;
+    _impl->src_1 = input2;
+    _impl->dst   = output;
+    _impl->op    = arm_compute::support::cpp14::make_unique<experimental::NEElementwiseDivision>();
+    _impl->op->configure(input1->info(), input2->info(), output->info(), act_info);
+}
+
+Status NEElementwiseDivision::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled());
+    return experimental::NEElementwiseDivision::validate(input1, input2, output, act_info);
+}
+
+void NEElementwiseDivision::run()
+{
+    const InputTensorMap  src{ { TensorType::ACL_SRC_0, _impl->src_0 }, { TensorType::ACL_SRC_1, _impl->src_1 } };
+    const OutputTensorMap dst{ { TensorType::ACL_DST, _impl->dst } };
+    _impl->op->run(src, dst, {});
+}
+
+struct NEElementwisePower::Impl
+{
+    const ITensor                                    *src_0{ nullptr };
+    const ITensor                                    *src_1{ nullptr };
+    ITensor                                          *dst{ nullptr };
+    std::unique_ptr<experimental::NEElementwisePower> op{ nullptr };
+};
+
+NEElementwisePower::NEElementwisePower()
+    : _impl(support::cpp14::make_unique<Impl>())
+{
+}
+NEElementwisePower::NEElementwisePower(NEElementwisePower &&) = default;
+NEElementwisePower &NEElementwisePower::operator=(NEElementwisePower &&) = default;
+NEElementwisePower::~NEElementwisePower()                                = default;
+
+void NEElementwisePower::configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_UNUSED(act_info);
+    _impl->src_0 = input1;
+    _impl->src_1 = input2;
+    _impl->dst   = output;
+    _impl->op    = arm_compute::support::cpp14::make_unique<experimental::NEElementwisePower>();
+    _impl->op->configure(input1->info(), input2->info(), output->info(), act_info);
+}
+
+Status NEElementwisePower::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled());
+    return experimental::NEElementwisePower::validate(input1, input2, output, act_info);
+}
+
+void NEElementwisePower::run()
+{
+    const InputTensorMap  src{ { TensorType::ACL_SRC_0, _impl->src_0 }, { TensorType::ACL_SRC_1, _impl->src_1 } };
+    const OutputTensorMap dst{ { TensorType::ACL_DST, _impl->dst } };
+    _impl->op->run(src, dst, {});
+}
+
+template <ComparisonOperation COP>
+struct NEElementwiseComparisonStatic<COP>::Impl
+{
+    const ITensor                                                    *src_0{ nullptr };
+    const ITensor                                                    *src_1{ nullptr };
+    ITensor                                                          *dst{ nullptr };
+    std::unique_ptr<experimental::NEElementwiseComparisonStatic<COP>> op{ nullptr };
+};
+
+template <ComparisonOperation COP>
+NEElementwiseComparisonStatic<COP>::NEElementwiseComparisonStatic()
+    : _impl(support::cpp14::make_unique<Impl>())
+{
+}
+template <ComparisonOperation COP>
+NEElementwiseComparisonStatic<COP>::NEElementwiseComparisonStatic(NEElementwiseComparisonStatic &&) = default;
+template <ComparisonOperation       COP>
+NEElementwiseComparisonStatic<COP> &NEElementwiseComparisonStatic<COP>::operator=(NEElementwiseComparisonStatic &&) = default;
+template <ComparisonOperation       COP>
+NEElementwiseComparisonStatic<COP>::~NEElementwiseComparisonStatic() = default;
+
+template <ComparisonOperation COP>
+void NEElementwiseComparisonStatic<COP>::configure(ITensor *input1, ITensor *input2, ITensor *output)
+{
+    _impl->src_0 = input1;
+    _impl->src_1 = input2;
+    _impl->dst   = output;
+    _impl->op    = arm_compute::support::cpp14::make_unique<experimental::NEElementwiseComparisonStatic<COP>>();
+    _impl->op->configure(input1->info(), input2->info(), output->info());
+}
+
+template <ComparisonOperation COP>
+Status NEElementwiseComparisonStatic<COP>::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+{
+    return experimental::NEElementwiseComparisonStatic<COP>::validate(input1, input2, output);
+}
+
+template <ComparisonOperation COP>
+void                          NEElementwiseComparisonStatic<COP>::run()
+{
+    const InputTensorMap  src{ { TensorType::ACL_SRC_0, _impl->src_0 }, { TensorType::ACL_SRC_1, _impl->src_1 } };
+    const OutputTensorMap dst{ { TensorType::ACL_DST, _impl->dst } };
+    _impl->op->run(src, dst, {});
+}
+
+struct NEElementwiseComparison::Impl
+{
+    const ITensor                                         *src_0{ nullptr };
+    const ITensor                                         *src_1{ nullptr };
+    ITensor                                               *dst{ nullptr };
+    std::unique_ptr<experimental::NEElementwiseComparison> op{ nullptr };
+};
+
+NEElementwiseComparison::NEElementwiseComparison()
+    : _impl(support::cpp14::make_unique<Impl>())
+{
+}
+NEElementwiseComparison::NEElementwiseComparison(NEElementwiseComparison &&) = default;
+NEElementwiseComparison &NEElementwiseComparison::operator=(NEElementwiseComparison &&) = default;
+NEElementwiseComparison::~NEElementwiseComparison()                                     = default;
+
+void NEElementwiseComparison::configure(ITensor *input1, ITensor *input2, ITensor *output, ComparisonOperation op)
+{
+    _impl->src_0 = input1;
+    _impl->src_1 = input2;
+    _impl->dst   = output;
+    _impl->op    = arm_compute::support::cpp14::make_unique<experimental::NEElementwiseComparison>();
+    _impl->op->configure(input1->info(), input2->info(), output->info(), op);
+}
+
+Status NEElementwiseComparison::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ComparisonOperation op)
+{
+    return experimental::NEElementwiseComparison::validate(input1, input2, output, op);
+}
+
+void NEElementwiseComparison::run()
+{
+    const InputTensorMap  src{ { TensorType::ACL_SRC_0, _impl->src_0 }, { TensorType::ACL_SRC_1, _impl->src_1 } };
+    const OutputTensorMap dst{ { TensorType::ACL_DST, _impl->dst } };
+    _impl->op->run(src, dst, {});
+}
+
 // Supported Specializations
 template class NEElementwiseComparisonStatic<ComparisonOperation::Equal>;
 template class NEElementwiseComparisonStatic<ComparisonOperation::NotEqual>;
diff --git a/src/runtime/NEON/functions/NEPReluLayer.cpp b/src/runtime/NEON/functions/NEPReluLayer.cpp
index 02dfc6f..1dd01fc 100644
--- a/src/runtime/NEON/functions/NEPReluLayer.cpp
+++ b/src/runtime/NEON/functions/NEPReluLayer.cpp
@@ -29,7 +29,9 @@
 
 namespace arm_compute
 {
-void NEPReluLayer::configure(const ITensor *input, const ITensor *alpha, ITensor *output)
+namespace experimental
+{
+void NEPReluLayer::configure(const ITensorInfo *input, const ITensorInfo *alpha, ITensorInfo *output)
 {
     auto k = arm_compute::support::cpp14::make_unique<NEArithmeticOperationKernel>();
     k->configure(ArithmeticOperation::PRELU, input, alpha, output);
@@ -40,4 +42,47 @@
 {
     return NEArithmeticOperationKernel::validate(ArithmeticOperation::PRELU, input, alpha, output);
 }
+
+MemoryRequirements NEPReluLayer::workspace() const
+{
+    return MemoryRequirements{};
+}
+} // nsamespace experimental
+
+struct NEPReluLayer::Impl
+{
+    const ITensor                              *src_0{ nullptr };
+    const ITensor                              *src_1{ nullptr };
+    ITensor                                    *dst{ nullptr };
+    std::unique_ptr<experimental::NEPReluLayer> op{ nullptr };
+};
+
+NEPReluLayer::NEPReluLayer()
+    : _impl(support::cpp14::make_unique<Impl>())
+{
+}
+NEPReluLayer::NEPReluLayer(NEPReluLayer &&) = default;
+NEPReluLayer &NEPReluLayer::operator=(NEPReluLayer &&) = default;
+NEPReluLayer::~NEPReluLayer()                          = default;
+
+void NEPReluLayer::configure(const ITensor *input, const ITensor *alpha, ITensor *output)
+{
+    _impl->src_0 = input;
+    _impl->src_1 = alpha;
+    _impl->dst   = output;
+    _impl->op    = arm_compute::support::cpp14::make_unique<experimental::NEPReluLayer>();
+    _impl->op->configure(input->info(), alpha->info(), output->info());
+}
+
+void NEPReluLayer::run()
+{
+    const InputTensorMap  src{ { TensorType::ACL_SRC_0, _impl->src_0 }, { TensorType::ACL_SRC_1, _impl->src_1 } };
+    const OutputTensorMap dst{ { TensorType::ACL_DST, _impl->dst } };
+    _impl->op->run(src, dst, {});
+}
+
+Status NEPReluLayer::validate(const ITensorInfo *input, const ITensorInfo *alpha, const ITensorInfo *output)
+{
+    return experimental::NEPReluLayer::validate(input, alpha, output);
+}
 } // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEReshapeLayer.cpp b/src/runtime/NEON/functions/NEReshapeLayer.cpp
index daf358e..2b866b5 100644
--- a/src/runtime/NEON/functions/NEReshapeLayer.cpp
+++ b/src/runtime/NEON/functions/NEReshapeLayer.cpp
@@ -89,8 +89,9 @@
 
 void NEReshapeLayer::run()
 {
-    const InputTensor src{ TensorType::ACL_SRC, _impl->src };
-    OutputTensor      dst{ TensorType::ACL_DST, _impl->dst };
-    _impl->op->run({ src }, { dst }, {});
+    const InputTensorMap  src{ { TensorType::ACL_SRC, _impl->src } };
+    const OutputTensorMap dst{ { TensorType::ACL_DST, _impl->dst } };
+
+    _impl->op->run(src, dst, {});
 }
 } // namespace arm_compute
diff --git a/src/runtime/OMP/OMPScheduler.cpp b/src/runtime/OMP/OMPScheduler.cpp
index 6d6b285..5b4b76a 100644
--- a/src/runtime/OMP/OMPScheduler.cpp
+++ b/src/runtime/OMP/OMPScheduler.cpp
@@ -83,7 +83,7 @@
     }
 }
 
-void OMPScheduler::schedule_op(ICPPKernel *kernel, const Hints &hints, const std::vector<InputTensor> &inputs, const std::vector<OutputTensor> &outputs)
+void OMPScheduler::schedule_op(ICPPKernel *kernel, const Hints &hints, const InputTensorMap &inputs, const OutputTensorMap &outputs)
 {
     ARM_COMPUTE_ERROR_ON_MSG(!kernel, "The child class didn't set the kernel");
     ARM_COMPUTE_ERROR_ON_MSG(hints.strategy() == StrategyHint::DYNAMIC,