COMPMID-3373: Async support to NEArithmetic* kernels/functions (Pt. 1)

Added support on NEArithmeticAddition and NEArithmeticSubtraction

Signed-off-by: Michalis Spyrou <michalis.spyrou@arm.com>
Change-Id: Ifa805f8455ef6eff1ee627752dc1c7fe9740ec47
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/3451
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
diff --git a/arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h b/arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h
index bff34df..f254027 100644
--- a/arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h
+++ b/arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h
@@ -68,12 +68,12 @@
      *   - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED
      *   - (QSYMM16,QSYMM16) -> QSYMM16
      *
-     * @param[in]  input1 First input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32
-     * @param[in]  input2 Second input tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32
-     * @param[out] output The output tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32.
+     * @param[in]  input1 First input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32
+     * @param[in]  input2 Second input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32
+     * @param[out] output The output tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32.
      * @param[in]  policy Overflow policy.
      */
-    void configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy);
+    void configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output, ConvertPolicy policy);
     /** Static function to check if given info will lead to a valid configuration of @ref NEArithmeticAdditionKernel
      *
      * @param[in] input1 First input tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32
@@ -86,7 +86,7 @@
     static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy);
 
     // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
+    void run_op(const InputTensorMap &inputs, const OutputTensorMap &outputs, const Window &window, const ThreadInfo &info) override;
 
 private:
     /** Common signature for all the specialised add functions
@@ -99,11 +99,8 @@
      */
     using AddFunction = void(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy, const Window &window);
     /** Add function to use for the particular tensor types passed to configure() */
-    AddFunction   *_func;
-    const ITensor *_input1;
-    const ITensor *_input2;
-    ITensor       *_output;
-    ConvertPolicy  _policy;
+    AddFunction *_func;
+    ConvertPolicy _policy;
 };
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_NEARITHMETICADDITIONKERNEL_H */
diff --git a/arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h b/arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h
index f75c6bf..dfd08d9 100644
--- a/arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h
+++ b/arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h
@@ -71,7 +71,7 @@
      * @param[out] output The output tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/F16/F32.
      * @param[in]  policy Overflow policy. Convert policy cannot be WRAP if datatype is quantized.
      */
-    void configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy);
+    void configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output, ConvertPolicy policy);
     /** Static function to check if given info will lead to a valid configuration of @ref NEArithmeticSubtractionKernel
      *
      * @note Convert policy cannot be WRAP if datatype is QASYMM8
@@ -86,7 +86,7 @@
     static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy);
 
     // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
+    void run_op(const InputTensorMap &inputs, const OutputTensorMap &outputs, const Window &window, const ThreadInfo &info) override;
 
 private:
     /** Common signature for all the specialised sub functions
@@ -99,11 +99,8 @@
      */
     using SubFunction = void(const ITensor *input1, const ITensor *input2, ITensor *output, const Window &window, bool is_sat);
     /** Sub function to use for the particular tensor types passed to configure() */
-    SubFunction   *_func;
-    const ITensor *_input1;
-    const ITensor *_input2;
-    ITensor       *_output;
-    ConvertPolicy  _policy;
+    SubFunction *_func;
+    ConvertPolicy _policy;
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_NEARITHMETICSUBTRACTIONKERNEL_H */
diff --git a/arm_compute/runtime/NEON/functions/NEArithmeticAddition.h b/arm_compute/runtime/NEON/functions/NEArithmeticAddition.h
index 2bf12df..589e062 100644
--- a/arm_compute/runtime/NEON/functions/NEArithmeticAddition.h
+++ b/arm_compute/runtime/NEON/functions/NEArithmeticAddition.h
@@ -25,14 +25,17 @@
 #define ARM_COMPUTE_NEARITHMETICADDITION_H
 
 #include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/INESimpleFunctionNoBorder.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/NEON/INEOperator.h"
 
 namespace arm_compute
 {
 class ITensor;
 
+namespace experimental
+{
 /** Basic function to run @ref NEArithmeticAdditionKernel */
-class NEArithmeticAddition : public INESimpleFunctionNoBorder
+class NEArithmeticAddition : public INEOperator
 {
 public:
     /** Initialise the kernel's inputs, output and conversion policy.
@@ -51,13 +54,69 @@
      *   - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED
      *   - (QSYMM16,QSYMM16) -> QSYMM16
      *
+     * @param[in]  input1   First tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32
+     * @param[in]  input2   Second tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32
+     * @param[out] output   Output tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32
+     * @param[in]  policy   Policy to use to handle overflow.
+     * @param[in]  act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
+     */
+    void configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    /** Static function to check if given info will lead to a valid configuration of @ref NEArithmeticAddition
+     *
+     * @param[in] input1   First tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32
+     * @param[in] input2   Second tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32
+     * @param[in] output   Output tensor info. Data types supported: U8/SQASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32
+     * @param[in] policy   Policy to use to handle overflow
+     * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+    // Inherited methods overridden:
+    MemoryRequirements workspace() const override;
+};
+} // namespace experimental
+
+/** Basic function to run @ref NEArithmeticAdditionKernel */
+class NEArithmeticAddition : public IFunction
+{
+public:
+    /** Default Constructor */
+    NEArithmeticAddition();
+    /** Default Destructor */
+    ~NEArithmeticAddition();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEArithmeticAddition(const NEArithmeticAddition &) = delete;
+    /** Default move constructor */
+    NEArithmeticAddition(NEArithmeticAddition &&);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEArithmeticAddition &operator=(const NEArithmeticAddition &) = delete;
+    /** Default move assignment operator */
+    NEArithmeticAddition &operator=(NEArithmeticAddition &&);
+    /** Initialise the kernel's inputs, output and conversion policy.
+     *
+     * Valid configurations (Input1,Input2) -> Output :
+     *
+     *   - (U8,U8)           -> U8
+     *   - (U8,U8)           -> S16
+     *   - (S16,U8)          -> S16
+     *   - (U8,S16)          -> S16
+     *   - (S16,S16)         -> S16
+     *   - (S32,S32)         -> S32
+     *   - (F16,F16)         -> F16
+     *   - (F32,F32)         -> F32
+     *   - (QASYMM8,QASYMM8) -> QASYMM8
+     *   - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED
+     *   - (QSYMM16,QSYMM16) -> QSYMM16
+     *
      * @param[in]  input1   First tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32
      * @param[in]  input2   Second tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32
      * @param[out] output   Output tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32
      * @param[in]  policy   Policy to use to handle overflow.
      * @param[in]  act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
      */
-    void configure(ITensor *input1, ITensor *input2, ITensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref NEArithmeticAddition
      *
      * @param[in] input1   First tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32
@@ -69,6 +128,13 @@
      * @return a status
      */
     static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_NEARITHMETICADDITION_H */
diff --git a/arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h b/arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h
index 31d1698..0bab911 100644
--- a/arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h
+++ b/arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h
@@ -25,12 +25,15 @@
 #define ARM_COMPUTE_NEARITHMETICSUBTRACTION_H
 
 #include "arm_compute/core/Types.h"
-#include "arm_compute/runtime/NEON/INESimpleFunction.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/NEON/INEOperator.h"
 
 namespace arm_compute
 {
 class ITensor;
 
+namespace experimental
+{
 /** Basic function to run @ref NEArithmeticSubtractionKernel
  *
  * @note The tensor data type for the inputs must be U8/QASYMM8/S16/F16/F32.
@@ -39,18 +42,18 @@
  *  This function calls the following kernels:
  * -# @ref NEArithmeticSubtractionKernel
  */
-class NEArithmeticSubtraction : public INESimpleFunction
+class NEArithmeticSubtraction : public INEOperator
 {
 public:
     /** Initialise the kernel's inputs, output and conversion policy.
      *
-     * @param[in]  input1   First tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/F16/F32
-     * @param[in]  input2   Second tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/F16/F32
-     * @param[out] output   Output tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/F16/F32
+     * @param[in]  input1   First tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/F16/F32
+     * @param[in]  input2   Second tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/F16/F32
+     * @param[out] output   Output tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/F16/F32
      * @param[in]  policy   Policy to use to handle overflow. Convert policy cannot be WRAP if datatype is quantized.
      * @param[in]  act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
      */
-    void configure(ITensor *input1, ITensor *input2, ITensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref NEArithmeticSubtraction
      *
      * @param[in] input1   First tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/F16/F32
@@ -62,6 +65,62 @@
      * @return a status
      */
     static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+    // Inherited methods overridden:
+    MemoryRequirements workspace() const override;
+};
+} // namespace experimental
+
+/** Basic function to run @ref NEArithmeticSubtractionKernel
+ *
+ * @note The tensor data type for the inputs must be U8/QASYMM8/S16/F16/F32.
+ * @note The function performs an arithmetic subtraction between two tensors.
+ *
+ *  This function calls the following kernels:
+ * -# @ref NEArithmeticSubtractionKernel
+ */
+class NEArithmeticSubtraction : public IFunction
+{
+public:
+    /** Default Constructor */
+    NEArithmeticSubtraction();
+    /** Default Destructor */
+    ~NEArithmeticSubtraction();
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEArithmeticSubtraction(const NEArithmeticSubtraction &) = delete;
+    /** Default move constructor */
+    NEArithmeticSubtraction(NEArithmeticSubtraction &&);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    NEArithmeticSubtraction &operator=(const NEArithmeticSubtraction &) = delete;
+    /** Default move assignment operator */
+    NEArithmeticSubtraction &operator=(NEArithmeticSubtraction &&);
+    /** Initialise the kernel's inputs, output and conversion policy.
+     *
+     * @param[in]  input1   First tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/F16/F32
+     * @param[in]  input2   Second tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/F16/F32
+     * @param[out] output   Output tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/QSYMM16/S16/F16/F32
+     * @param[in]  policy   Policy to use to handle overflow. Convert policy cannot be WRAP if datatype is quantized.
+     * @param[in]  act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
+     */
+    void configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    /** Static function to check if given info will lead to a valid configuration of @ref NEArithmeticSubtraction
+     *
+     * @param[in] input1   First tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/F16/F32
+     * @param[in] input2   Second tensor input. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/F16/F32
+     * @param[in] output   Output tensor. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/F16/F32
+     * @param[in] policy   Policy to use to handle overflow. Convert policy cannot be WRAP if datatype is quantized.
+     * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+
+    // Inherited methods overridden:
+    void run() override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_NEARITHMETICSUBTRACTION_H */
diff --git a/arm_compute/runtime/NEON/functions/NEGEMM.h b/arm_compute/runtime/NEON/functions/NEGEMM.h
index 8dc6b88..b89a373 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMM.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMM.h
@@ -24,7 +24,6 @@
 #ifndef ARM_COMPUTE_NEGEMM_H
 #define ARM_COMPUTE_NEGEMM_H
 
-#include "arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h"
 #include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h"
 #include "arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h"
 #include "arm_compute/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h"
@@ -35,6 +34,7 @@
 #include "arm_compute/runtime/IWeightsManager.h"
 #include "arm_compute/runtime/MemoryGroup.h"
 #include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h"
 #include "arm_compute/runtime/NEON/functions/NEGEMMAssemblyDispatch.h"
 #include "arm_compute/runtime/Tensor.h"
 
@@ -112,7 +112,7 @@
     NEGEMMAssemblyDispatch     _asm_glue;
     NEGEMMMatrixAdditionKernel _ma_kernel;
     NEActivationLayer          _alpha_scale_func;
-    NEArithmeticAdditionKernel _add_bias_kernel;
+    NEArithmeticAddition       _add_bias;
     NEActivationLayer          _activation_func;
 
     Tensor         _tmp_a;
diff --git a/arm_compute/runtime/NEON/functions/NELSTMLayer.h b/arm_compute/runtime/NEON/functions/NELSTMLayer.h
index 6484511..b9b581c 100644
--- a/arm_compute/runtime/NEON/functions/NELSTMLayer.h
+++ b/arm_compute/runtime/NEON/functions/NELSTMLayer.h
@@ -25,13 +25,13 @@
 #define ARM_COMPUTE_NELSTMLAYER_H
 
 #include "arm_compute/core/NEON/kernels/NEActivationLayerKernel.h"
-#include "arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h"
-#include "arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h"
 #include "arm_compute/core/NEON/kernels/NECopyKernel.h"
 #include "arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h"
 
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h"
+#include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h"
+#include "arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h"
 #include "arm_compute/runtime/NEON/functions/NEConcatenateLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEGEMM.h"
@@ -149,7 +149,7 @@
     MemoryGroup                     _memory_group;
     NEFullyConnectedLayer           _fully_connected_input_gate;
     NEArithmeticAddition            _accum_input_gate1;
-    NEArithmeticSubtractionKernel   _subtract_input_gate;
+    NEArithmeticSubtraction         _subtract_input_gate;
     NEPixelWiseMultiplicationKernel _pixelwise_mul_input_gate;
     NEActivationLayer               _activation_input_gate;
     NEFullyConnectedLayer           _fully_connected_forget_gate;
@@ -159,8 +159,8 @@
     NEFullyConnectedLayer           _fully_connected_cell_state;
     NEGEMM                          _gemm_cell_state1;
     NETransposeKernel               _transpose_cell_state;
-    NEArithmeticAdditionKernel      _accum_cell_state1;
-    NEArithmeticAdditionKernel      _accum_cell_state2;
+    NEArithmeticAddition            _accum_cell_state1;
+    NEArithmeticAddition            _accum_cell_state2;
     NEPixelWiseMultiplicationKernel _pixelwise_mul_cell_state1;
     NEActivationLayer               _activation_cell_state;
     NEActivationLayer               _cell_clip;
@@ -182,16 +182,16 @@
     NEConcatenateLayer              _concat_weights_output;
     NEMeanStdDevNormalizationLayer  _mean_std_norm_input_gate;
     NEPixelWiseMultiplicationKernel _pixelwise_mul_input_gate_coeff;
-    NEArithmeticAdditionKernel      _accum_input_gate_bias;
+    NEArithmeticAddition            _accum_input_gate_bias;
     NEMeanStdDevNormalizationLayer  _mean_std_norm_forget_gate;
     NEPixelWiseMultiplicationKernel _pixelwise_mul_forget_gate_coeff;
-    NEArithmeticAdditionKernel      _accum_forget_gate_bias;
+    NEArithmeticAddition            _accum_forget_gate_bias;
     NEMeanStdDevNormalizationLayer  _mean_std_norm_cell_gate;
     NEPixelWiseMultiplicationKernel _pixelwise_mul_cell_gate_coeff;
-    NEArithmeticAdditionKernel      _accum_cell_gate_bias;
+    NEArithmeticAddition            _accum_cell_gate_bias;
     NEMeanStdDevNormalizationLayer  _mean_std_norm_output_gate;
     NEPixelWiseMultiplicationKernel _pixelwise_mul_output_gate_coeff;
-    NEArithmeticAdditionKernel      _accum_output_gate_bias;
+    NEArithmeticAddition            _accum_output_gate_bias;
     Tensor                          _input_gate_out1;
     Tensor                          _input_gate_out2;
     Tensor                          _input_gate_out3;
diff --git a/arm_compute/runtime/NEON/functions/NEQLSTMLayer.h b/arm_compute/runtime/NEON/functions/NEQLSTMLayer.h
index d1cc962..60c8fa1 100644
--- a/arm_compute/runtime/NEON/functions/NEQLSTMLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEQLSTMLayer.h
@@ -24,14 +24,14 @@
 #ifndef ARM_COMPUTE_NEQLSTMLAYER_H
 #define ARM_COMPUTE_NEQLSTMLAYER_H
 
-#include "arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h"
-#include "arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h"
 #include "arm_compute/core/NEON/kernels/NECopyKernel.h"
 #include "arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h"
 #include "arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h"
 #include "arm_compute/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h"
+#include "arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h"
 #include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h"
 #include "arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h"
 #include "arm_compute/runtime/NEON/functions/NETranspose.h"
@@ -48,7 +48,7 @@
  * This function calls the following NEON functions/kernels:
  *
  * -# @ref NEActivationLayer                                     Activation functions (tanh and logistic)
- * -# @ref NEArithmeticAdditionKernel                            Elementwise addition
+ * -# @ref NEArithmeticAddition                            Elementwise addition
  * -# @ref NEArithmeticSubtractionKernel                         Elementwise subtraction
  * -# @ref NECopyKernel                                          Copy kernel for copying output_state_out to output
  * -# @ref NEGEMMLowpMatrixMultiplyCore                          Quantized matrix multiplication core. Accumulators are 32-bit integers
@@ -254,51 +254,51 @@
     NEGEMMLowpMatrixAReductionKernel _input_to_output_reduction{};
     NEGEMMLowpMatrixAReductionKernel _recurrent_to_output_reduction{};
     NEGEMMLowpMatrixAReductionKernel _projection_reduction{};
-    NEArithmeticAdditionKernel       _projection_bias_add{};
+    NEArithmeticAddition             _projection_bias_add{};
     NEGEMMLowpMatrixMultiplyCore     _mm_input_to_forget{};
     NEGEMMLowpMatrixMultiplyCore     _mm_recurrent_to_forget{};
     NEPixelWiseMultiplicationKernel  _pixelwise_mul_cell_to_forget{};
     NEGEMMLowpOutputStage            _input_to_forget_outstage{};
     NEGEMMLowpOutputStage            _recurrent_to_forget_outstage{};
     NEGEMMLowpOutputStage            _cell_to_forget_outstage{};
-    NEArithmeticAdditionKernel       _accumulate_input_recurrent_forget{};
-    NEArithmeticAdditionKernel       _accumulate_cell_forget{};
+    NEArithmeticAddition             _accumulate_input_recurrent_forget{};
+    NEArithmeticAddition             _accumulate_cell_forget{};
     NEActivationLayer                _forget_gate_sigmoid{};
     NEGEMMLowpMatrixMultiplyCore     _mm_input_to_cell{};
     NEGEMMLowpOutputStage            _input_to_cell_outstage{};
     NEGEMMLowpMatrixMultiplyCore     _mm_recurrent_to_cell{};
     NEGEMMLowpOutputStage            _recurrent_to_cell_outstage{};
-    NEArithmeticAdditionKernel       _accumulate_input_recurrent_modulation{};
+    NEArithmeticAddition             _accumulate_input_recurrent_modulation{};
     NEActivationLayer                _cell_gate_tanh{};
-    NEArithmeticSubtractionKernel    _input_gate_sub{};
+    NEArithmeticSubtraction          _input_gate_sub{};
     NEGEMMLowpMatrixMultiplyCore     _mm_input_to_input{};
     NEGEMMLowpOutputStage            _input_to_input_outstage{};
     NEGEMMLowpMatrixMultiplyCore     _mm_recurrent_to_input{};
     NEGEMMLowpOutputStage            _recurrent_to_input_outstage{};
-    NEArithmeticAdditionKernel       _accumulate_input_recurrent_input{};
+    NEArithmeticAddition             _accumulate_input_recurrent_input{};
     NEPixelWiseMultiplicationKernel  _pixelwise_mul_cell_to_input{};
     NEGEMMLowpOutputStage            _cell_to_input_outstage{};
-    NEArithmeticAdditionKernel       _accumulate_cell_input{};
+    NEArithmeticAddition             _accumulate_cell_input{};
     NEActivationLayer                _input_gate_sigmoid{};
     NEPixelWiseMultiplicationKernel  _pixelwise_mul_forget_cell{};
     NEPixelWiseMultiplicationKernel  _pixelwise_mul_input_cell{};
-    NEArithmeticAdditionKernel       _add_forget_cell{};
+    NEArithmeticAddition             _add_forget_cell{};
     NEActivationLayer                _cell_clip{};
     NEGEMMLowpMatrixMultiplyCore     _mm_input_to_output{};
     NEGEMMLowpOutputStage            _input_to_output_outstage{};
     NEGEMMLowpMatrixMultiplyCore     _mm_recurrent_to_output{};
     NEGEMMLowpOutputStage            _recurrent_to_output_outstage{};
-    NEArithmeticAdditionKernel       _accumulate_input_recurrent_output{};
+    NEArithmeticAddition             _accumulate_input_recurrent_output{};
     NEPixelWiseMultiplicationKernel  _pixelwise_mul_cell_to_output{};
     NEGEMMLowpOutputStage            _cell_to_output_outstage{};
-    NEArithmeticAdditionKernel       _accumulate_cell_to_output{};
+    NEArithmeticAddition             _accumulate_cell_to_output{};
     NEActivationLayer                _output_gate_sigmoid{};
     NEActivationLayer                _hidden_tanh{};
     NEPixelWiseMultiplicationKernel  _pixelwise_mul_hidden{};
     NEGEMMLowpOutputStage            _hidden_outstage{};
     NEGEMMLowpMatrixMultiplyCore     _mm_projection{};
     NEGEMMLowpOutputStage            _projection_outstage{};
-    NEArithmeticAdditionKernel       _accumulate_projection{};
+    NEArithmeticAddition             _accumulate_projection{};
     NEActivationLayer                _projection_clip{};
 
     TensorCopyKernel _projection_bias_copy{};
@@ -311,7 +311,10 @@
     NECopyKernel _copy_output{};
 
     // Tensor pointers
-    const ITensor *_input_to_input_weights{ nullptr };
+    const ITensor *_input_to_input_weights
+    {
+        nullptr
+    };
     const ITensor *_recurrent_to_input_weights{ nullptr };
     const ITensor *_projection_bias{ nullptr };
     const ITensor *_input_to_forget_weights{ nullptr };
@@ -370,7 +373,10 @@
     {
         // Output quantization scale will be different, but ignored here
         // since it will be configured at configure() stage.
-        const TensorInfo out{ in };
+        const TensorInfo out
+        {
+            in
+        };
         return NEQLSTMLayerNormalizationKernel::validate(&in, &out, &weight, &bias);
     }
 
diff --git a/arm_compute/runtime/NEON/functions/NERNNLayer.h b/arm_compute/runtime/NEON/functions/NERNNLayer.h
index db4134f..25cb74d 100644
--- a/arm_compute/runtime/NEON/functions/NERNNLayer.h
+++ b/arm_compute/runtime/NEON/functions/NERNNLayer.h
@@ -24,11 +24,11 @@
 #ifndef ARM_COMPUTE_NERNNLAYER_H
 #define ARM_COMPUTE_NERNNLAYER_H
 
-#include "arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h"
 #include "arm_compute/core/NEON/kernels/NECopyKernel.h"
 
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
+#include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h"
 #include "arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h"
 #include "arm_compute/runtime/NEON/functions/NEGEMM.h"
 
@@ -82,16 +82,16 @@
     void prepare() override;
 
 private:
-    MemoryGroup                _memory_group;
-    NEGEMM                     _gemm_state_f;
-    NEArithmeticAdditionKernel _add_kernel;
-    NEActivationLayer          _activation;
-    NEFullyConnectedLayer      _fully_connected;
-    NECopyKernel               _copy_kernel;
-    Tensor                     _fully_connected_out;
-    Tensor                     _gemm_output;
-    Tensor                     _add_output;
-    bool                       _is_prepared;
+    MemoryGroup           _memory_group;
+    NEGEMM                _gemm_state_f;
+    NEArithmeticAddition  _add_f;
+    NEActivationLayer     _activation;
+    NEFullyConnectedLayer _fully_connected;
+    NECopyKernel          _copy_kernel;
+    Tensor                _fully_connected_out;
+    Tensor                _gemm_output;
+    Tensor                _add_output;
+    bool                  _is_prepared;
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_NERNNLAYER_H */