Port NEGEMMLowp Part 2

Details:
Extend NEConvertQuantizedSignednessKernel
Port NEGEMMInterleave4x4Kernel to CpuGemmInterleave4x4Kernel
Port NEGEMMTranspose1xWKernel to CpuGemmTranspose1xWKernel
Port NEGEMMLowpMatrixAReductionKernel to CpuGemmLowpMatrixAReductionKernel
Port NEGEMMLowpMatrixBReductionKernel to CpuGemmLowpMatrixBReductionKernel
Port NEGEMMLowpOffsetContributionOutputStageKernel to CpuGemmLowpOffsetContributionOutputStageKernel
Port NEGEMMLowpOffsetContributionKernel to CpuGemmLowpOffsetContributionKernel

Resolves: COMPMID-4403

Change-Id: I3227f052f25e7b41d073bbea1da8a881fcd78b8e
Signed-off-by: Manuel Bottini <manuel.bottini@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5875
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com>
diff --git a/src/runtime/NEON/functions/NEQLSTMLayer.cpp b/src/runtime/NEON/functions/NEQLSTMLayer.cpp
index f3a3d23..946791a 100644
--- a/src/runtime/NEON/functions/NEQLSTMLayer.cpp
+++ b/src/runtime/NEON/functions/NEQLSTMLayer.cpp
@@ -23,6 +23,7 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEQLSTMLayer.h"
 
+#include "arm_compute/core/ITensorPack.h"
 #include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/QuantizationInfo.h"
 #include "arm_compute/core/Utils.h"
@@ -30,12 +31,8 @@
 #include "arm_compute/core/utils/misc/InfoHelpers.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "src/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h"
-#include "src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h"
-#include "src/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h"
-#include "src/core/NEON/kernels/NEGEMMLowpOffsetContributionOutputStageKernel.h"
-#include "src/core/NEON/kernels/NEGEMMLowpReductionKernel.h"
 #include "src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.h"
+#include "src/core/cpu/kernels/CpuGemmLowpMatrixReductionKernel.h"
 #include "src/core/helpers/WindowHelpers.h"
 
 namespace arm_compute
@@ -223,29 +220,29 @@
         _input_to_input_weights     = lstm_params.input_to_input_weights();
         _recurrent_to_input_weights = lstm_params.recurrent_to_input_weights();
 
-        _input_to_input_reduction     = std::make_unique<NEGEMMLowpMatrixAReductionKernel>();
-        _recurrent_to_input_reduction = std::make_unique<NEGEMMLowpMatrixAReductionKernel>();
-        _input_to_input_reduction->configure(_input_to_input_weights, &_input_to_input_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
-        _recurrent_to_input_reduction->configure(_recurrent_to_input_weights, &_recurrent_to_input_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
+        _input_to_input_reduction     = std::make_unique<cpu::kernels::CpuGemmLowpMatrixAReductionKernel>();
+        _recurrent_to_input_reduction = std::make_unique<cpu::kernels::CpuGemmLowpMatrixAReductionKernel>();
+        _input_to_input_reduction->configure(_input_to_input_weights->info(), _input_to_input_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
+        _recurrent_to_input_reduction->configure(_recurrent_to_input_weights->info(), _recurrent_to_input_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
     }
 
-    _input_to_forget_reduction     = std::make_unique<NEGEMMLowpMatrixAReductionKernel>();
-    _recurrent_to_forget_reduction = std::make_unique<NEGEMMLowpMatrixAReductionKernel>();
-    _input_to_cell_reduction       = std::make_unique<NEGEMMLowpMatrixAReductionKernel>();
-    _recurrent_to_cell_reduction   = std::make_unique<NEGEMMLowpMatrixAReductionKernel>();
-    _input_to_output_reduction     = std::make_unique<NEGEMMLowpMatrixAReductionKernel>();
-    _recurrent_to_output_reduction = std::make_unique<NEGEMMLowpMatrixAReductionKernel>();
+    _input_to_forget_reduction     = std::make_unique<cpu::kernels::CpuGemmLowpMatrixAReductionKernel>();
+    _recurrent_to_forget_reduction = std::make_unique<cpu::kernels::CpuGemmLowpMatrixAReductionKernel>();
+    _input_to_cell_reduction       = std::make_unique<cpu::kernels::CpuGemmLowpMatrixAReductionKernel>();
+    _recurrent_to_cell_reduction   = std::make_unique<cpu::kernels::CpuGemmLowpMatrixAReductionKernel>();
+    _input_to_output_reduction     = std::make_unique<cpu::kernels::CpuGemmLowpMatrixAReductionKernel>();
+    _recurrent_to_output_reduction = std::make_unique<cpu::kernels::CpuGemmLowpMatrixAReductionKernel>();
 
-    _recurrent_to_cell_reduction->configure(input_to_forget_weights, &_input_to_forget_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
-    _recurrent_to_forget_reduction->configure(recurrent_to_forget_weights, &_recurrent_to_forget_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
-    _input_to_cell_reduction->configure(input_to_cell_weights, &_input_to_cell_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
-    _recurrent_to_cell_reduction->configure(recurrent_to_cell_weights, &_recurrent_to_cell_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
-    _input_to_output_reduction->configure(input_to_output_weights, &_input_to_output_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
-    _recurrent_to_output_reduction->configure(recurrent_to_output_weights, &_recurrent_to_output_eff_bias, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
+    _input_to_forget_reduction->configure(input_to_forget_weights->info(), _input_to_forget_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
+    _recurrent_to_forget_reduction->configure(recurrent_to_forget_weights->info(), _recurrent_to_forget_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
+    _input_to_cell_reduction->configure(input_to_cell_weights->info(), _input_to_cell_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
+    _recurrent_to_cell_reduction->configure(recurrent_to_cell_weights->info(), _recurrent_to_cell_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
+    _input_to_output_reduction->configure(input_to_output_weights->info(), _input_to_output_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
+    _recurrent_to_output_reduction->configure(recurrent_to_output_weights->info(), _recurrent_to_output_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
     if(_has_projection)
     {
-        _projection_reduction = std::make_unique<NEGEMMLowpMatrixAReductionKernel>();
-        _projection_reduction->configure(_projection_weights, &_projection_eff_bias, GEMMLowpReductionKernelInfo(output_size, false, lstm_params.hidden_state_zero(), true));
+        _projection_reduction = std::make_unique<cpu::kernels::CpuGemmLowpMatrixAReductionKernel>();
+        _projection_reduction->configure(_projection_weights->info(), _projection_eff_bias.info(), GEMMLowpReductionKernelInfo(output_size, false, lstm_params.hidden_state_zero(), true));
         if(_projection_bias != nullptr)
         {
             _projection_bias_add.configure(_projection_bias, &_projection_eff_bias, &_projection_eff_bias, ConvertPolicy::SATURATE);
@@ -658,21 +655,26 @@
     const TensorInfo projection_eff_bias_info(TensorShape(output_size), 1, DataType::S32);
     if(!lstm_params.has_cifg_opt())
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(lstm_params.input_to_input_weights(), &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
-        ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(lstm_params.recurrent_to_input_weights(), &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset,
-                                                                               true)));
+        ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(lstm_params.input_to_input_weights(), &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false,
+                                                                                              -qinput.offset, true)));
+        ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(lstm_params.recurrent_to_input_weights(), &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false,
+                                                                                              -qoutput_state_in.offset,
+                                                                                              true)));
     }
-    ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(input_to_forget_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
-    ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(recurrent_to_forget_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)));
-    ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(input_to_cell_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
-    ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(recurrent_to_cell_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)));
-    ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(input_to_output_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
-    ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(recurrent_to_output_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)));
+    ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(input_to_forget_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
+    ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(recurrent_to_forget_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false,
+                                                                                          -qoutput_state_in.offset, true)));
+    ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(input_to_cell_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
+    ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(recurrent_to_cell_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset,
+                                                                                          true)));
+    ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(input_to_output_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
+    ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(recurrent_to_output_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false,
+                                                                                          -qoutput_state_in.offset, true)));
     if(lstm_params.has_projection())
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixAReductionKernel::validate(lstm_params.projection_weights(), &projection_eff_bias_info, GEMMLowpReductionKernelInfo(output_size, false,
-                                                                               lstm_params.hidden_state_zero(),
-                                                                               true)));
+        ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(lstm_params.projection_weights(), &projection_eff_bias_info, GEMMLowpReductionKernelInfo(output_size, false,
+                                                                                              lstm_params.hidden_state_zero(),
+                                                                                              true)));
         if(lstm_params.projection_bias() != nullptr)
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.projection_bias(), 1, DataType::S32);
@@ -1107,8 +1109,20 @@
         {
             _input_to_input_eff_bias.allocator()->allocate();
             _recurrent_to_input_eff_bias.allocator()->allocate();
-            NEScheduler::get().schedule(_input_to_input_reduction.get(), Window::DimY);
-            NEScheduler::get().schedule(_recurrent_to_input_reduction.get(), Window::DimY);
+
+            ITensorPack packII =
+            {
+                { TensorType::ACL_SRC, _input_to_input_weights },
+                { TensorType::ACL_DST, &_input_to_input_eff_bias }
+            };
+            NEScheduler::get().schedule_op(_input_to_input_reduction.get(), Window::DimY, _input_to_input_reduction->window(), packII);
+
+            ITensorPack packRI =
+            {
+                { TensorType::ACL_SRC, _recurrent_to_input_weights },
+                { TensorType::ACL_DST, &_recurrent_to_input_eff_bias }
+            };
+            NEScheduler::get().schedule_op(_recurrent_to_input_reduction.get(), Window::DimY, _recurrent_to_input_reduction->window(), packRI);
 
             _input_to_input_weights_transposed.allocator()->allocate();
             _recurrent_to_input_weights_transposed.allocator()->allocate();
@@ -1123,17 +1137,58 @@
         _recurrent_to_cell_eff_bias.allocator()->allocate();
         _input_to_output_eff_bias.allocator()->allocate();
         _recurrent_to_output_eff_bias.allocator()->allocate();
-        NEScheduler::get().schedule(_input_to_forget_reduction.get(), Window::DimY);
-        NEScheduler::get().schedule(_recurrent_to_forget_reduction.get(), Window::DimY);
-        NEScheduler::get().schedule(_input_to_cell_reduction.get(), Window::DimY);
-        NEScheduler::get().schedule(_recurrent_to_cell_reduction.get(), Window::DimY);
-        NEScheduler::get().schedule(_input_to_output_reduction.get(), Window::DimY);
-        NEScheduler::get().schedule(_recurrent_to_output_reduction.get(), Window::DimY);
+
+        ITensorPack packIF =
+        {
+            { TensorType::ACL_SRC, _input_to_forget_weights },
+            { TensorType::ACL_DST, &_input_to_forget_eff_bias }
+        };
+        NEScheduler::get().schedule_op(_input_to_forget_reduction.get(), Window::DimY, _input_to_forget_reduction->window(), packIF);
+
+        ITensorPack packRF =
+        {
+            { TensorType::ACL_SRC, _recurrent_to_forget_weights },
+            { TensorType::ACL_DST, &_recurrent_to_forget_eff_bias }
+        };
+        NEScheduler::get().schedule_op(_recurrent_to_forget_reduction.get(), Window::DimY, _recurrent_to_forget_reduction->window(), packRF);
+
+        ITensorPack packIC =
+        {
+            { TensorType::ACL_SRC, _input_to_cell_weights },
+            { TensorType::ACL_DST, &_input_to_cell_eff_bias }
+        };
+        NEScheduler::get().schedule_op(_input_to_cell_reduction.get(), Window::DimY, _input_to_cell_reduction->window(), packIC);
+
+        ITensorPack packRC =
+        {
+            { TensorType::ACL_SRC, _recurrent_to_cell_weights },
+            { TensorType::ACL_DST, &_recurrent_to_cell_eff_bias }
+        };
+        NEScheduler::get().schedule_op(_recurrent_to_cell_reduction.get(), Window::DimY, _recurrent_to_cell_reduction->window(), packRC);
+
+        ITensorPack packIO =
+        {
+            { TensorType::ACL_SRC, _input_to_output_weights },
+            { TensorType::ACL_DST, &_input_to_output_eff_bias }
+        };
+        NEScheduler::get().schedule_op(_input_to_output_reduction.get(), Window::DimY, _input_to_output_reduction->window(), packIO);
+
+        ITensorPack packRO =
+        {
+            { TensorType::ACL_SRC, _recurrent_to_output_weights },
+            { TensorType::ACL_DST, &_recurrent_to_output_eff_bias }
+        };
+        NEScheduler::get().schedule_op(_recurrent_to_output_reduction.get(), Window::DimY, _recurrent_to_output_reduction->window(), packRO);
 
         if(_has_projection)
         {
             _projection_eff_bias.allocator()->allocate();
-            NEScheduler::get().schedule(_projection_reduction.get(), Window::DimY);
+            ITensorPack pack =
+            {
+                { TensorType::ACL_SRC, _projection_weights },
+                { TensorType::ACL_DST, &_projection_eff_bias }
+            };
+            NEScheduler::get().schedule_op(_projection_reduction.get(), Window::DimY, _projection_reduction->window(), pack);
             if(_projection_bias != nullptr)
             {
                 _projection_bias_add.run();