COMPMID-433 - Port NEGEMM to support 16 bit fixed point

Change-Id: I82de74d7027bbc8a00a4d6671e968785280d5f6c
Reviewed-on: http://mpd-gerrit.cambridge.arm.com/79498
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
Tested-by: Kaizen <jeremy.johnson+kaizengerrit@arm.com>
Reviewed-by: Moritz Pflanzer <moritz.pflanzer@arm.com>
Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
diff --git a/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.cpp b/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.cpp
index 57d2807..91fbe6f 100644
--- a/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.cpp
@@ -114,6 +114,31 @@
     },
     in, out);
 }
+
+void matrix_addition_qs16(const ITensor *input, ITensor *output, const Window &window, float beta)
+{
+    const int        fixed_point_position = input->info()->fixed_point_position();
+    const qint16x8_t beta_qs16            = vdupq_n_qs16(scvt_qs16_f32(beta, fixed_point_position));
+
+    Iterator in(input, window);
+    Iterator out(output, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        const auto in_ptr  = reinterpret_cast<const qint16_t *>(in.ptr());
+        const auto out_ptr = reinterpret_cast<qint16_t *>(out.ptr());
+
+        qint16x8x2_t       alpha_ab = vld2q_s16(out_ptr);
+        const qint16x8x2_t c        = vld2q_s16(in_ptr);
+
+        // Multiply matrix C by its weight and accumulate
+        alpha_ab.val[0] = vqmlaq_qs16(alpha_ab.val[0], c.val[0], beta_qs16, fixed_point_position);
+        alpha_ab.val[1] = vqmlaq_qs16(alpha_ab.val[1], c.val[1], beta_qs16, fixed_point_position);
+
+        vst2q_s16(out_ptr, alpha_ab);
+    },
+    in, out);
+}
 } // namespace
 
 NEGEMMMatrixAdditionKernel::NEGEMMMatrixAdditionKernel()
@@ -123,8 +148,8 @@
 
 void NEGEMMMatrixAdditionKernel::configure(const ITensor *input, ITensor *output, float beta)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::F16, DataType::F32);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output);
     ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != output->info()->dimension(0));
@@ -138,6 +163,9 @@
         case DataType::QS8:
             _func = &matrix_addition_qs8;
             break;
+        case DataType::QS16:
+            _func = &matrix_addition_qs16;
+            break;
         case DataType::F16:
 #ifdef ARM_COMPUTE_ENABLE_FP16
             _func = &matrix_addition_f16;