COMPMID-417: Port NEDirectConvolution 1x1 to QS16. Change-Id: Icae6a5091e836d0aca24375f43cca9e6d3a2090f Reviewed-on: http://mpd-gerrit.cambridge.arm.com/81662 Reviewed-by: Moritz Pflanzer <moritz.pflanzer@arm.com> Tested-by: Kaizen <jeremy.johnson+kaizengerrit@arm.com> Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>

commit: f87cc7f6fef95f9b022725304118796a6a764a7c [log] [tgz]
author: Pablo Tello <pablo.tello@arm.com> Wed Jul 26 10:28:40 2017 +0100
committer: Anthony Barbier <anthony.barbier@arm.com> Mon Sep 17 14:16:42 2018 +0100
tree: 06a643c47c93ba1a64dcca1ae787214a6fbfff54
parent: 6c928343b0fa2bf60ffdfe21aea28b598d742ed4 [diff] [blame]
diff --git a/arm_compute/core/NEON/NEFixedPoint.inl b/arm_compute/core/NEON/NEFixedPoint.inl
index c879d3e..dd1066d 100644
--- a/arm_compute/core/NEON/NEFixedPoint.inl
+++ b/arm_compute/core/NEON/NEFixedPoint.inl

@@ -624,6 +624,20 @@
     return vqrshlq_s16(res, fixed_point_position_s16);
 }
 
+inline qint32x4_t vmull_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position)
+{
+    const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
+
+    // Initialize the temporary results with a constant used to round up the result
+    qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
+
+    // Vector multiply-accumulate long
+    tmp = vmull_s16(a, b);
+
+    // Shift right by fixed_point_position
+    return vqshlq_s32(tmp, fixed_point_position_s32);
+}
+
 inline qint8x8_t vmla_qs8(qint8x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
 {
     const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
commit	f87cc7f6fef95f9b022725304118796a6a764a7c	[log] [tgz]
author	Pablo Tello <pablo.tello@arm.com>	Wed Jul 26 10:28:40 2017 +0100
committer	Anthony Barbier <anthony.barbier@arm.com>	Mon Sep 17 14:16:42 2018 +0100
tree	06a643c47c93ba1a64dcca1ae787214a6fbfff54
parent	6c928343b0fa2bf60ffdfe21aea28b598d742ed4 [diff] [blame]