COMPMID-400 Add support for 16 bit fixed point arithmetic.

Change-Id: Iebfaef1b219d80d6362b7fd4b1357612b31e43cb
Reviewed-on: http://mpd-gerrit.cambridge.arm.com/77749
Reviewed-by: Moritz Pflanzer <moritz.pflanzer@arm.com>
Tested-by: Kaizen <jeremy.johnson+kaizengerrit@arm.com>
diff --git a/arm_compute/core/FixedPoint.h b/arm_compute/core/FixedPoint.h
index 925b494..da304c6 100644
--- a/arm_compute/core/FixedPoint.h
+++ b/arm_compute/core/FixedPoint.h
@@ -33,12 +33,21 @@
 /** 8 bit fixed point scalar saturating shift left
  *
  * @param[in] a     First 8 bit fixed point input
- * @param[in] shift Shift amount
+ * @param[in] shift Shift amount (positive only values)
  *
  * @return The result of the 8 bit fixed point shift. The result is saturated in case of overflow
  */
 qint8_t sqshl_qs8(qint8_t a, int shift);
 
+/** 16 bit fixed point scalar saturating shift left
+ *
+ * @param[in] a     First 16 bit fixed point input
+ * @param[in] shift Shift amount (positive only values)
+ *
+ * @return The result of the 16 bit fixed point shift. The result is saturated in case of overflow
+ */
+qint16_t sqshl_qs16(qint16_t a, int shift);
+
 /** 8 bit fixed point scalar absolute value
  *
  * @param[in] a 8 bit fixed point input
@@ -47,6 +56,14 @@
  */
 qint8_t sabs_qs8(qint8_t a);
 
+/** 16 bit fixed point scalar absolute value
+ *
+ * @param[in] a 16 bit fixed point input
+ *
+ * @return The result of the 16 bit fixed point absolute value
+ */
+qint16_t sabs_qs16(qint16_t a);
+
 /** 8 bit fixed point scalar add
  *
  * @param[in] a First 8 bit fixed point input
@@ -56,6 +73,15 @@
  */
 qint8_t sadd_qs8(qint8_t a, qint8_t b);
 
+/** 16 bit fixed point scalar add
+ *
+ * @param[in] a First 16 bit fixed point input
+ * @param[in] b Second 16 bit fixed point input
+ *
+ * @return The result of the 16 bit fixed point addition
+ */
+qint16_t sadd_qs16(qint16_t a, qint16_t b);
+
 /** 8 bit fixed point scalar saturating add
  *
  * @param[in] a First 8 bit fixed point input
@@ -83,6 +109,15 @@
  */
 qint8_t ssub_qs8(qint8_t a, qint8_t b);
 
+/** 16 bit fixed point scalar subtraction
+ *
+ * @param[in] a First 16 bit fixed point input
+ * @param[in] b Second 16 bit fixed point input
+ *
+ * @return The result of the 16 bit fixed point subtraction
+ */
+qint16_t ssub_qs16(qint16_t a, qint16_t b);
+
 /** 8 bit fixed point scalar saturating subtraction
  *
  * @param[in] a First 8 bit fixed point input
@@ -92,6 +127,15 @@
  */
 qint8_t sqsub_qs8(qint8_t a, qint8_t b);
 
+/** 16 bit fixed point scalar saturating subtraction
+ *
+ * @param[in] a First 16 bit fixed point input
+ * @param[in] b Second 16 bit fixed point input
+ *
+ * @return The result of the 16 bit fixed point subtraction. The result is saturated in case of overflow
+ */
+qint16_t sqsub_qs16(qint16_t a, qint16_t b);
+
 /** 8 bit fixed point scalar multiply
  *
  * @param[in] a                    First 8 bit fixed point input
@@ -102,6 +146,16 @@
  */
 qint8_t smul_qs8(qint8_t a, qint8_t b, int fixed_point_position);
 
+/** 16 bit fixed point scalar multiply
+ *
+ * @param[in] a                    First 16 bit fixed point input
+ * @param[in] b                    Second 16 bit fixed point input
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 16 bit fixed point multiplication.
+ */
+qint16_t smul_qs16(qint16_t a, qint16_t b, int fixed_point_position);
+
 /** 8 bit fixed point scalar saturating multiply
  *
  * @param[in] a                    First 8 bit fixed point input
@@ -112,6 +166,16 @@
  */
 qint8_t sqmul_qs8(qint8_t a, qint8_t b, int fixed_point_position);
 
+/** 16 bit fixed point scalar saturating multiply
+ *
+ * @param[in] a                    First 16 bit fixed point input
+ * @param[in] b                    Second 16 bit fixed point input
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 16 bit fixed point multiplication. The result is saturated in case of overflow
+ */
+qint16_t sqmul_qs16(qint16_t a, qint16_t b, int fixed_point_position);
+
 /** 8 bit fixed point scalar multiply long
  *
  * @param[in] a                    First 8 bit fixed point input
@@ -122,6 +186,16 @@
  */
 qint16_t sqmull_qs8(qint8_t a, qint8_t b, int fixed_point_position);
 
+/** 16 bit fixed point scalar multiply long
+ *
+ * @param[in] a                    First 16 bit fixed point input
+ * @param[in] b                    Second 16 bit fixed point input
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 16 bit fixed point multiplication long. The result is saturated in case of overflow
+ */
+qint32_t sqmull_qs16(qint16_t a, qint16_t b, int fixed_point_position);
+
 /** 16 bit fixed point scalar saturating multiply
 *
 * @param[in] a                    First 16 bit fixed point input
@@ -141,6 +215,15 @@
 */
 qint8_t sinvsqrt_qs8(qint8_t a, int fixed_point_position);
 
+/** 16 bit fixed point scalar inverse square root
+*
+* @param[in] a                    16 bit fixed point input
+* @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+*
+* @return The result of the 16 bit fixed point inverse square root.
+*/
+qint16_t sinvsqrt_qs16(qint16_t a, int fixed_point_position);
+
 /** 8 bit fixed point scalar division
 *
 * @param[in] a                    First 8 bit fixed point input
@@ -151,6 +234,16 @@
 */
 qint8_t sdiv_qs8(qint8_t a, qint8_t b, int fixed_point_position);
 
+/** 16 bit fixed point scalar division
+*
+* @param[in] a                    First 16 bit fixed point input
+* @param[in] b                    Second 16 bit fixed point input
+* @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+*
+* @return The result of the 16 bit fixed point division.
+*/
+qint16_t sdiv_qs16(qint16_t a, qint16_t b, int fixed_point_position);
+
 /** 8 bit fixed point scalar exponential
 *
 * @param[in] a                    8 bit fixed point input
@@ -160,6 +253,15 @@
 */
 qint8_t sexp_qs8(qint8_t a, int fixed_point_position);
 
+/** 16 bit fixed point scalar exponential
+*
+* @param[in] a                    16 bit fixed point input
+* @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+*
+* @return The result of the 16 bit fixed point exponential.
+*/
+qint16_t sexp_qs16(qint16_t a, int fixed_point_position);
+
 /** 8 bit fixed point scalar logarithm
 *
 * @param[in] a                    8 bit fixed point input
@@ -169,6 +271,15 @@
 */
 qint8_t slog_qs8(qint8_t a, int fixed_point_position);
 
+/** 16 bit fixed point scalar logarithm
+*
+* @param[in] a                    16 bit fixed point input
+* @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+*
+* @return The result of the 16 bit fixed point logarithm.
+*/
+qint16_t slog_qs16(qint16_t a, int fixed_point_position);
+
 /** Convert an 8 bit fixed point to float
  *
  * @param[in] a                    Input to convert
@@ -203,7 +314,7 @@
  *
  * @return The result of the conversion float -> 16 bit fixed point
  */
-qint8_t scvt_qs16_f32(float a, int fixed_point_position);
+qint16_t scvt_qs16_f32(float a, int fixed_point_position);
 
 /** Scalar saturating move and narrow.
  *
diff --git a/arm_compute/core/FixedPoint.inl b/arm_compute/core/FixedPoint.inl
index 4263a6f..fab91d6 100644
--- a/arm_compute/core/FixedPoint.inl
+++ b/arm_compute/core/FixedPoint.inl
@@ -46,13 +46,27 @@
 inline qint8_t sqshl_qs8(qint8_t a, int shift)
 {
     qint16_t tmp = static_cast<qint16_t>(a) << shift;
+
     // Saturate the result in case of overflow and cast to qint8_t
     return saturate_convert<qint16_t, qint8_t>(tmp);
 }
 
+inline qint16_t sqshl_qs16(qint16_t a, int shift)
+{
+    qint32_t tmp = static_cast<qint32_t>(a) << shift;
+
+    // Saturate the result in case of overflow and cast to qint16_t
+    return saturate_convert<qint32_t, qint16_t>(tmp);
+}
+
 inline qint8_t sabs_qs8(qint8_t a)
 {
-    return a & 0x7F;
+    return (a < 0) ? (a == std::numeric_limits<int8_t>::min()) ? std::numeric_limits<int8_t>::max() : -a : a;
+}
+
+inline qint16_t sabs_qs16(qint16_t a)
+{
+    return (a < 0) ? (a == std::numeric_limits<int16_t>::min()) ? std::numeric_limits<int16_t>::max() : -a : a;
 }
 
 inline qint8_t sadd_qs8(qint8_t a, qint8_t b)
@@ -60,6 +74,11 @@
     return a + b;
 }
 
+inline qint16_t sadd_qs16(qint16_t a, qint16_t b)
+{
+    return a + b;
+}
+
 inline qint8_t sqadd_qs8(qint8_t a, qint8_t b)
 {
     // We need to store the temporary result in qint16_t otherwise we cannot evaluate the overflow
@@ -83,6 +102,11 @@
     return a - b;
 }
 
+inline qint16_t ssub_qs16(qint16_t a, qint16_t b)
+{
+    return a - b;
+}
+
 inline qint8_t sqsub_qs8(qint8_t a, qint8_t b)
 {
     // We need to store the temporary result in uint16_t otherwise we cannot evaluate the overflow
@@ -92,6 +116,15 @@
     return saturate_convert<qint16_t, qint8_t>(tmp);
 }
 
+inline qint16_t sqsub_qs16(qint16_t a, qint16_t b)
+{
+    // We need to store the temporary result in qint32_t otherwise we cannot evaluate the overflow
+    qint32_t tmp = static_cast<qint32_t>(a) - static_cast<qint32_t>(b);
+
+    // Saturate the result in case of overflow and cast to qint16_t
+    return saturate_convert<qint32_t, qint16_t>(tmp);
+}
+
 inline qint8_t smul_qs8(qint8_t a, qint8_t b, int fixed_point_position)
 {
     const qint16_t round_up_const = (1 << (fixed_point_position - 1));
@@ -104,6 +137,18 @@
     return static_cast<qint8_t>(tmp >> fixed_point_position);
 }
 
+inline qint16_t smul_qs16(qint16_t a, qint16_t b, int fixed_point_position)
+{
+    const qint32_t round_up_const = (1 << (fixed_point_position - 1));
+
+    qint32_t tmp = static_cast<qint32_t>(a) * static_cast<qint32_t>(b);
+
+    // Rounding up
+    tmp += round_up_const;
+
+    return static_cast<qint16_t>(tmp >> fixed_point_position);
+}
+
 inline qint8_t sqmul_qs8(qint8_t a, qint8_t b, int fixed_point_position)
 {
     const qint16_t round_up_const = (1 << (fixed_point_position - 1));
@@ -140,16 +185,28 @@
     return tmp >> fixed_point_position;
 }
 
+inline qint32_t sqmull_qs16(qint16_t a, qint16_t b, int fixed_point_position)
+{
+    const qint32_t round_up_const = (1 << (fixed_point_position - 1));
+
+    qint32_t tmp = static_cast<qint32_t>(a) * static_cast<qint32_t>(b);
+
+    // Rounding up
+    tmp += round_up_const;
+
+    return tmp >> fixed_point_position;
+}
+
 inline qint8_t sinvsqrt_qs8(qint8_t a, int fixed_point_position)
 {
-    qint8_t shift = 8 - (fixed_point_position + (__builtin_clz(a) - 24));
+    const qint8_t shift = 8 - (fixed_point_position + (__builtin_clz(a) - 24));
 
-    qint8_t const_three = (3 << fixed_point_position);
-    qint8_t temp        = shift < 0 ? (a << -shift) : (a >> shift);
-    qint8_t x2          = temp;
+    const qint8_t const_three = (3 << fixed_point_position);
+    qint8_t       temp        = shift < 0 ? (a << -shift) : (a >> shift);
+    qint8_t       x2          = temp;
 
     // We need three iterations to find the result
-    for(int i = 0; i < 3; i++)
+    for(int i = 0; i < 3; ++i)
     {
         qint8_t three_minus_dx = ssub_qs8(const_three, smul_qs8(temp, smul_qs8(x2, x2, fixed_point_position), fixed_point_position));
         x2                     = (smul_qs8(x2, three_minus_dx, fixed_point_position) >> 1);
@@ -160,35 +217,84 @@
     return temp;
 }
 
+inline qint16_t sinvsqrt_qs16(qint16_t a, int fixed_point_position)
+{
+    const qint16_t shift = 16 - (fixed_point_position + (__builtin_clz(a) - 16));
+
+    const qint16_t const_three = (3 << fixed_point_position);
+    qint16_t       temp        = shift < 0 ? (a << -shift) : (a >> shift);
+    qint16_t       x2          = temp;
+
+    // We need three iterations to find the result
+    for(int i = 0; i < 3; ++i)
+    {
+        qint16_t three_minus_dx = ssub_qs16(const_three, smul_qs16(temp, smul_qs16(x2, x2, fixed_point_position), fixed_point_position));
+        x2                      = smul_qs16(x2, three_minus_dx, fixed_point_position) >> 1;
+    }
+
+    temp = shift < 0 ? (x2 << ((-shift) >> 1)) : (x2 >> (shift >> 1));
+
+    return temp;
+}
+
 inline qint8_t sdiv_qs8(qint8_t a, qint8_t b, int fixed_point_position)
 {
-    qint16_t temp = a << fixed_point_position;
-    return (qint8_t)(temp / b);
+    const qint16_t temp = a << fixed_point_position;
+    return static_cast<qint8_t>(temp / b);
+}
+
+inline qint16_t sdiv_qs16(qint16_t a, qint16_t b, int fixed_point_position)
+{
+    const qint32_t temp = a << fixed_point_position;
+    return static_cast<qint16_t>(temp / b);
 }
 
 inline qint8_t sqexp_qs8(qint8_t a, int fixed_point_position)
 {
     // Constants
-    qint8_t const_one = (1 << fixed_point_position);
-    qint8_t ln2       = ((0x58 >> (6 - fixed_point_position)) + 1) >> 1;
-    qint8_t inv_ln2   = (((0x38 >> (6 - fixed_point_position)) + 1) >> 1) | const_one;
-    qint8_t A         = ((0x7F >> (6 - fixed_point_position)) + 1) >> 1;
-    qint8_t B         = ((0x3F >> (6 - fixed_point_position)) + 1) >> 1;
-    qint8_t C         = ((0x16 >> (6 - fixed_point_position)) + 1) >> 1;
-    qint8_t D         = ((0x05 >> (6 - fixed_point_position)) + 1) >> 1;
+    const qint8_t const_one = (1 << fixed_point_position);
+    const qint8_t ln2       = ((0x58 >> (6 - fixed_point_position)) + 1) >> 1;
+    const qint8_t inv_ln2   = (((0x38 >> (6 - fixed_point_position)) + 1) >> 1) | const_one;
+    const qint8_t A         = ((0x7F >> (6 - fixed_point_position)) + 1) >> 1;
+    const qint8_t B         = ((0x3F >> (6 - fixed_point_position)) + 1) >> 1;
+    const qint8_t C         = ((0x16 >> (6 - fixed_point_position)) + 1) >> 1;
+    const qint8_t D         = ((0x05 >> (6 - fixed_point_position)) + 1) >> 1;
 
     // Polynomial expansion
-    int     dec_a = (sqmul_qs8(a, inv_ln2, fixed_point_position) >> fixed_point_position);
-    qint8_t alpha = sabs_qs8(sqsub_qs8(a, sqmul_qs8(ln2, sqshl_qs8(dec_a, fixed_point_position), fixed_point_position)));
-    qint8_t sum   = sqadd_qs8(sqmul_qs8(alpha, D, fixed_point_position), C);
-    sum           = sqadd_qs8(sqmul_qs8(alpha, sum, fixed_point_position), B);
-    sum           = sqadd_qs8(sqmul_qs8(alpha, sum, fixed_point_position), A);
-    sum           = sqmul_qs8(alpha, sum, fixed_point_position);
-    sum           = sqadd_qs8(sum, const_one);
+    const int     dec_a = (sqmul_qs8(a, inv_ln2, fixed_point_position) >> fixed_point_position);
+    const qint8_t alpha = sabs_qs8(sqsub_qs8(a, sqmul_qs8(ln2, sqshl_qs8(dec_a, fixed_point_position), fixed_point_position)));
+    qint8_t       sum   = sqadd_qs8(sqmul_qs8(alpha, D, fixed_point_position), C);
+    sum                 = sqadd_qs8(sqmul_qs8(alpha, sum, fixed_point_position), B);
+    sum                 = sqadd_qs8(sqmul_qs8(alpha, sum, fixed_point_position), A);
+    sum                 = sqmul_qs8(alpha, sum, fixed_point_position);
+    sum                 = sqadd_qs8(sum, const_one);
 
     return (dec_a < 0) ? (sum >> -dec_a) : sqshl_qs8(sum, dec_a);
 }
 
+inline qint16_t sqexp_qs16(qint16_t a, int fixed_point_position)
+{
+    // Constants
+    const qint16_t const_one = (1 << fixed_point_position);
+    const qint16_t ln2       = ((0x58B9 >> (14 - fixed_point_position)) + 1) >> 1;
+    const qint16_t inv_ln2   = (((0x38AA >> (14 - fixed_point_position)) + 1) >> 1) | const_one;
+    const qint16_t A         = ((0x7FBA >> (14 - fixed_point_position)) + 1) >> 1;
+    const qint16_t B         = ((0x3FE9 >> (14 - fixed_point_position)) + 1) >> 1;
+    const qint16_t C         = ((0x1693 >> (14 - fixed_point_position)) + 1) >> 1;
+    const qint16_t D         = ((0x0592 >> (14 - fixed_point_position)) + 1) >> 1;
+
+    // Polynomial expansion
+    const int      dec_a = (sqmul_qs16(a, inv_ln2, fixed_point_position) >> fixed_point_position);
+    const qint16_t alpha = sabs_qs16(sqsub_qs16(a, sqmul_qs16(ln2, sqshl_qs16(dec_a, fixed_point_position), fixed_point_position)));
+    qint16_t       sum   = sqadd_qs16(sqmul_qs16(alpha, D, fixed_point_position), C);
+    sum                  = sqadd_qs16(sqmul_qs16(alpha, sum, fixed_point_position), B);
+    sum                  = sqadd_qs16(sqmul_qs16(alpha, sum, fixed_point_position), A);
+    sum                  = sqmul_qs16(alpha, sum, fixed_point_position);
+    sum                  = sqadd_qs16(sum, const_one);
+
+    return (dec_a < 0) ? (sum >> -dec_a) : sqshl_qs16(sum, dec_a);
+}
+
 inline qint8_t slog_qs8(qint8_t a, int fixed_point_position)
 {
     // Constants
@@ -214,14 +320,47 @@
     a = ssub_qs8(a, const_one);
 
     // Polynomial expansion
-    auto sum = sqadd_qs8(sqmul_qs8(a, D, fixed_point_position), C);
-    sum      = sqadd_qs8(sqmul_qs8(a, sum, fixed_point_position), B);
-    sum      = sqadd_qs8(sqmul_qs8(a, sum, fixed_point_position), A);
-    sum      = sqmul_qs8(a, sum, fixed_point_position);
+    qint8_t sum = sqadd_qs8(sqmul_qs8(a, D, fixed_point_position), C);
+    sum         = sqadd_qs8(sqmul_qs8(a, sum, fixed_point_position), B);
+    sum         = sqadd_qs8(sqmul_qs8(a, sum, fixed_point_position), A);
+    sum         = sqmul_qs8(a, sum, fixed_point_position);
 
     return smul_qs8(sadd_qs8(sum, shift_val << fixed_point_position), ln2, fixed_point_position);
 }
 
+inline qint16_t slog_qs16(qint16_t a, int fixed_point_position)
+{
+    // Constants
+    qint16_t const_one = (1 << fixed_point_position);
+    qint16_t ln2       = (0x58B9 >> (7 - fixed_point_position));
+    qint16_t A         = (0x5C0F >> (7 - fixed_point_position - 1));
+    qint16_t B         = -(0x56AE >> (7 - fixed_point_position));
+    qint16_t C         = (0x2933 >> (7 - fixed_point_position));
+    qint16_t D         = -(0x0AA7 >> (7 - fixed_point_position));
+
+    if((const_one == a) || (a < 0))
+    {
+        return 0;
+    }
+    else if(a < const_one)
+    {
+        return -slog_qs16(sdiv_qs16(const_one, a, fixed_point_position), fixed_point_position);
+    }
+
+    // Remove even powers of 2
+    qint16_t shift_val = 31 - __builtin_clz(a >> fixed_point_position);
+    a >>= shift_val;
+    a = ssub_qs16(a, const_one);
+
+    // Polynomial expansion
+    qint16_t sum = sqadd_qs16(sqmul_qs16(a, D, fixed_point_position), C);
+    sum          = sqadd_qs16(sqmul_qs16(a, sum, fixed_point_position), B);
+    sum          = sqadd_qs16(sqmul_qs16(a, sum, fixed_point_position), A);
+    sum          = sqmul_qs16(a, sum, fixed_point_position);
+
+    return smul_qs16(sadd_qs16(sum, shift_val << fixed_point_position), ln2, fixed_point_position);
+}
+
 inline float scvt_f32_qs8(qint8_t a, int fixed_point_position)
 {
     return static_cast<float>(a) / (1 << fixed_point_position);
@@ -230,7 +369,7 @@
 inline qint8_t scvt_qs8_f32(float a, int fixed_point_position)
 {
     // round_nearest_integer(a * 2^(fixed_point_position))
-    return static_cast<qint8_t>(static_cast<float>(a) * (1 << fixed_point_position) + 0.5f);
+    return static_cast<qint8_t>(a * (1 << fixed_point_position) + 0.5f);
 }
 
 inline float scvt_f32_qs16(qint16_t a, int fixed_point_position)
@@ -238,10 +377,10 @@
     return static_cast<float>(a) / (1 << fixed_point_position);
 }
 
-inline qint8_t scvt_qs16_f32(float a, int fixed_point_position)
+inline qint16_t scvt_qs16_f32(float a, int fixed_point_position)
 {
     // round_nearest_integer(a * 2^(fixed_point_position))
-    return static_cast<qint16_t>(static_cast<float>(a) * (1 << fixed_point_position) + 0.5f);
+    return static_cast<qint16_t>(a * (1 << fixed_point_position) + 0.5f);
 }
 
 inline qint8_t sqmovn_qs16(qint16_t a)
diff --git a/arm_compute/core/NEON/NEFixedPoint.h b/arm_compute/core/NEON/NEFixedPoint.h
index 201c5b5..660464e 100644
--- a/arm_compute/core/NEON/NEFixedPoint.h
+++ b/arm_compute/core/NEON/NEFixedPoint.h
@@ -46,6 +46,7 @@
 using qint16x8x2_t = int16x8x2_t; /**< 16 bit fixed point vector with 16 elements */
 using qint16x8x3_t = int16x8x3_t; /**< 16 bit fixed point vector with 24 elements */
 using qint16x8x4_t = int16x8x4_t; /**< 16 bit fixed point vector with 32 elements */
+using qint32x4_t   = int32x4_t;   /**< 32 bit fixed point vector with 4 elements */
 
 /** Get the lower half of a 16 elements vector
  *
@@ -55,6 +56,14 @@
  */
 qint8x8_t vget_low_qs8(qint8x16_t a);
 
+/** Get the lower half of a 16 elements vector
+ *
+ * @param[in] a vector of 8 elements
+ *
+ * @return 16 bit fixed point vector (4 elements)
+ */
+qint16x4_t vget_low_qs16(qint16x8_t a);
+
 /** Get the higher half of a 16 elements vector
  *
  * @param[in] a vector of 16 elements
@@ -63,6 +72,14 @@
  */
 qint8x8_t vget_high_qs8(qint8x16_t a);
 
+/** Get the higher half of a 16 elements vector
+ *
+ * @param[in] a vector of 8 elements
+ *
+ * @return 16 bit fixed point vector (4 elements)
+ */
+qint16x4_t vget_high_qs16(qint16x8_t a);
+
 /** Load a single 8 bit fixed point vector from memory (8 elements)
  *
  * @param[in] addr Memory address of the 8 bit fixed point vector to load
@@ -71,14 +88,6 @@
  */
 qint8x8_t vld1_qs8(const qint8_t *addr);
 
-/** Load a single 8 bit fixed point vector from memory (16 elements)
- *
- * @param[in] addr Memory address of the 8 bit fixed point vector to load
- *
- * @return 8 bit fixed point vector (16 elements)
- */
-qint8x16_t vld1q_qs8(const qint8_t *addr);
-
 /** Load a single 16 bit fixed point vector from memory (4 elements)
  *
  * @param[in] addr Memory address of the 16 bit fixed point vector to load
@@ -87,6 +96,14 @@
  */
 qint16x4_t vld1_qs16(const qint16_t *addr);
 
+/** Load a single 8 bit fixed point vector from memory (16 elements)
+ *
+ * @param[in] addr Memory address of the 8 bit fixed point vector to load
+ *
+ * @return 8 bit fixed point vector (16 elements)
+ */
+qint8x16_t vld1q_qs8(const qint8_t *addr);
+
 /** Load a single 16 bit fixed point vector from memory (8 elements)
  *
  * @param[in] addr Memory address of the 16 bit fixed point vector to load
@@ -103,6 +120,14 @@
  */
 qint8x8_t vld1_dup_qs8(const qint8_t *addr);
 
+/** Load all lanes of 16 bit fixed point vector with same value from memory (4 elements)
+ *
+ * @param[in] addr Memory address of the 16 bit fixed point scalar value to load
+ *
+ * @return 16 bit fixed point vector (4 elements)
+ */
+qint16x4_t vld1_dup_qs16(const qint16_t *addr);
+
 /** Load all lanes of 8 bit fixed point vector with same value from memory (16 elements)
  *
  * @param[in] addr Memory address of the 8 bit fixed point scalar value to load
@@ -111,6 +136,14 @@
  */
 qint8x16_t vld1q_dup_qs8(const qint8_t *addr);
 
+/** Load all lanes of 16 bit fixed point vector with same value from memory (8 elements)
+ *
+ * @param[in] addr Memory address of the 16 bit fixed point scalar value to load
+ *
+ * @return 16 bit fixed point vector (8 elements)
+ */
+qint16x8_t vld1q_dup_qs16(const qint16_t *addr);
+
 /** Store a single 8 bit fixed point vector to memory (8 elements)
  *
  * @param[in] addr Memory address where the 8 bit fixed point vector should be stored
@@ -119,14 +152,6 @@
  */
 void vst1_qs8(qint8_t *addr, qint8x8_t b);
 
-/** Store a single 8 bit fixed point vector to memory (16 elements)
- *
- * @param[in] addr Memory address where the 8 bit fixed point vector should be stored
- * @param[in] b    8 bit fixed point vector to store
- *
- */
-void vst1q_qs8(qint8_t *addr, qint8x16_t b);
-
 /** Store a single 16 bit fixed point vector to memory (4 elements)
  *
  * @param[in] addr Memory address where the 16 bit fixed point vector should be stored
@@ -137,10 +162,18 @@
 
 /** Store a single 8 bit fixed point vector to memory (16 elements)
  *
- * @param[in] addr Memory address where the 16 bit fixed point vector should be stored
- * @param[in] b    16 bit fixed point vector to store
+ * @param[in] addr Memory address where the 8 bit fixed point vector should be stored
+ * @param[in] b    8 bit fixed point vector to store
  *
  */
+void vst1q_qs8(qint8_t *addr, qint8x16_t b);
+
+/** Store a single 16 bit fixed point vector to memory (8 elements)
+*
+* @param[in] addr Memory address where the 16 bit fixed point vector should be stored
+* @param[in] b    16 bit fixed point vector to store
+*
+*/
 void vst1q_qs16(qint16_t *addr, qint16x8_t b);
 
 /** 16 bit fixed point vector saturating narrow (8 elements)
@@ -151,6 +184,14 @@
  */
 qint8x8_t vqmovn_q16(qint16x8_t a);
 
+/** 32 bit fixed point vector saturating narrow (4 elements)
+ *
+ * @param[in] a 32 bit fixed point vector to convert
+ *
+ * @return 16 bit fixed point vector
+ */
+qint16x4_t vqmovn_q32(qint32x4_t a);
+
 /** 8 bit fixed point vector duplicate (8 elements)
  *
  * @param[in] a 8 bit fixed point to duplicate
@@ -159,6 +200,14 @@
  */
 qint8x8_t vdup_n_qs8(qint8_t a);
 
+/** 16 bit fixed point vector duplicate (4 elements)
+ *
+ * @param[in] a 16 bit fixed point to duplicate
+ *
+ * @return The result of the vector duplication
+ */
+qint16x4_t vdup_n_qs16(qint16_t a);
+
 /** 8 bit fixed point vector duplicate (16 elements)
  *
  * @param[in] a 8 bit fixed point to duplicate
@@ -192,6 +241,14 @@
  */
 qint8x8_t vabs_qs8(qint8x8_t a);
 
+/** Absolute value of 16 bit fixed point vector (4 elements)
+ *
+ * @param[in] a 16 bit fixed point input vector
+ *
+ * @return The result of the 16 bit fixed point vector absolute value
+ */
+qint16x4_t vabs_qs16(qint16x4_t a);
+
 /** Absolute value of 8 bit fixed point vector (16 elements)
  *
  * @param[in] a 8 bit fixed point input vector
@@ -200,6 +257,14 @@
  */
 qint8x16_t vabsq_qs8(qint8x16_t a);
 
+/** Absolute value of 16 bit fixed point vector (8 elements)
+ *
+ * @param[in] a 16 bit fixed point input vector
+ *
+ * @return The result of the 16 bit fixed point vector absolute value
+ */
+qint16x8_t vabsq_qs16(qint16x8_t a);
+
 /** Saturating absolute value of 8 bit fixed point vector (8 elements)
  *
  * @param[in] a 8 bit fixed point input vector
@@ -208,6 +273,14 @@
  */
 qint8x8_t vqabs_qs8(qint8x8_t a);
 
+/** Saturating absolute value of 16 bit fixed point vector (4 elements)
+ *
+ * @param[in] a 4 bit fixed point input vector
+ *
+ * @return The result of the 16 bit fixed point vector absolute value
+ */
+qint16x4_t vqabs_qs16(qint16x4_t a);
+
 /** Saturating absolute value of 8 bit fixed point vector (16 elements)
  *
  * @param[in] a 8 bit fixed point input vector
@@ -216,6 +289,14 @@
  */
 qint8x16_t vqabsq_qs8(qint8x16_t a);
 
+/** Saturating absolute value of 16 bit fixed point vector (8 elements)
+ *
+ * @param[in] a 16 bit fixed point input vector
+ *
+ * @return The result of the 16 bit fixed point vector absolute value
+ */
+qint16x8_t vqabsq_qs16(qint16x8_t a);
+
 /** 8 bit fixed point vector max (8 elements)
  *
  * @param[in] a First 8 bit fixed point input vector
@@ -225,6 +306,15 @@
  */
 qint8x8_t vmax_qs8(qint8x8_t a, qint8x8_t b);
 
+/** 16 bit fixed point vector max (4 elements)
+ *
+ * @param[in] a First 16 bit fixed point input vector
+ * @param[in] b Second 16 bit fixed point input vector
+ *
+ * @return The result of the 16 bit fixed point vector max operation
+ */
+qint16x4_t vmax_qs16(qint16x4_t a, qint16x4_t b);
+
 /** 8 bit fixed point vector max (16 elements)
  *
  * @param[in] a First 8 bit fixed point input vector
@@ -234,6 +324,15 @@
  */
 qint8x16_t vmaxq_qs8(qint8x16_t a, qint8x16_t b);
 
+/** 16 bit fixed point vector max (8 elements)
+ *
+ * @param[in] a First 16 bit fixed point input vector
+ * @param[in] b Second 16 bit fixed point input vector
+ *
+ * @return The result of the 16 bit fixed point vector max operation
+ */
+qint16x8_t vmaxq_qs16(qint16x8_t a, qint16x8_t b);
+
 /** 8 bit fixed point vector pairwise max (8 elements)
  *
  * @param[in] a First 8 bit fixed point input vector
@@ -243,6 +342,15 @@
  */
 qint8x8_t vpmax_qs8(qint8x8_t a, qint8x8_t b);
 
+/** 16 bit fixed point vector pairwise max (4 elements)
+ *
+ * @param[in] a First 16 bit fixed point input vector
+ * @param[in] b Second 16 bit fixed point input vector
+ *
+ * @return The result of the 16 bit fixed point vector pairwise max operation
+ */
+qint16x4_t vpmax_qs16(qint16x4_t a, qint16x4_t b);
+
 /** 8 bit fixed point vector min (8 elements)
  *
  * @param[in] a First 8 bit fixed point input vector
@@ -252,6 +360,15 @@
  */
 qint8x8_t vmin_qs8(qint8x8_t a, qint8x8_t b);
 
+/** 16 bit fixed point vector min (4 elements)
+ *
+ * @param[in] a First 16 bit fixed point input vector
+ * @param[in] b Second 16 bit fixed point input vector
+ *
+ * @return The result of the 16 bit fixed point vector max operation
+ */
+qint16x4_t vmin_qs16(qint16x4_t a, qint16x4_t b);
+
 /** 8 bit fixed point vector min (16 elements)
  *
  * @param[in] a First 8 bit fixed point input vector
@@ -261,6 +378,15 @@
  */
 qint8x16_t vminq_qs8(qint8x16_t a, qint8x16_t b);
 
+/** 16 bit fixed point vector min (8 elements)
+ *
+ * @param[in] a First 16 bit fixed point input vector
+ * @param[in] b Second 16 bit fixed point input vector
+ *
+ * @return The result of the 16 bit fixed point vector min operation
+ */
+qint16x8_t vminq_qs16(qint16x8_t a, qint16x8_t b);
+
 /** 8 bit fixed point vector pairwise min (8 elements)
  *
  * @param[in] a First 8 bit fixed point input vector
@@ -270,6 +396,15 @@
  */
 qint8x8_t vpmin_qs8(qint8x8_t a, qint8x8_t b);
 
+/** 16 bit fixed point vector pairwise min (4 elements)
+ *
+ * @param[in] a First 16 bit fixed point input vector
+ * @param[in] b Second 16 bit fixed point input vector
+ *
+ * @return The result of the 16 bit fixed point vector pairwise min operation
+ */
+qint16x4_t vpmin_qs16(qint16x4_t a, qint16x4_t b);
+
 /** 8 bit fixed point vector add (8 elements)
  *
  * @param[in] a First 8 bit fixed point input vector
@@ -279,6 +414,15 @@
  */
 qint8x8_t vadd_qs8(qint8x8_t a, qint8x8_t b);
 
+/** 16 bit fixed point vector add (4 elements)
+ *
+ * @param[in] a First 16 bit fixed point input vector
+ * @param[in] b Second 16 bit fixed point input vector
+ *
+ * @return The result of the 16 bit fixed point vector addition
+ */
+qint16x4_t vadd_qs16(qint16x4_t a, qint16x4_t b);
+
 /** 8 bit fixed point vector add (16 elements)
  *
  * @param[in] a First 8 bit fixed point input vector
@@ -288,6 +432,15 @@
  */
 qint8x16_t vaddq_qs8(qint8x16_t a, qint8x16_t b);
 
+/** 16 bit fixed point vector add (8 elements)
+ *
+ * @param[in] a First 16 bit fixed point input vector
+ * @param[in] b Second 16 bit fixed point input vector
+ *
+ * @return The result of the 16 bit fixed point vector addition
+ */
+qint16x8_t vaddq_qs16(qint16x8_t a, qint16x8_t b);
+
 /** 8 bit fixed point vector saturating add (8 elements)
  *
  * @param[in] a First 8 bit fixed point input vector
@@ -297,15 +450,6 @@
  */
 qint8x8_t vqadd_qs8(qint8x8_t a, qint8x8_t b);
 
-/** 8 bit fixed point vector saturating add (16 elements)
- *
- * @param[in] a First 8 bit fixed point input vector
- * @param[in] b Second 8 bit fixed point input vector
- *
- * @return The result of the 8 bit fixed point vector addition. The result is saturated in case of overflow
- */
-qint8x16_t vqaddq_qs8(qint8x16_t a, qint8x16_t b);
-
 /** 16 bit fixed point vector saturating add (4 elements)
  *
  * @param[in] a First 16 bit fixed point input vector
@@ -315,6 +459,15 @@
  */
 qint16x4_t vqadd_qs16(qint16x4_t a, qint16x4_t b);
 
+/** 8 bit fixed point vector saturating add (16 elements)
+ *
+ * @param[in] a First 8 bit fixed point input vector
+ * @param[in] b Second 8 bit fixed point input vector
+ *
+ * @return The result of the 8 bit fixed point vector addition. The result is saturated in case of overflow
+ */
+qint8x16_t vqaddq_qs8(qint8x16_t a, qint8x16_t b);
+
 /** 16 bit fixed point vector saturating add (8 elements)
  *
  * @param[in] a First 16 bit fixed point input vector
@@ -341,6 +494,15 @@
  */
 qint8x8_t vsub_qs8(qint8x8_t a, qint8x8_t b);
 
+/** 16 bit fixed point vector subtraction (4 elements)
+ *
+ * @param[in] a First 16 bit fixed point input vector
+ * @param[in] b Second 16 bit fixed point input vector
+ *
+ * @return The result of the 16 bit fixed point vector subtraction
+ */
+qint16x4_t vsub_qs16(qint16x4_t a, qint16x4_t b);
+
 /** 8 bit fixed point vector subtraction (16 elements)
  *
  * @param[in] a First 8 bit fixed point input vector
@@ -350,6 +512,15 @@
  */
 qint8x16_t vsubq_qs8(qint8x16_t a, qint8x16_t b);
 
+/** 16 bit fixed point vector subtraction (8 elements)
+ *
+ * @param[in] a First 16 bit fixed point input vector
+ * @param[in] b Second 16 bit fixed point input vector
+ *
+ * @return The result of the 16 bit fixed point vector subtraction
+ */
+qint16x8_t vsubq_qs16(qint16x8_t a, qint16x8_t b);
+
 /** 8 bit fixed point vector saturating subtraction (8 elements)
  *
  * @param[in] a First 8 bit fixed point input vector
@@ -359,6 +530,15 @@
  */
 qint8x8_t vqsub_qs8(qint8x8_t a, qint8x8_t b);
 
+/** 16 bit fixed point vector saturating subtraction (4 elements)
+ *
+ * @param[in] a First 16 bit fixed point input vector
+ * @param[in] b Second 16 bit fixed point input vector
+ *
+ * @return The result of the 16 bit fixed point vector subtraction. The result is saturated in case of overflow
+ */
+qint16x4_t vqsub_qs16(qint16x4_t a, qint16x4_t b);
+
 /** 8 bit fixed point vector saturating subtraction (16 elements)
  *
  * @param[in] a First 8 bit fixed point input vector
@@ -368,6 +548,15 @@
  */
 qint8x16_t vqsubq_qs8(qint8x16_t a, qint8x16_t b);
 
+/** 16 bit fixed point vector saturating subtraction (8 elements)
+ *
+ * @param[in] a First 16 bit fixed point input vector
+ * @param[in] b Second 16 bit fixed point input vector
+ *
+ * @return The result of the 16 bit fixed point vector subtraction. The result is saturated in case of overflow
+ */
+qint16x8_t vqsubq_qs16(qint16x8_t a, qint16x8_t b);
+
 /** 8 bit fixed point vector multiply (8 elements)
  *
  * @param[in] a                    First 8 bit fixed point input vector
@@ -378,6 +567,16 @@
  */
 qint8x8_t vmul_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position);
 
+/** 16 bit fixed point vector multiply (4 elements)
+ *
+ * @param[in] a                    First 16 bit fixed point input vector
+ * @param[in] b                    Second 16 bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 16 bit fixed point vector multiplication.
+ */
+qint16x4_t vmul_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position);
+
 /** 8 bit fixed point vector multiply (16 elements)
  *
  * @param[in] a                    First 8 bit fixed point input vector
@@ -388,6 +587,16 @@
  */
 qint8x16_t vmulq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position);
 
+/** 16 bit fixed point vector multiply (8 elements)
+ *
+ * @param[in] a                    First 16 bit fixed point input vector
+ * @param[in] b                    Second 16 bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 16 bit fixed point vector multiplication.
+ */
+qint16x8_t vmulq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position);
+
 /** 8 bit fixed point vector saturating multiply (8 elements)
  *
  * @param[in] a                    First 8 bit fixed point input vector
@@ -398,6 +607,16 @@
  */
 qint8x8_t vqmul_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position);
 
+/** 16 bit fixed point vector saturating multiply (4 elements)
+ *
+ * @param[in] a                    First 16 bit fixed point input vector
+ * @param[in] b                    Second 16 bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 16 bit fixed point vector multiplication. The result is saturated in case of overflow
+ */
+qint16x4_t vqmul_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position);
+
 /** 8 bit fixed point vector saturating multiply (16 elements)
  *
  * @param[in] a                    First 8 bit fixed point input vector
@@ -408,6 +627,16 @@
  */
 qint8x16_t vqmulq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position);
 
+/** 16 bit fixed point vector saturating multiply (8 elements)
+ *
+ * @param[in] a                    First 16 bit fixed point input vector
+ * @param[in] b                    Second 16 bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 16 bit fixed point vector multiplication. The result is saturated in case of overflow
+ */
+qint16x8_t vqmulq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position);
+
 /** 8 bit fixed point vector long multiply (8 elements)
  *
  * @param[in] a                    First 8 bit fixed point input vector
@@ -429,6 +658,17 @@
  */
 qint8x8_t vmla_qs8(qint8x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position);
 
+/** 16 bit fixed point vector multiply-accumulate (4 elements). This operation performs the product between @p b and @p c and add the result to @p a (a + b * c).
+ *
+ * @param[in] a                    First 16 bit fixed point input vector where the result of multiplication must be added to
+ * @param[in] b                    Second 16 bit fixed point input vector
+ * @param[in] c                    Third 16 bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 16 bit fixed point vector multiply-accumulate
+ */
+qint16x4_t vmla_qs16(qint16x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position);
+
 /** 8 bit fixed point vector multiply-accumulate (16 elements). This operation performs the product between @p b and @p c and add the result to @p a (a + b * c).
  *
  * @param[in] a                    First 8 bit fixed point input vector where the result of multiplication must be added to
@@ -440,6 +680,17 @@
  */
 qint8x16_t vmlaq_qs8(qint8x16_t a, qint8x16_t b, qint8x16_t c, int fixed_point_position);
 
+/** 16 bit fixed point vector multiply-accumulate (16 elements). This operation performs the product between @p b and @p c and add the result to @p a (a + b * c).
+ *
+ * @param[in] a                    First 16 bit fixed point input vector where the result of multiplication must be added to
+ * @param[in] b                    Second 16 bit fixed point input vector
+ * @param[in] c                    Third 16 bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 16 bit fixed point vector multiply-accumulate
+ */
+qint16x8_t vmlaq_qs16(qint16x8_t a, qint16x8_t b, qint16x8_t c, int fixed_point_position);
+
 /** 8 bit fixed point vector saturating multiply-accumulate (8 elements). This operation performs the product between @p b and @p c and add the result to @p a (a + b * c).
  *
  * @param[in] a                    First 8 bit fixed point input vector where the result of multiplication must be added to
@@ -451,6 +702,17 @@
  */
 qint8x8_t vqmla_qs8(qint8x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position);
 
+/** 16 bit fixed point vector saturating multiply-accumulate (4 elements). This operation performs the product between @p b and @p c and add the result to @p a (a + b * c).
+ *
+ * @param[in] a                    First 16 bit fixed point input vector where the result of multiplication must be added to
+ * @param[in] b                    Second 16 bit fixed point input vector
+ * @param[in] c                    Third 16 bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 16 bit fixed point vector multiply-accumulate. The result is saturated in case of overflow
+ */
+qint16x4_t vqmla_qs16(qint16x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position);
+
 /** 8 bit fixed point vector saturating multiply-accumulate (16 elements). This operation performs the product between @p b and @p c and add the result to @p a (a + b * c).
  *
  * @param[in] a                    First 8 bit fixed point input vector where the result of multiplication must be added to
@@ -462,6 +724,17 @@
  */
 qint8x16_t vqmlaq_qs8(qint8x16_t a, qint8x16_t b, qint8x16_t c, int fixed_point_position);
 
+/** 16 bit fixed point vector saturating multiply-accumulate (8 elements). This operation performs the product between @p b and @p c and add the result to @p a (a + b * c).
+ *
+ * @param[in] a                    First 16 bit fixed point input vector where the result of multiplication must be added to
+ * @param[in] b                    Second 16 bit fixed point input vector
+ * @param[in] c                    Third 16 bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 16 bit fixed point vector multiply-accumulate.The result is saturated in case of overflow
+ */
+qint16x8_t vqmlaq_qs16(qint16x8_t a, qint16x8_t b, qint16x8_t c, int fixed_point_position);
+
 /** 8 bit fixed point vector multiply-accumulate long (8 elements).
  *  This operation performs the product between @p b and @p c and add the result to the 16 bit fixed point vector @p a (a + b * c). 8 elements
  *
@@ -474,6 +747,18 @@
  */
 qint16x8_t vmlal_qs8(qint16x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position);
 
+/** 16 bit fixed point vector multiply-accumulate long (4 elements).
+ *  This operation performs the product between @p b and @p c and add the result to the 32 bit fixed point vector @p a (a + b * c). 4 elements
+ *
+ * @param[in] a                    First 32 bit fixed point input vector where the result of multiplication must be added to
+ * @param[in] b                    Second 16 bit fixed point input vector
+ * @param[in] c                    Third 16 bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 16 bit fixed point vector multiply-accumulate long
+ */
+qint32x4_t vmlal_qs16(qint32x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position);
+
 /** 8 bit fixed point vector saturating multiply-accumulate long (8 elements). The saturation is performed on the 16 bit fixed point output vector.
  *  This operation performs the product between @p b and @p c and add the result to the 16 bit fixed point vector @p a (a + b * c). 8 elements
  *
@@ -486,6 +771,18 @@
  */
 qint16x8_t vqmlal_qs8(qint16x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position);
 
+/** 16 bit fixed point vector saturating multiply-accumulate long (4 elements). The saturation is performed on the 16 bit fixed point output vector.
+ *  This operation performs the product between @p b and @p c and add the result to the 32 bit fixed point vector @p a (a + b * c). 4 elements
+ *
+ * @param[in] a                    First 32 bit fixed point input vector where the result of multiplication must be added to
+ * @param[in] b                    Second 16 bit fixed point input vector
+ * @param[in] c                    Third 16 bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 16 bit fixed point vector multiply-accumulate long
+ */
+qint32x4_t vqmlal_qs16(qint32x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position);
+
 /** Convert a float vector with 4x2 elements to 8 bit fixed point vector with 8 elements
  *
  * @param[in] a                    Float input vector
@@ -493,7 +790,16 @@
  *
  * @return The result of the conversion float -> 8 bit fixed point
  */
-qint8x8_t vcvt_qs8_f32(const float32x4x2_t &a, int fixed_point_position);
+qint8x8_t vcvt_qs8_f32(const float32x4x2_t a, int fixed_point_position);
+
+/** Convert a float vector with 4 elements to 16 bit fixed point vector with 4 elements
+ *
+ * @param[in] a                    Float input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the conversion float -> 16 bit fixed point
+ */
+qint16x4_t vcvt_qs16_f32(const float32x4_t a, int fixed_point_position);
 
 /** Convert a float vector with 4x4 elements to 8 bit fixed point vector with 16 elements
  *
@@ -504,6 +810,15 @@
  */
 qint8x16_t vcvtq_qs8_f32(const float32x4x4_t &a, int fixed_point_position);
 
+/** Convert a float vector with 4x2 elements to 16 bit fixed point vector with 8 elements
+ *
+ * @param[in] a                    Float input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the conversion float -> 16 bit fixed point
+ */
+qint16x8_t vcvtq_qs16_f32(const float32x4x2_t &a, int fixed_point_position);
+
 /** Convert a 8 bit fixed point vector with 8 elements to a float vector with 4x2 elements
  *
  * @param[in] a                    8 bit fixed point input vector
@@ -513,6 +828,15 @@
  */
 float32x4x2_t vcvt_f32_qs8(qint8x8_t a, int fixed_point_position);
 
+/** Convert a 16 bit fixed point vector with 4 elements to a float vector with 4 elements
+ *
+ * @param[in] a                    16 bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the conversion 16 bit fixed point -> float32x2
+ */
+float32x4_t vcvt_f32_qs16(qint16x4_t a, int fixed_point_position);
+
 /** Convert a 8 bit fixed point vector with 16 elements to a float vector with 4x4 elements
  *
  * @param[in] a                    8 bit fixed point input vector
@@ -522,6 +846,15 @@
  */
 float32x4x4_t vcvtq_qs8_f32(qint8x16_t a, int fixed_point_position);
 
+/** Convert a 16 bit fixed point vector with 8 elements to a float vector with 4x2 elements
+ *
+ * @param[in] a                    16 bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the conversion 16 bit fixed point -> float32x4x2
+ */
+float32x4x2_t vcvtq_qs16_f32(qint16x8_t a, int fixed_point_position);
+
 /** Calculate reciprocal of a fixed point 8bit number using the Newton-Raphson method. (8 elements)
  *
  * @param[in] a                    8bit fixed point input vector
@@ -531,6 +864,15 @@
  */
 qint8x8_t vrecip_qs8(qint8x8_t a, int fixed_point_position);
 
+/** Calculate reciprocal of a fixed point 8bit number using the Newton-Raphson method. (4 elements)
+ *
+ * @param[in] a                    16 bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 16 bit reciprocal (1/a).
+ */
+qint16x4_t vrecip_qs16(qint16x4_t a, int fixed_point_position);
+
 /** Calculate reciprocal of a fixed point 8bit number using the Newton-Raphson method. (16 elements)
  *
  * @param[in] a                    8bit fixed point input vector
@@ -540,6 +882,15 @@
  */
 qint8x16_t vrecipq_qs8(qint8x16_t a, int fixed_point_position);
 
+/** Calculate reciprocal of a fixed point 8bit number using the Newton-Raphson method. (8 elements)
+ *
+ * @param[in] a                    16 bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 16 bit reciprocal (1/a).
+ */
+qint16x8_t vrecipq_qs16(qint16x8_t a, int fixed_point_position);
+
 /** Division fixed point 8bit (8 elements)
  *
  * @param[in] a                    First 8bit fixed point input vector
@@ -550,6 +901,16 @@
  */
 qint8x8_t vdiv_qs8(qint8x8_t a, int8x8_t b, int fixed_point_position);
 
+/** Division fixed point 16 bit (4 elements)
+ *
+ * @param[in] a                    First 16 bit fixed point input vector
+ * @param[in] b                    Second  16 bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The quotient and remainder number in fixed point format.
+ */
+qint16x4_t vdiv_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position);
+
 /** Division fixed point 8bit (16 elements)
  *
  * @param[in] a                    First 8bit fixed point input vector
@@ -558,7 +919,17 @@
  *
  * @return The quotient and remainder number in 8bit fixed point format.
  */
-qint8x16_t vdivq_qs8(qint8x16_t a, int8x16_t b, int fixed_point_position);
+qint8x16_t vdivq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position);
+
+/** Division fixed point 16 bit (8 elements)
+ *
+ * @param[in] a                    First 16 bit fixed point input vector
+ * @param[in] b                    Second 16 bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The quotient and remainder number in 16 bit fixed point format.
+ */
+qint16x8_t vdivq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position);
 
 /** Perform a 4th degree polynomial approximation. (8 elements)
  *
@@ -570,6 +941,16 @@
 template <bool islog>
 qint8x8_t vtaylor_poly_qs8(qint8x8_t a, int fixed_point_position);
 
+/** Perform a 4th degree polynomial approximation. (4 elements)
+ *
+ * @param[in] a                    16 bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 16 bit taylor approximation.
+ */
+template <bool islog>
+qint16x4_t vtaylor_poly_qs16(qint16x4_t a, int fixed_point_position);
+
 /** Perform a 4th degree polynomial approximation. (16 elements)
  *
  * @param[in] a                    8bit fixed point input vector
@@ -580,6 +961,16 @@
 template <bool islog>
 qint8x16_t vtaylor_polyq_qs8(qint8x16_t a, int fixed_point_position);
 
+/** Perform a 4th degree polynomial approximation. (8 elements)
+ *
+ * @param[in] a                    16 bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 8bit taylor approximation.
+ */
+template <bool islog>
+qint16x8_t vtaylor_polyq_qs16(qint16x8_t a, int fixed_point_position);
+
 /** Calculate saturating exponential fixed point 8bit (8 elements)
  *
  * @param[in] a                    8bit fixed point input vector
@@ -589,6 +980,15 @@
  */
 qint8x8_t vqexp_qs8(qint8x8_t a, int fixed_point_position);
 
+/** Calculate saturating exponential fixed point 16 bit (4 elements)
+ *
+ * @param[in] a                    8bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 16 bit saturating exponential
+ */
+qint16x4_t vqexp_qs16(qint16x4_t a, int fixed_point_position);
+
 /** Calculate saturating exponential fixed point 8bit (16 elements)
  *
  * @param[in] a                    8bit fixed point input vector
@@ -598,7 +998,16 @@
  */
 qint8x16_t vqexpq_qs8(qint8x16_t a, int fixed_point_position);
 
-/** Calculate logarithm fixed point 16bit (8 elements)
+/** Calculate saturating exponential fixed point 16 bit (8 elements)
+ *
+ * @param[in] a                    16 bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 16 bit saturating exponential
+ */
+qint16x8_t vqexpq_qs16(qint16x8_t a, int fixed_point_position);
+
+/** Calculate logarithm fixed point 8 bit (8 elements)
  *
  * @param[in] a                    8bit fixed point input vector
  * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
@@ -607,6 +1016,15 @@
  */
 qint8x8_t vlog_qs8(qint8x8_t a, int fixed_point_position);
 
+/** Calculate logarithm fixed point 16 bit (4 elements)
+ *
+ * @param[in] a                    16 bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 16 bit logarithm.
+ */
+qint16x4_t vlog_qs16(qint16x4_t a, int fixed_point_position);
+
 /** Calculate logarithm fixed point 16bit (16 elements)
  *
  * @param[in] a                    8bit fixed point input vector
@@ -616,6 +1034,15 @@
  */
 qint8x16_t vlogq_qs8(qint8x16_t a, int fixed_point_position);
 
+/** Calculate logarithm fixed point 16 bit (8 elements)
+ *
+ * @param[in] a                    16 bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 16 bit logarithm.
+ */
+qint16x8_t vlogq_qs16(qint16x8_t a, int fixed_point_position);
+
 /** Calculate inverse square root for fixed point 8bit using Newton-Raphosn method (8 elements)
  *
  * @param[in] a                    8bit fixed point input vector
@@ -625,6 +1052,15 @@
  */
 qint8x8_t vinvsqrt_qs8(qint8x8_t a, int fixed_point_position);
 
+/** Calculate inverse square root for fixed point 16 bit using Newton-Raphosn method (4 elements)
+ *
+ * @param[in] a                    16 bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 16 bit inverse sqrt.
+ */
+qint16x4_t vinvsqrt_qs16(qint16x4_t a, int fixed_point_position);
+
 /** Calculate saturating inverse square root for fixed point 8bit using Newton-Raphosn method (8 elements)
  *
  * @param[in] a                    8bit fixed point input vector
@@ -634,6 +1070,15 @@
  */
 qint8x8_t vqinvsqrt_qs8(qint8x8_t a, int fixed_point_position);
 
+/** Calculate saturating inverse square root for fixed point 16 bit using Newton-Raphosn method (4 elements)
+ *
+ * @param[in] a                    16 bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 16 bit inverse sqrt.
+ */
+qint16x4_t vqinvsqrt_qs16(qint16x4_t a, int fixed_point_position);
+
 /** Calculate inverse square root for fixed point 8bit using Newton-Raphosn method (16 elements)
  *
  * @param[in] a                    8bit fixed point input vector
@@ -643,6 +1088,15 @@
  */
 qint8x16_t vinvsqrtq_qs8(qint8x16_t a, int fixed_point_position);
 
+/** Calculate inverse square root for fixed point 8bit using Newton-Raphosn method (8 elements)
+ *
+ * @param[in] a                    16 bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 16 bit inverse sqrt.
+ */
+qint16x8_t vinvsqrtq_qs16(qint16x8_t a, int fixed_point_position);
+
 /** Calculate saturating inverse square root for fixed point 8bit using Newton-Raphosn method (16 elements)
  *
  * @param[in] a                    8bit fixed point input vector
@@ -652,6 +1106,15 @@
  */
 qint8x16_t vqinvsqrtq_qs8(qint8x16_t a, int fixed_point_position);
 
+/** Calculate saturating inverse square root for fixed point 16 bit using Newton-Raphosn method (8 elements)
+ *
+ * @param[in] a                    16 bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 16 bit inverse sqrt.
+ */
+qint16x8_t vqinvsqrtq_qs16(qint16x8_t a, int fixed_point_position);
+
 /** Calculate hyperbolic tangent for fixed point 8bit (8 elements)
  *
  * @param[in] a                    8bit fixed point input vector
@@ -661,6 +1124,15 @@
  */
 qint8x8_t vtanh_qs8(qint8x8_t a, int fixed_point_position);
 
+/** Calculate hyperbolic tangent for fixed point 16 bit (4 elements)
+ *
+ * @param[in] a                    16 bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The calculated Hyperbolic Tangent.
+ */
+qint16x4_t vtanh_qs16(qint16x4_t a, int fixed_point_position);
+
 /** Calculate hyperbolic tangent for fixed point 8bit (16 elements)
  *
  * @param[in] a                    8bit fixed point input vector
@@ -690,6 +1162,15 @@
  * @return The lane-by-lane maximum -> float32x4x2
  */
 float32x4x2_t vmax2q_f32(float32x4x2_t a, float32x4x2_t b);
+
+/** Calculate hyperbolic tangent for fixed point 8bit (8 elements)
+ *
+ * @param[in] a                    16 bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The calculated Hyperbolic Tangent.
+ */
+qint16x8_t vtanhq_qs16(qint16x8_t a, int fixed_point_position);
 }
 #include "arm_compute/core/NEON/NEFixedPoint.inl"
 #endif /* __ARM_COMPUTE_NEFIXEDPOINT_H__ */
diff --git a/arm_compute/core/NEON/NEFixedPoint.inl b/arm_compute/core/NEON/NEFixedPoint.inl
index b57fd3e..4f7f44a 100644
--- a/arm_compute/core/NEON/NEFixedPoint.inl
+++ b/arm_compute/core/NEON/NEFixedPoint.inl
@@ -26,7 +26,7 @@
 {
 /**< Exponent polynomial coefficients for 8 bit fixed point (8 elements)
  *  Format is in Q0.7 for all elements */
-const std::array<qint8x8_t, 4> exp_tab_qs8 =
+static const std::array<qint8x8_t, 4> exp_tab_qs8 =
 {
     {
         vdup_n_s8(0x7F), // 0.9978546
@@ -36,9 +36,21 @@
     }
 };
 
+/**< Exponent polynomial coefficients for 16 bit fixed point (4 elements)
+ *  Format is in Q0.15 for all elements */
+static const std::array<qint16x4_t, 4> exp_tab_qs16 =
+{
+    {
+        vdup_n_s16(0x7FBA), // 0.9978546
+        vdup_n_s16(0x3FE9), // 0.4994721
+        vdup_n_s16(0x1693), // 0.1763723
+        vdup_n_s16(0x0592), // 0.0435108
+    }
+};
+
 /**< Exponent polynomial coefficients for 8 bit fixed point (16 elements)
  * Format is in Q0.7 for all elements */
-const std::array<qint8x16_t, 4> exp_tabq_qs8 =
+static const std::array<qint8x16_t, 4> exp_tabq_qs8 =
 {
     {
         vdupq_n_s8(0x7F), // 0.9978546
@@ -48,9 +60,21 @@
     }
 };
 
+/**< Exponent polynomial coefficients for 16 bit fixed point (8 elements)
+ * Format is in Q0.15 for all elements */
+static const std::array<qint16x8_t, 4> exp_tabq_qs16 =
+{
+    {
+        vdupq_n_s16(0x7FBA), // 0.9978546
+        vdupq_n_s16(0x3FE9), // 0.4994721
+        vdupq_n_s16(0x1693), // 0.1763723
+        vdupq_n_s16(0x0592), // 0.0435108
+    }
+};
+
 /**< Logarithm polynomial coefficients for 8 bit fixed point (8 elements)
  * Format is in Q0.7 for all elements except the first one which is in Q1.6 */
-const std::array<qint8x8_t, 4> log_tab_qs8 =
+static const std::array<qint8x8_t, 4> log_tab_qs8 =
 {
     {
         vdup_n_s8(0x5C),  // 1.4384189
@@ -60,9 +84,21 @@
     }
 };
 
+/**< Logarithm polynomial coefficients for 16 bit fixed point (8 elements)
+ * Format is in Q0.15 for all elements except the first one which is in Q1.14 */
+static const std::array<qint16x4_t, 4> log_tab_qs16 =
+{
+    {
+        vdup_n_s16(0x5C0F),  // 1.4384189
+        vdup_n_s16(-0x56AE), // -0.6771900
+        vdup_n_s16(0x2933),  // 0.3218538
+        vdup_n_s16(-0x0AA7), // -0.0832229
+    }
+};
+
 /**< Logarithm polynomial coefficients for 8 bit fixed point (16 elements)
  * Format is in Q0.7 for all elements except the first one which is in Q1.6 */
-const std::array<qint8x16_t, 4> log_tabq_qs8 =
+static const std::array<qint8x16_t, 4> log_tabq_qs8 =
 {
     {
         vdupq_n_s8(0x5C),  // 1.4384189
@@ -72,31 +108,53 @@
     }
 };
 
+/**< Logarithm polynomial coefficients for 16 bit fixed point (8 elements)
+ * Format is in Q0.15 for all elements except the first one which is in Q1.14 */
+static const std::array<qint16x8_t, 4> log_tabq_qs16 =
+{
+    {
+        vdupq_n_s16(0x5C0F),  // 1.4384189
+        vdupq_n_s16(-0x56AE), // -0.6771900
+        vdupq_n_s16(0x2933),  // 0.3218538
+        vdupq_n_s16(-0x0AA7), // -0.0832229
+    }
+};
+
 inline qint8x8_t vget_low_qs8(qint8x16_t a)
 {
     return vget_low_s8(a);
 }
 
+inline qint16x4_t vget_low_qs16(qint16x8_t a)
+{
+    return vget_low_s16(a);
+}
+
 inline qint8x8_t vget_high_qs8(qint8x16_t a)
 {
     return vget_high_s8(a);
 }
 
+inline qint16x4_t vget_high_qs16(qint16x8_t a)
+{
+    return vget_high_s16(a);
+}
+
 inline qint8x8_t vld1_qs8(const qint8_t *addr)
 {
     return vld1_s8(addr);
 }
 
-inline qint8x16_t vld1q_qs8(const qint8_t *addr)
-{
-    return vld1q_s8(addr);
-}
-
 inline qint16x4_t vld1_qs16(const qint16_t *addr)
 {
     return vld1_s16(addr);
 }
 
+inline qint8x16_t vld1q_qs8(const qint8_t *addr)
+{
+    return vld1q_s8(addr);
+}
+
 inline qint16x8_t vld1q_qs16(const qint16_t *addr)
 {
     return vld1q_s16(addr);
@@ -107,26 +165,36 @@
     return vld1_dup_s8(addr);
 }
 
+inline qint16x4_t vld1_dup_qs16(const qint16_t *addr)
+{
+    return vld1_dup_s16(addr);
+}
+
 inline qint8x16_t vld1q_dup_qs8(const qint8_t *addr)
 {
     return vld1q_dup_s8(addr);
 }
 
+inline qint16x8_t vld1q_dup_qs16(const qint16_t *addr)
+{
+    return vld1q_dup_s16(addr);
+}
+
 inline void vst1_qs8(qint8_t *addr, qint8x8_t b)
 {
     vst1_s8(addr, b);
 }
 
-inline void vst1q_qs8(qint8_t *addr, qint8x16_t b)
-{
-    vst1q_s8(addr, b);
-}
-
 inline void vst1_qs16(qint16_t *addr, qint16x4_t b)
 {
     vst1_s16(addr, b);
 }
 
+inline void vst1q_qs8(qint8_t *addr, qint8x16_t b)
+{
+    vst1q_s8(addr, b);
+}
+
 inline void vst1q_qs16(qint16_t *addr, qint16x8_t b)
 {
     vst1q_s16(addr, b);
@@ -137,11 +205,21 @@
     return vqmovn_s16(a);
 }
 
+inline qint16x4_t vqmovn_qs32(qint32x4_t a)
+{
+    return vqmovn_s32(a);
+}
+
 inline qint8x8_t vdup_n_qs8(qint8_t a)
 {
     return vdup_n_s8(a);
 }
 
+inline qint16x4_t vdup_n_qs16(qint16_t a)
+{
+    return vdup_n_s16(a);
+}
+
 inline qint8x16_t vdupq_n_qs8(qint8_t a)
 {
     return vdupq_n_s8(a);
@@ -166,31 +244,61 @@
     return vdupq_n_s16(a);
 }
 
+inline qint32x4_t vdupq_n_qs32(qint32_t a)
+{
+    return vdupq_n_s32(a);
+}
+
 inline qint8x8_t vabs_qs8(qint8x8_t a)
 {
     return vabs_s8(a);
 }
 
+inline qint16x4_t vabs_qs16(qint16x4_t a)
+{
+    return vabs_s16(a);
+}
+
 inline qint8x16_t vabsq_qs8(qint8x16_t a)
 {
     return vabsq_s8(a);
 }
 
+inline qint16x8_t vabsq_qs16(qint16x8_t a)
+{
+    return vabsq_s16(a);
+}
+
 inline qint8x8_t vqabs_qs8(qint8x8_t a)
 {
     return vqabs_s8(a);
 }
 
+inline qint16x4_t vqabs_qs16(qint16x4_t a)
+{
+    return vqabs_s16(a);
+}
+
 inline qint8x16_t vqabsq_qs8(qint8x16_t a)
 {
     return vqabsq_s8(a);
 }
 
+inline qint16x8_t vqabsq_qs16(qint16x8_t a)
+{
+    return vqabsq_s16(a);
+}
+
 inline qint8x8_t vmax_qs8(qint8x8_t a, qint8x8_t b)
 {
     return vmax_s8(a, b);
 }
 
+inline qint16x4_t vmax_qs16(qint16x4_t a, qint16x4_t b)
+{
+    return vmax_s16(a, b);
+}
+
 inline qint8x16_t vmaxq_qs8(qint8x16_t a, qint8x16_t b)
 {
     return vmaxq_s8(a, b);
@@ -201,11 +309,26 @@
     return vpmax_s8(a, b);
 }
 
+inline qint16x4_t vpmax_qs16(qint16x4_t a, qint16x4_t b)
+{
+    return vpmax_s16(a, b);
+}
+
+inline qint16x8_t vmaxq_qs16(qint16x8_t a, qint16x8_t b)
+{
+    return vmaxq_s16(a, b);
+}
+
 inline qint8x8_t vmin_qs8(qint8x8_t a, qint8x8_t b)
 {
     return vmin_s8(a, b);
 }
 
+inline qint16x4_t vmin_qs16(qint16x4_t a, qint16x4_t b)
+{
+    return vmin_s16(a, b);
+}
+
 inline qint8x16_t vminq_qs8(qint8x16_t a, qint8x16_t b)
 {
     return vminq_s8(a, b);
@@ -216,31 +339,51 @@
     return vpmin_s8(a, b);
 }
 
+inline qint16x4_t vpmin_qs16(qint16x4_t a, qint16x4_t b)
+{
+    return vpmin_s16(a, b);
+}
+
+inline qint16x8_t vminq_qs16(qint16x8_t a, qint16x8_t b)
+{
+    return vminq_s16(a, b);
+}
+
 inline qint8x8_t vadd_qs8(qint8x8_t a, qint8x8_t b)
 {
     return vadd_s8(a, b);
 }
 
+inline qint16x4_t vadd_qs16(qint16x4_t a, qint16x4_t b)
+{
+    return vadd_s16(a, b);
+}
+
 inline qint8x16_t vaddq_qs8(qint8x16_t a, qint8x16_t b)
 {
     return vaddq_s8(a, b);
 }
 
+inline qint16x8_t vaddq_qs16(qint16x8_t a, qint16x8_t b)
+{
+    return vaddq_s16(a, b);
+}
+
 inline qint8x8_t vqadd_qs8(qint8x8_t a, qint8x8_t b)
 {
     return vqadd_s8(a, b);
 }
 
-inline qint8x16_t vqaddq_qs8(qint8x16_t a, qint8x16_t b)
-{
-    return vqaddq_s8(a, b);
-}
-
 inline qint16x4_t vqadd_qs16(qint16x4_t a, qint16x4_t b)
 {
     return vqadd_s16(a, b);
 }
 
+inline qint8x16_t vqaddq_qs8(qint8x16_t a, qint8x16_t b)
+{
+    return vqaddq_s8(a, b);
+}
+
 inline qint16x8_t vqaddq_qs16(qint16x8_t a, qint16x8_t b)
 {
     return vqaddq_s16(a, b);
@@ -256,21 +399,41 @@
     return vsub_s8(a, b);
 }
 
+inline qint16x4_t vsub_qs16(qint16x4_t a, qint16x4_t b)
+{
+    return vsub_s16(a, b);
+}
+
 inline qint8x16_t vsubq_qs8(qint8x16_t a, qint8x16_t b)
 {
     return vsubq_s8(a, b);
 }
 
+inline qint16x8_t vsubq_qs16(qint16x8_t a, qint16x8_t b)
+{
+    return vsubq_s16(a, b);
+}
+
 inline qint8x8_t vqsub_qs8(qint8x8_t a, qint8x8_t b)
 {
     return vqsub_s8(a, b);
 }
 
+inline qint16x4_t vqsub_qs16(qint16x4_t a, qint16x4_t b)
+{
+    return vqsub_s16(a, b);
+}
+
 inline qint8x16_t vqsubq_qs8(qint8x16_t a, qint8x16_t b)
 {
     return vqsubq_s8(a, b);
 }
 
+inline qint16x8_t vqsubq_qs16(qint16x8_t a, qint16x8_t b)
+{
+    return vqsubq_s16(a, b);
+}
+
 inline qint8x8_t vmul_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
 {
     const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
@@ -288,6 +451,23 @@
     return vmovn_s16(res);
 }
 
+inline qint16x4_t vmul_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position)
+{
+    const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
+
+    // Initialize the temporary result with a constant used to round up the result
+    qint32x4_t res = vdupq_n_s32(1 << (fixed_point_position - 1));
+
+    // Vector multiply-accumulate long
+    res = vmlal_s16(res, a, b);
+
+    // Shift right by fixed_point_position
+    res = vshlq_s32(res, fixed_point_position_s32);
+
+    // Convert back to qint16
+    return vmovn_s32(res);
+}
+
 inline qint8x16_t vmulq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
 {
     const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
@@ -308,6 +488,26 @@
     return vcombine_s8(vmovn_s16(res0), vmovn_s16(res1));
 }
 
+inline qint16x8_t vmulq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position)
+{
+    const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
+
+    // Initialize the temporary results with a constant used to round up the result
+    qint32x4_t res0 = vdupq_n_s32(1 << (fixed_point_position - 1));
+    qint32x4_t res1 = res0;
+
+    // Vector multiply-accumulate long
+    res0 = vmlal_s16(res0, vget_low_qs16(a), vget_low_qs16(b));
+    res1 = vmlal_s16(res1, vget_high_qs16(a), vget_high_qs16(b));
+
+    // Shift right by fixed_point_position
+    res0 = vshlq_s32(res0, fixed_point_position_s32);
+    res1 = vshlq_s32(res1, fixed_point_position_s32);
+
+    // Convert back to qint16
+    return vcombine_s16(vmovn_s32(res0), vmovn_s32(res1));
+}
+
 inline qint8x8_t vqmul_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
 {
     const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
@@ -325,6 +525,23 @@
     return vqmovn_s16(res);
 }
 
+inline qint16x4_t vqmul_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position)
+{
+    const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
+
+    // Initialize the temporary result with a constant used to round up the result
+    qint32x4_t res = vdupq_n_s32(1 << (fixed_point_position - 1));
+
+    // Vector multiply-accumulate long
+    res = vmlal_s16(res, a, b);
+
+    // Shift right by fixed_point_position
+    res = vqshlq_s32(res, fixed_point_position_s32);
+
+    // Convert back to qint16 and saturate
+    return vqmovn_s32(res);
+}
+
 inline qint8x16_t vqmulq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
 {
     const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
@@ -345,6 +562,26 @@
     return vcombine_s8(vqmovn_s16(res0), vqmovn_s16(res1));
 }
 
+inline qint16x8_t vqmulq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position)
+{
+    const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
+
+    // Initialize the temporary results with a constant used to round up the result
+    qint32x4_t res0 = vdupq_n_s32(1 << (fixed_point_position - 1));
+    qint32x4_t res1 = res0;
+
+    // Vector multiply-accumulate long
+    res0 = vmlal_s16(res0, vget_low_qs16(a), vget_low_qs16(b));
+    res1 = vmlal_s16(res1, vget_high_qs16(a), vget_high_qs16(b));
+
+    // Shift right by fixed_point_position
+    res0 = vqshlq_s32(res0, fixed_point_position_s32);
+    res1 = vqshlq_s32(res1, fixed_point_position_s32);
+
+    // Convert back to qint16 and saturate
+    return vcombine_s16(vqmovn_s32(res0), vqmovn_s32(res1));
+}
+
 inline qint16x8_t vmull_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
 {
     const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
@@ -371,6 +608,23 @@
     return vadd_s8(a, vmovn_s16(tmp));
 }
 
+inline qint16x4_t vmla_qs16(qint16x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position)
+{
+    const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
+
+    // Initialize the temporary results with a constant used to round up the result
+    qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
+
+    // Vector multiply-accumulate long
+    tmp = vmlal_s16(tmp, b, c);
+
+    // Shift right by fixed_point_position
+    tmp = vshlq_s32(tmp, fixed_point_position_s32);
+
+    // Convert back to qint16 and accumulate
+    return vadd_s16(a, vmovn_s32(tmp));
+}
+
 inline qint8x16_t vmlaq_qs8(qint8x16_t a, qint8x16_t b, qint8x16_t c, int fixed_point_position)
 {
     const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
@@ -391,6 +645,26 @@
     return vcombine_s8(vadd_s8(vget_low_s8(a), vmovn_s16(tmp0)), vadd_s8(vget_high_s8(a), vmovn_s16(tmp1)));
 }
 
+inline qint16x8_t vmlaq_qs16(qint16x8_t a, qint16x8_t b, qint16x8_t c, int fixed_point_position)
+{
+    const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
+
+    // Initialize the temporary results with a constant used to round up the result
+    qint32x4_t tmp0 = vdupq_n_s32(1 << (fixed_point_position - 1));
+    qint32x4_t tmp1 = tmp0;
+
+    // Vector multiply-accumulate long
+    tmp0 = vmlal_s16(tmp0, vget_low_qs16(b), vget_low_qs16(c));
+    tmp1 = vmlal_s16(tmp1, vget_high_qs16(b), vget_high_qs16(c));
+
+    // Shift right by fixed_point_position
+    tmp0 = vshlq_s32(tmp0, fixed_point_position_s32);
+    tmp1 = vshlq_s32(tmp1, fixed_point_position_s32);
+
+    // Convert back to qint16 and accumulate
+    return vcombine_s16(vadd_s16(vget_low_qs16(a), vmovn_s32(tmp0)), vadd_s16(vget_high_qs16(a), vmovn_s32(tmp1)));
+}
+
 inline qint8x8_t vqmla_qs8(qint8x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
 {
     const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
@@ -408,6 +682,23 @@
     return vqadd_s8(a, vqmovn_s16(tmp));
 }
 
+inline qint16x4_t vqmla_qs16(qint16x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position)
+{
+    const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
+
+    // Initialize the temporary results with a constant used to round up the result
+    qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
+
+    // Vector multiply-accumulate long
+    tmp = vmlal_s16(tmp, b, c);
+
+    // Shift right by fixed_point_position
+    tmp = vqshlq_s32(tmp, fixed_point_position_s32);
+
+    // Convert back to qint8 and accumulate
+    return vqadd_s16(a, vqmovn_s32(tmp));
+}
+
 inline qint8x16_t vqmlaq_qs8(qint8x16_t a, qint8x16_t b, qint8x16_t c, int fixed_point_position)
 {
     const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
@@ -429,6 +720,27 @@
     return vqaddq_s8(a, res);
 }
 
+inline qint16x8_t vqmlaq_qs16(qint16x8_t a, qint16x8_t b, qint16x8_t c, int fixed_point_position)
+{
+    const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
+
+    // Initialize the temporary results with a constant used to round up the result
+    qint32x4_t tmp0 = vdupq_n_s32(1 << (fixed_point_position - 1));
+    qint32x4_t tmp1 = tmp0;
+
+    // Vector multiply-accumulate long
+    tmp0 = vmlal_s16(tmp0, vget_low_qs16(b), vget_low_qs16(c));
+    tmp1 = vmlal_s16(tmp1, vget_high_qs16(b), vget_high_qs16(c));
+
+    // Shift right by fixed_point_position
+    tmp0 = vqshlq_s32(tmp0, fixed_point_position_s32);
+    tmp1 = vqshlq_s32(tmp1, fixed_point_position_s32);
+
+    // Convert back to qint16 and accumulate
+    qint16x8_t res = vcombine_s16(vqmovn_s32(tmp0), vqmovn_s32(tmp1));
+    return vqaddq_s16(a, res);
+}
+
 inline qint16x8_t vmlal_qs8(qint16x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
 {
     const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
@@ -446,6 +758,23 @@
     return vaddq_s16(a, tmp);
 }
 
+inline qint32x4_t vmlal_qs16(qint32x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position)
+{
+    const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
+
+    // Initialize the temporary results with a constant used to round up the result
+    qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
+
+    // Vector multiply-accumulate long
+    tmp = vmlal_s16(tmp, b, c);
+
+    // Shift right by fixed_point_position
+    tmp = vshlq_s32(tmp, fixed_point_position_s32);
+
+    // Accumulate
+    return vaddq_s32(a, tmp);
+}
+
 inline qint16x8_t vqmlal_qs8(qint16x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
 {
     const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
@@ -463,6 +792,23 @@
     return vqaddq_s16(a, tmp);
 }
 
+inline qint32x4_t vqmlal_qs16(qint32x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position)
+{
+    const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
+
+    // Initialize the temporary results with a constant used to round up the result
+    qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
+
+    // Vector multiply-accumulate long
+    tmp = vmlal_s16(tmp, b, c);
+
+    // Shift right by fixed_point_position
+    tmp = vqshlq_s32(tmp, fixed_point_position_s32);
+
+    // Accumulate
+    return vqaddq_s32(a, tmp);
+}
+
 inline qint8x8_t vcvt_qs8_f32(const float32x4x2_t &a, int fixed_point_position)
 {
     const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
@@ -491,6 +837,19 @@
     return vqmovn_s16(res_s16);
 }
 
+inline qint16x4_t vcvt_qs16_f32(const float32x4_t a, int fixed_point_position)
+{
+    const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
+
+    float32x4_t res_f32 = vdupq_n_f32(0.5f);
+
+    res_f32 = vmlaq_f32(res_f32, a, pow2);
+
+    const int32x4_t res_s32 = vcvtq_s32_f32(res_f32);
+
+    return vqmovn_s32(res_s32);
+}
+
 inline qint8x16_t vcvtq_qs8_f32(const float32x4x4_t &a, int fixed_point_position)
 {
     const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
@@ -531,6 +890,32 @@
     return vcombine_s8(vqmovn_s16(res_s16.val[0]), vqmovn_s16(res_s16.val[1]));
 }
 
+inline qint16x8_t vcvtq_qs16_f32(const float32x4x2_t &a, int fixed_point_position)
+{
+    const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
+
+    float32x4x2_t res_f32 =
+    {
+        {
+            vdupq_n_f32(0.5f),
+            vdupq_n_f32(0.5f)
+        }
+    };
+
+    res_f32.val[0] = vmlaq_f32(res_f32.val[0], a.val[0], pow2);
+    res_f32.val[1] = vmlaq_f32(res_f32.val[1], a.val[1], pow2);
+
+    const int32x4x2_t res_s32 =
+    {
+        {
+            vcvtq_s32_f32(res_f32.val[0]),
+            vcvtq_s32_f32(res_f32.val[1])
+        }
+    };
+
+    return vcombine_s16(vqmovn_s32(res_s32.val[0]), vqmovn_s32(res_s32.val[1]));
+}
+
 inline float32x4x2_t vcvt_f32_qs8(qint8x8_t a, int fixed_point_position)
 {
     const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position));
@@ -540,8 +925,8 @@
     const int32x4x2_t res_s32 =
     {
         {
-            vmovl_s16(vget_low_s16(res_s16)),
-            vmovl_s16(vget_high_s16(res_s16))
+            vmovl_s16(vget_low_qs16(res_s16)),
+            vmovl_s16(vget_high_qs16(res_s16))
         }
     };
 
@@ -559,6 +944,14 @@
     return res_f32;
 }
 
+inline float32x4_t vcvt_f32_qs16(qint16x4_t a, int fixed_point_position)
+{
+    const float32x4_t pow2    = vdupq_n_f32(1.0f / (1 << fixed_point_position));
+    const float32x4_t res_f32 = vcvtq_f32_s32(vmovl_s16(a));
+
+    return vmulq_f32(res_f32, pow2);
+}
+
 inline float32x4x4_t vcvtq_f32_qs8(qint8x16_t a, int fixed_point_position)
 {
     const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position));
@@ -574,10 +967,10 @@
     const int32x4x4_t res_s32 =
     {
         {
-            vmovl_s16(vget_low_s16(res_s16.val[0])),
-            vmovl_s16(vget_high_s16(res_s16.val[0])),
-            vmovl_s16(vget_low_s16(res_s16.val[1])),
-            vmovl_s16(vget_high_s16(res_s16.val[1])),
+            vmovl_s16(vget_low_qs16(res_s16.val[0])),
+            vmovl_s16(vget_high_qs16(res_s16.val[0])),
+            vmovl_s16(vget_low_qs16(res_s16.val[1])),
+            vmovl_s16(vget_high_qs16(res_s16.val[1])),
         }
     };
 
@@ -599,18 +992,44 @@
     return res_f32;
 }
 
+inline float32x4x2_t vcvtq_f32_qs16(qint16x8_t a, int fixed_point_position)
+{
+    const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position));
+
+    const int32x4x2_t res_s32 =
+    {
+        {
+            vmovl_s16(vget_low_qs16(a)),
+            vmovl_s16(vget_high_qs16(a))
+        }
+    };
+
+    float32x4x2_t res_f32 =
+    {
+        {
+            vcvtq_f32_s32(res_s32.val[0]),
+            vcvtq_f32_s32(res_s32.val[1])
+        }
+    };
+
+    res_f32.val[0] = vmulq_f32(res_f32.val[0], pow2);
+    res_f32.val[1] = vmulq_f32(res_f32.val[1], pow2);
+
+    return res_f32;
+}
+
 inline qint8x8_t vrecip_qs8(qint8x8_t a, int fixed_point_position)
 {
     // We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
-    const qint8x8_t const_48_over_17       = vdup_n_s8(0x7A >> (5 - fixed_point_position));    // 2.823
-    const qint8x8_t const_minus_32_over_17 = vdup_n_s8(-(0x3C >> (5 - fixed_point_position))); // -1.8823
-    const qint8x8_t const_one              = vdup_n_s8(1 << fixed_point_position);
+    const qint8x8_t const_48_over_17 = vdup_n_s8(0x5A >> (5 - fixed_point_position));   // 2.823
+    const qint8x8_t const_32_over_17 = vdup_n_s8((0x3C >> (5 - fixed_point_position))); // 1.8823
+    const qint8x8_t const_one        = vdup_n_s8(1 << fixed_point_position);
 
     // Find shift value
     const qint8x8_t shift_value = vneg_s8(vsub_s8(vdup_n_s8(8), vadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
     const qint8x8_t temp        = vshl_s8(a, shift_value);
 
-    qint8x8_t x = vadd_s8(const_48_over_17, vmul_qs8(temp, const_minus_32_over_17, fixed_point_position));
+    qint8x8_t x = vadd_s8(const_48_over_17, vmul_qs8(temp, const_32_over_17, fixed_point_position));
 
     uint8x8_t set_one = vcgt_s8(x, const_one);
     x                 = vbsl_s8(set_one, const_one, x);
@@ -623,18 +1042,44 @@
     return vshl_s8(x, shift_value);
 }
 
+inline qint16x4_t vrecip_qs16(qint16x4_t a, int fixed_point_position)
+{
+    // We need two bits to store 2, thus we can only support formats from Q2.13 to Q15.0
+    const qint16x4_t const_48_over_17 = vdup_n_s16(0x5A5A >> (13 - fixed_point_position)); // 2.823
+    const qint16x4_t const_32_over_17 = vdup_n_s16(0x3C3C >> (13 - fixed_point_position)); // 1.8823
+    const qint16x4_t const_one        = vdup_n_s16(1 << fixed_point_position);
+
+    // Find shift value
+    const qint16x4_t shift_value = vneg_s16(vsub_s16(vdup_n_s16(8), vadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position))));
+    const qint16x4_t temp        = vshl_s16(a, shift_value);
+
+    qint16x4_t x = vadd_s16(const_48_over_17, vmul_qs16(temp, const_32_over_17, fixed_point_position));
+
+    uint16x4_t set_one = vcgt_s16(x, const_one);
+    x                  = vbsl_s16(set_one, const_one, x);
+
+    // Use five iterations of Newton-Raphson  method to get the result
+    x = vadd_s16(x, vmul_qs16(x, vsub_s16(const_one, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
+    x = vadd_s16(x, vmul_qs16(x, vsub_s16(const_one, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
+    x = vadd_s16(x, vmul_qs16(x, vsub_s16(const_one, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
+    x = vadd_s16(x, vmul_qs16(x, vsub_s16(const_one, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
+    x = vadd_s16(x, vmul_qs16(x, vsub_s16(const_one, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position));
+
+    return vshl_s16(x, shift_value);
+}
+
 inline qint8x16_t vrecipq_qs8(qint8x16_t a, int fixed_point_position)
 {
     // We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
-    const qint8x16_t const_48_over_17       = vdupq_n_s8(0x7A >> (5 - fixed_point_position));   // 2.823
-    const qint8x16_t const_minus_32_over_17 = vdupq_n_s8((0x3C >> (5 - fixed_point_position))); // -1.8823
-    const qint8x16_t const_one              = vdupq_n_s8(1 << fixed_point_position);
+    const qint8x16_t const_48_over_17 = vdupq_n_s8(0x5A >> (5 - fixed_point_position));   // 2.823
+    const qint8x16_t const_32_over_17 = vdupq_n_s8((0x3C >> (5 - fixed_point_position))); // -1.8823
+    const qint8x16_t const_one        = vdupq_n_s8(1 << fixed_point_position);
 
     // Find shift value
     const qint8x16_t shift_value = vnegq_s8(vsubq_s8(vdupq_n_s8(8), vaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
     const qint8x16_t temp        = vshlq_s8(a, shift_value);
 
-    qint8x16_t x = vsubq_qs8(const_48_over_17, vmulq_qs8(temp, const_minus_32_over_17, fixed_point_position));
+    qint8x16_t x = vsubq_qs8(const_48_over_17, vmulq_qs8(temp, const_32_over_17, fixed_point_position));
 
     // Set initial guess to one if x > 1
     uint8x16_t set_one = vcgtq_s8(x, const_one);
@@ -648,18 +1093,45 @@
     return vshlq_s8(x, shift_value);
 }
 
+inline qint16x8_t vrecipq_qs16(qint16x8_t a, int fixed_point_position)
+{
+    // We need two bits to store 2, thus we can only support formats from Q2.13 to Q15.0
+    const qint16x8_t const_48_over_17 = vdupq_n_s16(0x5A56 >> (13 - fixed_point_position)); // 2.823
+    const qint16x8_t const_32_over_17 = vdupq_n_s16(0x3C3C >> (13 - fixed_point_position)); // 1.8823
+    const qint16x8_t const_one        = vdupq_n_s16(1 << fixed_point_position);
+
+    // Find shift value
+    const qint16x8_t shift_value = vnegq_s16(vsubq_s16(vdupq_n_s16(16), vaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position))));
+    const qint16x8_t temp        = vshlq_s16(a, shift_value);
+
+    qint16x8_t x = vsubq_qs16(const_48_over_17, vmulq_qs16(temp, const_32_over_17, fixed_point_position));
+
+    // Set initial guess to one if x > 1
+    uint16x8_t set_one = vcgtq_s16(x, const_one);
+    x                  = vbslq_s16(set_one, const_one, x);
+
+    // Use five iterations of Newton-Raphson  method to get the result
+    x = vaddq_s16(x, vmulq_qs16(x, vsubq_s16(const_one, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
+    x = vaddq_s16(x, vmulq_qs16(x, vsubq_s16(const_one, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
+    x = vaddq_s16(x, vmulq_qs16(x, vsubq_s16(const_one, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
+    x = vaddq_s16(x, vmulq_qs16(x, vsubq_s16(const_one, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
+    x = vaddq_s16(x, vmulq_qs16(x, vsubq_s16(const_one, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
+
+    return vshlq_s16(x, shift_value);
+}
+
 inline qint8x16_t vqrecipq_qs8(qint8x16_t a, int fixed_point_position)
 {
     // We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
-    const qint8x16_t const_48_over_17       = vdupq_n_s8(0x7A >> (5 - fixed_point_position));   // 2.823
-    const qint8x16_t const_minus_32_over_17 = vdupq_n_s8((0x3C >> (5 - fixed_point_position))); // -1.8823
-    const qint8x16_t const_one              = vdupq_n_s8(1 << fixed_point_position);
+    const qint8x16_t const_48_over_17 = vdupq_n_s8(0x5A >> (5 - fixed_point_position));   // 2.823
+    const qint8x16_t const_32_over_17 = vdupq_n_s8((0x3C >> (5 - fixed_point_position))); // -1.8823
+    const qint8x16_t const_one        = vdupq_n_s8(1 << fixed_point_position);
 
     // Find shift value
     const qint8x16_t shift_value = vqnegq_s8(vqsubq_s8(vdupq_n_s8(8), vqaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
     const qint8x16_t temp        = vqshlq_s8(a, shift_value);
 
-    qint8x16_t x = vqsubq_qs8(const_48_over_17, vmulq_qs8(temp, const_minus_32_over_17, fixed_point_position));
+    qint8x16_t x = vqsubq_qs8(const_48_over_17, vmulq_qs8(temp, const_32_over_17, fixed_point_position));
 
     // Set initial guess to one if x > 1
     uint8x16_t set_one = vcgtq_s8(x, const_one);
@@ -673,18 +1145,55 @@
     return vqshlq_s8(x, shift_value);
 }
 
+inline qint16x8_t vqrecipq_qs16(qint16x8_t a, int fixed_point_position)
+{
+    // We need two bits to store 2, thus we can only support formats from Q2.13 to Q15.0
+    const qint16x8_t const_48_over_17 = vdupq_n_s16(0x5A56 >> (13 - fixed_point_position)); // 2.823
+    const qint16x8_t const_32_over_17 = vdupq_n_s16(0x3C3C >> (13 - fixed_point_position)); // 1.8823
+    const qint16x8_t const_one        = vdupq_n_s16(1 << fixed_point_position);
+
+    // Find shift value
+    const qint16x8_t shift_value = vqnegq_s16(vqsubq_s16(vdupq_n_s16(16), vqaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position))));
+    const qint16x8_t temp        = vqshlq_s16(a, shift_value);
+
+    qint16x8_t x = vqsubq_qs16(const_48_over_17, vqmulq_qs16(temp, const_32_over_17, fixed_point_position));
+
+    // Set initial guess to one if x > 1
+    uint16x8_t set_one = vcgtq_s16(x, const_one);
+    x                  = vbslq_s16(set_one, const_one, x);
+
+    // Use five iterations of Newton-Raphson  method to get the result
+    x = vqaddq_s16(x, vqmulq_qs16(x, vqsubq_s16(const_one, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
+    x = vqaddq_s16(x, vqmulq_qs16(x, vqsubq_s16(const_one, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
+    x = vqaddq_s16(x, vqmulq_qs16(x, vqsubq_s16(const_one, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
+    x = vqaddq_s16(x, vqmulq_qs16(x, vqsubq_s16(const_one, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
+    x = vqaddq_s16(x, vqmulq_qs16(x, vqsubq_s16(const_one, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position));
+
+    return vqshlq_s16(x, shift_value);
+}
+
 inline qint8x8_t vdiv_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
 {
     return vmul_qs8(a, vrecip_qs8(b, fixed_point_position), fixed_point_position);
 }
 
+inline qint16x4_t vdiv_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position)
+{
+    return vmul_qs16(a, vrecip_qs16(b, fixed_point_position), fixed_point_position);
+}
+
 inline qint8x16_t vdivq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
 {
     return vmulq_qs8(a, vrecipq_qs8(b, fixed_point_position), fixed_point_position);
 }
 
+inline qint16x8_t vdivq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position)
+{
+    return vmulq_qs16(a, vrecipq_qs16(b, fixed_point_position), fixed_point_position);
+}
+
 template <bool   islog>
-inline qint8x8_t vtaylor_poly_qs8(int8x8_t a, int fixed_point_position)
+inline qint8x8_t vtaylor_poly_qs8(qint8x8_t a, int fixed_point_position)
 {
     const qint8x8_t shift_value = vdup_n_s8(-(7 - fixed_point_position));
     const qint8x8_t const_one   = vdup_n_s8(1);
@@ -699,8 +1208,24 @@
     return res;
 }
 
+template <bool    islog>
+inline qint16x4_t vtaylor_poly_qs16(qint16x4_t a, int fixed_point_position)
+{
+    const qint16x4_t shift_value = vdup_n_s16(-(15 - fixed_point_position));
+    const qint16x4_t const_one   = vdup_n_s16(1);
+    const qint16x4_t A           = vrshl_s16(islog ? log_tab_qs16[0] : exp_tab_qs16[0], islog ? vadd_s16(shift_value, const_one) : shift_value);
+    const qint16x4_t B           = vrshl_s16(islog ? log_tab_qs16[1] : exp_tab_qs16[1], shift_value);
+    const qint16x4_t C           = vrshl_s16(islog ? log_tab_qs16[2] : exp_tab_qs16[2], shift_value);
+    const qint16x4_t D           = vrshl_s16(islog ? log_tab_qs16[3] : exp_tab_qs16[3], shift_value);
+    const qint16x4_t x1          = vadd_s16(vmul_qs16(a, D, fixed_point_position), C);
+    const qint16x4_t x2          = vadd_s16(vmul_qs16(a, x1, fixed_point_position), B);
+    const qint16x4_t x3          = vadd_s16(vmul_qs16(a, x2, fixed_point_position), A);
+    const qint16x4_t res         = vmul_qs16(a, x3, fixed_point_position);
+    return res;
+}
+
 template <bool   islog>
-inline qint8x8_t vqtaylor_poly_qs8(int8x8_t a, int fixed_point_position)
+inline qint8x8_t vqtaylor_poly_qs8(qint8x8_t a, int fixed_point_position)
 {
     const qint8x8_t shift_value = vdup_n_s8(-(7 - fixed_point_position));
     const qint8x8_t const_one   = vdup_n_s8(1);
@@ -716,6 +1241,22 @@
 }
 
 template <bool    islog>
+inline qint16x4_t vqtaylor_poly_qs16(qint16x4_t a, int fixed_point_position)
+{
+    const qint16x4_t shift_value = vdup_n_s16(-(15 - fixed_point_position));
+    const qint16x4_t const_one   = vdup_n_s16(1);
+    const qint16x4_t A           = vqrshl_s16(islog ? log_tab_qs16[0] : exp_tab_qs16[0], islog ? vqadd_s16(shift_value, const_one) : shift_value);
+    const qint16x4_t B           = vqrshl_s16(islog ? log_tab_qs16[1] : exp_tab_qs16[1], shift_value);
+    const qint16x4_t C           = vqrshl_s16(islog ? log_tab_qs16[2] : exp_tab_qs16[2], shift_value);
+    const qint16x4_t D           = vqrshl_s16(islog ? log_tab_qs16[3] : exp_tab_qs16[3], shift_value);
+    const qint16x4_t x1          = vqadd_s16(vqmul_qs16(a, D, fixed_point_position), C);
+    const qint16x4_t x2          = vqadd_s16(vqmul_qs16(a, x1, fixed_point_position), B);
+    const qint16x4_t x3          = vqadd_s16(vqmul_qs16(a, x2, fixed_point_position), A);
+    const qint16x4_t res         = vqmul_qs16(a, x3, fixed_point_position);
+    return res;
+}
+
+template <bool    islog>
 inline qint8x16_t vtaylor_polyq_qs8(qint8x16_t a, int fixed_point_position)
 {
     const qint8x16_t shift_value = vdupq_n_s8(-(7 - fixed_point_position));
@@ -732,6 +1273,22 @@
 }
 
 template <bool    islog>
+inline qint16x8_t vtaylor_polyq_qs16(qint16x8_t a, int fixed_point_position)
+{
+    const qint16x8_t shift_value = vdupq_n_s16(-(15 - fixed_point_position));
+    const qint16x8_t const_one   = vdupq_n_s16(1);
+    const qint16x8_t A           = vrshlq_s16(islog ? log_tabq_qs16[0] : exp_tabq_qs16[0], islog ? vaddq_s16(shift_value, const_one) : shift_value);
+    const qint16x8_t B           = vrshlq_s16(islog ? log_tabq_qs16[1] : exp_tabq_qs16[1], shift_value);
+    const qint16x8_t C           = vrshlq_s16(islog ? log_tabq_qs16[2] : exp_tabq_qs16[2], shift_value);
+    const qint16x8_t D           = vrshlq_s16(islog ? log_tabq_qs16[3] : exp_tabq_qs16[3], shift_value);
+    const qint16x8_t x1          = vaddq_s16(vmulq_qs16(a, D, fixed_point_position), C);
+    const qint16x8_t x2          = vaddq_s16(vmulq_qs16(a, x1, fixed_point_position), B);
+    const qint16x8_t x3          = vaddq_s16(vmulq_qs16(a, x2, fixed_point_position), A);
+    const qint16x8_t res         = vmulq_qs16(a, x3, fixed_point_position);
+    return res;
+}
+
+template <bool    islog>
 inline qint8x16_t vqtaylor_polyq_qs8(qint8x16_t a, int fixed_point_position)
 {
     const qint8x16_t shift_value = vdupq_n_s8(-(7 - fixed_point_position));
@@ -747,6 +1304,22 @@
     return res;
 }
 
+template <bool    islog>
+inline qint16x8_t vqtaylor_polyq_qs16(qint16x8_t a, int fixed_point_position)
+{
+    const qint16x8_t shift_value = vdupq_n_s16(-(15 - fixed_point_position));
+    const qint16x8_t const_one   = vdupq_n_s16(1);
+    const qint16x8_t A           = vqrshlq_s16(islog ? log_tabq_qs16[0] : exp_tabq_qs16[0], islog ? vqaddq_s16(shift_value, const_one) : shift_value);
+    const qint16x8_t B           = vqrshlq_s16(islog ? log_tabq_qs16[1] : exp_tabq_qs16[1], shift_value);
+    const qint16x8_t C           = vqrshlq_s16(islog ? log_tabq_qs16[2] : exp_tabq_qs16[2], shift_value);
+    const qint16x8_t D           = vqrshlq_s16(islog ? log_tabq_qs16[3] : exp_tabq_qs16[3], shift_value);
+    const qint16x8_t x1          = vqaddq_s16(vqmulq_qs16(a, D, fixed_point_position), C);
+    const qint16x8_t x2          = vqaddq_s16(vqmulq_qs16(a, x1, fixed_point_position), B);
+    const qint16x8_t x3          = vqaddq_s16(vqmulq_qs16(a, x2, fixed_point_position), A);
+    const qint16x8_t res         = vqmulq_qs16(a, x3, fixed_point_position);
+    return res;
+}
+
 inline qint8x8_t vqexp_qs8(qint8x8_t a, int fixed_point_position)
 {
     const qint8x8_t shift_value   = vdup_n_s8(fixed_point_position - 7);
@@ -773,6 +1346,32 @@
     return poly;
 }
 
+inline qint16x4_t vqexp_qs16(qint16x4_t a, int fixed_point_position)
+{
+    const qint16x4_t shift_value   = vdup_n_s16(fixed_point_position - 15);
+    const qint16x4_t const_one     = vdup_n_s16(1 << fixed_point_position);
+    const qint16x4_t const_ln2     = vqrshl_s16(vdup_n_s16(0x58B9), shift_value);                      // ln(2)
+    const qint16x4_t const_inv_ln2 = vorr_s16(vqrshl_s16(vdup_n_s16(0x38AA), shift_value), const_one); // 1/ln(2)
+
+    // Perform range reduction [-log(2),log(2)]
+    const qint16x4_t m = vqmul_qs16(a, const_inv_ln2, fixed_point_position); // x / ln(2)
+
+    // get decimal part from m
+    const qint16x4_t dec_m = vqshl_s16(m, vdup_n_s16(-fixed_point_position));
+
+    qint16x4_t alpha = vqmul_qs16(vqshl_s16(dec_m, vdup_n_s16(fixed_point_position)), const_ln2, fixed_point_position);
+    alpha            = vqabs_qs16(vqsub_s16(a, alpha));
+
+    // Polynomial Approximation
+    qint16x4_t poly = vqtaylor_poly_qs16<false>(alpha, fixed_point_position);
+    poly            = vqadd_s16(poly, const_one);
+
+    // Reconstruct
+    poly = vqshl_s16(poly, dec_m);
+
+    return poly;
+}
+
 inline qint8x16_t vqexpq_qs8(qint8x16_t a, int fixed_point_position)
 {
     const qint8x16_t shift_value   = vdupq_n_s8(fixed_point_position - 7);
@@ -799,6 +1398,32 @@
     return poly;
 }
 
+inline qint16x8_t vqexpq_qs16(qint16x8_t a, int fixed_point_position)
+{
+    const qint16x8_t shift_value   = vdupq_n_s16(fixed_point_position - 15);
+    const qint16x8_t const_one     = vdupq_n_s16(1 << fixed_point_position);
+    const qint16x8_t const_ln2     = vqrshlq_s16(vdupq_n_s16(0x58B9), shift_value);                       // ln(2)
+    const qint16x8_t const_inv_ln2 = vorrq_s16(vqrshlq_s16(vdupq_n_s16(0x38AA), shift_value), const_one); // 1/ln(2)
+
+    // Perform range reduction [-log(2),log(2)]
+    const qint16x8_t m = vqmulq_qs16(a, const_inv_ln2, fixed_point_position); // x / ln(2)
+
+    // get decimal part from m
+    const qint16x8_t dec_m = vqshlq_s16(m, vdupq_n_s16(-fixed_point_position));
+
+    qint16x8_t alpha = vqmulq_qs16(vqshlq_s16(dec_m, vdupq_n_s16(fixed_point_position)), const_ln2, fixed_point_position);
+    alpha            = vqabsq_qs16(vqsubq_qs16(a, alpha));
+
+    // Polynomial Approximation
+    qint16x8_t poly = vqtaylor_polyq_qs16<false>(alpha, fixed_point_position);
+    poly            = vqaddq_s16(poly, const_one);
+
+    // Reconstruct
+    poly = vqshlq_s16(poly, dec_m);
+
+    return poly;
+}
+
 inline qint8x8_t vlog_qs8(qint8x8_t a, int fixed_point_position)
 {
     const qint8x8_t const_one       = vdup_n_s8(1 << fixed_point_position);
@@ -838,6 +1463,45 @@
     return poly;
 }
 
+inline qint16x4_t vlog_qs16(qint16x4_t a, int fixed_point_position)
+{
+    const qint16x4_t const_one         = vdup_n_s16(1 << fixed_point_position);
+    const qint16x4_t const_fifteen_dec = vdup_n_s16(15);
+    const qint16x4_t const_ln2         = vdup_n_s16(0x58B9 >> (15 - fixed_point_position)); // ln(2)
+
+    // If 0 < a < 1, calculate log(1/x)
+    uint16x4_t calc_reciprocal = vclt_s16(a, const_one);
+    qint16x4_t recip           = vdup_n_s16(0);
+    recip                      = vbsl_s16(calc_reciprocal, recip, a);
+
+    // Calculate reciprocal
+    recip = vrecip_qs16(recip, fixed_point_position);
+    a     = vbsl_s16(calc_reciprocal, recip, a);
+
+    // Get decimal part of a
+    qint16x4_t shift_value = vdup_n_s16(-fixed_point_position);
+    qint16x4_t dec_a       = vshl_s16(a, shift_value); // a >> fixed_point_position
+
+    // Get exponent of 2^n which is equal or less than dec_a
+    shift_value = vsub_s16(const_fifteen_dec, vclz_s16(dec_a));
+
+    // Get x to range (1, 2]
+    const qint16x4_t shift_value_neg = vneg_s16(shift_value);
+    const qint16x4_t temp            = vsub_s16(vrshl_s16(a, shift_value_neg), const_one);
+    const qint16x4_t sum             = vmul_s16(shift_value, const_one);
+
+    // Polynomial Approximation
+    qint16x4_t poly = vtaylor_poly_qs16<true>(temp, fixed_point_position);
+
+    // Reconstruct
+    poly = vmul_qs16(vadd_s16(poly, sum), const_ln2, fixed_point_position);
+
+    // Set negative value for 0 < a < 1
+    poly = vbsl_s16(calc_reciprocal, vneg_s16(poly), poly);
+
+    return poly;
+}
+
 inline qint8x16_t vlogq_qs8(qint8x16_t a, int fixed_point_position)
 {
     const qint8x16_t const_one       = vdupq_n_s8(1 << fixed_point_position);
@@ -877,6 +1541,45 @@
     return poly;
 }
 
+inline qint16x8_t vlogq_qs16(qint16x8_t a, int fixed_point_position)
+{
+    const qint16x8_t const_one         = vdupq_n_s16(1 << fixed_point_position);
+    const qint16x8_t const_fifteen_dec = vdupq_n_s16(15);
+    const qint16x8_t const_ln2         = vdupq_n_s16(0x58B9 >> (15 - fixed_point_position)); // ln(2)
+
+    // If 0 < a < 1, calculate log(1/x)
+    uint16x8_t calc_reciprocal = vcltq_s16(a, const_one);
+    qint16x8_t recip           = vdupq_n_s16(0);
+    recip                      = vbslq_s16(calc_reciprocal, a, recip);
+
+    // Calculate reciprocal
+    recip = vqrecipq_qs16(recip, fixed_point_position);
+    a     = vbslq_s16(calc_reciprocal, recip, a);
+
+    // Get decimal part of a
+    qint16x8_t shift_value = vdupq_n_s16(-fixed_point_position);
+    qint16x8_t dec_a       = vshlq_s16(a, shift_value); // a >> fixed_point_position
+
+    // Get exponent of 2^n which is equal or less than dec_a
+    shift_value = vqsubq_s16(const_fifteen_dec, vclzq_s16(dec_a));
+
+    // Get x to range (1, 2]
+    const qint16x8_t shift_value_neg = vnegq_s16(shift_value);
+    const qint16x8_t temp            = vqsubq_s16(vrshlq_s16(a, shift_value_neg), const_one);
+    const qint16x8_t sum             = vmulq_s16(shift_value, const_one);
+
+    // Polynomial Approximation
+    qint16x8_t poly = vtaylor_polyq_qs16<true>(temp, fixed_point_position);
+
+    // Reconstruct
+    poly = vqmulq_qs16(vqaddq_s16(poly, sum), const_ln2, fixed_point_position);
+
+    // Set negative value for 0 < a < 1
+    poly = vbslq_s16(calc_reciprocal, vnegq_s16(poly), poly);
+
+    return poly;
+}
+
 inline qint8x8_t vinvsqrt_qs8(qint8x8_t a, int fixed_point_position)
 {
     const qint8x8_t const_three = vdup_n_s8(3 << fixed_point_position);
@@ -904,17 +1607,46 @@
     return vshl_s8(x, shift_value2);
 }
 
+inline qint16x4_t vinvsqrt_qs16(qint16x4_t a, int fixed_point_position)
+{
+    const qint16x4_t const_three = vdup_n_s16(3 << fixed_point_position);
+
+    // Find shift value. Number must be in (0.5, 2) range.
+    qint16x4_t shift_value = vneg_s16(vsub_s16(vdup_n_s16(16), vadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position))));
+
+    // Add one when the shift value is negative in order to get the correct result when we shift right with 1
+    qint16x4_t temp         = vsub_s16(vdup_n_s16(16), vadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position)));
+    uint16x4_t temp_ltz     = vclt_s16(temp, vdup_n_qs16(0));
+    temp                    = vbsl_s16(temp_ltz, vadd_s16(temp, vdup_n_s16(1)), temp);
+    qint16x4_t shift_value2 = vneg_s16(vshr_n_s16(temp, 1));
+
+    temp = vshl_s16(a, shift_value);
+
+    // Initial guess
+    qint16x4_t x = temp;
+
+    // Calculate (x / 2) * (3 - a * x^2)
+    // After five iterations we have the result for 8 bit
+    x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
+    x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
+    x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
+    x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
+    x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
+
+    return vshl_s16(x, shift_value2);
+}
+
 inline qint8x8_t vqinvsqrt_qs8(qint8x8_t a, int fixed_point_position)
 {
     const qint8x8_t const_three = vdup_n_s8(3 << fixed_point_position);
 
     // Find shift value. Number must be in (0.5, 2) range.
-    qint8x8_t shift_value = vneg_s8(vqsub_s8(vdup_n_s8(8), vadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
+    qint8x8_t shift_value = vneg_s8(vqsub_s8(vdup_n_s8(8), vqadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
 
     // Add one when the shift value is negative in order to get the correct result when we shift right with 1
-    qint8x8_t temp         = vsub_s8(vdup_n_s8(8), vadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position)));
+    qint8x8_t temp         = vqsub_s8(vdup_n_s8(8), vqadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position)));
     uint8x8_t temp_ltz     = vclt_s8(temp, vdup_n_qs8(0));
-    temp                   = vbsl_s8(temp_ltz, vadd_s8(temp, vdup_n_s8(1)), temp);
+    temp                   = vbsl_s8(temp_ltz, vqadd_s8(temp, vdup_n_s8(1)), temp);
     qint8x8_t shift_value2 = vneg_s8(vshr_n_s8(temp, 1));
 
     temp = vshl_s8(a, shift_value);
@@ -931,6 +1663,35 @@
     return vshl_s8(x, shift_value2);
 }
 
+inline qint16x4_t vqinvsqrt_qs16(qint16x4_t a, int fixed_point_position)
+{
+    const qint16x4_t const_three = vdup_n_s16(3 << fixed_point_position);
+
+    // Find shift value. Number must be in (0.5, 2) range.
+    qint16x4_t shift_value = vneg_s16(vqsub_s16(vdup_n_s16(16), vqadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position))));
+
+    // Add one when the shift value is negative in order to get the correct result when we shift right with 1
+    qint16x4_t temp         = vqsub_s16(vdup_n_s16(16), vqadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position)));
+    uint16x4_t temp_ltz     = vclt_s16(temp, vdup_n_qs16(0));
+    temp                    = vbsl_s16(temp_ltz, vqadd_s16(temp, vdup_n_s16(1)), temp);
+    qint16x4_t shift_value2 = vneg_s16(vshr_n_s16(temp, 1));
+
+    temp = vshl_s16(a, shift_value);
+
+    // Initial guess
+    qint16x4_t x = temp;
+
+    // Calculate (x / 2) * (3 - a * x^2)
+    // After five iterations we have the result for 16 bit
+    x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
+    x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
+    x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
+    x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
+    x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
+
+    return vqshl_s16(x, shift_value2);
+}
+
 inline qint8x16_t vinvsqrtq_qs8(qint8x16_t a, int fixed_point_position)
 {
     const qint8x16_t const_three = vdupq_n_s8(3 << fixed_point_position);
@@ -958,17 +1719,46 @@
     return vshlq_s8(x, shift_value2);
 }
 
+inline qint16x8_t vinvsqrtq_qs16(qint16x8_t a, int fixed_point_position)
+{
+    const qint16x8_t const_three = vdupq_n_s16(3 << fixed_point_position);
+
+    // Find shift value. Number must be in (0.5, 2) range.
+    qint16x8_t shift_value = vnegq_s16(vsubq_s16(vdupq_n_s16(16), vaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position))));
+
+    // Add one when the shift value is negative in order to get the correct result when we shift right with 1
+    qint16x8_t temp         = vsubq_s16(vdupq_n_s16(16), vaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position)));
+    uint16x8_t temp_ltz     = vcltq_s16(temp, vdupq_n_qs16(0));
+    temp                    = vbslq_s16(temp_ltz, vaddq_s16(temp, vdupq_n_s16(1)), temp);
+    qint16x8_t shift_value2 = vnegq_s16(vshrq_n_s16(temp, 1));
+
+    temp = vshlq_s16(a, shift_value);
+
+    // Initial guess
+    qint16x8_t x = temp;
+
+    // Calculate (x / 2) * (3 - a * x^2)
+    // After five iterations we have the result for 16 bit
+    x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
+    x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
+    x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
+    x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
+    x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
+
+    return vshlq_s16(x, shift_value2);
+}
+
 inline qint8x16_t vqinvsqrtq_qs8(qint8x16_t a, int fixed_point_position)
 {
     const qint8x16_t const_three = vdupq_n_s8(3 << fixed_point_position);
 
     // Find shift value. Number must be in (0.5, 2) range.
-    qint8x16_t shift_value = vnegq_s8(vqsubq_s8(vdupq_n_s8(8), vaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
+    qint8x16_t shift_value = vnegq_s8(vqsubq_s8(vdupq_n_s8(8), vqaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
 
     // Add one when the shift value is negative in order to get the correct result when we shift right with 1
-    qint8x16_t temp         = vsubq_s8(vdupq_n_s8(8), vaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position)));
+    qint8x16_t temp         = vqsubq_s8(vdupq_n_s8(8), vqaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position)));
     uint8x16_t temp_ltz     = vcltq_s8(temp, vdupq_n_qs8(0));
-    temp                    = vbslq_s8(temp_ltz, vaddq_s8(temp, vdupq_n_s8(1)), temp);
+    temp                    = vbslq_s8(temp_ltz, vqaddq_s8(temp, vdupq_n_s8(1)), temp);
     qint8x16_t shift_value2 = vnegq_s8(vshrq_n_s8(temp, 1));
 
     temp = vshlq_s8(a, shift_value);
@@ -982,7 +1772,36 @@
     x = vshrq_n_s8(vqmulq_qs8(x, vqsubq_s8(const_three, vqmulq_qs8(temp, vqmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
     x = vshrq_n_s8(vqmulq_qs8(x, vqsubq_s8(const_three, vqmulq_qs8(temp, vqmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
 
-    return vshlq_s8(x, shift_value2);
+    return vqshlq_s8(x, shift_value2);
+}
+
+inline qint16x8_t vqinvsqrtq_qs16(qint16x8_t a, int fixed_point_position)
+{
+    const qint16x8_t const_three = vdupq_n_s16(3 << fixed_point_position);
+
+    // Find shift value. Number must be in (0.5, 2) range.
+    qint16x8_t shift_value = vnegq_s16(vqsubq_s16(vdupq_n_s16(16), vqaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position))));
+
+    // Add one when the shift value is negative in order to get the correct result when we shift right with 1
+    qint16x8_t temp         = vqsubq_s16(vdupq_n_s16(16), vqaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position)));
+    uint16x8_t temp_ltz     = vcltq_s16(temp, vdupq_n_qs16(0));
+    temp                    = vbslq_s16(temp_ltz, vqaddq_s16(temp, vdupq_n_s16(1)), temp);
+    qint16x8_t shift_value2 = vnegq_s16(vshrq_n_s16(temp, 1));
+
+    temp = vqshlq_s16(a, shift_value);
+
+    // Initial guess
+    qint16x8_t x = temp;
+
+    // Calculate (x / 2) * (3 - a * x^2)
+    // After five iterations we have the result for 16 bit
+    x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
+    x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
+    x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
+    x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
+    x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
+
+    return vqshlq_s16(x, shift_value2);
 }
 
 inline qint8x8_t vtanh_qs8(qint8x8_t a, int fixed_point_position)
@@ -998,6 +1817,19 @@
     return tanh;
 }
 
+inline qint16x4_t vtanh_qs16(qint16x4_t a, int fixed_point_position)
+{
+    const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
+    const qint16x4_t const_two = vdup_n_s16(2 << fixed_point_position);
+
+    qint16x4_t exp2x = vqexp_qs16(vqmul_qs16(const_two, a, fixed_point_position), fixed_point_position);
+    qint16x4_t num   = vqsub_qs16(exp2x, const_one);
+    qint16x4_t den   = vqadd_qs16(exp2x, const_one);
+    qint16x4_t tanh  = vqmul_qs16(num, vrecip_qs16(den, fixed_point_position), fixed_point_position);
+
+    return tanh;
+}
+
 inline qint8x16_t vtanhq_qs8(qint8x16_t a, int fixed_point_position)
 {
     const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
@@ -1027,4 +1859,17 @@
     };
     return res;
 }
+
+inline qint16x8_t vtanhq_qs16(qint16x8_t a, int fixed_point_position)
+{
+    const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
+    const qint16x8_t const_two = vdupq_n_s16(2 << fixed_point_position);
+
+    qint16x8_t exp2x = vqexpq_qs16(vqmulq_qs16(const_two, a, fixed_point_position), fixed_point_position);
+    qint16x8_t num   = vqsubq_qs16(exp2x, const_one);
+    qint16x8_t den   = vqaddq_qs16(exp2x, const_one);
+    qint16x8_t tanh  = vqmulq_qs16(num, vqrecipq_qs16(den, fixed_point_position), fixed_point_position);
+
+    return tanh;
+}
 }
diff --git a/tests/Utils.h b/tests/Utils.h
index a6181b3..25023d4 100644
--- a/tests/Utils.h
+++ b/tests/Utils.h
@@ -499,6 +499,7 @@
             *reinterpret_cast<uint16_t *>(ptr) = value;
             break;
         case DataType::S16:
+        case DataType::QS16:
             *reinterpret_cast<int16_t *>(ptr) = value;
             break;
         case DataType::U32:
diff --git a/tests/validation/FixedPoint.h b/tests/validation/FixedPoint.h
index 380bad0..fa99ff8 100644
--- a/tests/validation/FixedPoint.h
+++ b/tests/validation/FixedPoint.h
@@ -756,8 +756,9 @@
         fixed_point<T>       a          = shift < 0 ? shift_left(x, -shift) : shift_right(x, shift);
         const fixed_point<T> x_half     = shift_right(a, 1);
 
-        // We need three iterations to find the result
-        for(int i = 0; i < 3; ++i)
+        // We need three iterations to find the result for QS8 and five for QS16
+        constexpr int num_iterations = std::is_same<T, int8_t>::value ? 3 : 5;
+        for(int i = 0; i < num_iterations; ++i)
         {
             a = mul(a, sub(three_half, mul(x_half, mul(a, a))));
         }
diff --git a/tests/validation/NEON/Fixedpoint/Exp_QS16.cpp b/tests/validation/NEON/Fixedpoint/Exp_QS16.cpp
new file mode 100644
index 0000000..e6d7d86
--- /dev/null
+++ b/tests/validation/NEON/Fixedpoint/Exp_QS16.cpp
@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "Globals.h"
+#include "NEON/Helper.h"
+#include "NEON/NEAccessor.h"
+#include "TensorLibrary.h"
+#include "TypePrinter.h"
+#include "Utils.h"
+#include "validation/Datasets.h"
+#include "validation/ReferenceCPP.h"
+#include "validation/Validation.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/NEFixedPoint.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+
+#include "boost_wrapper.h"
+
+#include <random>
+#include <string>
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::neon;
+using namespace arm_compute::test::validation;
+
+namespace
+{
+const float tolerance = 1.0f; /**< Tolerance value for comparing reference's output against implementation's output */
+
+/** Compute Neon exponential function for signed 16 bit fixed point.
+ *
+ * @param[in] shape Shape of the input and output tensors.
+ *
+ * @return Computed output tensor.
+ */
+Tensor compute_exp_qs16(const TensorShape &shape, int fixed_point_position)
+{
+    // Create tensors
+    Tensor src = create_tensor(shape, DataType::QS16, 1, fixed_point_position);
+    Tensor dst = create_tensor(shape, DataType::QS16, 1, fixed_point_position);
+
+    constexpr unsigned int num_elems_processed_per_iteration = 8;
+    Window                 window                            = calculate_max_window(*src.info(), Steps(num_elems_processed_per_iteration));
+    AccessWindowHorizontal input_access(src.info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_access(dst.info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(window, input_access, output_access);
+    output_access.set_valid_region(window, src.info()->valid_region());
+
+    // Allocate tensors
+    src.allocator()->allocate();
+    dst.allocator()->allocate();
+
+    BOOST_TEST(!src.info()->is_resizable());
+    BOOST_TEST(!dst.info()->is_resizable());
+
+    // Fill tensors. Keep the range between [-1.0, 1.0) so the result won't
+    // overflow.
+    std::uniform_int_distribution<> distribution(-(1 << (fixed_point_position - 1)), (1 << (fixed_point_position - 1)));
+    library->fill(NEAccessor(src), distribution, 0);
+
+    Iterator input(&src, window);
+    Iterator output(&dst, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        qint16x8_t in = vld1q_qs16(reinterpret_cast<const qint16_t *>(input.ptr()));
+        // Use saturated exp
+        vst1q_qs16(reinterpret_cast<qint16_t *>(output.ptr()), vqexpq_qs16(in, fixed_point_position));
+    },
+    input, output);
+
+    return dst;
+}
+} // namespace
+
+#ifndef DOXYGEN_SKIP_THIS
+BOOST_AUTO_TEST_SUITE(NEON)
+BOOST_AUTO_TEST_SUITE(FixedPoint)
+BOOST_AUTO_TEST_SUITE(QS16)
+BOOST_AUTO_TEST_SUITE(Exp)
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit") * boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(RunSmall, Small1DShape() * boost::unit_test::data::xrange(1, 15), shape, fixed_point_position)
+{
+    // Compute function
+    Tensor dst = compute_exp_qs16(shape, fixed_point_position);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_fixed_point_operation(shape, DataType::QS16, DataType::QS16, FixedPointOp::EXP, fixed_point_position);
+
+    // Validate output
+    validate(NEAccessor(dst), ref_dst, tolerance, 0);
+}
+
+BOOST_AUTO_TEST_SUITE_END()
+BOOST_AUTO_TEST_SUITE_END()
+BOOST_AUTO_TEST_SUITE_END()
+BOOST_AUTO_TEST_SUITE_END()
+#endif
diff --git a/tests/validation/NEON/Fixedpoint/Exp_QS8.cpp b/tests/validation/NEON/Fixedpoint/Exp_QS8.cpp
index 086314f..f8fc0c2 100644
--- a/tests/validation/NEON/Fixedpoint/Exp_QS8.cpp
+++ b/tests/validation/NEON/Fixedpoint/Exp_QS8.cpp
@@ -78,9 +78,9 @@
     BOOST_TEST(!src.info()->is_resizable());
     BOOST_TEST(!dst.info()->is_resizable());
 
-    // Fill tensors. Keep the range between (1, (1 << (fixed_point_position - 1))) so the result won't
+    // Fill tensors. Keep the range between [-1.0, 1.0) so the result won't
     // overflow. E.g. e^7 = 1096, which cannot be represented in QS8
-    std::uniform_int_distribution<> distribution(1, (1 << (fixed_point_position - 1)));
+    std::uniform_int_distribution<> distribution(-(1 << (fixed_point_position - 1)), (1 << (fixed_point_position - 1)));
     library->fill(NEAccessor(src), distribution, 0);
 
     Iterator input(&src, window);
diff --git a/tests/validation/NEON/Fixedpoint/Invsqrt_QS16.cpp b/tests/validation/NEON/Fixedpoint/Invsqrt_QS16.cpp
new file mode 100644
index 0000000..9211ccf
--- /dev/null
+++ b/tests/validation/NEON/Fixedpoint/Invsqrt_QS16.cpp
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "Globals.h"
+#include "NEON/Helper.h"
+#include "NEON/NEAccessor.h"
+#include "TensorLibrary.h"
+#include "TypePrinter.h"
+#include "Utils.h"
+#include "validation/Datasets.h"
+#include "validation/ReferenceCPP.h"
+#include "validation/Validation.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/NEFixedPoint.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+
+#include "boost_wrapper.h"
+
+#include <random>
+#include <string>
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::neon;
+using namespace arm_compute::test::validation;
+
+namespace
+{
+const float tolerance = 5.0f; /**< Tolerance value for comparing reference's output against implementation's output */
+
+/** Compute Neon inverse square root function for signed 16 bit fixed point.
+ *
+ * @param[in] shape Shape of the input and output tensors.
+ *
+ * @return Computed output tensor.
+ */
+Tensor compute_invsqrt_qs16(const TensorShape &shape, int fixed_point_position)
+{
+    // Create tensors
+    Tensor src = create_tensor(shape, DataType::QS16, 1, fixed_point_position);
+    Tensor dst = create_tensor(shape, DataType::QS16, 1, fixed_point_position);
+
+    constexpr unsigned int num_elems_processed_per_iteration = 8;
+    Window                 window                            = calculate_max_window(*src.info(), Steps(num_elems_processed_per_iteration));
+    AccessWindowHorizontal input_access(src.info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_access(dst.info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(window, input_access, output_access);
+    output_access.set_valid_region(window, src.info()->valid_region());
+
+    // Allocate tensors
+    src.allocator()->allocate();
+    dst.allocator()->allocate();
+
+    BOOST_TEST(!src.info()->is_resizable());
+    BOOST_TEST(!dst.info()->is_resizable());
+
+    // Fill tensors. Keep the range between [1, 0x7FFF)
+    std::uniform_int_distribution<> distribution(1, 0x7FFF);
+    library->fill(NEAccessor(src), distribution, 0);
+
+    Iterator input(&src, window);
+    Iterator output(&dst, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        qint16x8_t in = vld1q_qs16(reinterpret_cast<const qint16_t *>(input.ptr()));
+        vst1q_qs16(reinterpret_cast<qint16_t *>(output.ptr()), vqinvsqrtq_qs16(in, fixed_point_position));
+    },
+    input, output);
+
+    return dst;
+}
+} // namespace
+
+#ifndef DOXYGEN_SKIP_THIS
+BOOST_AUTO_TEST_SUITE(NEON)
+BOOST_AUTO_TEST_SUITE(FixedPoint)
+BOOST_AUTO_TEST_SUITE(QS16)
+BOOST_AUTO_TEST_SUITE(Invsqrt)
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit") * boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(RunSmall, Small1DShape() * boost::unit_test::data::xrange(1, 14), shape, fixed_point_position)
+{
+    // Compute function
+    Tensor dst = compute_invsqrt_qs16(shape, fixed_point_position);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_fixed_point_operation(shape, DataType::QS16, DataType::QS16, FixedPointOp::INV_SQRT, fixed_point_position);
+
+    // Validate output
+    validate(NEAccessor(dst), ref_dst, tolerance, 0);
+}
+
+BOOST_AUTO_TEST_SUITE_END()
+BOOST_AUTO_TEST_SUITE_END()
+BOOST_AUTO_TEST_SUITE_END()
+BOOST_AUTO_TEST_SUITE_END()
+#endif
diff --git a/tests/validation/NEON/Fixedpoint/Invsqrt_QS8.cpp b/tests/validation/NEON/Fixedpoint/Invsqrt_QS8.cpp
index 3308f7d..ab63cbe 100644
--- a/tests/validation/NEON/Fixedpoint/Invsqrt_QS8.cpp
+++ b/tests/validation/NEON/Fixedpoint/Invsqrt_QS8.cpp
@@ -49,7 +49,7 @@
 
 namespace
 {
-const float tolerance = 3; /**< Tolerance value for comparing reference's output against implementation's output */
+const float tolerance = 4.0f; /**< Tolerance value for comparing reference's output against implementation's output */
 
 /** Compute Neon inverse square root function for signed 8bit fixed point.
  *
@@ -78,9 +78,8 @@
     BOOST_TEST(!src.info()->is_resizable());
     BOOST_TEST(!dst.info()->is_resizable());
 
-    // Fill tensors. Keep the range between (32, 127) so the result won't
-    // overflow. E.g. for Q2.5 invsqrt(0.001) = 31.6, which cannot be represented.
-    std::uniform_int_distribution<> distribution(32, 127);
+    // Fill tensors. Keep the range between [1, 127).
+    std::uniform_int_distribution<> distribution(1, 127);
     library->fill(NEAccessor(src), distribution, 0);
 
     Iterator input(&src, window);
@@ -89,7 +88,7 @@
     execute_window_loop(window, [&](const Coordinates & id)
     {
         qint8x16_t in = vld1q_s8(reinterpret_cast<const qint8_t *>(input.ptr()));
-        vst1q_s8(reinterpret_cast<qint8_t *>(output.ptr()), vinvsqrtq_qs8(in, fixed_point_position));
+        vst1q_s8(reinterpret_cast<qint8_t *>(output.ptr()), vqinvsqrtq_qs8(in, fixed_point_position));
     },
     input, output);
 
diff --git a/tests/validation/NEON/Fixedpoint/Log_QS16.cpp b/tests/validation/NEON/Fixedpoint/Log_QS16.cpp
new file mode 100644
index 0000000..c23d127
--- /dev/null
+++ b/tests/validation/NEON/Fixedpoint/Log_QS16.cpp
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "Globals.h"
+#include "NEON/Helper.h"
+#include "NEON/NEAccessor.h"
+#include "TensorLibrary.h"
+#include "TypePrinter.h"
+#include "Utils.h"
+#include "validation/Datasets.h"
+#include "validation/ReferenceCPP.h"
+#include "validation/Validation.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/NEFixedPoint.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+
+#include "boost_wrapper.h"
+
+#include <random>
+#include <string>
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::neon;
+using namespace arm_compute::test::validation;
+
+namespace
+{
+const float tolerance = 7.0f; /**< Tolerance value for comparing reference's output against implementation's output */
+
+/** Compute Neon logarithm function for signed 16 bit fixed point.
+ *
+ * @param[in] shape Shape of the input and output tensors.
+ *
+ * @return Computed output tensor.
+ */
+Tensor compute_log_qs16(const TensorShape &shape, int fixed_point_position)
+{
+    // Create tensors
+    Tensor src = create_tensor(shape, DataType::QS16, 1, fixed_point_position);
+    Tensor dst = create_tensor(shape, DataType::QS16, 1, fixed_point_position);
+
+    constexpr unsigned int num_elems_processed_per_iteration = 8;
+    Window                 window                            = calculate_max_window(*src.info(), Steps(num_elems_processed_per_iteration));
+    AccessWindowHorizontal input_access(src.info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_access(dst.info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(window, input_access, output_access);
+    output_access.set_valid_region(window, src.info()->valid_region());
+
+    // Allocate tensors
+    src.allocator()->allocate();
+    dst.allocator()->allocate();
+
+    BOOST_TEST(!src.info()->is_resizable());
+    BOOST_TEST(!dst.info()->is_resizable());
+
+    // Fill tensors. Keep the range between [(1 << (fixed_point_position - 1), 0x3FFF) so the result won't
+    // overflow.
+    std::uniform_int_distribution<> distribution((1 << (fixed_point_position - 1)), 0x3FFF);
+    library->fill(NEAccessor(src), distribution, 0);
+
+    Iterator input(&src, window);
+    Iterator output(&dst, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        qint16x8_t in = vld1q_qs16(reinterpret_cast<const qint16_t *>(input.ptr()));
+        vst1q_qs16(reinterpret_cast<qint16_t *>(output.ptr()), vlogq_qs16(in, fixed_point_position));
+    },
+    input, output);
+
+    return dst;
+}
+} // namespace
+
+#ifndef DOXYGEN_SKIP_THIS
+BOOST_AUTO_TEST_SUITE(NEON)
+BOOST_AUTO_TEST_SUITE(FixedPoint)
+BOOST_AUTO_TEST_SUITE(QS16)
+BOOST_AUTO_TEST_SUITE(Log)
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit") * boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(RunSmall, Small1DShape() * boost::unit_test::data::xrange(4, 14), shape, fixed_point_position)
+{
+    // Compute function
+    Tensor dst = compute_log_qs16(shape, fixed_point_position);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_fixed_point_operation(shape, DataType::QS16, DataType::QS16, FixedPointOp::LOG, fixed_point_position);
+
+    // Validate output
+    validate(NEAccessor(dst), ref_dst, tolerance, 0);
+}
+
+BOOST_AUTO_TEST_SUITE_END()
+BOOST_AUTO_TEST_SUITE_END()
+BOOST_AUTO_TEST_SUITE_END()
+BOOST_AUTO_TEST_SUITE_END()
+#endif
diff --git a/tests/validation/NEON/Fixedpoint/Log_QS8.cpp b/tests/validation/NEON/Fixedpoint/Log_QS8.cpp
index 7b734c1..6789ec7 100644
--- a/tests/validation/NEON/Fixedpoint/Log_QS8.cpp
+++ b/tests/validation/NEON/Fixedpoint/Log_QS8.cpp
@@ -78,9 +78,9 @@
     BOOST_TEST(!src.info()->is_resizable());
     BOOST_TEST(!dst.info()->is_resizable());
 
-    // Fill tensors. Keep the range between ((1 << (fixed_point_position - 1), 63) so the result won't
+    // Fill tensors. Keep the range between [(1 << (fixed_point_position - 1), 63) so the result won't
     // overflow. E.g. for Q2.5 ln(0.001) = -6.9, which cannot be represented.
-    std::uniform_int_distribution<> distribution((1 << (fixed_point_position - 1)), 63);
+    std::uniform_int_distribution<> distribution((1 << (fixed_point_position - 1)), 0x3F);
     library->fill(NEAccessor(src), distribution, 0);
 
     Iterator input(&src, window);
diff --git a/tests/validation/NEON/Fixedpoint/Reciprocal_QS16.cpp b/tests/validation/NEON/Fixedpoint/Reciprocal_QS16.cpp
new file mode 100644
index 0000000..c66cf0e
--- /dev/null
+++ b/tests/validation/NEON/Fixedpoint/Reciprocal_QS16.cpp
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "Globals.h"
+#include "NEON/Helper.h"
+#include "NEON/NEAccessor.h"
+#include "TensorLibrary.h"
+#include "TypePrinter.h"
+#include "Utils.h"
+#include "validation/Datasets.h"
+#include "validation/ReferenceCPP.h"
+#include "validation/Validation.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/NEON/NEFixedPoint.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "arm_compute/runtime/TensorAllocator.h"
+
+#include "boost_wrapper.h"
+
+#include <random>
+#include <string>
+
+using namespace arm_compute;
+using namespace arm_compute::test;
+using namespace arm_compute::test::neon;
+using namespace arm_compute::test::validation;
+
+namespace
+{
+const float tolerance = 9.0f; /**< Tolerance value for comparing reference's output against implementation's output. */
+
+/** Compute Neon reciprocal function for signed 16 bit fixed point.
+ *
+ * @param[in] shape Shape of the input and output tensors.
+ *
+ * @return Computed output tensor.
+ */
+Tensor compute_reciprocal_qs16(const TensorShape &shape, int fixed_point_position)
+{
+    // Create tensors
+    Tensor src = create_tensor(shape, DataType::QS16, 1, fixed_point_position);
+    Tensor dst = create_tensor(shape, DataType::QS16, 1, fixed_point_position);
+
+    constexpr unsigned int num_elems_processed_per_iteration = 8;
+    Window                 window                            = calculate_max_window(*src.info(), Steps(num_elems_processed_per_iteration));
+    AccessWindowHorizontal input_access(src.info(), 0, num_elems_processed_per_iteration);
+    AccessWindowHorizontal output_access(dst.info(), 0, num_elems_processed_per_iteration);
+
+    update_window_and_padding(window, input_access, output_access);
+    output_access.set_valid_region(window, src.info()->valid_region());
+
+    // Allocate tensors
+    src.allocator()->allocate();
+    dst.allocator()->allocate();
+
+    BOOST_TEST(!src.info()->is_resizable());
+    BOOST_TEST(!dst.info()->is_resizable());
+
+    // Fill tensors. Keep the range between [15, 0x7FFF) so the result won't
+    // overflow.
+    std::uniform_int_distribution<> distribution(15, 0x7FFF);
+    library->fill(NEAccessor(src), distribution, 0);
+
+    Iterator input(&src, window);
+    Iterator output(&dst, window);
+
+    execute_window_loop(window, [&](const Coordinates & id)
+    {
+        qint16x8_t in = vld1q_qs16(reinterpret_cast<const qint16_t *>(input.ptr()));
+        vst1q_qs16(reinterpret_cast<qint16_t *>(output.ptr()), vqrecipq_qs16(in, fixed_point_position));
+    },
+    input, output);
+
+    return dst;
+}
+} // namespace
+
+#ifndef DOXYGEN_SKIP_THIS
+BOOST_AUTO_TEST_SUITE(NEON)
+BOOST_AUTO_TEST_SUITE(FixedPoint)
+BOOST_AUTO_TEST_SUITE(QS16)
+BOOST_AUTO_TEST_SUITE(Reciprocal)
+
+BOOST_TEST_DECORATOR(*boost::unit_test::label("precommit") * boost::unit_test::label("nightly"))
+BOOST_DATA_TEST_CASE(RunSmall, Small1DShape() * boost::unit_test::data::xrange(1, 14), shape, fixed_point_position)
+{
+    // Compute function
+    Tensor dst = compute_reciprocal_qs16(shape, fixed_point_position);
+
+    // Compute reference
+    RawTensor ref_dst = Reference::compute_reference_fixed_point_operation(shape, DataType::QS16, DataType::QS16, FixedPointOp::RECIPROCAL, fixed_point_position);
+
+    // Validate output
+    validate(NEAccessor(dst), ref_dst, tolerance, 0);
+}
+
+BOOST_AUTO_TEST_SUITE_END()
+BOOST_AUTO_TEST_SUITE_END()
+BOOST_AUTO_TEST_SUITE_END()
+BOOST_AUTO_TEST_SUITE_END()
+#endif
diff --git a/tests/validation/NEON/Fixedpoint/Reciprocal_QS8.cpp b/tests/validation/NEON/Fixedpoint/Reciprocal_QS8.cpp
index 4c1c782..f1f130a 100644
--- a/tests/validation/NEON/Fixedpoint/Reciprocal_QS8.cpp
+++ b/tests/validation/NEON/Fixedpoint/Reciprocal_QS8.cpp
@@ -78,9 +78,9 @@
     BOOST_TEST(!src.info()->is_resizable());
     BOOST_TEST(!dst.info()->is_resizable());
 
-    // Fill tensors. Keep the range between (15, 100) so the result won't
+    // Fill tensors. Keep the range between [15, 100) so the result won't
     // overflow. E.g. for Q2.5 reciprocal(0.001) = 1000, which cannot be represented.
-    std::uniform_int_distribution<> distribution(15, 100);
+    std::uniform_int_distribution<> distribution(15, 0x7F);
     library->fill(NEAccessor(src), distribution, 0);
 
     Iterator input(&src, window);
diff --git a/tests/validation/Reference.cpp b/tests/validation/Reference.cpp
index be6ddba..761bc20 100644
--- a/tests/validation/Reference.cpp
+++ b/tests/validation/Reference.cpp
@@ -657,20 +657,20 @@
     switch(op)
     {
         case(FixedPointOp::INV_SQRT):
-            min = 32;
-            max = 127;
+            min = 1;
+            max = (dt_in == DataType::QS8) ? 0x7F : 0x7FFF;
             break;
         case(FixedPointOp::LOG):
             min = (1 << (fixed_point_position - 1));
-            max = 63;
+            max = (dt_in == DataType::QS8) ? 0x3F : 0x3FFF;
             break;
         case(FixedPointOp::EXP):
-            min = 1;
+            min = -(1 << (fixed_point_position - 1));
             max = (1 << (fixed_point_position - 1));
             break;
         case(FixedPointOp::RECIPROCAL):
             min = 15;
-            max = 100;
+            max = (dt_in == DataType::QS8) ? 0x7F : 0x7FFF;
             break;
         default:
             ARM_COMPUTE_ERROR("Fixed point operation not supported");
diff --git a/tests/validation/TensorFactory.h b/tests/validation/TensorFactory.h
index 610425b..d842bad 100644
--- a/tests/validation/TensorFactory.h
+++ b/tests/validation/TensorFactory.h
@@ -83,6 +83,7 @@
                 v                    = Tensor<uint16_t>(shape, dt, fixed_point_position, reinterpret_cast<value_type_u16 *>(data));
                 break;
             case DataType::S16:
+            case DataType::QS16:
                 using value_type_s16 = typename match_const<R, int16_t>::type;
                 v                    = Tensor<int16_t>(shape, dt, fixed_point_position, reinterpret_cast<value_type_s16 *>(data));
                 break;
diff --git a/tests/validation/Validation.cpp b/tests/validation/Validation.cpp
index 8aada0c..0092e4d 100644
--- a/tests/validation/Validation.cpp
+++ b/tests/validation/Validation.cpp
@@ -78,6 +78,8 @@
             return *reinterpret_cast<const uint16_t *>(ptr);
         case DataType::S16:
             return *reinterpret_cast<const int16_t *>(ptr);
+        case DataType::QS16:
+            return *reinterpret_cast<const qint16_t *>(ptr);
         case DataType::U32:
             return *reinterpret_cast<const uint32_t *>(ptr);
         case DataType::S32: