COMPMID-970 : Remove QS8 / QS16 support Removed fixed point related code. Change-Id: I487acf138dace3b0450e0d72ca7071eaec254566 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/137678 Tested-by: Jenkins <bsgcomp@arm.com> Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>

commit: 7485d5a62685cb745ab50e970adb722cb71557ac [log] [tgz]
author: Vidhya Sudhan Loganathan <vidhyasudhan.loganathan@arm.com> Wed Jul 04 09:34:00 2018 +0100
committer: Anthony Barbier <anthony.barbier@arm.com> Fri Nov 02 16:54:10 2018 +0000
tree: ba01b99ca466c93edc9a3f8c1e34394ff84be060
parent: 014333d73883c3872e458cedda5ccef586a7ccd4 [diff]
diff --git a/arm_compute/core/NEON/NEFixedPoint.h b/arm_compute/core/NEON/NEFixedPoint.h
index 504ec6c..ce64a8e 100644
--- a/arm_compute/core/NEON/NEFixedPoint.h
+++ b/arm_compute/core/NEON/NEFixedPoint.h

@@ -24,1194 +24,10 @@
 #ifndef __ARM_COMPUTE_NEFIXEDPOINT_H__
 #define __ARM_COMPUTE_NEFIXEDPOINT_H__
 
-#include "arm_compute/core/FixedPoint.h"
-
 #include <arm_neon.h>
 
 namespace arm_compute
 {
-using qint8x8_t    = int8x8_t;    /**< 8 bit fixed point vector with 8 elements */
-using qint8x8x2_t  = int8x8x2_t;  /**< 8 bit fixed point vector with 16 elements */
-using qint8x8x3_t  = int8x8x3_t;  /**< 8 bit fixed point vector with 24 elements */
-using qint8x8x4_t  = int8x8x4_t;  /**< 8 bit fixed point vector with 32 elements */
-using qint8x16_t   = int8x16_t;   /**< 8 bit fixed point vector with 16 elements */
-using qint8x16x2_t = int8x16x2_t; /**< 8 bit fixed point vector with 32 elements */
-using qint8x16x3_t = int8x16x3_t; /**< 8 bit fixed point vector with 48 elements */
-using qint8x16x4_t = int8x16x4_t; /**< 8 bit fixed point vector with 64 elements */
-using qint16x4_t   = int16x4_t;   /**< 16 bit fixed point vector with 4 elements */
-using qint16x4x2_t = int16x4x2_t; /**< 16 bit fixed point vector with 8 elements */
-using qint16x4x3_t = int16x4x3_t; /**< 16 bit fixed point vector with 12 elements */
-using qint16x4x4_t = int16x4x4_t; /**< 16 bit fixed point vector with 16 elements */
-using qint16x8_t   = int16x8_t;   /**< 16 bit fixed point vector with 8 elements */
-using qint16x8x2_t = int16x8x2_t; /**< 16 bit fixed point vector with 16 elements */
-using qint16x8x3_t = int16x8x3_t; /**< 16 bit fixed point vector with 24 elements */
-using qint16x8x4_t = int16x8x4_t; /**< 16 bit fixed point vector with 32 elements */
-using qint32x2_t   = int32x2_t;   /**< 32 bit fixed point vector with 2 elements */
-using qint32x4_t   = int32x4_t;   /**< 32 bit fixed point vector with 4 elements */
-using qint32x4x2_t = int32x4x2_t; /**< 32 bit fixed point vector with 8 elements */
-
-/** Get the lower half of a 16 elements vector
- *
- * @param[in] a vector of 16 elements
- *
- * @return 8 bit fixed point vector (8 elements)
- */
-qint8x8_t vget_low_qs8(qint8x16_t a);
-
-/** Get the lower half of a 16 elements vector
- *
- * @param[in] a vector of 8 elements
- *
- * @return 16 bit fixed point vector (4 elements)
- */
-qint16x4_t vget_low_qs16(qint16x8_t a);
-
-/** Get the higher half of a 16 elements vector
- *
- * @param[in] a vector of 16 elements
- *
- * @return 8 bit fixed point vector (8 elements)
- */
-qint8x8_t vget_high_qs8(qint8x16_t a);
-
-/** Get the higher half of a 16 elements vector
- *
- * @param[in] a vector of 8 elements
- *
- * @return 16 bit fixed point vector (4 elements)
- */
-qint16x4_t vget_high_qs16(qint16x8_t a);
-
-/** Load a single 8 bit fixed point vector from memory (8 elements)
- *
- * @param[in] addr Memory address of the 8 bit fixed point vector to load
- *
- * @return 8 bit fixed point vector (8 elements)
- */
-qint8x8_t vld1_qs8(const qint8_t *addr);
-
-/** Load a single 16 bit fixed point vector from memory (4 elements)
- *
- * @param[in] addr Memory address of the 16 bit fixed point vector to load
- *
- * @return 16 bit fixed point vector (4 elements)
- */
-qint16x4_t vld1_qs16(const qint16_t *addr);
-
-/** Load a single 8 bit fixed point vector from memory (16 elements)
- *
- * @param[in] addr Memory address of the 8 bit fixed point vector to load
- *
- * @return 8 bit fixed point vector (16 elements)
- */
-qint8x16_t vld1q_qs8(const qint8_t *addr);
-
-/** Load a single 16 bit fixed point vector from memory (8 elements)
- *
- * @param[in] addr Memory address of the 16 bit fixed point vector to load
- *
- * @return 16 bit fixed point vector (8 elements)
- */
-qint16x8_t vld1q_qs16(const qint16_t *addr);
-
-/** Load all lanes of 8 bit fixed point vector with same value from memory (8 elements)
- *
- * @param[in] addr Memory address of the 8 bit fixed point scalar value to load
- *
- * @return 8 bit fixed point vector (8 elements)
- */
-qint8x8_t vld1_dup_qs8(const qint8_t *addr);
-
-/** Load all lanes of 16 bit fixed point vector with same value from memory (4 elements)
- *
- * @param[in] addr Memory address of the 16 bit fixed point scalar value to load
- *
- * @return 16 bit fixed point vector (4 elements)
- */
-qint16x4_t vld1_dup_qs16(const qint16_t *addr);
-
-/** Load all lanes of 8 bit fixed point vector with same value from memory (16 elements)
- *
- * @param[in] addr Memory address of the 8 bit fixed point scalar value to load
- *
- * @return 8 bit fixed point vector (16 elements)
- */
-qint8x16_t vld1q_dup_qs8(const qint8_t *addr);
-
-/** Load all lanes of 16 bit fixed point vector with same value from memory (8 elements)
- *
- * @param[in] addr Memory address of the 16 bit fixed point scalar value to load
- *
- * @return 16 bit fixed point vector (8 elements)
- */
-qint16x8_t vld1q_dup_qs16(const qint16_t *addr);
-
-/** Load two 16 bit fixed point vectors from memory (8x2 elements)
- *
- * @param[in] addr Memory address of the 16 bit fixed point vectors to load
- *
- * @return 16 bit fixed point vectors (8x2 elements)
- */
-qint16x8x2_t vld2q_qs16(qint16_t *addr);
-
-/** Store a single 8 bit fixed point vector to memory (8 elements)
- *
- * @param[in] addr Memory address where the 8 bit fixed point vector should be stored
- * @param[in] b    8 bit fixed point vector to store
- *
- */
-void vst1_qs8(qint8_t *addr, qint8x8_t b);
-
-/** Store a single 16 bit fixed point vector to memory (4 elements)
- *
- * @param[in] addr Memory address where the 16 bit fixed point vector should be stored
- * @param[in] b    16 bit fixed point vector to store
- *
- */
-void vst1_qs16(qint16_t *addr, qint16x4_t b);
-
-/** Store a single 8 bit fixed point vector to memory (16 elements)
- *
- * @param[in] addr Memory address where the 8 bit fixed point vector should be stored
- * @param[in] b    8 bit fixed point vector to store
- *
- */
-void vst1q_qs8(qint8_t *addr, qint8x16_t b);
-
-/** Store a single 16 bit fixed point vector to memory (8 elements)
- *
- * @param[in] addr Memory address where the 16 bit fixed point vector should be stored
- * @param[in] b    16 bit fixed point vector to store
- *
- */
-void vst1q_qs16(qint16_t *addr, qint16x8_t b);
-
-/** Store two 16 bit fixed point vector to memory (8x2 elements)
- *
- * @param[in] addr Memory address where the 16 bit fixed point vectors should be stored
- * @param[in] b    16 bit fixed point vectors to store
- *
- */
-void vst2q_qs16(qint16_t *addr, qint16x8x2_t b);
-
-/** 16 bit fixed point vector saturating narrow (8 elements)
- *
- * @param[in] a 16 bit fixed point vector to convert
- *
- * @return 8 bit fixed point vector
- */
-qint8x8_t vqmovn_q16(qint16x8_t a);
-
-/** 32 bit fixed point vector saturating narrow (4 elements)
- *
- * @param[in] a 32 bit fixed point vector to convert
- *
- * @return 16 bit fixed point vector
- */
-qint16x4_t vqmovn_q32(qint32x4_t a);
-
-/** 8 bit fixed point vector duplicate (8 elements)
- *
- * @param[in] a 8 bit fixed point to duplicate
- *
- * @return The result of the vector duplication
- */
-qint8x8_t vdup_n_qs8(qint8_t a);
-
-/** 16 bit fixed point vector duplicate (4 elements)
- *
- * @param[in] a 16 bit fixed point to duplicate
- *
- * @return The result of the vector duplication
- */
-qint16x4_t vdup_n_qs16(qint16_t a);
-
-/** 8 bit fixed point vector duplicate (16 elements)
- *
- * @param[in] a 8 bit fixed point to duplicate
- *
- * @return The result of the vector duplication
- */
-qint8x16_t vdupq_n_qs8(qint8_t a);
-
-/** Duplicate a float and convert it to 8 bit fixed point vector (16 elements)
- *
- * @param[in] a                    floating point value to convert and duplicate
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the vector duplication
- */
-qint8x16_t vdupq_n_qs8_f32(float a, int fixed_point_position);
-
-/** Duplicate a float and convert it to 16 bit fixed point vector (8 elements)
- *
- * @param[in] a                    floating point value to convert and duplicate
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the vector duplication
- */
-qint16x8_t vdupq_n_qs16_f32(float a, int fixed_point_position);
-
-/** 16 bit fixed point vector duplicate (8 elements)
- *
- * @param[in] a 16 bit fixed point to duplicate
- *
- * @return The result of the vector duplication
- */
-qint16x8_t vdupq_n_qs16(qint16x8_t a);
-
-/** Absolute value of 8 bit fixed point vector (8 elements)
- *
- * @param[in] a 8 bit fixed point input vector
- *
- * @return The result of the 8 bit fixed point vector absolute value
- */
-qint8x8_t vabs_qs8(qint8x8_t a);
-
-/** Absolute value of 16 bit fixed point vector (4 elements)
- *
- * @param[in] a 16 bit fixed point input vector
- *
- * @return The result of the 16 bit fixed point vector absolute value
- */
-qint16x4_t vabs_qs16(qint16x4_t a);
-
-/** Absolute value of 8 bit fixed point vector (16 elements)
- *
- * @param[in] a 8 bit fixed point input vector
- *
- * @return The result of the 8 bit fixed point vector absolute value
- */
-qint8x16_t vabsq_qs8(qint8x16_t a);
-
-/** Absolute value of 16 bit fixed point vector (8 elements)
- *
- * @param[in] a 16 bit fixed point input vector
- *
- * @return The result of the 16 bit fixed point vector absolute value
- */
-qint16x8_t vabsq_qs16(qint16x8_t a);
-
-/** Saturating absolute value of 8 bit fixed point vector (8 elements)
- *
- * @param[in] a 8 bit fixed point input vector
- *
- * @return The result of the 8 bit fixed point vector absolute value
- */
-qint8x8_t vqabs_qs8(qint8x8_t a);
-
-/** Saturating absolute value of 16 bit fixed point vector (4 elements)
- *
- * @param[in] a 4 bit fixed point input vector
- *
- * @return The result of the 16 bit fixed point vector absolute value
- */
-qint16x4_t vqabs_qs16(qint16x4_t a);
-
-/** Saturating absolute value of 8 bit fixed point vector (16 elements)
- *
- * @param[in] a 8 bit fixed point input vector
- *
- * @return The result of the 8 bit fixed point vector absolute value
- */
-qint8x16_t vqabsq_qs8(qint8x16_t a);
-
-/** Saturating absolute value of 16 bit fixed point vector (8 elements)
- *
- * @param[in] a 16 bit fixed point input vector
- *
- * @return The result of the 16 bit fixed point vector absolute value
- */
-qint16x8_t vqabsq_qs16(qint16x8_t a);
-
-/** 8 bit fixed point vector max (8 elements)
- *
- * @param[in] a First 8 bit fixed point input vector
- * @param[in] b Second 8 bit fixed point input vector
- *
- * @return The result of the 8 bit fixed point vector max operation
- */
-qint8x8_t vmax_qs8(qint8x8_t a, qint8x8_t b);
-
-/** 16 bit fixed point vector max (4 elements)
- *
- * @param[in] a First 16 bit fixed point input vector
- * @param[in] b Second 16 bit fixed point input vector
- *
- * @return The result of the 16 bit fixed point vector max operation
- */
-qint16x4_t vmax_qs16(qint16x4_t a, qint16x4_t b);
-
-/** 8 bit fixed point vector max (16 elements)
- *
- * @param[in] a First 8 bit fixed point input vector
- * @param[in] b Second 8 bit fixed point input vector
- *
- * @return The result of the 8 bit fixed point vector max operation
- */
-qint8x16_t vmaxq_qs8(qint8x16_t a, qint8x16_t b);
-
-/** 16 bit fixed point vector max (8 elements)
- *
- * @param[in] a First 16 bit fixed point input vector
- * @param[in] b Second 16 bit fixed point input vector
- *
- * @return The result of the 16 bit fixed point vector max operation
- */
-qint16x8_t vmaxq_qs16(qint16x8_t a, qint16x8_t b);
-
-/** 8 bit fixed point vector pairwise max (8 elements)
- *
- * @param[in] a First 8 bit fixed point input vector
- * @param[in] b Second 8 bit fixed point input vector
- *
- * @return The result of the 8 bit fixed point vector pairwise max operation
- */
-qint8x8_t vpmax_qs8(qint8x8_t a, qint8x8_t b);
-
-/** 16 bit fixed point vector pairwise max (4 elements)
- *
- * @param[in] a First 16 bit fixed point input vector
- * @param[in] b Second 16 bit fixed point input vector
- *
- * @return The result of the 16 bit fixed point vector pairwise max operation
- */
-qint16x4_t vpmax_qs16(qint16x4_t a, qint16x4_t b);
-
-/** 8 bit fixed point vector min (8 elements)
- *
- * @param[in] a First 8 bit fixed point input vector
- * @param[in] b Second 8 bit fixed point input vector
- *
- * @return The result of the 8 bit fixed point vector max operation
- */
-qint8x8_t vmin_qs8(qint8x8_t a, qint8x8_t b);
-
-/** 16 bit fixed point vector min (4 elements)
- *
- * @param[in] a First 16 bit fixed point input vector
- * @param[in] b Second 16 bit fixed point input vector
- *
- * @return The result of the 16 bit fixed point vector max operation
- */
-qint16x4_t vmin_qs16(qint16x4_t a, qint16x4_t b);
-
-/** 8 bit fixed point vector min (16 elements)
- *
- * @param[in] a First 8 bit fixed point input vector
- * @param[in] b Second 8 bit fixed point input vector
- *
- * @return The result of the 8 bit fixed point vector min operation
- */
-qint8x16_t vminq_qs8(qint8x16_t a, qint8x16_t b);
-
-/** 16 bit fixed point vector min (8 elements)
- *
- * @param[in] a First 16 bit fixed point input vector
- * @param[in] b Second 16 bit fixed point input vector
- *
- * @return The result of the 16 bit fixed point vector min operation
- */
-qint16x8_t vminq_qs16(qint16x8_t a, qint16x8_t b);
-
-/** 8 bit fixed point vector pairwise min (8 elements)
- *
- * @param[in] a First 8 bit fixed point input vector
- * @param[in] b Second 8 bit fixed point input vector
- *
- * @return The result of the 8 bit fixed point vector pairwise min operation
- */
-qint8x8_t vpmin_qs8(qint8x8_t a, qint8x8_t b);
-
-/** 16 bit fixed point vector pairwise min (4 elements)
- *
- * @param[in] a First 16 bit fixed point input vector
- * @param[in] b Second 16 bit fixed point input vector
- *
- * @return The result of the 16 bit fixed point vector pairwise min operation
- */
-qint16x4_t vpmin_qs16(qint16x4_t a, qint16x4_t b);
-
-/** 8 bit fixed point vector add (8 elements)
- *
- * @param[in] a First 8 bit fixed point input vector
- * @param[in] b Second 8 bit fixed point input vector
- *
- * @return The result of the 8 bit fixed point vector addition
- */
-qint8x8_t vadd_qs8(qint8x8_t a, qint8x8_t b);
-
-/** 16 bit fixed point vector add (4 elements)
- *
- * @param[in] a First 16 bit fixed point input vector
- * @param[in] b Second 16 bit fixed point input vector
- *
- * @return The result of the 16 bit fixed point vector addition
- */
-qint16x4_t vadd_qs16(qint16x4_t a, qint16x4_t b);
-
-/** 8 bit fixed point vector add (16 elements)
- *
- * @param[in] a First 8 bit fixed point input vector
- * @param[in] b Second 8 bit fixed point input vector
- *
- * @return The result of the 8 bit fixed point vector addition
- */
-qint8x16_t vaddq_qs8(qint8x16_t a, qint8x16_t b);
-
-/** 16 bit fixed point vector add (8 elements)
- *
- * @param[in] a First 16 bit fixed point input vector
- * @param[in] b Second 16 bit fixed point input vector
- *
- * @return The result of the 16 bit fixed point vector addition
- */
-qint16x8_t vaddq_qs16(qint16x8_t a, qint16x8_t b);
-
-/** 8 bit fixed point vector saturating add (8 elements)
- *
- * @param[in] a First 8 bit fixed point input vector
- * @param[in] b Second 8 bit fixed point input vector
- *
- * @return The result of the 8 bit fixed point vector addition. The result is saturated in case of overflow
- */
-qint8x8_t vqadd_qs8(qint8x8_t a, qint8x8_t b);
-
-/** 16 bit fixed point vector saturating add (4 elements)
- *
- * @param[in] a First 16 bit fixed point input vector
- * @param[in] b Second 16 bit fixed point input vector
- *
- * @return The result of the 16 bit fixed point vector addition. The result is saturated in case of overflow
- */
-qint16x4_t vqadd_qs16(qint16x4_t a, qint16x4_t b);
-
-/** 8 bit fixed point vector saturating add (16 elements)
- *
- * @param[in] a First 8 bit fixed point input vector
- * @param[in] b Second 8 bit fixed point input vector
- *
- * @return The result of the 8 bit fixed point vector addition. The result is saturated in case of overflow
- */
-qint8x16_t vqaddq_qs8(qint8x16_t a, qint8x16_t b);
-
-/** 16 bit fixed point vector saturating add (8 elements)
- *
- * @param[in] a First 16 bit fixed point input vector
- * @param[in] b Second 16 bit fixed point input vector
- *
- * @return The result of the 16 bit fixed point vector addition. The result is saturated in case of overflow
- */
-qint16x8_t vqaddq_qs16(qint16x8_t a, qint16x8_t b);
-
-/** 8 bit fixed point vector saturating pairwise add (8 elements)
- *
- * @param[in] a 8 bit fixed point input vector
- *
- * @return The result of the 16 bit fixed point vector addition. The result is saturated in case of overflow
- */
-int16x4_t vpaddl_qs8(qint8x8_t a);
-
-/** 8 bit fixed point vector subtraction (8 elements)
- *
- * @param[in] a First 8 bit fixed point input vector
- * @param[in] b Second 8 bit fixed point input vector
- *
- * @return The result of the 8 bit fixed point vector subtraction
- */
-qint8x8_t vsub_qs8(qint8x8_t a, qint8x8_t b);
-
-/** 16 bit fixed point vector subtraction (4 elements)
- *
- * @param[in] a First 16 bit fixed point input vector
- * @param[in] b Second 16 bit fixed point input vector
- *
- * @return The result of the 16 bit fixed point vector subtraction
- */
-qint16x4_t vsub_qs16(qint16x4_t a, qint16x4_t b);
-
-/** 8 bit fixed point vector subtraction (16 elements)
- *
- * @param[in] a First 8 bit fixed point input vector
- * @param[in] b Second 8 bit fixed point input vector
- *
- * @return The result of the 8 bit fixed point vector subtraction
- */
-qint8x16_t vsubq_qs8(qint8x16_t a, qint8x16_t b);
-
-/** 16 bit fixed point vector subtraction (8 elements)
- *
- * @param[in] a First 16 bit fixed point input vector
- * @param[in] b Second 16 bit fixed point input vector
- *
- * @return The result of the 16 bit fixed point vector subtraction
- */
-qint16x8_t vsubq_qs16(qint16x8_t a, qint16x8_t b);
-
-/** 8 bit fixed point vector saturating subtraction (8 elements)
- *
- * @param[in] a First 8 bit fixed point input vector
- * @param[in] b Second 8 bit fixed point input vector
- *
- * @return The result of the 8 bit fixed point vector subtraction. The result is saturated in case of overflow
- */
-qint8x8_t vqsub_qs8(qint8x8_t a, qint8x8_t b);
-
-/** 16 bit fixed point vector saturating subtraction (4 elements)
- *
- * @param[in] a First 16 bit fixed point input vector
- * @param[in] b Second 16 bit fixed point input vector
- *
- * @return The result of the 16 bit fixed point vector subtraction. The result is saturated in case of overflow
- */
-qint16x4_t vqsub_qs16(qint16x4_t a, qint16x4_t b);
-
-/** 8 bit fixed point vector saturating subtraction (16 elements)
- *
- * @param[in] a First 8 bit fixed point input vector
- * @param[in] b Second 8 bit fixed point input vector
- *
- * @return The result of the 8 bit fixed point vector subtraction. The result is saturated in case of overflow
- */
-qint8x16_t vqsubq_qs8(qint8x16_t a, qint8x16_t b);
-
-/** 16 bit fixed point vector saturating subtraction (8 elements)
- *
- * @param[in] a First 16 bit fixed point input vector
- * @param[in] b Second 16 bit fixed point input vector
- *
- * @return The result of the 16 bit fixed point vector subtraction. The result is saturated in case of overflow
- */
-qint16x8_t vqsubq_qs16(qint16x8_t a, qint16x8_t b);
-
-/** 8 bit fixed point vector multiply (8 elements)
- *
- * @param[in] a                    First 8 bit fixed point input vector
- * @param[in] b                    Second 8 bit fixed point input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the 8 bit fixed point vector multiplication.
- */
-qint8x8_t vmul_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position);
-
-/** 16 bit fixed point vector multiply (4 elements)
- *
- * @param[in] a                    First 16 bit fixed point input vector
- * @param[in] b                    Second 16 bit fixed point input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the 16 bit fixed point vector multiplication.
- */
-qint16x4_t vmul_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position);
-
-/** 8 bit fixed point vector multiply (16 elements)
- *
- * @param[in] a                    First 8 bit fixed point input vector
- * @param[in] b                    Second 8 bit fixed point input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the 8 bit fixed point vector multiplication.
- */
-qint8x16_t vmulq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position);
-
-/** 16 bit fixed point vector multiply (8 elements)
- *
- * @param[in] a                    First 16 bit fixed point input vector
- * @param[in] b                    Second 16 bit fixed point input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the 16 bit fixed point vector multiplication.
- */
-qint16x8_t vmulq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position);
-
-/** 8 bit fixed point vector saturating multiply (8 elements)
- *
- * @param[in] a                    First 8 bit fixed point input vector
- * @param[in] b                    Second 8 bit fixed point input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the 8 bit fixed point vector multiplication. The result is saturated in case of overflow
- */
-qint8x8_t vqmul_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position);
-
-/** 16 bit fixed point vector saturating multiply (4 elements)
- *
- * @param[in] a                    First 16 bit fixed point input vector
- * @param[in] b                    Second 16 bit fixed point input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the 16 bit fixed point vector multiplication. The result is saturated in case of overflow
- */
-qint16x4_t vqmul_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position);
-
-/** 8 bit fixed point vector saturating multiply (16 elements)
- *
- * @param[in] a                    First 8 bit fixed point input vector
- * @param[in] b                    Second 8 bit fixed point input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the 8 bit fixed point vector multiplication. The result is saturated in case of overflow
- */
-qint8x16_t vqmulq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position);
-
-/** 16 bit fixed point vector saturating multiply (8 elements)
- *
- * @param[in] a                    First 16 bit fixed point input vector
- * @param[in] b                    Second 16 bit fixed point input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the 16 bit fixed point vector multiplication. The result is saturated in case of overflow
- */
-qint16x8_t vqmulq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position);
-
-/** 8 bit fixed point vector long multiply (8 elements)
- *
- * @param[in] a                    First 8 bit fixed point input vector
- * @param[in] b                    Second 8 bit fixed point input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the 8 bit fixed point long vector multiplication.
- */
-qint16x8_t vmull_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position);
-
-/** 16 bit fixed point vector long multiply (4 elements)
- *
- * @param[in] a                    First 16 bit fixed point input vector
- * @param[in] b                    Second 16 bit fixed point input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the 32 bit fixed point long vector multiplication.
- */
-qint32x4_t vmull_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position);
-
-/** 8 bit fixed point vector multiply-accumulate (8 elements). This operation performs the product between @p b and @p c and add the result to @p a (a + b * c).
- *
- * @param[in] a                    First 8 bit fixed point input vector where the result of multiplication must be added to
- * @param[in] b                    Second 8 bit fixed point input vector
- * @param[in] c                    Third 8 bit fixed point input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the 8 bit fixed point vector multiply-accumulate
- */
-qint8x8_t vmla_qs8(qint8x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position);
-
-/** 16 bit fixed point vector multiply-accumulate (4 elements). This operation performs the product between @p b and @p c and add the result to @p a (a + b * c).
- *
- * @param[in] a                    First 16 bit fixed point input vector where the result of multiplication must be added to
- * @param[in] b                    Second 16 bit fixed point input vector
- * @param[in] c                    Third 16 bit fixed point input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the 16 bit fixed point vector multiply-accumulate
- */
-qint16x4_t vmla_qs16(qint16x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position);
-
-/** 8 bit fixed point vector multiply-accumulate (16 elements). This operation performs the product between @p b and @p c and add the result to @p a (a + b * c).
- *
- * @param[in] a                    First 8 bit fixed point input vector where the result of multiplication must be added to
- * @param[in] b                    Second 8 bit fixed point input vector
- * @param[in] c                    Third 8 bit fixed point input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the 8 bit fixed point vector multiply-accumulate
- */
-qint8x16_t vmlaq_qs8(qint8x16_t a, qint8x16_t b, qint8x16_t c, int fixed_point_position);
-
-/** 16 bit fixed point vector multiply-accumulate (16 elements). This operation performs the product between @p b and @p c and add the result to @p a (a + b * c).
- *
- * @param[in] a                    First 16 bit fixed point input vector where the result of multiplication must be added to
- * @param[in] b                    Second 16 bit fixed point input vector
- * @param[in] c                    Third 16 bit fixed point input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the 16 bit fixed point vector multiply-accumulate
- */
-qint16x8_t vmlaq_qs16(qint16x8_t a, qint16x8_t b, qint16x8_t c, int fixed_point_position);
-
-/** 8 bit fixed point vector saturating multiply-accumulate (8 elements). This operation performs the product between @p b and @p c and add the result to @p a (a + b * c).
- *
- * @param[in] a                    First 8 bit fixed point input vector where the result of multiplication must be added to
- * @param[in] b                    Second 8 bit fixed point input vector
- * @param[in] c                    Third 8 bit fixed point input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the 8 bit fixed point vector multiply-accumulate. The result is saturated in case of overflow
- */
-qint8x8_t vqmla_qs8(qint8x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position);
-
-/** 16 bit fixed point vector saturating multiply-accumulate (4 elements). This operation performs the product between @p b and @p c and add the result to @p a (a + b * c).
- *
- * @param[in] a                    First 16 bit fixed point input vector where the result of multiplication must be added to
- * @param[in] b                    Second 16 bit fixed point input vector
- * @param[in] c                    Third 16 bit fixed point input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the 16 bit fixed point vector multiply-accumulate. The result is saturated in case of overflow
- */
-qint16x4_t vqmla_qs16(qint16x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position);
-
-/** 8 bit fixed point vector saturating multiply-accumulate (16 elements). This operation performs the product between @p b and @p c and add the result to @p a (a + b * c).
- *
- * @param[in] a                    First 8 bit fixed point input vector where the result of multiplication must be added to
- * @param[in] b                    Second 8 bit fixed point input vector
- * @param[in] c                    Third 8 bit fixed point input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the 8 bit fixed point vector multiply-accumulate.The result is saturated in case of overflow
- */
-qint8x16_t vqmlaq_qs8(qint8x16_t a, qint8x16_t b, qint8x16_t c, int fixed_point_position);
-
-/** 16 bit fixed point vector saturating multiply-accumulate (8 elements). This operation performs the product between @p b and @p c and add the result to @p a (a + b * c).
- *
- * @param[in] a                    First 16 bit fixed point input vector where the result of multiplication must be added to
- * @param[in] b                    Second 16 bit fixed point input vector
- * @param[in] c                    Third 16 bit fixed point input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the 16 bit fixed point vector multiply-accumulate.The result is saturated in case of overflow
- */
-qint16x8_t vqmlaq_qs16(qint16x8_t a, qint16x8_t b, qint16x8_t c, int fixed_point_position);
-
-/** 8 bit fixed point vector multiply-accumulate long (8 elements).
- *  This operation performs the product between @p b and @p c and add the result to the 16 bit fixed point vector @p a (a + b * c). 8 elements
- *
- * @param[in] a                    First 16 bit fixed point input vector where the result of multiplication must be added to
- * @param[in] b                    Second 8 bit fixed point input vector
- * @param[in] c                    Third 8 bit fixed point input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the 8 bit fixed point vector multiply-accumulate long
- */
-qint16x8_t vmlal_qs8(qint16x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position);
-
-/** 16 bit fixed point vector multiply-accumulate long (4 elements).
- *  This operation performs the product between @p b and @p c and add the result to the 32 bit fixed point vector @p a (a + b * c). 4 elements
- *
- * @param[in] a                    First 32 bit fixed point input vector where the result of multiplication must be added to
- * @param[in] b                    Second 16 bit fixed point input vector
- * @param[in] c                    Third 16 bit fixed point input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the 16 bit fixed point vector multiply-accumulate long
- */
-qint32x4_t vmlal_qs16(qint32x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position);
-
-/** 8 bit fixed point vector saturating multiply-accumulate long (8 elements). The saturation is performed on the 16 bit fixed point output vector.
- *  This operation performs the product between @p b and @p c and add the result to the 16 bit fixed point vector @p a (a + b * c). 8 elements
- *
- * @param[in] a                    First 16 bit fixed point input vector where the result of multiplication must be added to
- * @param[in] b                    Second 8 bit fixed point input vector
- * @param[in] c                    Third 8 bit fixed point input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the 8 bit fixed point vector multiply-accumulate long
- */
-qint16x8_t vqmlal_qs8(qint16x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position);
-
-/** 16 bit fixed point vector saturating multiply-accumulate long (4 elements). The saturation is performed on the 16 bit fixed point output vector.
- *  This operation performs the product between @p b and @p c and add the result to the 32 bit fixed point vector @p a (a + b * c). 4 elements
- *
- * @param[in] a                    First 32 bit fixed point input vector where the result of multiplication must be added to
- * @param[in] b                    Second 16 bit fixed point input vector
- * @param[in] c                    Third 16 bit fixed point input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the 16 bit fixed point vector multiply-accumulate long
- */
-qint32x4_t vqmlal_qs16(qint32x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position);
-
-/** Convert a float vector with 4x2 elements to 8 bit fixed point vector with 8 elements
- *
- * @param[in] a                    Float input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the conversion float -> 8 bit fixed point. The result is saturated in case of overflow
- */
-qint8x8_t vqcvt_qs8_f32(const float32x4x2_t a, int fixed_point_position);
-
-/** Convert a float vector with 4 elements to 16 bit fixed point vector with 4 elements
- *
- * @param[in] a                    Float input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the conversion float -> 16 bit fixed point. The result is saturated in case of overflow
- */
-qint16x4_t vqcvt_qs16_f32(const float32x4_t a, int fixed_point_position);
-
-/** Convert a float vector with 4x4 elements to 8 bit fixed point vector with 16 elements
- *
- * @param[in] a                    Float input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the conversion float -> 8 bit fixed point. The result is saturated in case of overflow
- */
-qint8x16_t vqcvtq_qs8_f32(const float32x4x4_t &a, int fixed_point_position);
-
-/** Convert a float vector with 4x2 elements to 16 bit fixed point vector with 8 elements
- *
- * @param[in] a                    Float input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the conversion float -> 16 bit fixed point. The result is saturated in case of overflow
- */
-qint16x8_t vqcvtq_qs16_f32(const float32x4x2_t &a, int fixed_point_position);
-
-/** Convert a 8 bit fixed point vector with 8 elements to a float vector with 4x2 elements
- *
- * @param[in] a                    8 bit fixed point input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the conversion 8 bit fixed point -> float32x2x4
- */
-float32x4x2_t vcvt_f32_qs8(qint8x8_t a, int fixed_point_position);
-
-/** Convert a 16 bit fixed point vector with 4 elements to a float vector with 4 elements
- *
- * @param[in] a                    16 bit fixed point input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the conversion 16 bit fixed point -> float32x2
- */
-float32x4_t vcvt_f32_qs16(qint16x4_t a, int fixed_point_position);
-
-/** Convert a 8 bit fixed point vector with 16 elements to a float vector with 4x4 elements
- *
- * @param[in] a                    8 bit fixed point input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the conversion 8 bit fixed point -> float32x4x4
- */
-float32x4x4_t vcvtq_qs8_f32(qint8x16_t a, int fixed_point_position);
-
-/** Convert a 16 bit fixed point vector with 8 elements to a float vector with 4x2 elements
- *
- * @param[in] a                    16 bit fixed point input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the conversion 16 bit fixed point -> float32x4x2
- */
-float32x4x2_t vcvtq_qs16_f32(qint16x8_t a, int fixed_point_position);
-
-/** Calculate reciprocal of a fixed point 8bit number using the Newton-Raphson method. (8 elements)
- *
- * @param[in] a                    8bit fixed point input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the 8bit reciprocal (1/a).
- */
-qint8x8_t vrecip_qs8(qint8x8_t a, int fixed_point_position);
-
-/** Calculate reciprocal of a fixed point 8bit number using the Newton-Raphson method. (4 elements)
- *
- * @param[in] a                    16 bit fixed point input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the 16 bit reciprocal (1/a).
- */
-qint16x4_t vrecip_qs16(qint16x4_t a, int fixed_point_position);
-
-/** Calculate reciprocal of a fixed point 8bit number using the Newton-Raphson method. (16 elements)
- *
- * @param[in] a                    8bit fixed point input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the 8bit reciprocal (1/a).
- */
-qint8x16_t vrecipq_qs8(qint8x16_t a, int fixed_point_position);
-
-/** Calculate reciprocal of a fixed point 8bit number using the Newton-Raphson method. (8 elements)
- *
- * @param[in] a                    16 bit fixed point input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the 16 bit reciprocal (1/a).
- */
-qint16x8_t vrecipq_qs16(qint16x8_t a, int fixed_point_position);
-
-/** Division fixed point 8bit (8 elements)
- *
- * @param[in] a                    First 8bit fixed point input vector
- * @param[in] b                    Second 8bit fixed point input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The quotient and remainder number in fixed point format.
- */
-qint8x8_t vdiv_qs8(qint8x8_t a, int8x8_t b, int fixed_point_position);
-
-/** Division fixed point 16 bit (4 elements)
- *
- * @param[in] a                    First 16 bit fixed point input vector
- * @param[in] b                    Second  16 bit fixed point input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The quotient and remainder number in fixed point format.
- */
-qint16x4_t vdiv_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position);
-
-/** Division fixed point 8bit (16 elements)
- *
- * @param[in] a                    First 8bit fixed point input vector
- * @param[in] b                    Second 8bit fixed point input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The quotient and remainder number in 8bit fixed point format.
- */
-qint8x16_t vdivq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position);
-
-/** Division fixed point 16 bit (8 elements)
- *
- * @param[in] a                    First 16 bit fixed point input vector
- * @param[in] b                    Second 16 bit fixed point input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The quotient and remainder number in 16 bit fixed point format.
- */
-qint16x8_t vdivq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position);
-
-/** Perform a 4th degree polynomial approximation. (8 elements)
- *
- * @param[in] a                    8bit fixed point input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the 8bit taylor approximation.
- */
-template <bool islog>
-qint8x8_t vtaylor_poly_qs8(qint8x8_t a, int fixed_point_position);
-
-/** Perform a 4th degree polynomial approximation. (4 elements)
- *
- * @param[in] a                    16 bit fixed point input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the 16 bit taylor approximation.
- */
-template <bool islog>
-qint16x4_t vtaylor_poly_qs16(qint16x4_t a, int fixed_point_position);
-
-/** Perform a 4th degree polynomial approximation. (16 elements)
- *
- * @param[in] a                    8bit fixed point input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the 8bit taylor approximation.
- */
-template <bool islog>
-qint8x16_t vtaylor_polyq_qs8(qint8x16_t a, int fixed_point_position);
-
-/** Perform a 4th degree polynomial approximation. (8 elements)
- *
- * @param[in] a                    16 bit fixed point input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the 8bit taylor approximation.
- */
-template <bool islog>
-qint16x8_t vtaylor_polyq_qs16(qint16x8_t a, int fixed_point_position);
-
-/** Calculate saturating exponential fixed point 8bit (8 elements)
- *
- * @param[in] a                    8bit fixed point input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the 8bit saturating exponential
- */
-qint8x8_t vqexp_qs8(qint8x8_t a, int fixed_point_position);
-
-/** Calculate saturating exponential fixed point 16 bit (4 elements)
- *
- * @param[in] a                    8bit fixed point input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the 16 bit saturating exponential
- */
-qint16x4_t vqexp_qs16(qint16x4_t a, int fixed_point_position);
-
-/** Calculate saturating exponential fixed point 8bit (16 elements)
- *
- * @param[in] a                    8bit fixed point input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the 8bit saturating exponential
- */
-qint8x16_t vqexpq_qs8(qint8x16_t a, int fixed_point_position);
-
-/** Calculate saturating exponential fixed point 16 bit (8 elements)
- *
- * @param[in] a                    16 bit fixed point input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the 16 bit saturating exponential
- */
-qint16x8_t vqexpq_qs16(qint16x8_t a, int fixed_point_position);
-
-/** Calculate logarithm fixed point 8 bit (8 elements)
- *
- * @param[in] a                    8bit fixed point input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the 8bit logarithm.
- */
-qint8x8_t vlog_qs8(qint8x8_t a, int fixed_point_position);
-
-/** Calculate logarithm fixed point 16 bit (4 elements)
- *
- * @param[in] a                    16 bit fixed point input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the 16 bit logarithm.
- */
-qint16x4_t vlog_qs16(qint16x4_t a, int fixed_point_position);
-
-/** Calculate logarithm fixed point 16bit (16 elements)
- *
- * @param[in] a                    8bit fixed point input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the 8bit logarithm.
- */
-qint8x16_t vlogq_qs8(qint8x16_t a, int fixed_point_position);
-
-/** Calculate logarithm fixed point 16 bit (8 elements)
- *
- * @param[in] a                    16 bit fixed point input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the 16 bit logarithm.
- */
-qint16x8_t vlogq_qs16(qint16x8_t a, int fixed_point_position);
-
-/** Calculate inverse square root for fixed point 8bit using Newton-Raphosn method (8 elements)
- *
- * @param[in] a                    8bit fixed point input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the 8bit inverse sqrt.
- */
-qint8x8_t vinvsqrt_qs8(qint8x8_t a, int fixed_point_position);
-
-/** Calculate inverse square root for fixed point 16 bit using Newton-Raphosn method (4 elements)
- *
- * @param[in] a                    16 bit fixed point input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the 16 bit inverse sqrt.
- */
-qint16x4_t vinvsqrt_qs16(qint16x4_t a, int fixed_point_position);
-
-/** Calculate saturating inverse square root for fixed point 8bit using Newton-Raphosn method (8 elements)
- *
- * @param[in] a                    8bit fixed point input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the 8bit inverse sqrt.
- */
-qint8x8_t vqinvsqrt_qs8(qint8x8_t a, int fixed_point_position);
-
-/** Calculate saturating inverse square root for fixed point 16 bit using Newton-Raphosn method (4 elements)
- *
- * @param[in] a                    16 bit fixed point input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the 16 bit inverse sqrt.
- */
-qint16x4_t vqinvsqrt_qs16(qint16x4_t a, int fixed_point_position);
-
-/** Calculate inverse square root for fixed point 8bit using Newton-Raphosn method (16 elements)
- *
- * @param[in] a                    8bit fixed point input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the 8bit inverse sqrt.
- */
-qint8x16_t vinvsqrtq_qs8(qint8x16_t a, int fixed_point_position);
-
-/** Calculate inverse square root for fixed point 8bit using Newton-Raphosn method (8 elements)
- *
- * @param[in] a                    16 bit fixed point input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the 16 bit inverse sqrt.
- */
-qint16x8_t vinvsqrtq_qs16(qint16x8_t a, int fixed_point_position);
-
-/** Calculate saturating inverse square root for fixed point 8bit using Newton-Raphosn method (16 elements)
- *
- * @param[in] a                    8bit fixed point input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the 8bit inverse sqrt.
- */
-qint8x16_t vqinvsqrtq_qs8(qint8x16_t a, int fixed_point_position);
-
-/** Calculate saturating inverse square root for fixed point 16 bit using Newton-Raphosn method (8 elements)
- *
- * @param[in] a                    16 bit fixed point input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the 16 bit inverse sqrt.
- */
-qint16x8_t vqinvsqrtq_qs16(qint16x8_t a, int fixed_point_position);
-
-/** Calculate hyperbolic tangent for fixed point 8bit (8 elements)
- *
- * @param[in] a                    8bit fixed point input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The calculated Hyperbolic Tangent.
- */
-qint8x8_t vqtanh_qs8(qint8x8_t a, int fixed_point_position);
-
-/** Calculate hyperbolic tangent for fixed point 16 bit (4 elements)
- *
- * @param[in] a                    16 bit fixed point input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The calculated Hyperbolic Tangent.
- */
-qint16x4_t vqtanh_qs16(qint16x4_t a, int fixed_point_position);
-
-/** Calculate hyperbolic tangent for fixed point 8bit (16 elements)
- *
- * @param[in] a                    8bit fixed point input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The calculated Hyperbolic Tangent.
- */
-qint8x16_t vqtanhq_qs8(qint8x16_t a, int fixed_point_position);
-
-/** Calculate hyperbolic tangent for fixed point 16bit (8 elements)
- *
- * @param[in] a                    16 bit fixed point input vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The calculated Hyperbolic Tangent.
- */
-qint16x8_t vqtanhq_qs16(qint16x8_t a, int fixed_point_position);
-
-/** Calculate saturating n power for fixed point 8bit (16 elements).
- *
- * pow(a,b) = e^(b*log(a))
- *
- * @param[in] a                    8bit fixed point input vector
- * @param[in] b                    8bit fixed point power vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the 8bit power.
- */
-qint8x16_t vqpowq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position);
-
-/** Calculate saturating n power for fixed point 16bit (8 elements).
- *
- * pow(a,b) = e^(b*log(a))
- *
- * @param[in] a                    16bit fixed point input vector
- * @param[in] b                    16bit fixed point power vector
- * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
- *
- * @return The result of the 16bit power.
- */
-qint16x8_t vqpowq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position);
-
 /** Compute lane-by-lane maximum between elements of a float vector with 4x2 elements
  *
  * @param[in] a Float input vector

diff --git a/arm_compute/core/NEON/NEFixedPoint.inl b/arm_compute/core/NEON/NEFixedPoint.inl
index b86c3cb..14e51d8 100644
--- a/arm_compute/core/NEON/NEFixedPoint.inl
+++ b/arm_compute/core/NEON/NEFixedPoint.inl

@@ -26,1965 +26,7 @@
 
 namespace arm_compute
 {
-/** Exponent polynomial coefficients for 8 bit fixed point (8 elements)
- *  Format is in Q0.7 for all elements
- */
-static const std::array<qint8x8_t, 4> exp_tab_qs8 =
-{
-    {
-        vdup_n_s8(0x7F), // 0.9978546
-        vdup_n_s8(0x3F), // 0.4994721
-        vdup_n_s8(0x16), // 0.1763723
-        vdup_n_s8(0x05), // 0.0435108
-    }
-};
-
-/** Exponent polynomial coefficients for 16 bit fixed point (4 elements)
- *  Format is in Q0.15 for all elements
- */
-static const std::array<qint16x4_t, 4> exp_tab_qs16 =
-{
-    {
-        vdup_n_s16(0x7FBA), // 0.9978546
-        vdup_n_s16(0x3FE9), // 0.4994721
-        vdup_n_s16(0x1693), // 0.1763723
-        vdup_n_s16(0x0592), // 0.0435108
-    }
-};
-
-/** Exponent polynomial coefficients for 8 bit fixed point (16 elements)
- *  Format is in Q0.7 for all elements
- */
-static const std::array<qint8x16_t, 4> exp_tabq_qs8 =
-{
-    {
-        vdupq_n_s8(0x7F), // 0.9978546
-        vdupq_n_s8(0x3F), // 0.4994721
-        vdupq_n_s8(0x16), // 0.1763723
-        vdupq_n_s8(0x05), // 0.0435108
-    }
-};
-
-/** Exponent polynomial coefficients for 16 bit fixed point (8 elements)
- *  Format is in Q0.15 for all elements
- */
-static const std::array<qint16x8_t, 4> exp_tabq_qs16 =
-{
-    {
-        vdupq_n_s16(0x7FBA), // 0.9978546
-        vdupq_n_s16(0x3FE9), // 0.4994721
-        vdupq_n_s16(0x1693), // 0.1763723
-        vdupq_n_s16(0x0592), // 0.0435108
-    }
-};
-
-/** Logarithm polynomial coefficients for 8 bit fixed point (8 elements)
- *  Format is in Q0.7 for all elements except the first one which is in Q1.6
- */
-static const std::array<qint8x8_t, 4> log_tab_qs8 =
-{
-    {
-        vdup_n_s8(0x5C),  // 1.4384189
-        vdup_n_s8(-0x56), // -0.6771900
-        vdup_n_s8(0x29),  // 0.3218538
-        vdup_n_s8(-0x0A), // -0.0832229
-    }
-};
-
-/** Logarithm polynomial coefficients for 16 bit fixed point (8 elements)
- *  Format is in Q0.15 for all elements except the first one which is in Q1.14
- */
-static const std::array<qint16x4_t, 4> log_tab_qs16 =
-{
-    {
-        vdup_n_s16(0x5C0F),  // 1.4384189
-        vdup_n_s16(-0x56AE), // -0.6771900
-        vdup_n_s16(0x2933),  // 0.3218538
-        vdup_n_s16(-0x0AA7), // -0.0832229
-    }
-};
-
-/** Logarithm polynomial coefficients for 8 bit fixed point (16 elements)
- *  Format is in Q0.7 for all elements except the first one which is in Q1.6
- */
-static const std::array<qint8x16_t, 4> log_tabq_qs8 =
-{
-    {
-        vdupq_n_s8(0x5C),  // 1.4384189
-        vdupq_n_s8(-0x56), // -0.6771900
-        vdupq_n_s8(0x29),  // 0.3218538
-        vdupq_n_s8(-0x0A), // -0.0832229
-    }
-};
-
-/** Logarithm polynomial coefficients for 16 bit fixed point (8 elements)
- *  Format is in Q0.15 for all elements except the first one which is in Q1.14
- */
-static const std::array<qint16x8_t, 4> log_tabq_qs16 =
-{
-    {
-        vdupq_n_s16(0x5C0F),  // 1.4384189
-        vdupq_n_s16(-0x56AE), // -0.6771900
-        vdupq_n_s16(0x2933),  // 0.3218538
-        vdupq_n_s16(-0x0AA7), // -0.0832229
-    }
-};
-
 #ifndef DOXYGEN_SKIP_THIS
-inline qint8x8_t vget_low_qs8(qint8x16_t a)
-{
-    return vget_low_s8(a);
-}
-
-inline qint16x4_t vget_low_qs16(qint16x8_t a)
-{
-    return vget_low_s16(a);
-}
-
-inline qint8x8_t vget_high_qs8(qint8x16_t a)
-{
-    return vget_high_s8(a);
-}
-
-inline qint16x4_t vget_high_qs16(qint16x8_t a)
-{
-    return vget_high_s16(a);
-}
-
-inline qint8x8_t vld1_qs8(const qint8_t *addr)
-{
-    return vld1_s8(addr);
-}
-
-inline qint16x4_t vld1_qs16(const qint16_t *addr)
-{
-    return vld1_s16(addr);
-}
-
-inline qint8x16_t vld1q_qs8(const qint8_t *addr)
-{
-    return vld1q_s8(addr);
-}
-
-inline qint16x8_t vld1q_qs16(const qint16_t *addr)
-{
-    return vld1q_s16(addr);
-}
-
-inline qint8x8_t vld1_dup_qs8(const qint8_t *addr)
-{
-    return vld1_dup_s8(addr);
-}
-
-inline qint16x4_t vld1_dup_qs16(const qint16_t *addr)
-{
-    return vld1_dup_s16(addr);
-}
-
-inline qint8x16_t vld1q_dup_qs8(const qint8_t *addr)
-{
-    return vld1q_dup_s8(addr);
-}
-
-inline qint16x8_t vld1q_dup_qs16(const qint16_t *addr)
-{
-    return vld1q_dup_s16(addr);
-}
-
-inline qint16x8x2_t vld2q_qs16(const qint16_t *addr)
-{
-    return vld2q_s16(addr);
-}
-
-inline void vst1_qs8(qint8_t *addr, qint8x8_t b)
-{
-    vst1_s8(addr, b);
-}
-
-inline void vst1_qs16(qint16_t *addr, qint16x4_t b)
-{
-    vst1_s16(addr, b);
-}
-
-inline void vst1q_qs8(qint8_t *addr, qint8x16_t b)
-{
-    vst1q_s8(addr, b);
-}
-
-inline void vst1q_qs16(qint16_t *addr, qint16x8_t b)
-{
-    vst1q_s16(addr, b);
-}
-
-inline void vst2q_qs16(qint16_t *addr, qint16x8x2_t b)
-{
-    vst2q_s16(addr, b);
-}
-
-inline qint8x8_t vqmovn_qs16(qint16x8_t a)
-{
-    return vqmovn_s16(a);
-}
-
-inline qint16x4_t vqmovn_qs32(qint32x4_t a)
-{
-    return vqmovn_s32(a);
-}
-
-inline qint8x8_t vdup_n_qs8(qint8_t a)
-{
-    return vdup_n_s8(a);
-}
-
-inline qint16x4_t vdup_n_qs16(qint16_t a)
-{
-    return vdup_n_s16(a);
-}
-
-inline qint8x16_t vdupq_n_qs8(qint8_t a)
-{
-    return vdupq_n_s8(a);
-}
-
-inline qint8x16_t vdupq_n_qs8_f32(float a, int fixed_point_position)
-{
-    float32x4x4_t res =
-    {
-        {
-            vdupq_n_f32(a),
-            vdupq_n_f32(a),
-            vdupq_n_f32(a),
-            vdupq_n_f32(a),
-        }
-    };
-    return vqcvtq_qs8_f32(res, fixed_point_position);
-}
-
-inline qint16x8_t vdupq_n_qs16_f32(float a, int fixed_point_position)
-{
-    float32x4x2_t res =
-    {
-        {
-            vdupq_n_f32(a),
-            vdupq_n_f32(a),
-        }
-    };
-    return vqcvtq_qs16_f32(res, fixed_point_position);
-}
-
-inline qint16x8_t vdupq_n_qs16(qint16_t a)
-{
-    return vdupq_n_s16(a);
-}
-
-inline qint32x4_t vdupq_n_qs32(qint32_t a)
-{
-    return vdupq_n_s32(a);
-}
-
-inline qint8x8_t vabs_qs8(qint8x8_t a)
-{
-    return vabs_s8(a);
-}
-
-inline qint16x4_t vabs_qs16(qint16x4_t a)
-{
-    return vabs_s16(a);
-}
-
-inline qint8x16_t vabsq_qs8(qint8x16_t a)
-{
-    return vabsq_s8(a);
-}
-
-inline qint16x8_t vabsq_qs16(qint16x8_t a)
-{
-    return vabsq_s16(a);
-}
-
-inline qint8x8_t vqabs_qs8(qint8x8_t a)
-{
-    return vqabs_s8(a);
-}
-
-inline qint16x4_t vqabs_qs16(qint16x4_t a)
-{
-    return vqabs_s16(a);
-}
-
-inline qint8x16_t vqabsq_qs8(qint8x16_t a)
-{
-    return vqabsq_s8(a);
-}
-
-inline qint16x8_t vqabsq_qs16(qint16x8_t a)
-{
-    return vqabsq_s16(a);
-}
-
-inline qint8x8_t vmax_qs8(qint8x8_t a, qint8x8_t b)
-{
-    return vmax_s8(a, b);
-}
-
-inline qint16x4_t vmax_qs16(qint16x4_t a, qint16x4_t b)
-{
-    return vmax_s16(a, b);
-}
-
-inline qint8x16_t vmaxq_qs8(qint8x16_t a, qint8x16_t b)
-{
-    return vmaxq_s8(a, b);
-}
-
-inline qint8x8_t vpmax_qs8(qint8x8_t a, qint8x8_t b)
-{
-    return vpmax_s8(a, b);
-}
-
-inline qint16x4_t vpmax_qs16(qint16x4_t a, qint16x4_t b)
-{
-    return vpmax_s16(a, b);
-}
-
-inline qint16x8_t vmaxq_qs16(qint16x8_t a, qint16x8_t b)
-{
-    return vmaxq_s16(a, b);
-}
-
-inline qint8x8_t vmin_qs8(qint8x8_t a, qint8x8_t b)
-{
-    return vmin_s8(a, b);
-}
-
-inline qint16x4_t vmin_qs16(qint16x4_t a, qint16x4_t b)
-{
-    return vmin_s16(a, b);
-}
-
-inline qint8x16_t vminq_qs8(qint8x16_t a, qint8x16_t b)
-{
-    return vminq_s8(a, b);
-}
-
-inline qint8x8_t vpmin_qs8(qint8x8_t a, qint8x8_t b)
-{
-    return vpmin_s8(a, b);
-}
-
-inline qint16x4_t vpmin_qs16(qint16x4_t a, qint16x4_t b)
-{
-    return vpmin_s16(a, b);
-}
-
-inline qint16x8_t vminq_qs16(qint16x8_t a, qint16x8_t b)
-{
-    return vminq_s16(a, b);
-}
-
-inline qint8x8_t vadd_qs8(qint8x8_t a, qint8x8_t b)
-{
-    return vadd_s8(a, b);
-}
-
-inline qint16x4_t vadd_qs16(qint16x4_t a, qint16x4_t b)
-{
-    return vadd_s16(a, b);
-}
-
-inline qint8x16_t vaddq_qs8(qint8x16_t a, qint8x16_t b)
-{
-    return vaddq_s8(a, b);
-}
-
-inline qint16x8_t vaddq_qs16(qint16x8_t a, qint16x8_t b)
-{
-    return vaddq_s16(a, b);
-}
-
-inline qint8x8_t vqadd_qs8(qint8x8_t a, qint8x8_t b)
-{
-    return vqadd_s8(a, b);
-}
-
-inline qint16x4_t vqadd_qs16(qint16x4_t a, qint16x4_t b)
-{
-    return vqadd_s16(a, b);
-}
-
-inline qint32x2_t vqadd_qs32(qint32x2_t a, qint32x2_t b)
-{
-    return vqadd_s32(a, b);
-}
-
-inline qint8x16_t vqaddq_qs8(qint8x16_t a, qint8x16_t b)
-{
-    return vqaddq_s8(a, b);
-}
-
-inline qint16x8_t vqaddq_qs16(qint16x8_t a, qint16x8_t b)
-{
-    return vqaddq_s16(a, b);
-}
-
-inline qint32x4_t vqaddq_qs32(qint32x4_t a, qint32x4_t b)
-{
-    return vqaddq_s32(a, b);
-}
-
-inline int16x4_t vpaddl_qs8(qint8x8_t a)
-{
-    return vpaddl_s8(a);
-}
-
-inline qint8x8_t vsub_qs8(qint8x8_t a, qint8x8_t b)
-{
-    return vsub_s8(a, b);
-}
-
-inline qint16x4_t vsub_qs16(qint16x4_t a, qint16x4_t b)
-{
-    return vsub_s16(a, b);
-}
-
-inline qint8x16_t vsubq_qs8(qint8x16_t a, qint8x16_t b)
-{
-    return vsubq_s8(a, b);
-}
-
-inline qint16x8_t vsubq_qs16(qint16x8_t a, qint16x8_t b)
-{
-    return vsubq_s16(a, b);
-}
-
-inline qint8x8_t vqsub_qs8(qint8x8_t a, qint8x8_t b)
-{
-    return vqsub_s8(a, b);
-}
-
-inline qint16x4_t vqsub_qs16(qint16x4_t a, qint16x4_t b)
-{
-    return vqsub_s16(a, b);
-}
-
-inline qint8x16_t vqsubq_qs8(qint8x16_t a, qint8x16_t b)
-{
-    return vqsubq_s8(a, b);
-}
-
-inline qint16x8_t vqsubq_qs16(qint16x8_t a, qint16x8_t b)
-{
-    return vqsubq_s16(a, b);
-}
-
-inline qint8x8_t vmul_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
-{
-    const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
-
-    // Initialize the temporary result with a constant used to round up the result
-    qint16x8_t res = vdupq_n_s16(1 << (fixed_point_position - 1));
-
-    // Vector multiply-accumulate long
-    res = vmlal_s8(res, a, b);
-
-    // Shift right by fixed_point_position
-    res = vshlq_s16(res, fixed_point_position_s16);
-
-    // Convert back to qint8
-    return vmovn_s16(res);
-}
-
-inline qint16x4_t vmul_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position)
-{
-    const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
-
-    // Initialize the temporary result with a constant used to round up the result
-    qint32x4_t res = vdupq_n_s32(1 << (fixed_point_position - 1));
-
-    // Vector multiply-accumulate long
-    res = vmlal_s16(res, a, b);
-
-    // Shift right by fixed_point_position
-    res = vshlq_s32(res, fixed_point_position_s32);
-
-    // Convert back to qint16
-    return vmovn_s32(res);
-}
-
-inline qint8x16_t vmulq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
-{
-    const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
-
-    // Initialize the temporary results with a constant used to round up the result
-    qint16x8_t res0 = vdupq_n_s16(1 << (fixed_point_position - 1));
-    qint16x8_t res1 = res0;
-
-    // Vector multiply-accumulate long
-    res0 = vmlal_s8(res0, vget_low_s8(a), vget_low_s8(b));
-    res1 = vmlal_s8(res1, vget_high_s8(a), vget_high_s8(b));
-
-    // Shift right by fixed_point_position
-    res0 = vshlq_s16(res0, fixed_point_position_s16);
-    res1 = vshlq_s16(res1, fixed_point_position_s16);
-
-    // Convert back to qint8
-    return vcombine_s8(vmovn_s16(res0), vmovn_s16(res1));
-}
-
-inline qint16x8_t vmulq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position)
-{
-    const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
-
-    // Initialize the temporary results with a constant used to round up the result
-    qint32x4_t res0 = vdupq_n_s32(1 << (fixed_point_position - 1));
-    qint32x4_t res1 = res0;
-
-    // Vector multiply-accumulate long
-    res0 = vmlal_s16(res0, vget_low_qs16(a), vget_low_qs16(b));
-    res1 = vmlal_s16(res1, vget_high_qs16(a), vget_high_qs16(b));
-
-    // Shift right by fixed_point_position
-    res0 = vshlq_s32(res0, fixed_point_position_s32);
-    res1 = vshlq_s32(res1, fixed_point_position_s32);
-
-    // Convert back to qint16
-    return vcombine_s16(vmovn_s32(res0), vmovn_s32(res1));
-}
-
-inline qint8x8_t vqmul_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
-{
-    const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
-
-    // Initialize the temporary result with a constant used to round up the result
-    qint16x8_t res = vdupq_n_s16(1 << (fixed_point_position - 1));
-
-    // Vector multiply-accumulate long
-    res = vmlal_s8(res, a, b);
-
-    // Shift right by fixed_point_position
-    res = vqshlq_s16(res, fixed_point_position_s16);
-
-    // Convert back to qint8 and saturate
-    return vqmovn_s16(res);
-}
-
-inline qint16x4_t vqmul_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position)
-{
-    const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
-
-    // Initialize the temporary result with a constant used to round up the result
-    qint32x4_t res = vdupq_n_s32(1 << (fixed_point_position - 1));
-
-    // Vector multiply-accumulate long
-    res = vmlal_s16(res, a, b);
-
-    // Shift right by fixed_point_position
-    res = vqshlq_s32(res, fixed_point_position_s32);
-
-    // Convert back to qint16 and saturate
-    return vqmovn_s32(res);
-}
-
-inline qint8x16_t vqmulq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
-{
-    const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
-
-    // Initialize the temporary results with a constant used to round up the result
-    qint16x8_t res0 = vdupq_n_s16(1 << (fixed_point_position - 1));
-    qint16x8_t res1 = res0;
-
-    // Vector multiply-accumulate long
-    res0 = vmlal_s8(res0, vget_low_s8(a), vget_low_s8(b));
-    res1 = vmlal_s8(res1, vget_high_s8(a), vget_high_s8(b));
-
-    // Shift right by fixed_point_position
-    res0 = vqshlq_s16(res0, fixed_point_position_s16);
-    res1 = vqshlq_s16(res1, fixed_point_position_s16);
-
-    // Convert back to qint8 and saturate
-    return vcombine_s8(vqmovn_s16(res0), vqmovn_s16(res1));
-}
-
-inline qint16x8_t vqmulq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position)
-{
-    const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
-
-    // Initialize the temporary results with a constant used to round up the result
-    qint32x4_t res0 = vdupq_n_s32(1 << (fixed_point_position - 1));
-    qint32x4_t res1 = res0;
-
-    // Vector multiply-accumulate long
-    res0 = vmlal_s16(res0, vget_low_qs16(a), vget_low_qs16(b));
-    res1 = vmlal_s16(res1, vget_high_qs16(a), vget_high_qs16(b));
-
-    // Shift right by fixed_point_position
-    res0 = vqshlq_s32(res0, fixed_point_position_s32);
-    res1 = vqshlq_s32(res1, fixed_point_position_s32);
-
-    // Convert back to qint16 and saturate
-    return vcombine_s16(vqmovn_s32(res0), vqmovn_s32(res1));
-}
-
-inline qint16x8_t vmull_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
-{
-    const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
-
-    qint16x8_t res = vmull_s8(a, b);
-
-    return vqrshlq_s16(res, fixed_point_position_s16);
-}
-
-inline qint32x4_t vmull_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position)
-{
-    const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
-
-    // Initialize the temporary results with a constant used to round up the result
-    qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
-
-    // Vector multiply-accumulate long
-    tmp = vmull_s16(a, b);
-
-    // Shift right by fixed_point_position
-    return vqshlq_s32(tmp, fixed_point_position_s32);
-}
-
-inline qint8x8_t vmla_qs8(qint8x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
-{
-    const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
-
-    // Initialize the temporary results with a constant used to round up the result
-    qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
-
-    // Vector multiply-accumulate long
-    tmp = vmlal_s8(tmp, b, c);
-
-    // Shift right by fixed_point_position
-    tmp = vshlq_s16(tmp, fixed_point_position_s16);
-
-    // Convert back to qint8 and accumulate
-    return vadd_s8(a, vmovn_s16(tmp));
-}
-
-inline qint16x4_t vmla_qs16(qint16x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position)
-{
-    const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
-
-    // Initialize the temporary results with a constant used to round up the result
-    qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
-
-    // Vector multiply-accumulate long
-    tmp = vmlal_s16(tmp, b, c);
-
-    // Shift right by fixed_point_position
-    tmp = vshlq_s32(tmp, fixed_point_position_s32);
-
-    // Convert back to qint16 and accumulate
-    return vadd_s16(a, vmovn_s32(tmp));
-}
-
-inline qint8x16_t vmlaq_qs8(qint8x16_t a, qint8x16_t b, qint8x16_t c, int fixed_point_position)
-{
-    const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
-
-    // Initialize the temporary results with a constant used to round up the result
-    qint16x8_t tmp0 = vdupq_n_s16(1 << (fixed_point_position - 1));
-    qint16x8_t tmp1 = tmp0;
-
-    // Vector multiply-accumulate long
-    tmp0 = vmlal_s8(tmp0, vget_low_s8(b), vget_low_s8(c));
-    tmp1 = vmlal_s8(tmp1, vget_high_s8(b), vget_high_s8(c));
-
-    // Shift right by fixed_point_position
-    tmp0 = vshlq_s16(tmp0, fixed_point_position_s16);
-    tmp1 = vshlq_s16(tmp1, fixed_point_position_s16);
-
-    // Convert back to qint8 and accumulate
-    return vcombine_s8(vadd_s8(vget_low_s8(a), vmovn_s16(tmp0)), vadd_s8(vget_high_s8(a), vmovn_s16(tmp1)));
-}
-
-inline qint16x8_t vmlaq_qs16(qint16x8_t a, qint16x8_t b, qint16x8_t c, int fixed_point_position)
-{
-    const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
-
-    // Initialize the temporary results with a constant used to round up the result
-    qint32x4_t tmp0 = vdupq_n_s32(1 << (fixed_point_position - 1));
-    qint32x4_t tmp1 = tmp0;
-
-    // Vector multiply-accumulate long
-    tmp0 = vmlal_s16(tmp0, vget_low_qs16(b), vget_low_qs16(c));
-    tmp1 = vmlal_s16(tmp1, vget_high_qs16(b), vget_high_qs16(c));
-
-    // Shift right by fixed_point_position
-    tmp0 = vshlq_s32(tmp0, fixed_point_position_s32);
-    tmp1 = vshlq_s32(tmp1, fixed_point_position_s32);
-
-    // Convert back to qint16 and accumulate
-    return vcombine_s16(vadd_s16(vget_low_qs16(a), vmovn_s32(tmp0)), vadd_s16(vget_high_qs16(a), vmovn_s32(tmp1)));
-}
-
-inline qint8x8_t vqmla_qs8(qint8x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
-{
-    const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
-
-    // Initialize the temporary results with a constant used to round up the result
-    qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
-
-    // Vector multiply-accumulate long
-    tmp = vmlal_s8(tmp, b, c);
-
-    // Shift right by fixed_point_position
-    tmp = vqshlq_s16(tmp, fixed_point_position_s16);
-
-    // Convert back to qint8 and accumulate
-    return vqadd_s8(a, vqmovn_s16(tmp));
-}
-
-inline qint16x4_t vqmla_qs16(qint16x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position)
-{
-    const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
-
-    // Initialize the temporary results with a constant used to round up the result
-    qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
-
-    // Vector multiply-accumulate long
-    tmp = vmlal_s16(tmp, b, c);
-
-    // Shift right by fixed_point_position
-    tmp = vqshlq_s32(tmp, fixed_point_position_s32);
-
-    // Convert back to qint8 and accumulate
-    return vqadd_s16(a, vqmovn_s32(tmp));
-}
-
-inline qint8x16_t vqmlaq_qs8(qint8x16_t a, qint8x16_t b, qint8x16_t c, int fixed_point_position)
-{
-    const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
-
-    // Initialize the temporary results with a constant used to round up the result
-    qint16x8_t tmp0 = vdupq_n_s16(1 << (fixed_point_position - 1));
-    qint16x8_t tmp1 = tmp0;
-
-    // Vector multiply-accumulate long
-    tmp0 = vmlal_s8(tmp0, vget_low_s8(b), vget_low_s8(c));
-    tmp1 = vmlal_s8(tmp1, vget_high_s8(b), vget_high_s8(c));
-
-    // Shift right by fixed_point_position
-    tmp0 = vqshlq_s16(tmp0, fixed_point_position_s16);
-    tmp1 = vqshlq_s16(tmp1, fixed_point_position_s16);
-
-    // Convert back to qint8 and accumulate
-    qint8x16_t res = vcombine_s8(vqmovn_s16(tmp0), vqmovn_s16(tmp1));
-    return vqaddq_s8(a, res);
-}
-
-inline qint16x8_t vqmlaq_qs16(qint16x8_t a, qint16x8_t b, qint16x8_t c, int fixed_point_position)
-{
-    const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
-
-    // Initialize the temporary results with a constant used to round up the result
-    qint32x4_t tmp0 = vdupq_n_s32(1 << (fixed_point_position - 1));
-    qint32x4_t tmp1 = tmp0;
-
-    // Vector multiply-accumulate long
-    tmp0 = vmlal_s16(tmp0, vget_low_qs16(b), vget_low_qs16(c));
-    tmp1 = vmlal_s16(tmp1, vget_high_qs16(b), vget_high_qs16(c));
-
-    // Shift right by fixed_point_position
-    tmp0 = vqshlq_s32(tmp0, fixed_point_position_s32);
-    tmp1 = vqshlq_s32(tmp1, fixed_point_position_s32);
-
-    // Convert back to qint16 and accumulate
-    qint16x8_t res = vcombine_s16(vqmovn_s32(tmp0), vqmovn_s32(tmp1));
-    return vqaddq_s16(a, res);
-}
-
-inline qint16x8_t vmlal_qs8(qint16x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
-{
-    const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
-
-    // Initialize the temporary results with a constant used to round up the result
-    qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
-
-    // Vector multiply-accumulate long
-    tmp = vmlal_s8(tmp, b, c);
-
-    // Shift right by fixed_point_position
-    tmp = vshlq_s16(tmp, fixed_point_position_s16);
-
-    // Accumulate
-    return vaddq_s16(a, tmp);
-}
-
-inline qint32x4_t vmlal_qs16(qint32x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position)
-{
-    const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
-
-    // Initialize the temporary results with a constant used to round up the result
-    qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
-
-    // Vector multiply-accumulate long
-    tmp = vmlal_s16(tmp, b, c);
-
-    // Shift right by fixed_point_position
-    tmp = vshlq_s32(tmp, fixed_point_position_s32);
-
-    // Accumulate
-    return vaddq_s32(a, tmp);
-}
-
-inline qint16x8_t vqmlal_qs8(qint16x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
-{
-    const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
-
-    // Initialize the temporary results with a constant used to round up the result
-    qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1));
-
-    // Vector multiply-accumulate long
-    tmp = vmlal_s8(tmp, b, c);
-
-    // Shift right by fixed_point_position
-    tmp = vqshlq_s16(tmp, fixed_point_position_s16);
-
-    // Accumulate
-    return vqaddq_s16(a, tmp);
-}
-
-inline qint32x4_t vqmlal_qs16(qint32x4_t a, qint16x4_t b, qint16x4_t c, int fixed_point_position)
-{
-    const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
-
-    // Initialize the temporary results with a constant used to round up the result
-    qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
-
-    // Vector multiply-accumulate long
-    tmp = vmlal_s16(tmp, b, c);
-
-    // Shift right by fixed_point_position
-    tmp = vqshlq_s32(tmp, fixed_point_position_s32);
-
-    // Accumulate
-    return vqaddq_s32(a, tmp);
-}
-
-inline qint8x8_t vqcvt_qs8_f32(const float32x4x2_t &a, int fixed_point_position)
-{
-    const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
-
-    float32x4x2_t res_f32 =
-    {
-        {
-            vbslq_f32(vcgeq_f32(a.val[0], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
-            vbslq_f32(vcgeq_f32(a.val[1], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f))
-        }
-    };
-
-    res_f32.val[0] = vmlaq_f32(res_f32.val[0], a.val[0], pow2);
-    res_f32.val[1] = vmlaq_f32(res_f32.val[1], a.val[1], pow2);
-
-    const int32x4x2_t res_s32 =
-    {
-        {
-            vcvtq_s32_f32(res_f32.val[0]),
-            vcvtq_s32_f32(res_f32.val[1]),
-        }
-    };
-
-    const int16x8_t res_s16 = vcombine_s16(vqmovn_s32(res_s32.val[0]), vqmovn_s32(res_s32.val[1]));
-
-    return vqmovn_s16(res_s16);
-}
-
-inline qint16x4_t vqcvt_qs16_f32(const float32x4_t a, int fixed_point_position)
-{
-    const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
-
-    float32x4_t res_f32 = vbslq_f32(vcgeq_f32(a, vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f));
-
-    res_f32 = vmlaq_f32(res_f32, a, pow2);
-
-    const int32x4_t res_s32 = vcvtq_s32_f32(res_f32);
-
-    return vqmovn_s32(res_s32);
-}
-
-inline qint8x16_t vqcvtq_qs8_f32(const float32x4x4_t &a, int fixed_point_position)
-{
-    const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
-
-    float32x4x4_t res_f32 =
-    {
-        {
-            vbslq_f32(vcgeq_f32(a.val[0], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
-            vbslq_f32(vcgeq_f32(a.val[1], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
-            vbslq_f32(vcgeq_f32(a.val[2], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
-            vbslq_f32(vcgeq_f32(a.val[3], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f))
-        }
-    };
-
-    res_f32.val[0] = vmlaq_f32(res_f32.val[0], a.val[0], pow2);
-    res_f32.val[1] = vmlaq_f32(res_f32.val[1], a.val[1], pow2);
-    res_f32.val[2] = vmlaq_f32(res_f32.val[2], a.val[2], pow2);
-    res_f32.val[3] = vmlaq_f32(res_f32.val[3], a.val[3], pow2);
-
-    const int32x4x4_t res_s32 =
-    {
-        {
-            vcvtq_s32_f32(res_f32.val[0]),
-            vcvtq_s32_f32(res_f32.val[1]),
-            vcvtq_s32_f32(res_f32.val[2]),
-            vcvtq_s32_f32(res_f32.val[3]),
-        }
-    };
-
-    const int16x8x2_t res_s16 =
-    {
-        {
-            vcombine_s16(vqmovn_s32(res_s32.val[0]), vqmovn_s32(res_s32.val[1])),
-            vcombine_s16(vqmovn_s32(res_s32.val[2]), vqmovn_s32(res_s32.val[3])),
-        }
-    };
-
-    return vcombine_s8(vqmovn_s16(res_s16.val[0]), vqmovn_s16(res_s16.val[1]));
-}
-
-inline qint16x8_t vqcvtq_qs16_f32(const float32x4x2_t &a, int fixed_point_position)
-{
-    const float32x4_t pow2 = vdupq_n_f32(static_cast<float>(1 << fixed_point_position));
-
-    float32x4x2_t res_f32 =
-    {
-        {
-            vbslq_f32(vcgeq_f32(a.val[0], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f)),
-            vbslq_f32(vcgeq_f32(a.val[1], vdupq_n_f32(0)), vdupq_n_f32(0.5f), vdupq_n_f32(-0.5f))
-        }
-    };
-
-    res_f32.val[0] = vmlaq_f32(res_f32.val[0], a.val[0], pow2);
-    res_f32.val[1] = vmlaq_f32(res_f32.val[1], a.val[1], pow2);
-
-    const int32x4x2_t res_s32 =
-    {
-        {
-            vcvtq_s32_f32(res_f32.val[0]),
-            vcvtq_s32_f32(res_f32.val[1])
-        }
-    };
-
-    return vcombine_s16(vqmovn_s32(res_s32.val[0]), vqmovn_s32(res_s32.val[1]));
-}
-
-inline float32x4x2_t vcvt_f32_qs8(qint8x8_t a, int fixed_point_position)
-{
-    const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position));
-
-    const int16x8_t res_s16 = vmovl_s8(a);
-
-    const int32x4x2_t res_s32 =
-    {
-        {
-            vmovl_s16(vget_low_qs16(res_s16)),
-            vmovl_s16(vget_high_qs16(res_s16))
-        }
-    };
-
-    float32x4x2_t res_f32 =
-    {
-        {
-            vcvtq_f32_s32(res_s32.val[0]),
-            vcvtq_f32_s32(res_s32.val[1])
-        }
-    };
-
-    res_f32.val[0] = vmulq_f32(res_f32.val[0], pow2);
-    res_f32.val[1] = vmulq_f32(res_f32.val[1], pow2);
-
-    return res_f32;
-}
-
-inline float32x4_t vcvt_f32_qs16(qint16x4_t a, int fixed_point_position)
-{
-    const float32x4_t pow2    = vdupq_n_f32(1.0f / (1 << fixed_point_position));
-    const float32x4_t res_f32 = vcvtq_f32_s32(vmovl_s16(a));
-
-    return vmulq_f32(res_f32, pow2);
-}
-
-inline float32x4x4_t vcvtq_f32_qs8(qint8x16_t a, int fixed_point_position)
-{
-    const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position));
-
-    const int16x8x2_t res_s16 =
-    {
-        {
-            vmovl_s8(vget_low_s8(a)),
-            vmovl_s8(vget_high_s8(a)),
-        }
-    };
-
-    const int32x4x4_t res_s32 =
-    {
-        {
-            vmovl_s16(vget_low_qs16(res_s16.val[0])),
-            vmovl_s16(vget_high_qs16(res_s16.val[0])),
-            vmovl_s16(vget_low_qs16(res_s16.val[1])),
-            vmovl_s16(vget_high_qs16(res_s16.val[1])),
-        }
-    };
-
-    float32x4x4_t res_f32 =
-    {
-        {
-            vcvtq_f32_s32(res_s32.val[0]),
-            vcvtq_f32_s32(res_s32.val[1]),
-            vcvtq_f32_s32(res_s32.val[2]),
-            vcvtq_f32_s32(res_s32.val[3])
-        }
-    };
-
-    res_f32.val[0] = vmulq_f32(res_f32.val[0], pow2);
-    res_f32.val[1] = vmulq_f32(res_f32.val[1], pow2);
-    res_f32.val[2] = vmulq_f32(res_f32.val[2], pow2);
-    res_f32.val[3] = vmulq_f32(res_f32.val[3], pow2);
-
-    return res_f32;
-}
-
-inline float32x4x2_t vcvtq_f32_qs16(qint16x8_t a, int fixed_point_position)
-{
-    const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position));
-
-    const int32x4x2_t res_s32 =
-    {
-        {
-            vmovl_s16(vget_low_qs16(a)),
-            vmovl_s16(vget_high_qs16(a))
-        }
-    };
-
-    float32x4x2_t res_f32 =
-    {
-        {
-            vcvtq_f32_s32(res_s32.val[0]),
-            vcvtq_f32_s32(res_s32.val[1])
-        }
-    };
-
-    res_f32.val[0] = vmulq_f32(res_f32.val[0], pow2);
-    res_f32.val[1] = vmulq_f32(res_f32.val[1], pow2);
-
-    return res_f32;
-}
-
-inline qint8x8_t vrecip_qs8(qint8x8_t a, int fixed_point_position)
-{
-    // We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
-    const qint8x8_t const_48_over_17 = vdup_n_s8(0x5A >> (5 - fixed_point_position));   // 2.823
-    const qint8x8_t const_32_over_17 = vdup_n_s8((0x3C >> (5 - fixed_point_position))); // 1.8823
-    const qint8x8_t const_one        = vdup_n_s8(1 << fixed_point_position);
-    const qint8x8_t const_two        = vdup_n_s8(2 << fixed_point_position);
-
-    // Find shift value
-    const qint8x8_t shift_value = vneg_s8(vsub_s8(vdup_n_s8(8), vadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
-    const qint8x8_t temp        = vshl_s8(a, shift_value);
-
-    // Newton-Raphson division initial estimate X0 calculation
-    qint8x8_t x = vsub_s8(const_48_over_17, vmul_qs8(temp, const_32_over_17, fixed_point_position));
-
-    uint8x8_t set_one = vcgt_s8(x, const_one);
-    x                 = vbsl_s8(set_one, const_one, x);
-
-    // Use three iterations of Newton-Raphson  method to get the result
-    x = vmul_qs8(x, vsub_s8(const_two, vmul_qs8(temp, x, fixed_point_position)), fixed_point_position);
-    x = vmul_qs8(x, vsub_s8(const_two, vmul_qs8(temp, x, fixed_point_position)), fixed_point_position);
-    x = vmul_qs8(x, vsub_s8(const_two, vmul_qs8(temp, x, fixed_point_position)), fixed_point_position);
-
-    return vshl_s8(x, shift_value);
-}
-
-inline qint16x4_t vrecip_qs16(qint16x4_t a, int fixed_point_position)
-{
-    // We need two bits to store 2, thus we can only support formats from Q2.13 to Q15.0
-    const qint16x4_t const_48_over_17 = vdup_n_s16(0x5A5A >> (13 - fixed_point_position)); // 2.823
-    const qint16x4_t const_32_over_17 = vdup_n_s16(0x3C3C >> (13 - fixed_point_position)); // 1.8823
-    const qint16x4_t const_one        = vdup_n_s16(1 << fixed_point_position);
-    const qint16x4_t const_two        = vdup_n_s16(2 << fixed_point_position);
-
-    // Find shift value
-    const qint16x4_t shift_value = vneg_s16(vsub_s16(vdup_n_s16(8), vadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position))));
-    const qint16x4_t temp        = vshl_s16(a, shift_value);
-
-    // Newton-Raphson division initial estimate X0 calculation
-    qint16x4_t x = vsub_s16(const_48_over_17, vmul_qs16(temp, const_32_over_17, fixed_point_position));
-
-    uint16x4_t set_one = vcgt_s16(x, const_one);
-    x                  = vbsl_s16(set_one, const_one, x);
-
-    // Use four iterations of Newton-Raphson  method to get the result
-    x = vmul_qs16(x, vsub_s16(const_two, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position);
-    x = vmul_qs16(x, vsub_s16(const_two, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position);
-    x = vmul_qs16(x, vsub_s16(const_two, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position);
-    x = vmul_qs16(x, vsub_s16(const_two, vmul_qs16(temp, x, fixed_point_position)), fixed_point_position);
-
-    return vshl_s16(x, shift_value);
-}
-
-inline qint8x8_t vqrecip_qs8(qint8x8_t a, int fixed_point_position)
-{
-    // We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
-    const qint8x8_t const_48_over_17 = vdup_n_s8(0x5A >> (5 - fixed_point_position));   // 2.823
-    const qint8x8_t const_32_over_17 = vdup_n_s8((0x3C >> (5 - fixed_point_position))); // 1.8823
-    const qint8x8_t const_one        = vdup_n_s8(1 << fixed_point_position);
-    const qint8x8_t const_two        = vdup_n_s8(2 << fixed_point_position);
-
-    // Find shift value
-    const qint8x8_t shift_value = vqneg_s8(vqsub_s8(vdup_n_s8(8), vqadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
-    const qint8x8_t temp        = vqshl_s8(a, shift_value);
-
-    // Newton-Raphson division initial estimate X0 calculation
-    qint8x8_t x = vqsub_s8(const_48_over_17, vqmul_qs8(temp, const_32_over_17, fixed_point_position));
-
-    uint8x8_t set_one = vcgt_s8(x, const_one);
-    x                 = vbsl_s8(set_one, const_one, x);
-
-    // Use three iterations of Newton-Raphson  method to get the result
-    x = vqmul_qs8(x, vqsub_s8(const_two, vqmul_qs8(temp, x, fixed_point_position)), fixed_point_position);
-    x = vqmul_qs8(x, vqsub_s8(const_two, vqmul_qs8(temp, x, fixed_point_position)), fixed_point_position);
-    x = vqmul_qs8(x, vqsub_s8(const_two, vqmul_qs8(temp, x, fixed_point_position)), fixed_point_position);
-
-    return vqshl_s8(x, shift_value);
-}
-
-inline qint16x4_t vqrecip_qs16(qint16x4_t a, int fixed_point_position)
-{
-    // We need two bits to store 2, thus we can only support formats from Q2.13 to Q15.0
-    const qint16x4_t const_48_over_17 = vdup_n_s16(0x5A5A >> (13 - fixed_point_position)); // 2.823
-    const qint16x4_t const_32_over_17 = vdup_n_s16(0x3C3C >> (13 - fixed_point_position)); // 1.8823
-    const qint16x4_t const_one        = vdup_n_s16(1 << fixed_point_position);
-    const qint16x4_t const_two        = vdup_n_s16(2 << fixed_point_position);
-
-    // Find shift value
-    const qint16x4_t shift_value = vqneg_s16(vqsub_s16(vdup_n_s16(8), vqadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position))));
-    const qint16x4_t temp        = vqshl_s16(a, shift_value);
-
-    // Newton-Raphson division initial estimate X0 calculation
-    qint16x4_t x = vqsub_s16(const_48_over_17, vqmul_qs16(temp, const_32_over_17, fixed_point_position));
-
-    uint16x4_t set_one = vcgt_s16(x, const_one);
-    x                  = vbsl_s16(set_one, const_one, x);
-
-    // Use four iterations of Newton-Raphson  method to get the result
-    x = vqmul_qs16(x, vqsub_s16(const_two, vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position);
-    x = vqmul_qs16(x, vqsub_s16(const_two, vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position);
-    x = vqmul_qs16(x, vqsub_s16(const_two, vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position);
-    x = vqmul_qs16(x, vqsub_s16(const_two, vqmul_qs16(temp, x, fixed_point_position)), fixed_point_position);
-
-    return vqshl_s16(x, shift_value);
-}
-
-inline qint8x16_t vrecipq_qs8(qint8x16_t a, int fixed_point_position)
-{
-    // We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
-    const qint8x16_t const_48_over_17 = vdupq_n_s8(0x5A >> (5 - fixed_point_position));   // 2.823
-    const qint8x16_t const_32_over_17 = vdupq_n_s8((0x3C >> (5 - fixed_point_position))); // -1.8823
-    const qint8x16_t const_one        = vdupq_n_s8(1 << fixed_point_position);
-    const qint8x16_t const_two        = vdupq_n_s8(2 << fixed_point_position);
-
-    // Find shift value
-    const qint8x16_t shift_value = vnegq_s8(vsubq_s8(vdupq_n_s8(8), vaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
-    const qint8x16_t temp        = vshlq_s8(a, shift_value);
-
-    // Newton-Raphson division initial estimate X0 calculation
-    qint8x16_t x = vsubq_qs8(const_48_over_17, vmulq_qs8(temp, const_32_over_17, fixed_point_position));
-
-    // Set initial guess to one if x > 1
-    uint8x16_t set_one = vcgtq_s8(x, const_one);
-    x                  = vbslq_s8(set_one, const_one, x);
-
-    // Use three iterations of Newton-Raphson  method to get the result
-    x = vmulq_qs8(x, vsubq_s8(const_two, vmulq_qs8(temp, x, fixed_point_position)), fixed_point_position);
-    x = vmulq_qs8(x, vsubq_s8(const_two, vmulq_qs8(temp, x, fixed_point_position)), fixed_point_position);
-    x = vmulq_qs8(x, vsubq_s8(const_two, vmulq_qs8(temp, x, fixed_point_position)), fixed_point_position);
-
-    return vshlq_s8(x, shift_value);
-}
-
-inline qint16x8_t vrecipq_qs16(qint16x8_t a, int fixed_point_position)
-{
-    // We need two bits to store 2, thus we can only support formats from Q2.13 to Q15.0
-    const qint16x8_t const_48_over_17 = vdupq_n_s16(0x5A56 >> (13 - fixed_point_position)); // 2.823
-    const qint16x8_t const_32_over_17 = vdupq_n_s16(0x3C3C >> (13 - fixed_point_position)); // 1.8823
-    const qint16x8_t const_one        = vdupq_n_s16(1 << fixed_point_position);
-    const qint16x8_t const_two        = vdupq_n_s16(2 << fixed_point_position);
-
-    // Find shift value
-    const qint16x8_t shift_value = vnegq_s16(vsubq_s16(vdupq_n_s16(16), vaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position))));
-    const qint16x8_t temp        = vshlq_s16(a, shift_value);
-
-    // Newton-Raphson division initial estimate X0 calculation
-    qint16x8_t x = vsubq_qs16(const_48_over_17, vmulq_qs16(temp, const_32_over_17, fixed_point_position));
-
-    // Set initial guess to one if x > 1
-    uint16x8_t set_one = vcgtq_s16(x, const_one);
-    x                  = vbslq_s16(set_one, const_one, x);
-
-    // Use four iterations of Newton-Raphson  method to get the result
-    x = vmulq_qs16(x, vsubq_s16(const_two, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position);
-    x = vmulq_qs16(x, vsubq_s16(const_two, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position);
-    x = vmulq_qs16(x, vsubq_s16(const_two, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position);
-    x = vmulq_qs16(x, vsubq_s16(const_two, vmulq_qs16(temp, x, fixed_point_position)), fixed_point_position);
-
-    return vshlq_s16(x, shift_value);
-}
-
-inline qint8x16_t vqrecipq_qs8(qint8x16_t a, int fixed_point_position)
-{
-    // We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0
-    const qint8x16_t const_48_over_17 = vdupq_n_s8(0x5A >> (5 - fixed_point_position));   // 2.823
-    const qint8x16_t const_32_over_17 = vdupq_n_s8((0x3C >> (5 - fixed_point_position))); // -1.8823
-    const qint8x16_t const_one        = vdupq_n_s8(1 << fixed_point_position);
-    const qint8x16_t const_two        = vdupq_n_s8(2 << fixed_point_position);
-
-    // Find shift value
-    const qint8x16_t shift_value = vqnegq_s8(vqsubq_s8(vdupq_n_s8(8), vqaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
-    const qint8x16_t temp        = vqshlq_s8(a, shift_value);
-
-    // Newton-Raphson division initial estimate X0 calculation
-    qint8x16_t x = vqsubq_qs8(const_48_over_17, vqmulq_qs8(temp, const_32_over_17, fixed_point_position));
-
-    // Set initial guess to one if x > 1
-    uint8x16_t set_one = vcgtq_s8(x, const_one);
-    x                  = vbslq_s8(set_one, const_one, x);
-
-    // Use three iterations of Newton-Raphson  method to get the result
-    x = vqmulq_qs8(x, vqsubq_s8(const_two, vqmulq_qs8(temp, x, fixed_point_position)), fixed_point_position);
-    x = vqmulq_qs8(x, vqsubq_s8(const_two, vqmulq_qs8(temp, x, fixed_point_position)), fixed_point_position);
-    x = vqmulq_qs8(x, vqsubq_s8(const_two, vqmulq_qs8(temp, x, fixed_point_position)), fixed_point_position);
-
-    return vqshlq_s8(x, shift_value);
-}
-
-inline qint16x8_t vqrecipq_qs16(qint16x8_t a, int fixed_point_position)
-{
-    // We need two bits to store 2, thus we can only support formats from Q2.13 to Q15.0
-    const qint16x8_t const_48_over_17 = vdupq_n_s16(0x5A56 >> (13 - fixed_point_position)); // 2.823
-    const qint16x8_t const_32_over_17 = vdupq_n_s16(0x3C3C >> (13 - fixed_point_position)); // 1.8823
-    const qint16x8_t const_one        = vdupq_n_s16(1 << fixed_point_position);
-    const qint16x8_t const_two        = vdupq_n_s16(2 << fixed_point_position);
-
-    // Find shift value
-    const qint16x8_t shift_value = vqnegq_s16(vqsubq_s16(vdupq_n_s16(16), vqaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position))));
-    const qint16x8_t temp        = vqshlq_s16(a, shift_value);
-
-    // Newton-Raphson division initial estimate X0 calculation
-    qint16x8_t x = vqsubq_qs16(const_48_over_17, vqmulq_qs16(temp, const_32_over_17, fixed_point_position));
-
-    // Set initial guess to one if x > 1
-    uint16x8_t set_one = vcgtq_s16(x, const_one);
-    x                  = vbslq_s16(set_one, const_one, x);
-
-    // Use four iterations of Newton-Raphson  method to get the result
-    x = vqmulq_qs16(x, vqsubq_s16(const_two, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position);
-    x = vqmulq_qs16(x, vqsubq_s16(const_two, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position);
-    x = vqmulq_qs16(x, vqsubq_s16(const_two, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position);
-    x = vqmulq_qs16(x, vqsubq_s16(const_two, vqmulq_qs16(temp, x, fixed_point_position)), fixed_point_position);
-
-    // Saturate result in case of overflow
-    return vbslq_s16(vceqq_s16(a, vdupq_n_s16(0)), vdupq_n_s16(std::numeric_limits<int16_t>::max()), vqshlq_s16(x, shift_value));
-}
-
-inline qint8x8_t vdiv_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position)
-{
-    return vmul_qs8(a, vrecip_qs8(b, fixed_point_position), fixed_point_position);
-}
-
-inline qint16x4_t vdiv_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position)
-{
-    return vmul_qs16(a, vrecip_qs16(b, fixed_point_position), fixed_point_position);
-}
-
-inline qint8x16_t vdivq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
-{
-    return vmulq_qs8(a, vrecipq_qs8(b, fixed_point_position), fixed_point_position);
-}
-
-inline qint16x8_t vdivq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position)
-{
-    return vmulq_qs16(a, vrecipq_qs16(b, fixed_point_position), fixed_point_position);
-}
-
-template <bool   islog>
-inline qint8x8_t vtaylor_poly_qs8(qint8x8_t a, int fixed_point_position)
-{
-    const qint8x8_t shift_value = vdup_n_s8(-(7 - fixed_point_position));
-    const qint8x8_t const_one   = vdup_n_s8(1);
-    const qint8x8_t A           = vrshl_s8(islog ? log_tab_qs8[0] : exp_tab_qs8[0], islog ? vadd_s8(shift_value, const_one) : shift_value);
-    const qint8x8_t B           = vrshl_s8(islog ? log_tab_qs8[1] : exp_tab_qs8[1], shift_value);
-    const qint8x8_t C           = vrshl_s8(islog ? log_tab_qs8[2] : exp_tab_qs8[2], shift_value);
-    const qint8x8_t D           = vrshl_s8(islog ? log_tab_qs8[3] : exp_tab_qs8[3], shift_value);
-    const qint8x8_t x1          = vadd_s8(vmul_qs8(a, D, fixed_point_position), C);
-    const qint8x8_t x2          = vadd_s8(vmul_qs8(a, x1, fixed_point_position), B);
-    const qint8x8_t x3          = vadd_s8(vmul_qs8(a, x2, fixed_point_position), A);
-    const qint8x8_t res         = vmul_qs8(a, x3, fixed_point_position);
-    return res;
-}
-
-template <bool    islog>
-inline qint16x4_t vtaylor_poly_qs16(qint16x4_t a, int fixed_point_position)
-{
-    const qint16x4_t shift_value = vdup_n_s16(-(15 - fixed_point_position));
-    const qint16x4_t const_one   = vdup_n_s16(1);
-    const qint16x4_t A           = vrshl_s16(islog ? log_tab_qs16[0] : exp_tab_qs16[0], islog ? vadd_s16(shift_value, const_one) : shift_value);
-    const qint16x4_t B           = vrshl_s16(islog ? log_tab_qs16[1] : exp_tab_qs16[1], shift_value);
-    const qint16x4_t C           = vrshl_s16(islog ? log_tab_qs16[2] : exp_tab_qs16[2], shift_value);
-    const qint16x4_t D           = vrshl_s16(islog ? log_tab_qs16[3] : exp_tab_qs16[3], shift_value);
-    const qint16x4_t x1          = vadd_s16(vmul_qs16(a, D, fixed_point_position), C);
-    const qint16x4_t x2          = vadd_s16(vmul_qs16(a, x1, fixed_point_position), B);
-    const qint16x4_t x3          = vadd_s16(vmul_qs16(a, x2, fixed_point_position), A);
-    const qint16x4_t res         = vmul_qs16(a, x3, fixed_point_position);
-    return res;
-}
-
-template <bool   islog>
-inline qint8x8_t vqtaylor_poly_qs8(qint8x8_t a, int fixed_point_position)
-{
-    const qint8x8_t shift_value = vdup_n_s8(-(7 - fixed_point_position));
-    const qint8x8_t const_one   = vdup_n_s8(1);
-    const qint8x8_t A           = vqrshl_s8(islog ? log_tab_qs8[0] : exp_tab_qs8[0], islog ? vqadd_s8(shift_value, const_one) : shift_value);
-    const qint8x8_t B           = vqrshl_s8(islog ? log_tab_qs8[1] : exp_tab_qs8[1], shift_value);
-    const qint8x8_t C           = vqrshl_s8(islog ? log_tab_qs8[2] : exp_tab_qs8[2], shift_value);
-    const qint8x8_t D           = vqrshl_s8(islog ? log_tab_qs8[3] : exp_tab_qs8[3], shift_value);
-    const qint8x8_t x1          = vqadd_s8(vqmul_qs8(a, D, fixed_point_position), C);
-    const qint8x8_t x2          = vqadd_s8(vqmul_qs8(a, x1, fixed_point_position), B);
-    const qint8x8_t x3          = vqadd_s8(vqmul_qs8(a, x2, fixed_point_position), A);
-    const qint8x8_t res         = vqmul_qs8(a, x3, fixed_point_position);
-    return res;
-}
-
-template <bool    islog>
-inline qint16x4_t vqtaylor_poly_qs16(qint16x4_t a, int fixed_point_position)
-{
-    const qint16x4_t shift_value = vdup_n_s16(-(15 - fixed_point_position));
-    const qint16x4_t const_one   = vdup_n_s16(1);
-    const qint16x4_t A           = vqrshl_s16(islog ? log_tab_qs16[0] : exp_tab_qs16[0], islog ? vqadd_s16(shift_value, const_one) : shift_value);
-    const qint16x4_t B           = vqrshl_s16(islog ? log_tab_qs16[1] : exp_tab_qs16[1], shift_value);
-    const qint16x4_t C           = vqrshl_s16(islog ? log_tab_qs16[2] : exp_tab_qs16[2], shift_value);
-    const qint16x4_t D           = vqrshl_s16(islog ? log_tab_qs16[3] : exp_tab_qs16[3], shift_value);
-    const qint16x4_t x1          = vqadd_s16(vqmul_qs16(a, D, fixed_point_position), C);
-    const qint16x4_t x2          = vqadd_s16(vqmul_qs16(a, x1, fixed_point_position), B);
-    const qint16x4_t x3          = vqadd_s16(vqmul_qs16(a, x2, fixed_point_position), A);
-    const qint16x4_t res         = vqmul_qs16(a, x3, fixed_point_position);
-    return res;
-}
-
-template <bool    islog>
-inline qint8x16_t vtaylor_polyq_qs8(qint8x16_t a, int fixed_point_position)
-{
-    const qint8x16_t shift_value = vdupq_n_s8(-(7 - fixed_point_position));
-    const qint8x16_t const_one   = vdupq_n_s8(1);
-    const qint8x16_t A           = vrshlq_s8(islog ? log_tabq_qs8[0] : exp_tabq_qs8[0], islog ? vaddq_s8(shift_value, const_one) : shift_value);
-    const qint8x16_t B           = vrshlq_s8(islog ? log_tabq_qs8[1] : exp_tabq_qs8[1], shift_value);
-    const qint8x16_t C           = vrshlq_s8(islog ? log_tabq_qs8[2] : exp_tabq_qs8[2], shift_value);
-    const qint8x16_t D           = vrshlq_s8(islog ? log_tabq_qs8[3] : exp_tabq_qs8[3], shift_value);
-    const qint8x16_t x1          = vaddq_s8(vmulq_qs8(a, D, fixed_point_position), C);
-    const qint8x16_t x2          = vaddq_s8(vmulq_qs8(a, x1, fixed_point_position), B);
-    const qint8x16_t x3          = vaddq_s8(vmulq_qs8(a, x2, fixed_point_position), A);
-    const qint8x16_t res         = vmulq_qs8(a, x3, fixed_point_position);
-    return res;
-}
-
-template <bool    islog>
-inline qint16x8_t vtaylor_polyq_qs16(qint16x8_t a, int fixed_point_position)
-{
-    const qint16x8_t shift_value = vdupq_n_s16(-(15 - fixed_point_position));
-    const qint16x8_t const_one   = vdupq_n_s16(1);
-    const qint16x8_t A           = vrshlq_s16(islog ? log_tabq_qs16[0] : exp_tabq_qs16[0], islog ? vaddq_s16(shift_value, const_one) : shift_value);
-    const qint16x8_t B           = vrshlq_s16(islog ? log_tabq_qs16[1] : exp_tabq_qs16[1], shift_value);
-    const qint16x8_t C           = vrshlq_s16(islog ? log_tabq_qs16[2] : exp_tabq_qs16[2], shift_value);
-    const qint16x8_t D           = vrshlq_s16(islog ? log_tabq_qs16[3] : exp_tabq_qs16[3], shift_value);
-    const qint16x8_t x1          = vaddq_s16(vmulq_qs16(a, D, fixed_point_position), C);
-    const qint16x8_t x2          = vaddq_s16(vmulq_qs16(a, x1, fixed_point_position), B);
-    const qint16x8_t x3          = vaddq_s16(vmulq_qs16(a, x2, fixed_point_position), A);
-    const qint16x8_t res         = vmulq_qs16(a, x3, fixed_point_position);
-    return res;
-}
-
-template <bool    islog>
-inline qint8x16_t vqtaylor_polyq_qs8(qint8x16_t a, int fixed_point_position)
-{
-    const qint8x16_t shift_value = vdupq_n_s8(-(7 - fixed_point_position));
-    const qint8x16_t const_one   = vdupq_n_s8(1);
-    const qint8x16_t A           = vqrshlq_s8(islog ? log_tabq_qs8[0] : exp_tabq_qs8[0], islog ? vqaddq_s8(shift_value, const_one) : shift_value);
-    const qint8x16_t B           = vqrshlq_s8(islog ? log_tabq_qs8[1] : exp_tabq_qs8[1], shift_value);
-    const qint8x16_t C           = vqrshlq_s8(islog ? log_tabq_qs8[2] : exp_tabq_qs8[2], shift_value);
-    const qint8x16_t D           = vqrshlq_s8(islog ? log_tabq_qs8[3] : exp_tabq_qs8[3], shift_value);
-    const qint8x16_t x1          = vqaddq_s8(vqmulq_qs8(a, D, fixed_point_position), C);
-    const qint8x16_t x2          = vqaddq_s8(vqmulq_qs8(a, x1, fixed_point_position), B);
-    const qint8x16_t x3          = vqaddq_s8(vqmulq_qs8(a, x2, fixed_point_position), A);
-    const qint8x16_t res         = vqmulq_qs8(a, x3, fixed_point_position);
-    return res;
-}
-
-template <bool    islog>
-inline qint16x8_t vqtaylor_polyq_qs16(qint16x8_t a, int fixed_point_position)
-{
-    const qint16x8_t shift_value = vdupq_n_s16(-(15 - fixed_point_position));
-    const qint16x8_t const_one   = vdupq_n_s16(1);
-    const qint16x8_t A           = vqrshlq_s16(islog ? log_tabq_qs16[0] : exp_tabq_qs16[0], islog ? vqaddq_s16(shift_value, const_one) : shift_value);
-    const qint16x8_t B           = vqrshlq_s16(islog ? log_tabq_qs16[1] : exp_tabq_qs16[1], shift_value);
-    const qint16x8_t C           = vqrshlq_s16(islog ? log_tabq_qs16[2] : exp_tabq_qs16[2], shift_value);
-    const qint16x8_t D           = vqrshlq_s16(islog ? log_tabq_qs16[3] : exp_tabq_qs16[3], shift_value);
-    const qint16x8_t x1          = vqaddq_s16(vqmulq_qs16(a, D, fixed_point_position), C);
-    const qint16x8_t x2          = vqaddq_s16(vqmulq_qs16(a, x1, fixed_point_position), B);
-    const qint16x8_t x3          = vqaddq_s16(vqmulq_qs16(a, x2, fixed_point_position), A);
-    const qint16x8_t res         = vqmulq_qs16(a, x3, fixed_point_position);
-    return res;
-}
-
-inline qint8x8_t vqexp_qs8(qint8x8_t a, int fixed_point_position)
-{
-    const qint8x8_t shift_value   = vdup_n_s8(fixed_point_position - 7);
-    const qint8x8_t const_one     = vdup_n_s8(1 << fixed_point_position);
-    const qint8x8_t const_ln2     = vqrshl_s8(vdup_n_s8(0x58), shift_value);                     // ln(2)
-    const qint8x8_t const_inv_ln2 = vorr_s8(vqrshl_s8(vdup_n_s8(0x38), shift_value), const_one); // 1/ln(2)
-
-    // Perform range reduction [-log(2),log(2)]
-    const qint8x8_t m = vqmul_qs8(a, const_inv_ln2, fixed_point_position); // x / ln(2)
-
-    // get decimal part from m
-    const qint8x8_t dec_m = vqshl_s8(m, vdup_n_s8(-fixed_point_position));
-
-    qint8x8_t alpha = vqmul_qs8(vqshl_s8(dec_m, vdup_n_s8(fixed_point_position)), const_ln2, fixed_point_position);
-    alpha           = vqabs_qs8(vqsub_s8(a, alpha));
-
-    // Polynomial Approximation
-    qint8x8_t poly = vqtaylor_poly_qs8<false>(alpha, fixed_point_position);
-    poly           = vqadd_s8(poly, const_one);
-
-    // Reconstruct
-    poly = vqshl_s8(poly, dec_m);
-
-    return poly;
-}
-
-inline qint16x4_t vqexp_qs16(qint16x4_t a, int fixed_point_position)
-{
-    const qint16x4_t shift_value   = vdup_n_s16(fixed_point_position - 15);
-    const qint16x4_t const_one     = vdup_n_s16(1 << fixed_point_position);
-    const qint16x4_t const_ln2     = vqrshl_s16(vdup_n_s16(0x58B9), shift_value);                      // ln(2)
-    const qint16x4_t const_inv_ln2 = vorr_s16(vqrshl_s16(vdup_n_s16(0x38AA), shift_value), const_one); // 1/ln(2)
-
-    // Perform range reduction [-log(2),log(2)]
-    const qint16x4_t m = vqmul_qs16(a, const_inv_ln2, fixed_point_position); // x / ln(2)
-
-    // get decimal part from m
-    const qint16x4_t dec_m = vqshl_s16(m, vdup_n_s16(-fixed_point_position));
-
-    qint16x4_t alpha = vqmul_qs16(vqshl_s16(dec_m, vdup_n_s16(fixed_point_position)), const_ln2, fixed_point_position);
-    alpha            = vqabs_qs16(vqsub_s16(a, alpha));
-
-    // Polynomial Approximation
-    qint16x4_t poly = vqtaylor_poly_qs16<false>(alpha, fixed_point_position);
-    poly            = vqadd_s16(poly, const_one);
-
-    // Reconstruct
-    poly = vqshl_s16(poly, dec_m);
-
-    return poly;
-}
-
-inline qint8x16_t vqexpq_qs8(qint8x16_t a, int fixed_point_position)
-{
-    const qint8x16_t shift_value   = vdupq_n_s8(fixed_point_position - 7);
-    const qint8x16_t const_one     = vdupq_n_s8(1 << fixed_point_position);
-    const qint8x16_t const_ln2     = vqrshlq_s8(vdupq_n_s8(0x58), shift_value);                      // ln(2)
-    const qint8x16_t const_inv_ln2 = vorrq_s8(vqrshlq_s8(vdupq_n_s8(0x38), shift_value), const_one); // 1/ln(2)
-
-    // Perform range reduction [-log(2),log(2)]
-    const qint8x16_t m = vqmulq_qs8(a, const_inv_ln2, fixed_point_position); // x / ln(2)
-
-    // get decimal part from m
-    const qint8x16_t dec_m = vqshlq_s8(m, vdupq_n_s8(-fixed_point_position));
-
-    qint8x16_t alpha = vqmulq_qs8(vqshlq_s8(dec_m, vdupq_n_s8(fixed_point_position)), const_ln2, fixed_point_position);
-    alpha            = vqabsq_qs8(vqsubq_qs8(a, alpha));
-
-    // Polynomial Approximation
-    qint8x16_t poly = vqtaylor_polyq_qs8<false>(alpha, fixed_point_position);
-    poly            = vqaddq_s8(poly, const_one);
-
-    // Reconstruct
-    poly = vqshlq_s8(poly, dec_m);
-
-    return poly;
-}
-
-inline qint16x8_t vqexpq_qs16(qint16x8_t a, int fixed_point_position)
-{
-    const qint16x8_t shift_value   = vdupq_n_s16(fixed_point_position - 15);
-    const qint16x8_t const_one     = vdupq_n_s16(1 << fixed_point_position);
-    const qint16x8_t const_ln2     = vqrshlq_s16(vdupq_n_s16(0x58B9), shift_value);                       // ln(2)
-    const qint16x8_t const_inv_ln2 = vorrq_s16(vqrshlq_s16(vdupq_n_s16(0x38AA), shift_value), const_one); // 1/ln(2)
-
-    // Perform range reduction [-log(2),log(2)]
-    const qint16x8_t m = vqmulq_qs16(a, const_inv_ln2, fixed_point_position); // x / ln(2)
-
-    // get decimal part from m
-    const qint16x8_t dec_m = vqshlq_s16(m, vdupq_n_s16(-fixed_point_position));
-
-    qint16x8_t alpha = vqmulq_qs16(vqshlq_s16(dec_m, vdupq_n_s16(fixed_point_position)), const_ln2, fixed_point_position);
-    alpha            = vqabsq_qs16(vqsubq_qs16(a, alpha));
-
-    // Polynomial Approximation
-    qint16x8_t poly = vqtaylor_polyq_qs16<false>(alpha, fixed_point_position);
-    poly            = vqaddq_s16(poly, const_one);
-
-    // Reconstruct
-    poly = vqshlq_s16(poly, dec_m);
-
-    return poly;
-}
-
-inline qint8x8_t vlog_qs8(qint8x8_t a, int fixed_point_position)
-{
-    const qint8x8_t const_one       = vdup_n_s8(1 << fixed_point_position);
-    const qint8x8_t const_seven_dec = vdup_n_s8(7);
-    const qint8x8_t const_ln2       = vdup_n_s8(0x58 >> (7 - fixed_point_position)); // ln(2)
-
-    // If 0 < a < 1, calculate log(1/x)
-    uint8x8_t calc_reciprocal = vclt_s8(a, const_one);
-    qint8x8_t recip           = vdup_n_s8(0);
-    recip                     = vbsl_s8(calc_reciprocal, recip, a);
-
-    // Calculate reciprocal
-    recip = vrecip_qs8(recip, fixed_point_position);
-    a     = vbsl_s8(calc_reciprocal, recip, a);
-
-    // Get decimal part of a
-    qint8x8_t shift_value = vdup_n_s8(-fixed_point_position);
-    qint8x8_t dec_a       = vshl_s8(a, shift_value); // a >> fixed_point_position
-
-    // Get exponent of 2^n which is equal or less than dec_a
-    shift_value = vsub_s8(const_seven_dec, vclz_s8(dec_a));
-
-    // Get x to range (1, 2]
-    const qint8x8_t shift_value_neg = vneg_s8(shift_value);
-    const qint8x8_t temp            = vsub_s8(vrshl_s8(a, shift_value_neg), const_one);
-    const qint8x8_t sum             = vmul_s8(shift_value, const_one);
-
-    // Polynomial Approximation
-    qint8x8_t poly = vtaylor_poly_qs8<true>(temp, fixed_point_position);
-
-    // Reconstruct
-    poly = vmul_qs8(vadd_s8(poly, sum), const_ln2, fixed_point_position);
-
-    // Set negative value for 0 < a < 1
-    poly = vbsl_s8(calc_reciprocal, vneg_s8(poly), poly);
-
-    return poly;
-}
-
-inline qint16x4_t vlog_qs16(qint16x4_t a, int fixed_point_position)
-{
-    const qint16x4_t const_one         = vdup_n_s16(1 << fixed_point_position);
-    const qint16x4_t const_fifteen_dec = vdup_n_s16(15);
-    const qint16x4_t const_ln2         = vdup_n_s16(0x58B9 >> (15 - fixed_point_position)); // ln(2)
-
-    // If 0 < a < 1, calculate log(1/x)
-    uint16x4_t calc_reciprocal = vclt_s16(a, const_one);
-    qint16x4_t recip           = vdup_n_s16(0);
-    recip                      = vbsl_s16(calc_reciprocal, recip, a);
-
-    // Calculate reciprocal
-    recip = vrecip_qs16(recip, fixed_point_position);
-    a     = vbsl_s16(calc_reciprocal, recip, a);
-
-    // Get decimal part of a
-    qint16x4_t shift_value = vdup_n_s16(-fixed_point_position);
-    qint16x4_t dec_a       = vshl_s16(a, shift_value); // a >> fixed_point_position
-
-    // Get exponent of 2^n which is equal or less than dec_a
-    shift_value = vsub_s16(const_fifteen_dec, vclz_s16(dec_a));
-
-    // Get x to range (1, 2]
-    const qint16x4_t shift_value_neg = vneg_s16(shift_value);
-    const qint16x4_t temp            = vsub_s16(vrshl_s16(a, shift_value_neg), const_one);
-    const qint16x4_t sum             = vmul_s16(shift_value, const_one);
-
-    // Polynomial Approximation
-    qint16x4_t poly = vtaylor_poly_qs16<true>(temp, fixed_point_position);
-
-    // Reconstruct
-    poly = vmul_qs16(vadd_s16(poly, sum), const_ln2, fixed_point_position);
-
-    // Set negative value for 0 < a < 1
-    poly = vbsl_s16(calc_reciprocal, vneg_s16(poly), poly);
-
-    return poly;
-}
-
-inline qint8x16_t vlogq_qs8(qint8x16_t a, int fixed_point_position)
-{
-    const qint8x16_t const_one       = vdupq_n_s8(1 << fixed_point_position);
-    const qint8x16_t const_seven_dec = vdupq_n_s8(7);
-    const qint8x16_t const_ln2       = vdupq_n_s8(0x58 >> (7 - fixed_point_position)); // ln(2)
-
-    // If 0 < a < 1, calculate log(1/x)
-    uint8x16_t calc_reciprocal = vcltq_s8(a, const_one);
-    qint8x16_t recip           = vdupq_n_s8(0);
-    recip                      = vbslq_s8(calc_reciprocal, a, recip);
-
-    // Calculate reciprocal
-    recip = vrecipq_qs8(recip, fixed_point_position);
-    a     = vbslq_s8(calc_reciprocal, recip, a);
-
-    // Get decimal part of a
-    qint8x16_t shift_value = vdupq_n_s8(-fixed_point_position);
-    qint8x16_t dec_a       = vshlq_s8(a, shift_value); // a >> fixed_point_position
-
-    // Get exponent of 2^n which is equal or less than dec_a
-    shift_value = vsubq_s8(const_seven_dec, vclzq_s8(dec_a));
-
-    // Get x to range (1, 2]
-    const qint8x16_t shift_value_neg = vnegq_s8(shift_value);
-    const qint8x16_t temp            = vsubq_s8(vrshlq_s8(a, shift_value_neg), const_one);
-    const qint8x16_t sum             = vmulq_s8(shift_value, const_one);
-
-    // Polynomial Approximation
-    qint8x16_t poly = vtaylor_polyq_qs8<true>(temp, fixed_point_position);
-
-    // Reconstruct
-    poly = vmulq_qs8(vaddq_s8(poly, sum), const_ln2, fixed_point_position);
-
-    // Set negative value for 0 < a < 1
-    poly = vbslq_s8(calc_reciprocal, vnegq_s8(poly), poly);
-
-    return poly;
-}
-
-inline qint16x8_t vlogq_qs16(qint16x8_t a, int fixed_point_position)
-{
-    const qint16x8_t const_one         = vdupq_n_s16(1 << fixed_point_position);
-    const qint16x8_t const_fifteen_dec = vdupq_n_s16(15);
-    const qint16x8_t const_ln2         = vdupq_n_s16(0x58B9 >> (15 - fixed_point_position)); // ln(2)
-
-    // If 0 < a < 1, calculate log(1/x)
-    uint16x8_t calc_reciprocal = vcltq_s16(a, const_one);
-    qint16x8_t recip           = vdupq_n_s16(0);
-    recip                      = vbslq_s16(calc_reciprocal, a, recip);
-
-    // Calculate reciprocal
-    recip = vqrecipq_qs16(recip, fixed_point_position);
-    a     = vbslq_s16(calc_reciprocal, recip, a);
-
-    // Get decimal part of a
-    qint16x8_t shift_value = vdupq_n_s16(-fixed_point_position);
-    qint16x8_t dec_a       = vshlq_s16(a, shift_value); // a >> fixed_point_position
-
-    // Get exponent of 2^n which is equal or less than dec_a
-    shift_value = vqsubq_s16(const_fifteen_dec, vclzq_s16(dec_a));
-
-    // Get x to range (1, 2]
-    const qint16x8_t shift_value_neg = vnegq_s16(shift_value);
-    const qint16x8_t temp            = vqsubq_s16(vrshlq_s16(a, shift_value_neg), const_one);
-    const qint16x8_t sum             = vmulq_s16(shift_value, const_one);
-
-    // Polynomial Approximation
-    qint16x8_t poly = vtaylor_polyq_qs16<true>(temp, fixed_point_position);
-
-    // Reconstruct
-    poly = vqmulq_qs16(vqaddq_s16(poly, sum), const_ln2, fixed_point_position);
-
-    // Set negative value for 0 < a < 1
-    poly = vbslq_s16(calc_reciprocal, vnegq_s16(poly), poly);
-
-    return poly;
-}
-
-inline qint8x8_t vinvsqrt_qs8(qint8x8_t a, int fixed_point_position)
-{
-    const qint8x8_t const_three = vdup_n_s8(3 << fixed_point_position);
-
-    // Find shift value. Number must be in (0.5, 2) range.
-    qint8x8_t shift_value = vneg_s8(vsub_s8(vdup_n_s8(8), vadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
-
-    // Add one when the shift value is negative in order to get the correct result when we shift right with 1
-    qint8x8_t temp         = vsub_s8(vdup_n_s8(8), vadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position)));
-    uint8x8_t temp_ltz     = vclt_s8(temp, vdup_n_qs8(0));
-    temp                   = vbsl_s8(temp_ltz, vadd_s8(temp, vdup_n_s8(1)), temp);
-    qint8x8_t shift_value2 = vneg_s8(vshr_n_s8(temp, 1));
-
-    temp = vshl_s8(a, shift_value);
-
-    // Initial guess
-    qint8x8_t x = temp;
-
-    // Calculate (x / 2) * (3 - a * x^2)
-    // After three iterations we have the result for 8 bit
-    x = vshr_n_s8(vmul_qs8(x, vsub_s8(const_three, vmul_qs8(temp, vmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
-    x = vshr_n_s8(vmul_qs8(x, vsub_s8(const_three, vmul_qs8(temp, vmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
-    x = vshr_n_s8(vmul_qs8(x, vsub_s8(const_three, vmul_qs8(temp, vmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
-
-    return vshl_s8(x, shift_value2);
-}
-
-inline qint16x4_t vinvsqrt_qs16(qint16x4_t a, int fixed_point_position)
-{
-    const qint16x4_t const_three = vdup_n_s16(3 << fixed_point_position);
-
-    // Find shift value. Number must be in (0.5, 2) range.
-    qint16x4_t shift_value = vneg_s16(vsub_s16(vdup_n_s16(16), vadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position))));
-
-    // Add one when the shift value is negative in order to get the correct result when we shift right with 1
-    qint16x4_t temp         = vsub_s16(vdup_n_s16(16), vadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position)));
-    uint16x4_t temp_ltz     = vclt_s16(temp, vdup_n_qs16(0));
-    temp                    = vbsl_s16(temp_ltz, vadd_s16(temp, vdup_n_s16(1)), temp);
-    qint16x4_t shift_value2 = vneg_s16(vshr_n_s16(temp, 1));
-
-    temp = vshl_s16(a, shift_value);
-
-    // Initial guess
-    qint16x4_t x = temp;
-
-    // Calculate (x / 2) * (3 - a * x^2)
-    // After five iterations we have the result for 8 bit
-    x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
-    x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
-    x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
-    x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
-    x = vshr_n_s16(vmul_qs16(x, vsub_s16(const_three, vmul_qs16(temp, vmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
-
-    return vshl_s16(x, shift_value2);
-}
-
-inline qint8x8_t vqinvsqrt_qs8(qint8x8_t a, int fixed_point_position)
-{
-    const qint8x8_t const_three = vdup_n_s8(3 << fixed_point_position);
-
-    // Find shift value. Number must be in (0.5, 2) range.
-    qint8x8_t shift_value = vqneg_s8(vqsub_s8(vdup_n_s8(8), vqadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))));
-
-    // Add one when the shift value is negative in order to get the correct result when we shift right with 1
-    qint8x8_t temp         = vqsub_s8(vdup_n_s8(8), vqadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position)));
-    uint8x8_t temp_ltz     = vclt_s8(temp, vdup_n_qs8(0));
-    temp                   = vbsl_s8(temp_ltz, vqadd_s8(temp, vdup_n_s8(1)), temp);
-    qint8x8_t shift_value2 = vqneg_s8(vshr_n_s8(temp, 1));
-
-    temp = vqshl_s8(a, shift_value);
-
-    // Initial guess
-    qint8x8_t x = temp;
-
-    // Calculate (x / 2) * (3 - a * x^2)
-    // After three iterations we have the result for 8 bit
-    x = vshr_n_s8(vqmul_qs8(x, vqsub_s8(const_three, vqmul_qs8(temp, vqmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
-    x = vshr_n_s8(vqmul_qs8(x, vqsub_s8(const_three, vqmul_qs8(temp, vqmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
-    x = vshr_n_s8(vqmul_qs8(x, vqsub_s8(const_three, vqmul_qs8(temp, vqmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
-
-    return vqshl_s8(x, shift_value2);
-}
-
-inline qint16x4_t vqinvsqrt_qs16(qint16x4_t a, int fixed_point_position)
-{
-    const qint16x4_t const_three = vdup_n_s16(3 << fixed_point_position);
-
-    // Find shift value. Number must be in (0.5, 2) range.
-    qint16x4_t shift_value = vqneg_s16(vqsub_s16(vdup_n_s16(16), vqadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position))));
-
-    // Add one when the shift value is negative in order to get the correct result when we shift right with 1
-    qint16x4_t temp         = vqsub_s16(vdup_n_s16(16), vqadd_s16(vclz_s16(a), vdup_n_s16(fixed_point_position)));
-    uint16x4_t temp_ltz     = vclt_s16(temp, vdup_n_qs16(0));
-    temp                    = vbsl_s16(temp_ltz, vqadd_s16(temp, vdup_n_s16(1)), temp);
-    qint16x4_t shift_value2 = vqneg_s16(vshr_n_s16(temp, 1));
-
-    temp = vqshl_s16(a, shift_value);
-
-    // Initial guess
-    qint16x4_t x = temp;
-
-    // Calculate (x / 2) * (3 - a * x^2)
-    // After five iterations we have the result for 16 bit
-    x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
-    x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
-    x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
-    x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
-    x = vshr_n_s16(vqmul_qs16(x, vqsub_s16(const_three, vqmul_qs16(temp, vqmul_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
-
-    return vqshl_s16(x, shift_value2);
-}
-
-inline qint8x16_t vinvsqrtq_qs8(qint8x16_t a, int fixed_point_position)
-{
-    const qint8x16_t const_three = vdupq_n_s8(3 << fixed_point_position);
-
-    // Find shift value. Number must be in (0.5, 2) range.
-    qint8x16_t shift_value = vnegq_s8(vsubq_s8(vdupq_n_s8(8), vaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
-
-    // Add one when the shift value is negative in order to get the correct result when we shift right with 1
-    qint8x16_t temp         = vsubq_s8(vdupq_n_s8(8), vaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position)));
-    uint8x16_t temp_ltz     = vcltq_s8(temp, vdupq_n_qs8(0));
-    temp                    = vbslq_s8(temp_ltz, vaddq_s8(temp, vdupq_n_s8(1)), temp);
-    qint8x16_t shift_value2 = vnegq_s8(vshrq_n_s8(temp, 1));
-
-    temp = vshlq_s8(a, shift_value);
-
-    // Initial guess
-    qint8x16_t x = temp;
-
-    // Calculate (x / 2) * (3 - a * x^2)
-    // After three iterations we have the result for 8 bit
-    x = vshrq_n_s8(vmulq_qs8(x, vsubq_s8(const_three, vmulq_qs8(temp, vmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
-    x = vshrq_n_s8(vmulq_qs8(x, vsubq_s8(const_three, vmulq_qs8(temp, vmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
-    x = vshrq_n_s8(vmulq_qs8(x, vsubq_s8(const_three, vmulq_qs8(temp, vmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
-
-    return vshlq_s8(x, shift_value2);
-}
-
-inline qint16x8_t vinvsqrtq_qs16(qint16x8_t a, int fixed_point_position)
-{
-    const qint16x8_t const_three = vdupq_n_s16(3 << fixed_point_position);
-
-    // Find shift value. Number must be in (0.5, 2) range.
-    qint16x8_t shift_value = vnegq_s16(vsubq_s16(vdupq_n_s16(16), vaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position))));
-
-    // Add one when the shift value is negative in order to get the correct result when we shift right with 1
-    qint16x8_t temp         = vsubq_s16(vdupq_n_s16(16), vaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position)));
-    uint16x8_t temp_ltz     = vcltq_s16(temp, vdupq_n_qs16(0));
-    temp                    = vbslq_s16(temp_ltz, vaddq_s16(temp, vdupq_n_s16(1)), temp);
-    qint16x8_t shift_value2 = vnegq_s16(vshrq_n_s16(temp, 1));
-
-    temp = vshlq_s16(a, shift_value);
-
-    // Initial guess
-    qint16x8_t x = temp;
-
-    // Calculate (x / 2) * (3 - a * x^2)
-    // After five iterations we have the result for 16 bit
-    x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
-    x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
-    x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
-    x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
-    x = vshrq_n_s16(vmulq_qs16(x, vsubq_s16(const_three, vmulq_qs16(temp, vmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
-
-    return vshlq_s16(x, shift_value2);
-}
-
-inline qint8x16_t vqinvsqrtq_qs8(qint8x16_t a, int fixed_point_position)
-{
-    const qint8x16_t const_three = vdupq_n_s8(3 << fixed_point_position);
-
-    // Find shift value. Number must be in (0.5, 2) range.
-    qint8x16_t shift_value = vqnegq_s8(vqsubq_s8(vdupq_n_s8(8), vqaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))));
-
-    // Add one when the shift value is negative in order to get the correct result when we shift right with 1
-    qint8x16_t temp         = vqsubq_s8(vdupq_n_s8(8), vqaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position)));
-    uint8x16_t temp_ltz     = vcltq_s8(temp, vdupq_n_qs8(0));
-    temp                    = vbslq_s8(temp_ltz, vqaddq_s8(temp, vdupq_n_s8(1)), temp);
-    qint8x16_t shift_value2 = vqnegq_s8(vshrq_n_s8(temp, 1));
-
-    temp = vqshlq_s8(a, shift_value);
-
-    // Initial guess
-    qint8x16_t x = temp;
-
-    // Calculate (x / 2) * (3 - a * x^2)
-    // After three iterations we have the result for 8 bit
-    x = vshrq_n_s8(vqmulq_qs8(x, vqsubq_s8(const_three, vqmulq_qs8(temp, vqmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
-    x = vshrq_n_s8(vqmulq_qs8(x, vqsubq_s8(const_three, vqmulq_qs8(temp, vqmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
-    x = vshrq_n_s8(vqmulq_qs8(x, vqsubq_s8(const_three, vqmulq_qs8(temp, vqmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
-
-    return vqshlq_s8(x, shift_value2);
-}
-
-inline qint16x8_t vqinvsqrtq_qs16(qint16x8_t a, int fixed_point_position)
-{
-    const qint16x8_t const_three = vdupq_n_s16(3 << fixed_point_position);
-
-    // Find shift value. Number must be in (0.5, 2) range.
-    qint16x8_t shift_value = vqnegq_s16(vqsubq_s16(vdupq_n_s16(16), vqaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position))));
-
-    // Add one when the shift value is negative in order to get the correct result when we shift right with 1
-    qint16x8_t temp         = vqsubq_s16(vdupq_n_s16(16), vqaddq_s16(vclzq_s16(a), vdupq_n_s16(fixed_point_position)));
-    uint16x8_t temp_ltz     = vcltq_s16(temp, vdupq_n_qs16(0));
-    temp                    = vbslq_s16(temp_ltz, vqaddq_s16(temp, vdupq_n_s16(1)), temp);
-    qint16x8_t shift_value2 = vqnegq_s16(vshrq_n_s16(temp, 1));
-
-    temp = vqshlq_s16(a, shift_value);
-
-    // Initial guess
-    qint16x8_t x = temp;
-
-    // Calculate (x / 2) * (3 - a * x^2)
-    // After five iterations we have the result for 16 bit
-    x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
-    x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
-    x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
-    x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
-    x = vshrq_n_s16(vqmulq_qs16(x, vqsubq_s16(const_three, vqmulq_qs16(temp, vqmulq_qs16(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1);
-
-    return vqshlq_s16(x, shift_value2);
-}
-
-inline qint8x8_t vqtanh_qs8(qint8x8_t a, int fixed_point_position)
-{
-    const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position);
-    const qint8x8_t const_two = vdup_n_s8(2 << fixed_point_position);
-
-    const qint8x8_t exp2x = vqexp_qs8(vqmul_qs8(const_two, a, fixed_point_position), fixed_point_position);
-    const qint8x8_t num   = vqsub_qs8(exp2x, const_one);
-    const qint8x8_t den   = vqadd_qs8(exp2x, const_one);
-    const qint8x8_t tanh  = vqmul_qs8(num, vqrecip_qs8(den, fixed_point_position), fixed_point_position);
-
-    return tanh;
-}
-
-inline qint16x4_t vqtanh_qs16(qint16x4_t a, int fixed_point_position)
-{
-    const qint16x4_t const_one = vdup_n_s16(1 << fixed_point_position);
-    const qint16x4_t const_two = vdup_n_s16(2 << fixed_point_position);
-
-    const qint16x4_t exp2x = vqexp_qs16(vqmul_qs16(const_two, a, fixed_point_position), fixed_point_position);
-    const qint16x4_t num   = vqsub_qs16(exp2x, const_one);
-    const qint16x4_t den   = vqadd_qs16(exp2x, const_one);
-    const qint16x4_t tanh  = vqmul_qs16(num, vqrecip_qs16(den, fixed_point_position), fixed_point_position);
-
-    return tanh;
-}
-
-inline qint8x16_t vqtanhq_qs8(qint8x16_t a, int fixed_point_position)
-{
-    const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position);
-    const qint8x16_t const_two = vdupq_n_s8(2 << fixed_point_position);
-
-    const qint8x16_t exp2x = vqexpq_qs8(vqmulq_qs8(const_two, a, fixed_point_position), fixed_point_position);
-    const qint8x16_t num   = vqsubq_qs8(exp2x, const_one);
-    const qint8x16_t den   = vqaddq_qs8(exp2x, const_one);
-    const qint8x16_t tanh  = vqmulq_qs8(num, vqrecipq_qs8(den, fixed_point_position), fixed_point_position);
-
-    return tanh;
-}
-
-inline qint16x8_t vqtanhq_qs16(qint16x8_t a, int fixed_point_position)
-{
-    const qint16x8_t const_one = vdupq_n_s16(1 << fixed_point_position);
-    const qint16x8_t const_two = vdupq_n_s16(2 << fixed_point_position);
-
-    const qint16x8_t exp2x = vqexpq_qs16(vqmulq_qs16(const_two, a, fixed_point_position), fixed_point_position);
-    const qint16x8_t num   = vqsubq_qs16(exp2x, const_one);
-    const qint16x8_t den   = vqaddq_qs16(exp2x, const_one);
-    const qint16x8_t tanh  = vqmulq_qs16(num, vqrecipq_qs16(den, fixed_point_position), fixed_point_position);
-
-    return tanh;
-}
-
-inline qint8x16_t vqpowq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position)
-{
-    return vqexpq_qs8(vqmulq_qs8(b, vlogq_qs8(a, fixed_point_position), fixed_point_position), fixed_point_position);
-}
-
-inline qint16x8_t vqpowq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position)
-{
-    return vqexpq_qs16(vqmulq_qs16(b, vlogq_qs16(a, fixed_point_position), fixed_point_position), fixed_point_position);
-}
 
 inline float32x4x2_t vmax2q_f32(float32x4x2_t a, float32x4x2_t b)
 {

diff --git a/arm_compute/core/NEON/kernels/NEActivationLayerKernel.h b/arm_compute/core/NEON/kernels/NEActivationLayerKernel.h
index 06a0a01..0290e32 100644
--- a/arm_compute/core/NEON/kernels/NEActivationLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NEActivationLayerKernel.h

@@ -24,7 +24,6 @@
 #ifndef __ARM_COMPUTE_NEACTIVATIONLAYERKERNEL_H__
 #define __ARM_COMPUTE_NEACTIVATIONLAYERKERNEL_H__
 
-#include "arm_compute/core/FixedPoint.h"
 #include "arm_compute/core/NEON/INEKernel.h"
 #include "arm_compute/core/QAsymm8.h"
 
@@ -59,7 +58,7 @@
      * @note If the output tensor is a nullptr, the activation function will be performed in-place
      *
      * @param[in, out] input           Source tensor. In case of @p output tensor = nullptr, this tensor will store the result
-     *                                 of the activation function. Data types supported: QS8/QASYMM8/QS16/F16/F32.
+     *                                 of the activation function. Data types supported: QASYMM8/F16/F32.
      * @param[out]     output          Destination tensor. Data type supported: same as @p input
      * @param[in]      activation_info Activation layer information.
      */
@@ -67,7 +66,7 @@
     /** Static function to check if given info will lead to a valid configuration of @ref NEActivationLayerKernel
      *
      * @param[in] input    Source tensor info. In case of @p output tensor info = nullptr, this tensor will store the result
-     *                     of the activation function. Data types supported: QS8/QASYMM8/QS16/F16/F32.
+     *                     of the activation function. Data types supported: QASYMM8/F16/F32.
      * @param[in] output   Destination tensor info. Data type supported: same as @p input
      * @param[in] act_info Activation layer information.
      *
@@ -104,19 +103,7 @@
      * @param[in] window Region on which to execute the kernel
      */
     template <ActivationLayerInfo::ActivationFunction F, typename T>
-    typename std::enable_if<std::is_same<T, qint8_t>::value, void>::type activation(const Window &window);
-    /** Function to apply an activation function on a tensor.
-     *
-     * @param[in] window Region on which to execute the kernel
-     */
-    template <ActivationLayerInfo::ActivationFunction F, typename T>
     typename std::enable_if<std::is_same<T, qasymm8_t>::value, void>::type activation(const Window &window);
-    /** Function to apply an activation function on a tensor.
-     *
-     * @param[in] window Region on which to execute the kernel
-     */
-    template <ActivationLayerInfo::ActivationFunction F, typename T>
-    typename std::enable_if<std::is_same<T, qint16_t>::value, void>::type activation(const Window &window);
 
 private:
     ITensor                      *_input;

diff --git a/arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h b/arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h
index 155e792..8cf21ea 100644
--- a/arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h
+++ b/arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h

@@ -57,26 +57,24 @@
      * Valid configurations (Input1,Input2) -> Output :
      *
      *   - (U8,U8)     -> U8
-     *   - (QS8,QS8)   -> QS8
      *   - (U8,U8)     -> S16
      *   - (S16,U8)    -> S16
      *   - (U8,S16)    -> S16
      *   - (S16,S16)   -> S16
-     *   - (QS16,QS16) -> QS16
      *   - (F16,F16)   -> F16
      *   - (F32,F32)   -> F32
      *
-     * @param[in]  input1 An input tensor. Data types supported: U8/QS8/QS16/S16/F16/F32
-     * @param[in]  input2 An input tensor. Data types supported: U8/QS8/QS16/S16/F16/F32
-     * @param[out] output The output tensor. Data types supported: U8/QS8/QS16/S16/F16/F32.
+     * @param[in]  input1 An input tensor. Data types supported: U8/S16/F16/F32
+     * @param[in]  input2 An input tensor. Data types supported: U8/S16/F16/F32
+     * @param[out] output The output tensor. Data types supported: U8/S16/F16/F32.
      * @param[in]  policy Overflow policy.
      */
     void configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy);
     /** Static function to check if given info will lead to a valid configuration of @ref NEArithmeticAdditionKernel
      *
-     * @param[in] input1 An input tensor. Data types supported: U8/QS8/QS16/S16/F16/F32
-     * @param[in] input2 An input tensor. Data types supported: U8/QS8/QS16/S16/F16/F32
-     * @param[in] output The output tensor. Data types supported: U8/QS8/QS16/S16/F16/F32.
+     * @param[in] input1 An input tensor. Data types supported: U8/S16/F16/F32
+     * @param[in] input2 An input tensor. Data types supported: U8/S16/F16/F32
+     * @param[in] output The output tensor. Data types supported: U8/S16/F16/F32.
      * @param[in] policy Overflow policy.
      *
      * @return a status
@@ -90,9 +88,9 @@
 private:
     /** Common signature for all the specialised add functions
      *
-     * @param[in]  input1 An input tensor. Data types supported: U8/QS8/QS16/S16/F16/F32
-     * @param[in]  input2 An input tensor. Data types supported: U8/QS8/QS16/S16/F16/F32
-     * @param[out] output The output tensor. Data types supported: U8/QS8/QS16/S16/F16/F32.
+     * @param[in]  input1 An input tensor. Data types supported: U8/S16/F16/F32
+     * @param[in]  input2 An input tensor. Data types supported: U8/S16/F16/F32
+     * @param[out] output The output tensor. Data types supported: U8/S16/F16/F32.
      * @param[in]  window Region on which to execute the kernel.
      */
     using AddFunction = void(const ITensor *input1, const ITensor *input2, ITensor *output, const Window &window);

diff --git a/arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h b/arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h
index 73ecfcf..3e93922 100644
--- a/arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h
+++ b/arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h

@@ -57,26 +57,24 @@
      * Valid configurations (Input1,Input2) -> Output :
      *
      *   - (U8,U8)     -> U8
-     *   - (QS8,QS8)   -> QS8
      *   - (U8,U8)     -> S16
      *   - (S16,U8)    -> S16
      *   - (U8,S16)    -> S16
      *   - (S16,S16)   -> S16
-     *   - (QS16,QS16) -> QS16
      *   - (F16,F16)   -> F16
      *   - (F32,F32)   -> F32
      *
-     * @param[in]  input1 An input tensor. Data types supported: U8/QS8/QS16/S16/F16/F32
-     * @param[in]  input2 An input tensor. Data types supported: U8/QS8/QS16/S16/F16/F32
-     * @param[out] output The output tensor. Data types supported: U8/QS8/QS16/S16/F16/F32.
+     * @param[in]  input1 An input tensor. Data types supported: U8/S16/F16/F32
+     * @param[in]  input2 An input tensor. Data types supported: U8/S16/F16/F32
+     * @param[out] output The output tensor. Data types supported: U8/S16/F16/F32.
      * @param[in]  policy Overflow policy.
      */
     void configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy);
     /** Static function to check if given info will lead to a valid configuration of @ref NEArithmeticSubtractionKernel
      *
-     * @param[in] input1 First tensor input. Data types supported: U8/QS8/QS16/S16/F16/F32
-     * @param[in] input2 Second tensor input. Data types supported: U8/QS8/QS16/S16/F16/F32
-     * @param[in] output Output tensor. Data types supported: U8/QS8/QS16/S16/F16/F32
+     * @param[in] input1 First tensor input. Data types supported: U8/S16/F16/F32
+     * @param[in] input2 Second tensor input. Data types supported: U8/S16/F16/F32
+     * @param[in] output Output tensor. Data types supported: U8/S16/F16/F32
      * @param[in] policy Policy to use to handle overflow.
      *
      * @return a status
@@ -89,9 +87,9 @@
 private:
     /** Common signature for all the specialised sub functions
      *
-     * @param[in]  input1 An input tensor. Data types supported: U8/QS8/QS16/S16/F16/F32
-     * @param[in]  input2 An input tensor. Data types supported: U8/QS8/QS16/S16/F16/F32
-     * @param[out] output The output tensor. Data types supported: U8/QS8/QS16/S16/F16/F32.
+     * @param[in]  input1 An input tensor. Data types supported: U8/S16/F16/F32
+     * @param[in]  input2 An input tensor. Data types supported: U8/S16/F16/F32
+     * @param[out] output The output tensor. Data types supported: U8/S16/F16/F32.
      * @param[in]  window Region on which to execute the kernel.
      */
     using SubFunction = void(const ITensor *input1, const ITensor *input2, ITensor *output, const Window &window);

diff --git a/arm_compute/core/NEON/kernels/NEBatchNormalizationLayerKernel.h b/arm_compute/core/NEON/kernels/NEBatchNormalizationLayerKernel.h
index 2d33f87..2a540c1 100644
--- a/arm_compute/core/NEON/kernels/NEBatchNormalizationLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NEBatchNormalizationLayerKernel.h

@@ -57,7 +57,7 @@
      *
      * @param[in, out] input    Source tensor. In case of @p output tensor = nullptr, this tensor will store the result.
      *                          3 lower dimensions represent a single input with dimensions [width, height, FM].
-     *                          The rest are optional and used for representing batches. Data types supported: QS8/QS16/F16/F32.
+     *                          The rest are optional and used for representing batches. Data types supported: F16/F32.
      * @param[out]     output   Destination tensor. Output will have the same number of dimensions as input. Data type supported: same as @p input
      * @param[in]      mean     Mean values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
      * @param[in]      var      Variance values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
@@ -72,7 +72,7 @@
      *
      * @param[in] input    Source tensor info. In case of @p output tensor = nullptr, this tensor will store the result.
      *                     3 lower dimensions represent a single input with dimensions [width, height, FM].
-     *                     The rest are optional and used for representing batches. Data types supported: QS8/QS16/F16/F32.
+     *                     The rest are optional and used for representing batches. Data types supported: F16/F32.
      * @param[in] output   Destination tensor info. Output will have the same number of dimensions as input. Data type supported: same as @p input
      * @param[in] mean     Mean values tensor info. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
      * @param[in] var      Variance values tensor info. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input
@@ -96,22 +96,7 @@
     void configure_non_fused();
     /** Configure execution function in case of fused activation **/
     void configure_fused();
-    /** Template function to run batch normalization on 8-bit fixed point
-     *
-     * @tparam fused_activation Boolean that flags if its a fused activation or not
-     *
-     * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
-     */
-    template <bool fused_activation>
-    void batch_normalization_qs8(const Window &window);
-    /** Template function to run batch normalization on 16-bit fixed point
-     *
-     * @tparam fused_activation Boolean that flags if its a fused activation or not
-     *
-     * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
-     */
-    template <bool fused_activation>
-    void batch_normalization_qs16(const Window &window);
+
     /** Template function to run batch normalization on fp16
      *
      * @tparam fused_activation Boolean that flags if its a fused activation or not

diff --git a/arm_compute/core/NEON/kernels/NECol2ImKernel.h b/arm_compute/core/NEON/kernels/NECol2ImKernel.h
index 9fb493c..f02858e 100644
--- a/arm_compute/core/NEON/kernels/NECol2ImKernel.h
+++ b/arm_compute/core/NEON/kernels/NECol2ImKernel.h

@@ -72,7 +72,7 @@
 
     /** Set the input and output of the kernel.
      *
-     * @param[in]  input          The input tensor to convert. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32
+     * @param[in]  input          The input tensor to convert. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
      * @param[out] output         The output tensor. 3 lower dimensions represent a single output [width, height, OFM],
      *                            while the rest represent batch of outputs. Data types supported: Same as @p input
      * @param[in]  convolved_dims Output convolved dimensions.
@@ -80,7 +80,7 @@
     void configure(const ITensor *input, ITensor *output, const Size2D &convolved_dims);
     /** Static function to check if given info will lead to a valid configuration of @ref NECol2ImKernel
      *
-     * @param[in] input          The input tensor to convert. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32
+     * @param[in] input          The input tensor to convert. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
      * @param[in] output         The output tensor. 3 lower dimensions represent a single output [width, height, OFM],
      *                           while the rest represent batch of outputs. Data types supported: Same as @p input
      * @param[in] convolved_dims Output convolved dimensions.

diff --git a/arm_compute/core/NEON/kernels/NEConvertFullyConnectedWeightsKernel.h b/arm_compute/core/NEON/kernels/NEConvertFullyConnectedWeightsKernel.h
index 65ce764..d5c9e3b 100644
--- a/arm_compute/core/NEON/kernels/NEConvertFullyConnectedWeightsKernel.h
+++ b/arm_compute/core/NEON/kernels/NEConvertFullyConnectedWeightsKernel.h

@@ -59,7 +59,7 @@
     ~NEConvertFullyConnectedWeightsKernel() = default;
     /** Set the input and output tensor.
      *
-     * @param[in]  input                Source weights tensor to convert. Must be 2 dimensional. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/U32/S32/QS32/F16/F32.
+     * @param[in]  input                Source weights tensor to convert. Must be 2 dimensional. Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/QS32/F16/F32.
      * @param[out] output               The converted weights tensor. Shape and Data Type: Same as @p input.
      * @param[in]  original_input_shape Shape of the original input tensor (the one entering fully connected layer). Must be in NCHW format.
      * @param[in]  data_layout          The data layout the weights have been trained in.
@@ -67,7 +67,7 @@
     void configure(const ITensor *input, ITensor *output, const TensorShape &original_input_shape, DataLayout data_layout);
     /** Static function to check if given info will lead to a valid configuration of @ref NEConvertFullyConnectedWeightsKernel
      *
-     * @param[in] input                Source weights tensor info to convert. Must be 2 dimensional. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/U32/S32/QS32/F16/F32.
+     * @param[in] input                Source weights tensor info to convert. Must be 2 dimensional. Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/QS32/F16/F32.
      * @param[in] output               The converted weights tensor info. Shape and Data Type: Same as @p input.
      * @param[in] original_input_shape Shape of the original input tensor (the one entering fully connected layer). Must be in NCHW format.
      * @param[in] data_layout          The data layout the weights have been trained in.

diff --git a/arm_compute/core/NEON/kernels/NEDepthConcatenateLayerKernel.h b/arm_compute/core/NEON/kernels/NEDepthConcatenateLayerKernel.h
index 67ef529..12a5051 100644
--- a/arm_compute/core/NEON/kernels/NEDepthConcatenateLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NEDepthConcatenateLayerKernel.h

@@ -55,7 +55,7 @@
     ~NEDepthConcatenateLayerKernel() = default;
     /** Initialise the kernel's inputs and output
      *
-     * @param[in]     input        Input tensor. Data types supported: QS8/QS16/F16/F32.
+     * @param[in]     input        Input tensor. Data types supported: F16/F32.
      * @param[in]     depth_offset The offset on the Z axis.
      * @param[in,out] output       Output tensor. Data types supported: Same as @p input.
      *

diff --git a/arm_compute/core/NEON/kernels/NEDepthConvertLayerKernel.h b/arm_compute/core/NEON/kernels/NEDepthConvertLayerKernel.h
index 50536f2..77bb041 100644
--- a/arm_compute/core/NEON/kernels/NEDepthConvertLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NEDepthConvertLayerKernel.h

@@ -55,19 +55,12 @@
      *
      * Valid conversions Input -> Output :
      *
-     *   - QS8 -> QS8, F32
      *   - U8 -> U16, S16, S32
      *   - U16 -> U8, U32
      *   - S16 -> U8, S32
-     *   - QS16 -> QS16, F32
-     *   - F32 -> QS8
      *
-     * @warning In case of in-place fixed point position conversion make sure that configure has been called
-     *          before the updated tensor is used in other functions, as the TensorInfo of the tensor will be
-     *          altered. In-place is only supported for QS8 -> QS8, QS16 -> QS16.
-     *
-     * @param[in, out] input  The input tensor to convert (Written in case of in-place computation). Data types supported: U8/QS8/U16/S16/F32.
-     * @param[out]     output The output tensor. Can be null in case of in-place computation. Data types supported: U8/QS8/U16/S16/U32/S32/F32.
+     * @param[in, out] input  The input tensor to convert (Written in case of in-place computation). Data types supported: U8/U16/S16.
+     * @param[out]     output The output tensor. Can be null in case of in-place computation. Data types supported: U8/U16/S16/U32/S32/F32.
      * @param[in]      policy Conversion policy.
      * @param[in]      shift  (Optional) Value for down/up conversions. Must be 0 <= shift < 8.
      *                         In case of fixed point position conversion, it specifies the new fixed point position, if operation is in-place.
@@ -82,8 +75,6 @@
     ITensor      *_output;
     ConvertPolicy _policy;
     uint32_t      _shift;
-    int           _fixed_point_position_input;
-    int           _fixed_point_position_output;
 };
 } // namespace arm_compute
 #endif /*__ARM_COMPUTE_NEDEPTHCONVERTKERNEL_H__ */

diff --git a/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerKernel.h b/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerKernel.h
index f859f97..589725a 100644
--- a/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerKernel.h

@@ -57,24 +57,24 @@
      *        3x3 convolution with stride_x = 1/2/3, stride_y = 1/2/3
      *
      * @param[in]  input     The input tensor to convolve. 3 lower dimensions represent a single input [width, height, IFM],
-     *                       while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QS8/QS16/F16/F32.
+     *                       while every optional dimension from 4 and above represent a batch of inputs. Data types supported: F16/F32.
      * @param[in]  weights   Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
      *                       The 3rd dimension must be the same as the input's volume 3rd dimension.
      *                       Data type supported:Same as @p input.
      * @param[out] output    Output tensor.
-     *                       The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: QS16/QS32/F16/F32
+     *                       The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: F16/F32
      * @param[in]  conv_info Contains padding and stride information described in @ref PadStrideInfo.
      */
     void configure(const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info);
     /** Static function to check if given info will lead to a valid configuration of @ref NEDirectConvolutionLayerKernel
      *
      * @param[in] input     The input tensor to convolve. 3 lower dimensions represent a single input [width, height, IFM],
-     *                      while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QS8/QS16/F16/F32.
+     *                      while every optional dimension from 4 and above represent a batch of inputs. Data types supported: F16/F32.
      * @param[in] weights   Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
      *                      The 3rd dimension must be the same as the input's volume 3rd dimension.
      *                      Data type supported:Same as @p input.
      * @param[in] output    Output tensor.
-     *                      The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: QS16/QS32/F16/F32
+     *                      The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: QS32/F16/F32
      * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo.
      *
      * @return a status

diff --git a/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.h b/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.h
index 77711d7..7fd1d70 100644
--- a/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.h
+++ b/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.h

@@ -55,10 +55,10 @@
     /** Set the accumulate buffer and the biases of the kernel.
      *
      * @param[in, out] input                        Input to add the bias to. If @p output is not specified then accumulation is done in-place.
-     *                                              Data type supported: QS16/QS32/F16/F32
+     *                                              Data type supported: QS32/F16/F32
      * @param[in]      bias                         (Optional) The shared bias tensor to add. It must be 1D Tensor. Data type supported: Same as @p input
      * @param[out]     output                       (Optional) If the output tensor is specified the accumulation is done out-of-place. (Defaults to nullptr)
-     *                                              Data type supported: QS8/QS16/F16/F32
+     *                                              Data type supported: F16/F32
      * @param[in]      result_fixedpoint_multiplier (Optional)Fixed point value to be multiplied to each element of the input matrix when once the result_offset has been add
      * @param[in]      result_shift                 (Optional)Integer value used to round to nearest division by a power-of-two the result after the fixed point multiplication
      * @param[in]      result_offset_after_shift    (Optional)Offset to be applied to result before converting it back to QASYMM8
@@ -68,10 +68,10 @@
     /** Static function to check if given info will lead to a valid configuration of @ref NEDirectConvolutionLayerOutputStageKernel
      *
      * @param[in] input  Input to add the bias to. If @p output is not specified then accumulation is done in-place.
-     *                   Data type supported: QS16/QS32/F16/F32
+     *                   Data type supported: QS32/F16/F32
      * @param[in] bias   (Optional) The shared bias tensor to add. It must be 1D Tensor. Data type supported: Same as @p input
      * @param[in] output (Optional) If the output tensor is specified the accumulation is done out-of-place. (Defaults to nullptr)
-     *                         Data type supported: QS8/QS16/F16/F32
+     *                         Data type supported: F16/F32
      * @return a status
      */
     static Status validate(const ITensorInfo *input, const ITensorInfo *bias = nullptr, const ITensorInfo *output = nullptr);

diff --git a/arm_compute/core/NEON/kernels/NEFillBorderKernel.h b/arm_compute/core/NEON/kernels/NEFillBorderKernel.h
index dd19b8f..cff6b4e 100644
--- a/arm_compute/core/NEON/kernels/NEFillBorderKernel.h
+++ b/arm_compute/core/NEON/kernels/NEFillBorderKernel.h

@@ -57,7 +57,7 @@
      *
      * @note This kernel fills the borders within the XY-planes.
      *
-     * @param[in,out] tensor                Tensor to process. Data types supported: U8/S8/QS8/QASYMM8/QS16/S16/S32/F32.
+     * @param[in,out] tensor                Tensor to process. Data types supported: U8/S8/QASYMM8/S16/S32/F32.
      * @param[in]     border_size           Size of the border to fill in elements.
      * @param[in]     border_mode           Border mode to use for the convolution.
      * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.

diff --git a/arm_compute/core/NEON/kernels/NEFillInnerBorderKernel.h b/arm_compute/core/NEON/kernels/NEFillInnerBorderKernel.h
index 545a265..2b6c7af 100644
--- a/arm_compute/core/NEON/kernels/NEFillInnerBorderKernel.h
+++ b/arm_compute/core/NEON/kernels/NEFillInnerBorderKernel.h

@@ -57,7 +57,7 @@
      *
      * @note This kernel fills the borders within the XY-planes.
      *
-     * @param[in,out] input                 Tensor to process. Data types supported: U8/QS8/S16/S32/F32.
+     * @param[in,out] input                 Tensor to process. Data types supported: U8/S16/S32/F32.
      * @param[in]     border_size           Size of the border to fill in elements.
      * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
      *

diff --git a/arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h b/arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h
index 79504fd..5c0104d 100644
--- a/arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h
+++ b/arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h

@@ -60,13 +60,13 @@
     NEGEMMInterleave4x4Kernel();
     /** Initialise the kernel's input and output.
      *
-     * @param[in]  input  Input tensor. Data types supported: U8/S8/QS8/QASYMM8/QS16/U16/S16/F16/U32/S32/F32
+     * @param[in]  input  Input tensor. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
      * @param[out] output Output tensor which stores the interleaved matrix. Data type supported: same as @p input.
      */
     void configure(const ITensor *input, ITensor *output);
     /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMInterleave4x4Kernel
      *
-     * @param[in] input  Input tensor info. Data types supported: U8/S8/QS8/QASYMM8/QS16/U16/S16/F16/U32/S32/F32
+     * @param[in] input  Input tensor info. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
      * @param[in] output Output tensor info which stores the interleaved matrix. Data type supported: same as @p input.
      *
      * @return a status
@@ -79,7 +79,7 @@
 private:
     /** Common signature for all the transpose functions
      *
-     * @param[in]  input  An input tensor. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32
+     * @param[in]  input  An input tensor. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
      * @param[out] output The output tensor. Data type supported: same as @p input
      * @param[in]  window Region on which to execute the kernel.
      */

diff --git a/arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h b/arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h
index e48a9a7..419a9f9 100644
--- a/arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h
+++ b/arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h

@@ -51,13 +51,13 @@
     ~NEGEMMMatrixAccumulateBiasesKernel() = default;
     /** Set the accumulate buffer and the biases of the kernel.
      *
-     * @param[in, out] accum  The accumulate tensor to convert. Data type supported: QS8/QS16/F32
+     * @param[in, out] accum  The accumulate tensor to convert. Data type supported: F32
      * @param[in]      biases The shared biases tensor to append. It must be 1D Tensor. Data type supported: Same as @p input
      */
     void configure(ITensor *accum, const ITensor *biases);
     /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMMatrixAccumulateBiasesKernel
      *
-     * @param[in] accum  The accumulate tensor to convert. Data type supported: QS8/QS16/F32
+     * @param[in] accum  The accumulate tensor to convert. Data type supported: F32
      * @param[in] biases The shared biases tensor to append. It must be 1D Tensor. Data type supported: Same as @p input
      *
      * @return a status

diff --git a/arm_compute/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h b/arm_compute/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h
index 5e4f8b7..1a23593 100644
--- a/arm_compute/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h
+++ b/arm_compute/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h

@@ -59,7 +59,7 @@
      *
      * @note The input and output tensor must have the same dimensions
      *
-     * @param[in]      input  Input tensor (Matrix C). Data types supported: QS8/QS16/F16/F32
+     * @param[in]      input  Input tensor (Matrix C). Data types supported: F16/F32
      * @param[in, out] output Output tensor. If this kernel is used to finalize the GEMM result, output contains the result obtained by the kernel @ref NEGEMMMatrixMultiplyKernel. Data type supported: the same as @p input.
      * @param[in]      beta   Weight of matrix C
      */
@@ -71,7 +71,7 @@
 private:
     /** Common signature for all the matrix addition functions
      *
-     * @param[in]  input  An input tensor. Data types supported: QS8/QS16/F16/F32
+     * @param[in]  input  An input tensor. Data types supported: F16/F32
      * @param[out] output The output tensor. Data type supported: same as @p input
      * @param[in]  window Region on which to execute the kernel.
      * @param[in]  beta   Weight of matrix C

diff --git a/arm_compute/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h b/arm_compute/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h
index d54522c..6ee9582 100644
--- a/arm_compute/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h
+++ b/arm_compute/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h

@@ -58,7 +58,7 @@
      * @note If the output tensor is a matrix, the input matrices @p input0 and @p input1 should be the output of the kernels: @ref NEGEMMInterleave4x4Kernel and @ref NEGEMMTranspose1xWKernel
      *       These two kernels change the layout of the original matrices to be more cache-friendly.
      *
-     * @param[in]  input0         Input tensor containing the interleaved Matrix A or the vector A. Data types supported: QS8/QS16/F16/F32
+     * @param[in]  input0         Input tensor containing the interleaved Matrix A or the vector A. Data types supported: F16/F32
      * @param[in]  input1         Input tensor containing the transposed Matrix B if the first input tensor A is not a vector.
      *                            If the output tensor is a vector, input1 must contain the matrix B not reshaped. Data type supported: same as @p input0
      * @param[out] output         Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0.
@@ -69,7 +69,7 @@
     void configure(const ITensor *input0, const ITensor *input1, ITensor *output, float alpha, bool is_interleaved, const GEMMReshapeInfo &reshape_info = GEMMReshapeInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMMatrixMultiplyKernel
      *
-     * @param[in] input0         Input tensor containing the interleaved Matrix A or the vector A. Data types supported: QS8/QS16/F16/F32
+     * @param[in] input0         Input tensor containing the interleaved Matrix A or the vector A. Data types supported: F16/F32
      * @param[in] input1         Input tensor containing the transposed Matrix B if the first input tensor A is not a vector.
      *                           If the output tensor is a vector, input1 must contain the matrix B not reshaped. Data type supported: same as @p input0
      * @param[in] output         Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0.

diff --git a/arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h b/arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h
index fcdd8dd..b7fbfcf 100644
--- a/arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h
+++ b/arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h

@@ -74,13 +74,13 @@
     }
     /** Initialise the kernel's input and output.
      *
-     * @param[in]  input  Input tensor. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32
+     * @param[in]  input  Input tensor. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
      * @param[out] output Output tensor. Data type supported: same as @p input.
      */
     void configure(const ITensor *input, ITensor *output);
     /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMTranspose1xWKernel
      *
-     * @param[in] input  Input tensor info. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32
+     * @param[in] input  Input tensor info. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
      * @param[in] output Output tensor info. Data type supported: same as @p input.
      *
      * @return a status

diff --git a/arm_compute/core/NEON/kernels/NEIm2ColKernel.h b/arm_compute/core/NEON/kernels/NEIm2ColKernel.h
index 5aa803f..d455fd9 100644
--- a/arm_compute/core/NEON/kernels/NEIm2ColKernel.h
+++ b/arm_compute/core/NEON/kernels/NEIm2ColKernel.h

@@ -77,7 +77,7 @@
     /** Set the input and output of the kernel.
      *
      * @param[in]  input              The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM],
-     *                                while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QS8/QS16/QASYMM8/F16/F32
+     *                                while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QASYMM8/F16/F32
      *                                Note: QASYMM8 works only for has_bias = false
      * @param[out] output             The output tensor. Data types supported: Same as @p input
      * @param[in]  kernel_dims        The kernel dimensions (width and height).
@@ -92,7 +92,7 @@
     /** Static function to check if given info will lead to a valid configuration of @ref NEIm2ColKernel
      *
      * @param[in] input              The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM],
-     *                               while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QS8/QS16/QASYMM8/F16/F32
+     *                               while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QASYMM8/F16/F32
      *                               Note: QASYMM8 works only for has_bias = false
      * @param[in] output             The output tensor. Data types supported: Same as @p input
      * @param[in] kernel_dims        The kernel dimensions (width and height).

diff --git a/arm_compute/core/NEON/kernels/NENormalizationLayerKernel.h b/arm_compute/core/NEON/kernels/NENormalizationLayerKernel.h
index 6ae7b73..9208643 100644
--- a/arm_compute/core/NEON/kernels/NENormalizationLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NENormalizationLayerKernel.h

@@ -54,7 +54,7 @@
     /** Set the input and output tensors.
      *
      * @param[in]  input         Source tensor. 3 lower dims represent a single input with dimensions [width, height, IFM],
-     *                           and an optional 4th dimension for batch of inputs. Data types supported: QS8/QS16/FP16/F32.
+     *                           and an optional 4th dimension for batch of inputs. Data types supported: FP16/F32.
      * @param[in]  input_squared Source with each element has been squared. 3 lower dims represent a single input with dimensions [width, height, IFM],
      *                           Data type supported: same as @p input
      * @param[out] output        Destination tensor. Output will have the same number of dimensions as input. Data type supported: same as @p input
@@ -64,7 +64,7 @@
     /** Static function to check if given info will lead to a valid configuration of @ref NENormalizationLayerKernel
      *
      * @param[in] input         Source tensor. 3 lower dims represent a single input with dimensions [width, height, IFM],
-     *                          and an optional 4th dimension for batch of inputs. Data types supported: QS8/QS16/FP16/F32.
+     *                          and an optional 4th dimension for batch of inputs. Data types supported: FP16/F32.
      * @param[in] input_squared Source with each element has been squared. 3 lower dims represent a single input with dimensions [width, height, IFM],
      *                          Data type supported: same as @p input
      * @param[in] output        Destination tensor. Output will have the same number of dimensions as input. Data type supported: same as @p input
@@ -92,18 +92,6 @@
     template <DataType dt, unsigned int dim, bool do_2D_norm>
     void normalize_float(const Window &window);
 
-    /** Function to perform normalization for fixed-point values depending on
-     * the given template dimension. The second template parameter specifies
-     * whether the normalization has to be 1D or 2D.
-     *
-     * @note Only supported normalizations are:
-     *  - 1D over X or Z
-     *  - 2D over X and Y
-     *
-     * @param[in] window Region on which to execute the kernel.
-     */
-    template <DataType dt, unsigned int dim, bool do_2D_norm>
-    void normalize_fixed_point(const Window &window);
     /** Common signature for all the specialised normalization functions
      *
      * @param[in] window Region on which to execute the kernel.

diff --git a/arm_compute/core/NEON/kernels/NEPermuteKernel.h b/arm_compute/core/NEON/kernels/NEPermuteKernel.h
index 68bbdcb..b56faa8 100644
--- a/arm_compute/core/NEON/kernels/NEPermuteKernel.h
+++ b/arm_compute/core/NEON/kernels/NEPermuteKernel.h

@@ -58,7 +58,7 @@
      *
      * @note Supported permutation vectors : [2, 0, 1], [1, 2, 0]
      *
-     * @param[in]  input  The input tensor to permute. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32
+     * @param[in]  input  The input tensor to permute. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
      * @param[out] output The output tensor. Data types supported: Same as @p input
      * @param[in]  perm   Permutation vector
      */
@@ -67,7 +67,7 @@
      *
      * @note Supported permutation vectors : [2, 0, 1], [1, 2, 0]
      *
-     * @param[in] input  The input tensor to permute. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32
+     * @param[in] input  The input tensor to permute. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
      * @param[in] output The output tensor. Data types supported: Same as @p input
      * @param[in] perm   Permutation vector
      *

diff --git a/arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h b/arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h
index 8c24556..41ea914 100644
--- a/arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h
+++ b/arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h

@@ -55,11 +55,10 @@
      *
      * @note For @p scale equal to 1/255 only round to nearest even (implemented as round half up) is supported.
      *       For all other scale values only round to zero (implemented as round towards minus infinity) is supported.
-     *       For QS8/QS16 scale = 1 is the only supported value.
      *
-     * @param[in]  input1          An input tensor. Data types supported: U8/QS8/QS16/S16/F16/F32
-     * @param[in]  input2          An input tensor. Data types supported: U8, QS8 (only if @p input1 is QS8), QS16 (only if @p input1 is QS16), S16/F16 (only if @p input1 is F16), F32 (only if @p input1 is F32).
-     * @param[out] output          The output tensor. Data types supported: U8 (Only if both inputs are U8), QS8 (only if both inputs are QS8), QS16 (only if both inputs are QS16), S16/F16 (only if @p input1 is F16), F32 (only if both inputs are F32).
+     * @param[in]  input1          An input tensor. Data types supported: U8/S16/F16/F32
+     * @param[in]  input2          An input tensor. Data types supported: U8, S16/F16 (only if @p input1 is F16), F32 (only if @p input1 is F32).
+     * @param[out] output          The output tensor. Data types supported: U8 (Only if both inputs are U8), S16/F16 (only if @p input1 is F16), F32 (only if both inputs are F32).
      * @param[in]  scale           Scale to apply after multiplication.
      *                             Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15.
      * @param[in]  overflow_policy Overflow policy.
@@ -70,11 +69,10 @@
      *
      * @note For @p scale equal to 1/255 only round to nearest even (implemented as round half up) is supported.
      *       For all other scale values only round to zero (implemented as round towards minus infinity) is supported.
-     *       For QS8/QS16 scale = 1 is the only supported value.
      *
-     * @param[in] input1          An input tensor. Data types supported: U8/QS8/QS16/S16/F16/F32
-     * @param[in] input2          An input tensor. Data types supported: U8, QS8 (only if @p input1 is QS8), QS16 (only if @p input1 is QS16), S16/F16 (only if @p input1 is F16), F32 (only if @p input1 is F32).
-     * @param[in] output          The output tensor. Data types supported: U8 (Only if both inputs are U8), QS8 (only if both inputs are QS8), QS16 (only if both inputs are QS16), S16/F16 (only if @p input1 is F16), F32 (only if both inputs are F32).
+     * @param[in] input1          An input tensor. Data types supported: U8/S16/F16/F32
+     * @param[in] input2          An input tensor. Data types supported: U8, S16/F16 (only if @p input1 is F16), F32 (only if @p input1 is F32).
+     * @param[in] output          The output tensor. Data types supported: U8 (Only if both inputs are U8), S16/F16 (only if @p input1 is F16), F32 (only if both inputs are F32).
      * @param[in] scale           Scale to apply after multiplication.
      *                            Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15.
      * @param[in] overflow_policy Overflow policy.
@@ -96,15 +94,6 @@
      * @param[out] output_ptr Pointer to the output tensor.
      */
     using MulFunctionInt = void(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int scale);
-    /** Common signature for all the specialised multiplication functions with fixed-point values
-     *
-     * @param[in]  input1_ptr           Pointer to the first input tensor.
-     * @param[in]  input2_ptr           Pointer to the second input tensor.
-     * @param[in]  scale                Scaling factor.
-     * @param[in]  fixed_point_position Fixed-point position that expresses the number of bits for the fractional part of the number.
-     * @param[out] output_ptr           Pointer to the output tensor.
-     */
-    using MulFunctionQInt = void(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int scale, int fixed_point_position);
     /** Common signature for all the specialised multiplication functions with float scaling factor
      *
      * @param[in]  input1_ptr Pointer to the first input tensor.
@@ -115,7 +104,6 @@
 
     MulFunctionFloat *_func_float;
     MulFunctionInt   *_func_int;
-    MulFunctionQInt *_func_q_int;
 
 private:
     const ITensor *_input1;

diff --git a/arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h b/arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h
index 4140ccf..6c4c1db 100644
--- a/arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h

@@ -52,18 +52,18 @@
     ~NEPoolingLayerKernel() = default;
     /** Set the input and output tensors.
      *
-     * @note QS8, QS16 and F16 are supported for pool sizes 2 and 3 only
+     * @note F16 are supported for pool sizes 2 and 3 only
      *
-     * @param[in]  input     Source tensor. Data types supported: QS8/QASYMM8/QS16/F16/F32.
+     * @param[in]  input     Source tensor. Data types supported: QASYMM8/F16/F32.
      * @param[out] output    Destination tensor. Data types supported: Same as @p input.
      * @param[in]  pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
      */
     void configure(const ITensor *input, ITensor *output, const PoolingLayerInfo &pool_info);
     /** Static function to check if given info will lead to a valid configuration of @ref NEPoolingLayerKernel
      *
-     * @note QS8, QS16 and F16 are supported for pool sizes 2 and 3 only
+     * @note F16 are supported for pool sizes 2 and 3 only
      *
-     * @param[in] input     Source tensor. Data types supported: QS8/QASYMM8/QS16/F16/F32.
+     * @param[in] input     Source tensor. Data types supported: QASYMM8/F16/F32.
      * @param[in] output    Destination tensor. Data types supported: Same as @p input.
      * @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
      *
@@ -90,13 +90,6 @@
      */
     template <PoolingType pooling_type, bool exclude_padding = false>
     void pooling2_f16_nchw(const Window &window_input, const Window &window);
-    /** Function to perform 2x2 pooling for 8bit fixed point.
-     *
-     * @param[in] window_input Input region on which to execute the kernel.
-     * @param[in] window       Output region on which to execute the kernel.
-     */
-    template <PoolingType pooling_type>
-    void pooling2_q8_nchw(const Window &window_input, const Window &window);
     /** Function to perform 2x2 pooling for 8bit asymmetric fixed point.
      *
      * @param[in] window_input Input region on which to execute the kernel.
@@ -104,13 +97,6 @@
      */
     template <PoolingType pooling_type, bool exclude_padding = false>
     void pooling2_qasymm8_nchw(const Window &window_input, const Window &window);
-    /** Function to perform 2x2 pooling for 16bit fixed point.
-     *
-     * @param[in] window_input Input region on which to execute the kernel.
-     * @param[in] window       Output region on which to execute the kernel.
-     */
-    template <PoolingType pooling_type>
-    void pooling2_q16_nchw(const Window &window_input, const Window &window);
     /** Function to perform 3x3 pooling.
      *
      * @param[in] window_input Input region on which to execute the kernel.
@@ -125,13 +111,6 @@
      */
     template <PoolingType pooling_type, bool exclude_padding = false>
     void pooling3_f16_nchw(const Window &window_input, const Window &window);
-    /** Function to perform 3x3 pooling for 8bit fixed point.
-     *
-     * @param[in] window_input Input region on which to execute the kernel.
-     * @param[in] window       Output region on which to execute the kernel.
-     */
-    template <PoolingType pooling_type>
-    void pooling3_q8_nchw(const Window &window_input, const Window &window);
     /** Function to perform 3x3 pooling for 8bit quantized fixed point.
      *
      * @param[in] window_input Input region on which to execute the kernel.
@@ -139,13 +118,6 @@
      */
     template <PoolingType pooling_type, bool exclude_padding = false>
     void pooling3_qasymm8_nchw(const Window &window_input, const Window &window);
-    /** Function to perform 3x3 pooling for 16bit fixed point.
-     *
-     * @param[in] window_input Input region on which to execute the kernel.
-     * @param[in] window       Output region on which to execute the kernel.
-     */
-    template <PoolingType pooling_type>
-    void pooling3_q16_nchw(const Window &window_input, const Window &window);
     /** Function to perform 7x7 pooling.
      *
      * @param[in] window_input Input region on which to execute the kernel.
@@ -153,13 +125,6 @@
      */
     template <PoolingType pooling_type, bool exclude_padding = false>
     void pooling7_f32_nchw(const Window &window_input, const Window &window);
-    /** Function to perform MxN pooling for 8bit fixed point.
-     *
-     * @param[in] window_input Input region on which to execute the kernel.
-     * @param[in] window       Output region on which to execute the kernel.
-     */
-    template <PoolingType pooling_type>
-    void poolingMxN_q8_nchw(const Window &window_input, const Window &window);
     /** Function to perform MxN pooling for 8-bit quantized.
      *
      * @param[in] window_input Input region on which to execute the kernel.
@@ -174,13 +139,6 @@
      */
     template <PoolingType pooling_type, bool exclude_padding = false>
     void poolingMxN_qasymm8_nhwc(const Window &window_input, const Window &window);
-    /** Function to perform MxN pooling for 16bit fixed point.
-     *
-     * @param[in] window_input Input region on which to execute the kernel.
-     * @param[in] window       Output region on which to execute the kernel.
-     */
-    template <PoolingType pooling_type>
-    void poolingMxN_q16_nchw(const Window &window_input, const Window &window);
     /** Function to perform MxN pooling for 16-bit floating point values.
      *
      * @param[in] window_input Input region on which to execute the kernel.

diff --git a/arm_compute/core/NEON/kernels/NEReshapeLayerKernel.h b/arm_compute/core/NEON/kernels/NEReshapeLayerKernel.h
index 0a3fc44..08b4e11 100644
--- a/arm_compute/core/NEON/kernels/NEReshapeLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NEReshapeLayerKernel.h

@@ -40,7 +40,7 @@
     }
     /** Set the input and output of the kernel
      *
-     * @param[in]  input  Source tensor. Data type supported: U8/S8/QS8/U16/S16/QS16/QASYMM8/U32/S32/F16/F32
+     * @param[in]  input  Source tensor. Data type supported: U8/S8/U16/S16/QASYMM8/U32/S32/F16/F32
      * @param[out] output Destination tensor. Data type supported: Same as @p input
      */
     void configure(const ITensor *input, ITensor *output);

diff --git a/arm_compute/core/NEON/kernels/NESoftmaxLayerKernel.h b/arm_compute/core/NEON/kernels/NESoftmaxLayerKernel.h
index c30a4cd..25c3196 100644
--- a/arm_compute/core/NEON/kernels/NESoftmaxLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NESoftmaxLayerKernel.h

@@ -43,13 +43,13 @@
     NELogits1DMaxKernel();
     /** Set the input and output tensors.
      *
-     * @param[in]  input  Source tensor. Data types supported: QASYMM8/QS8/QS16/F16/F32.
+     * @param[in]  input  Source tensor. Data types supported: QASYMM8/F16/F32.
      * @param[out] output Destination tensor. Data types supported: same as @p input
      */
     void configure(const ITensor *input, ITensor *output);
     /** Static function to check if given info will lead to a valid configuration of @ref NELogits1DMaxKernel
      *
-     * @param[in] input  Source tensor. Data types supported: QASYMM8/QS8/QS16/F16/F32.
+     * @param[in] input  Source tensor. Data types supported: QASYMM8/F16/F32.
      * @param[in] output Destination tensor. Data types supported: same as @p input
      *
      * @return a status
@@ -90,7 +90,7 @@
     ~NELogits1DSoftmaxKernel() = default;
     /** Set the input and output tensors.
      *
-     * @param[in]  input  Source tensor. Data types supported: QASYMM8/QS8/QS16/F16/F32.
+     * @param[in]  input  Source tensor. Data types supported: QASYMM8/F16/F32.
      * @param[in]  max    Max values tensor. Same shape as input with dimension 0 set to 1.
      *                    Data types supported: same as @p input.
      * @param[out] output Destination tensor. Data types supported: same as @p input.
@@ -101,7 +101,7 @@
     void configure(const ITensor *input, const ITensor *max, ITensor *output, const float beta, ITensor *tmp);
     /** Static function to check if given info will lead to a valid configuration of @ref NELogits1DSoftmaxKernel
      *
-     * @param[in] input  Source tensor info. Data types supported: QASYMM8/QS8/QS16/F16/F32.
+     * @param[in] input  Source tensor info. Data types supported: QASYMM8/F16/F32.
      * @param[in] max    Max values tensor info. Same shape as input with dimension 0 set to 1.
      *                   Data types supported: same as @p input.
      * @param[in] output Destination tensor info. Data types supported: same as @p input.

diff --git a/arm_compute/core/NEON/kernels/NETransposeKernel.h b/arm_compute/core/NEON/kernels/NETransposeKernel.h
index dc7ef8f..76823ac 100644
--- a/arm_compute/core/NEON/kernels/NETransposeKernel.h
+++ b/arm_compute/core/NEON/kernels/NETransposeKernel.h

@@ -57,13 +57,13 @@
 
     /** Initialise the kernel's input and output.
      *
-     * @param[in]  input  Input tensor. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32
+     * @param[in]  input  Input tensor. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
      * @param[out] output Output tensor. Data type supported: Same as @p input
      */
     void configure(const ITensor *input, ITensor *output);
     /** Static function to check if given info will lead to a valid configuration of @ref NETransposeKernel
      *
-     * @param[in] input  Input tensor. Data types supported: U8/S8/QS8/QASYMM8/U16/S16/QS16/F16/U32/S32/F32
+     * @param[in] input  Input tensor. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
      * @param[in] output Output tensor. Data type supported: Same as @p input
      *
      * @return a status
@@ -76,7 +76,7 @@
 private:
     /** Common signature for all the transpose functions
      *
-     * @param[in]  input  An input tensor. Data types supported: U8/S8/QS8/U16/S16/QS16/F16/U32/S32/F32
+     * @param[in]  input  An input tensor. Data types supported: U8/S8/U16/S16/F16/U32/S32/F32
      * @param[out] output The output tensor. Data type supported: same as @p input
      * @param[in]  window Region on which to execute the kernel.
      */

diff --git a/arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h b/arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h
index 1a7525b..21f36f6 100644
--- a/arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h
+++ b/arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h

@@ -75,7 +75,7 @@
     /** Set the input and output of the kernel.
      *
      * @param[in]  input  The input tensor to convert. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] if shared,
-     *                    and 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM, num_patches] if unshared. Data types supported: QS8/QASYMM8/QS16/F32
+     *                    and 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM, num_patches] if unshared. Data types supported: QASYMM8/F32
      * @param[in]  bias   The shared biases tensor to append.  Bias is 1D tensor with dimensions [OFM] if shared and 2D tensor with
      *                    dimensions [OFM, num_patches] if unshared. Data types supported: Same as @p input
      *                    @warning Appending biases to weights reshaped matrix is not supported for quantized asymmetric types.
@@ -85,7 +85,7 @@
     /** Static function to check if given info will lead to a valid configuration of @ref NEWeightsReshapeKernel
      *
      * @param[in] input  The input tensor to convert. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] if shared,
-     *                   and 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM,  num_patches] if unshared. Data types supported: QS8/QASYMM8/QS16/F16/F32
+     *                   and 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM,  num_patches] if unshared. Data types supported: QASYMM8/F16/F32
      * @param[in] biases The shared biases tensor to append.  Bias is 1D tensor with dimensions [OFM] if shared and 2D tensor with
      *                   dimensions [OFM, num_patches] if unshared. Data types supported: Same as @p input
      *                   @warning Appending biases to weights reshaped matrix is not supported for quantized asymmetric types.

diff --git a/arm_compute/core/NEON/kernels/detail/NEDirectConvolution3x3.h b/arm_compute/core/NEON/kernels/detail/NEDirectConvolution3x3.h
index fee2066..fd0c0f0 100644
--- a/arm_compute/core/NEON/kernels/detail/NEDirectConvolution3x3.h
+++ b/arm_compute/core/NEON/kernels/detail/NEDirectConvolution3x3.h

@@ -45,13 +45,11 @@
 }
 
 template <unsigned int stridex>
-float32x4x2_t convolve_3x3(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, int fixed_point_position);
+float32x4x2_t convolve_3x3(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2);
 
 template <>
-inline float32x4x2_t convolve_3x3<1>(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, int fixed_point_position)
+inline float32x4x2_t convolve_3x3<1>(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2)
 {
-    ARM_COMPUTE_UNUSED(fixed_point_position);
-
     const float32x4x3_t vtop =
     {
         {
@@ -108,9 +106,9 @@
 }
 
 template <>
-inline float32x4x2_t convolve_3x3<2>(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, int fixed_point_position)
+inline float32x4x2_t convolve_3x3<2>(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2)
 {
-    float32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, fixed_point_position);
+    float32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2);
     out.val[0]        = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 2), out.val[0], 1);
     out.val[0]        = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 0), out.val[0], 2);
     out.val[0]        = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 2), out.val[0], 3);
@@ -118,9 +116,9 @@
 }
 
 template <>
-inline float32x4x2_t convolve_3x3<3>(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, int fixed_point_position)
+inline float32x4x2_t convolve_3x3<3>(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2)
 {
-    float32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, fixed_point_position);
+    float32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2);
     out.val[0]        = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 3), out.val[0], 1);
     return out;
 }

diff --git a/arm_compute/core/NEON/kernels/detail/NEDirectConvolutionDetail.h b/arm_compute/core/NEON/kernels/detail/NEDirectConvolutionDetail.h
index 908fa13..d56fd44 100644
--- a/arm_compute/core/NEON/kernels/detail/NEDirectConvolutionDetail.h
+++ b/arm_compute/core/NEON/kernels/detail/NEDirectConvolutionDetail.h

@@ -55,29 +55,6 @@
     return r;
 }
 
-/** Loads a 3x3 matrix as a row  (qint8_t).
- *
- * @param[in] ptr            Pointer to a qint8 3x3 matrix.
- * @param[in] weights_offset (Optional) Weights quantization offset.
- *
- * @return The loaded matrix.
- */
-inline qint8x8x3_t load_matrix_row(const qint8_t *ptr, int weights_offset = 0)
-{
-    ARM_COMPUTE_UNUSED(weights_offset);
-    /* ptr is a pointer to a row in a 3x3 matrix, the function returns 3 vectors holding exactly the same value in all lanes:
-       r.val[0] contains the first element, r.val[1] the second element and r.val[2] the third element (in all lanes) */
-    const qint8x8x3_t r =
-    {
-        {
-            vld1_dup_qs8(ptr),
-            vld1_dup_qs8(1 + ptr),
-            vld1_dup_qs8(2 + ptr)
-        }
-    };
-    return r;
-}
-
 /** Loads a 3x3 matrix as a row  (uint8_t).
  *
  * @param[in] ptr            Pointer to a uint8_t 3x3 matrix.
@@ -104,27 +81,25 @@
 
 /** Perform a convolve3x3 on float32.
  *
- * @param[in] in_top               Pointer to the first row of the input.
- * @param[in] in_mid               Pointer to the second row of the input.
- * @param[in] in_low               Pointer to the third row of the input.
- * @param[in] m0                   First row of the filter.
- * @param[in] m1                   Second row of the filter.
- * @param[in] m2                   Third row of the filter.
- * @param[in] fixed_point_position (Optional) Fixed point position.
- * @param[in] input_offset         (Optional) Input quantization offset.
+ * @param[in] in_top       Pointer to the first row of the input.
+ * @param[in] in_mid       Pointer to the second row of the input.
+ * @param[in] in_low       Pointer to the third row of the input.
+ * @param[in] m0           First row of the filter.
+ * @param[in] m1           Second row of the filter.
+ * @param[in] m2           Third row of the filter.
+ * @param[in] input_offset (Optional) Input quantization offset.
  *
  */
 template <unsigned int stridex>
 float32x4x2_t convolve_3x3(const float *in_top, const float *in_mid, const float *in_low,
                            const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2,
-                           int fixed_point_position, int input_offset = 0);
+                           int input_offset = 0);
 
 template <>
 inline float32x4x2_t convolve_3x3<1>(const float *in_top, const float *in_mid, const float *in_low,
                                      const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2,
-                                     int fixed_point_position, int input_offset)
+                                     int input_offset)
 {
-    ARM_COMPUTE_UNUSED(fixed_point_position);
     ARM_COMPUTE_UNUSED(input_offset);
 
     const float32x4x3_t vtop =
@@ -185,11 +160,11 @@
 template <>
 inline float32x4x2_t convolve_3x3<2>(const float *in_top, const float *in_mid, const float *in_low,
                                      const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2,
-                                     int fixed_point_position, int input_offset)
+                                     int input_offset)
 {
     ARM_COMPUTE_UNUSED(input_offset);
 
-    float32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, fixed_point_position, input_offset);
+    float32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, input_offset);
     out.val[0]        = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 2), out.val[0], 1);
     out.val[0]        = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 0), out.val[0], 2);
     out.val[0]        = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 2), out.val[0], 3);
@@ -199,145 +174,35 @@
 template <>
 inline float32x4x2_t convolve_3x3<3>(const float *in_top, const float *in_mid, const float *in_low,
                                      const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2,
-                                     int fixed_point_position, int input_offset)
+                                     int input_offset)
 {
     ARM_COMPUTE_UNUSED(input_offset);
 
-    float32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, fixed_point_position, input_offset);
+    float32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, input_offset);
     out.val[0]        = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 3), out.val[0], 1);
     return out;
 }
 
-/** Perform a convolve3x3 on qint16.
- *
- * @param[in] in_top               Pointer to the first row of the input.
- * @param[in] in_mid               Pointer to the second row of the input.
- * @param[in] in_low               Pointer to the third row of the input.
- * @param[in] m0                   First row of the filter.
- * @param[in] m1                   Second row of the filter.
- * @param[in] m2                   Third row of the filter.
- * @param[in] fixed_point_position (Optional) Fixed point position.
- * @param[in] input_offset         (Optional) Input quantization offset.
- *
- */
-template <unsigned int stridex>
-qint16x8x2_t convolve_3x3(const qint8_t *in_top, const qint8_t *in_mid, const qint8_t *in_low,
-                          const qint8x8x3_t &m0, const qint8x8x3_t &m1, const qint8x8x3_t &m2,
-                          int fixed_point_position, int input_offset = 0);
-
-template <>
-inline qint16x8x2_t convolve_3x3<1>(const qint8_t *in_top, const qint8_t *in_mid, const qint8_t *in_low,
-                                    const qint8x8x3_t &m0, const qint8x8x3_t &m1, const qint8x8x3_t &m2,
-                                    int fixed_point_position, int input_offset)
-{
-    ARM_COMPUTE_UNUSED(fixed_point_position);
-    ARM_COMPUTE_UNUSED(input_offset);
-
-    const qint8x8x3_t vtop =
-    {
-        {
-            vld1_qs8(in_top),
-            vld1_qs8(in_top + 8),
-            vld1_qs8(in_top + 16)
-        }
-    };
-    const qint8x8x3_t vmid =
-    {
-        {
-            vld1_qs8(in_mid),
-            vld1_qs8(in_mid + 8),
-            vld1_qs8(in_mid + 16)
-        }
-    };
-    const qint8x8x3_t vlow =
-    {
-        {
-            vld1_qs8(in_low),
-            vld1_qs8(in_low + 8),
-            vld1_qs8(in_low + 16)
-        }
-    };
-    qint16x8x2_t out =
-    {
-        {
-            vmull_qs8(vtop.val[0], m0.val[0], fixed_point_position),
-            vmull_qs8(vtop.val[1], m0.val[0], fixed_point_position)
-        }
-    };
-    out.val[0] = vqmlal_qs8(out.val[0], vext_s8(vtop.val[0], vtop.val[1], 1), m0.val[1], fixed_point_position);
-    out.val[0] = vqmlal_qs8(out.val[0], vext_s8(vtop.val[0], vtop.val[1], 2), m0.val[2], fixed_point_position);
-    out.val[0] = vqmlal_qs8(out.val[0], vmid.val[0], m1.val[0], fixed_point_position);
-    out.val[0] = vqmlal_qs8(out.val[0], vext_s8(vmid.val[0], vmid.val[1], 1), m1.val[1], fixed_point_position);
-    out.val[0] = vqmlal_qs8(out.val[0], vext_s8(vmid.val[0], vmid.val[1], 2), m1.val[2], fixed_point_position);
-    out.val[0] = vqmlal_qs8(out.val[0], vlow.val[0], m2.val[0], fixed_point_position);
-    out.val[0] = vqmlal_qs8(out.val[0], vext_s8(vlow.val[0], vlow.val[1], 1), m2.val[1], fixed_point_position);
-    out.val[0] = vqmlal_qs8(out.val[0], vext_s8(vlow.val[0], vlow.val[1], 2), m2.val[2], fixed_point_position);
-    out.val[1] = vqmlal_qs8(out.val[1], vext_s8(vtop.val[1], vtop.val[2], 1), m0.val[1], fixed_point_position);
-    out.val[1] = vqmlal_qs8(out.val[1], vext_s8(vtop.val[1], vtop.val[2], 2), m0.val[2], fixed_point_position);
-    out.val[1] = vqmlal_qs8(out.val[1], vmid.val[1], m1.val[0], fixed_point_position);
-    out.val[1] = vqmlal_qs8(out.val[1], vext_s8(vmid.val[1], vmid.val[2], 1), m1.val[1], fixed_point_position);
-    out.val[1] = vqmlal_qs8(out.val[1], vext_s8(vmid.val[1], vmid.val[2], 2), m1.val[2], fixed_point_position);
-    out.val[1] = vqmlal_qs8(out.val[1], vlow.val[1], m2.val[0], fixed_point_position);
-    out.val[1] = vqmlal_qs8(out.val[1], vext_s8(vlow.val[1], vlow.val[2], 1), m2.val[1], fixed_point_position);
-    out.val[1] = vqmlal_qs8(out.val[1], vext_s8(vlow.val[1], vlow.val[2], 2), m2.val[2], fixed_point_position);
-    return out;
-}
-
-template <>
-inline qint16x8x2_t convolve_3x3<2>(const qint8_t *in_top, const qint8_t *in_mid, const qint8_t *in_low,
-                                    const qint8x8x3_t &m0, const qint8x8x3_t &m1, const qint8x8x3_t &m2,
-                                    int fixed_point_position, int input_offset)
-{
-    ARM_COMPUTE_UNUSED(input_offset);
-
-    qint16x8x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, fixed_point_position, input_offset);
-    out.val[0]       = vsetq_lane_s16(vgetq_lane_s16(out.val[0], 2), out.val[0], 1);
-    out.val[0]       = vsetq_lane_s16(vgetq_lane_s16(out.val[0], 4), out.val[0], 2);
-    out.val[0]       = vsetq_lane_s16(vgetq_lane_s16(out.val[0], 6), out.val[0], 3);
-    out.val[0]       = vsetq_lane_s16(vgetq_lane_s16(out.val[1], 0), out.val[0], 4);
-    out.val[0]       = vsetq_lane_s16(vgetq_lane_s16(out.val[1], 2), out.val[0], 5);
-    out.val[0]       = vsetq_lane_s16(vgetq_lane_s16(out.val[1], 4), out.val[0], 6);
-    out.val[0]       = vsetq_lane_s16(vgetq_lane_s16(out.val[1], 6), out.val[0], 7);
-    return out;
-}
-
-template <>
-inline qint16x8x2_t convolve_3x3<3>(const qint8_t *in_top, const qint8_t *in_mid, const qint8_t *in_low,
-                                    const qint8x8x3_t &m0, const qint8x8x3_t &m1, const qint8x8x3_t &m2,
-                                    int fixed_point_position, int input_offset)
-{
-    ARM_COMPUTE_UNUSED(input_offset);
-
-    qint16x8x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, fixed_point_position, input_offset);
-    out.val[0]       = vsetq_lane_s16(vgetq_lane_s16(out.val[0], 3), out.val[0], 1);
-    out.val[0]       = vsetq_lane_s16(vgetq_lane_s16(out.val[0], 6), out.val[0], 2);
-    out.val[0]       = vsetq_lane_s16(vgetq_lane_s16(out.val[1], 1), out.val[0], 3);
-    return out;
-}
-
 /** Perform a convolve3x3 on uint8_t
  *
- * @param[in] in_top               Pointer to the first row of the input.
- * @param[in] in_mid               Pointer to the second row of the input.
- * @param[in] in_low               Pointer to the third row of the input.
- * @param[in] m0                   First row of the filter.
- * @param[in] m1                   Second row of the filter.
- * @param[in] m2                   Third row of the filter.
- * @param[in] fixed_point_position (Optional) Fixed point position.
- * @param[in] input_offset         (Optional) Input quantization offset.
+ * @param[in] in_top       Pointer to the first row of the input.
+ * @param[in] in_mid       Pointer to the second row of the input.
+ * @param[in] in_low       Pointer to the third row of the input.
+ * @param[in] m0           First row of the filter.
+ * @param[in] m1           Second row of the filter.
+ * @param[in] m2           Third row of the filter.
+ * @param[in] input_offset (Optional) Input quantization offset.
  *
  */
 template <unsigned int stridex>
 int32x4x2_t convolve_3x3(const uint8_t *in_top, const uint8_t *in_mid, const uint8_t *in_low,
                          const int32x4x3_t &m0, const int32x4x3_t &m1, const int32x4x3_t &m2,
-                         int fixed_point_position, int input_offset);
+                         int input_offset);
 
 template <>
 inline int32x4x2_t convolve_3x3<1>(const uint8_t *in_top, const uint8_t *in_mid, const uint8_t *in_low, const int32x4x3_t &m0, const int32x4x3_t &m1, const int32x4x3_t &m2,
-                                   int fixed_point_position, int input_offset)
+                                   int input_offset)
 {
-    ARM_COMPUTE_UNUSED(fixed_point_position);
-
     const int32x4_t v_input_offset = vdupq_n_s32(input_offset);
 
     const uint8x8x2_t vtop =
@@ -427,11 +292,9 @@
 template <>
 inline int32x4x2_t convolve_3x3<2>(const uint8_t *in_top, const uint8_t *in_mid, const uint8_t *in_low,
                                    const int32x4x3_t &m0, const int32x4x3_t &m1, const int32x4x3_t &m2,
-                                   int fixed_point_position, int input_offset)
+                                   int input_offset)
 {
-    ARM_COMPUTE_UNUSED(fixed_point_position);
-
-    int32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, fixed_point_position, input_offset);
+    int32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, input_offset);
     out.val[0]      = vsetq_lane_s32(vgetq_lane_s32(out.val[0], 2), out.val[0], 1);
     out.val[0]      = vsetq_lane_s32(vgetq_lane_s32(out.val[1], 0), out.val[0], 2);
     out.val[0]      = vsetq_lane_s32(vgetq_lane_s32(out.val[1], 2), out.val[0], 3);
@@ -441,10 +304,9 @@
 template <>
 inline int32x4x2_t convolve_3x3<3>(const uint8_t *in_top, const uint8_t *in_mid, const uint8_t *in_low,
                                    const int32x4x3_t &m0, const int32x4x3_t &m1, const int32x4x3_t &m2,
-                                   int fixed_point_position, int input_offset)
+                                   int input_offset)
 {
-    ARM_COMPUTE_UNUSED(fixed_point_position);
-    int32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, fixed_point_position, input_offset);
+    int32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, input_offset);
     out.val[0]      = vsetq_lane_s32(vgetq_lane_s32(out.val[0], 3), out.val[0], 1);
     return out;
 }
@@ -477,34 +339,6 @@
     vst1_f32(buffer, vget_low_f32(values.val[0]));
 }
 
-/** Stores a qint16_t array into a memory location.
- *
- * @param[in] buffer Pointer to the memory location where the values will be stored.
- * @param[in] values Values that will be stored.
- *
- */
-template <unsigned int stridex>
-void store_results(qint16_t *buffer, const qint16x8x2_t &values);
-
-template <>
-inline void store_results<1>(qint16_t *buffer, const qint16x8x2_t &values)
-{
-    vst1q_qs16(buffer, values.val[0]);
-    vst1q_qs16(buffer + 8, values.val[1]);
-}
-
-template <>
-inline void store_results<2>(qint16_t *buffer, const qint16x8x2_t &values)
-{
-    vst1q_qs16(buffer, values.val[0]);
-}
-
-template <>
-inline void store_results<3>(qint16_t *buffer, const qint16x8x2_t &values)
-{
-    vst1_qs16(buffer, vget_low_s16(values.val[0]));
-}
-
 /** Stores a uint32_t array into a memory location.
  *
  * @param[in] buffer Pointer to the memory location where the values will be stored.
@@ -557,25 +391,20 @@
 
 /** Perform a convolve3x3 on float16.
  *
- * @param[in] in_top               Pointer to the first row of the input.
- * @param[in] in_mid               Pointer to the second row of the input.
- * @param[in] in_low               Pointer to the third row of the input.
- * @param[in] m0                   First row of the filter.
- * @param[in] m1                   Second row of the filter.
- * @param[in] m2                   Third row of the filter.
- * @param[in] fixed_point_position (Optional) Fixed point position.
+ * @param[in] in_top Pointer to the first row of the input.
+ * @param[in] in_mid Pointer to the second row of the input.
+ * @param[in] in_low Pointer to the third row of the input.
+ * @param[in] m0     First row of the filter.
+ * @param[in] m1     Second row of the filter.
+ * @param[in] m2     Third row of the filter.
  *
  */
 template <unsigned int stridex>
-float16x8x2_t convolve_3x3(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low, const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2,
-                           int fixed_point_position);
+float16x8x2_t convolve_3x3(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low, const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2);
 
 template <>
-inline float16x8x2_t convolve_3x3<1>(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low, const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2,
-                                     int fixed_point_position)
+inline float16x8x2_t convolve_3x3<1>(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low, const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2)
 {
-    ARM_COMPUTE_UNUSED(fixed_point_position);
-
     const float16x8x3_t vtop =
     {
         {
@@ -627,10 +456,9 @@
 }
 
 template <>
-inline float16x8x2_t convolve_3x3<2>(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low, const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2,
-                                     int fixed_point_position)
+inline float16x8x2_t convolve_3x3<2>(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low, const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2)
 {
-    float16x8x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, fixed_point_position);
+    float16x8x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2);
     out.val[0]        = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 2), out.val[0], 1);
     out.val[0]        = vsetq_lane_f16(vgetq_lane_f16(out.val[1], 0), out.val[0], 2);
     out.val[0]        = vsetq_lane_f16(vgetq_lane_f16(out.val[1], 2), out.val[0], 3);
@@ -638,10 +466,9 @@
 }
 
 template <>
-inline float16x8x2_t convolve_3x3<3>(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low, const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2,
-                                     int fixed_point_position)
+inline float16x8x2_t convolve_3x3<3>(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low, const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2)
 {
-    float16x8x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, fixed_point_position);
+    float16x8x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2);
     out.val[0]        = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 3), out.val[0], 1);
     return out;
 }
commit	7485d5a62685cb745ab50e970adb722cb71557ac	[log] [tgz]
author	Vidhya Sudhan Loganathan <vidhyasudhan.loganathan@arm.com>	Wed Jul 04 09:34:00 2018 +0100
committer	Anthony Barbier <anthony.barbier@arm.com>	Fri Nov 02 16:54:10 2018 +0000
tree	ba01b99ca466c93edc9a3f8c1e34394ff84be060
parent	014333d73883c3872e458cedda5ccef586a7ccd4 [diff]