COMPMID-456: Add support for QS16 NEON Normalization Layer.

Change-Id: I1e542808cfd7774c67cc4e9a58e42449e4fb29aa
Reviewed-on: http://mpd-gerrit.cambridge.arm.com/81735
Tested-by: Kaizen <jeremy.johnson+kaizengerrit@arm.com>
Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
diff --git a/arm_compute/core/NEON/NEFixedPoint.h b/arm_compute/core/NEON/NEFixedPoint.h
index 50463b5..08f6808 100644
--- a/arm_compute/core/NEON/NEFixedPoint.h
+++ b/arm_compute/core/NEON/NEFixedPoint.h
@@ -235,13 +235,22 @@
 
 /** Duplicate a float and convert it to 8 bit fixed point vector (16 elements)
  *
- * @param[in] a                    8 bit fixed point to duplicate
+ * @param[in] a                    floating point value to convert and duplicate
  * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
  *
  * @return The result of the vector duplication
  */
 qint8x16_t vdupq_n_qs8_f32(float a, int fixed_point_position);
 
+/** Duplicate a float and convert it to 16 bit fixed point vector (8 elements)
+ *
+ * @param[in] a                    floating point value to convert and duplicate
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the vector duplication
+ */
+qint16x8_t vdupq_n_qs16_f32(float a, int fixed_point_position);
+
 /** 16 bit fixed point vector duplicate (8 elements)
  *
  * @param[in] a 16 bit fixed point to duplicate
@@ -1178,7 +1187,19 @@
  *
  * @return The result of the 8bit power.
  */
-qint8x8_t vqpowq_qs8(qint8x8_t a, qint8x16_t b, int fixed_point_position);
+qint8x16_t vqpowq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position);
+
+/** Calculate saturating n power for fixed point 16bit (8 elements).
+ *
+ * pow(a,b) = e^(b*log(a))
+ *
+ * @param[in] a                    16bit fixed point input vector
+ * @param[in] b                    16bit fixed point power vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 16bit power.
+ */
+qint16x8_t vqpowq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position);
 
 /** Compute lane-by-lane maximum between elements of a float vector with 4x2 elements
  *
diff --git a/arm_compute/core/NEON/NEFixedPoint.inl b/arm_compute/core/NEON/NEFixedPoint.inl
index 7cebfad..c879d3e 100644
--- a/arm_compute/core/NEON/NEFixedPoint.inl
+++ b/arm_compute/core/NEON/NEFixedPoint.inl
@@ -250,6 +250,18 @@
     return vqcvtq_qs8_f32(res, fixed_point_position);
 }
 
+inline qint16x8_t vdupq_n_qs16_f32(float a, int fixed_point_position)
+{
+    float32x4x2_t res =
+    {
+        {
+            vdupq_n_f32(a),
+            vdupq_n_f32(a),
+        }
+    };
+    return vqcvtq_qs16_f32(res, fixed_point_position);
+}
+
 inline qint16x8_t vdupq_n_qs16(qint16_t a)
 {
     return vdupq_n_s16(a);
@@ -1941,6 +1953,11 @@
     return vqexpq_qs8(vqmulq_qs8(b, vlogq_qs8(a, fixed_point_position), fixed_point_position), fixed_point_position);
 }
 
+inline qint16x8_t vqpowq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position)
+{
+    return vqexpq_qs16(vqmulq_qs16(b, vlogq_qs16(a, fixed_point_position), fixed_point_position), fixed_point_position);
+}
+
 inline float32x4x2_t vmax2q_f32(float32x4x2_t a, float32x4x2_t b)
 {
     float32x4x2_t res =
diff --git a/arm_compute/core/NEON/kernels/NENormalizationLayerKernel.h b/arm_compute/core/NEON/kernels/NENormalizationLayerKernel.h
index b1bc594..e24e481 100644
--- a/arm_compute/core/NEON/kernels/NENormalizationLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NENormalizationLayerKernel.h
@@ -50,7 +50,7 @@
     /** Set the input and output tensors.
      *
      * @param[in]  input         Source tensor. 3 lower dims represent a single input with dimensions [width, height, IFM],
-     *                           and an optional 4th dimension for batch of inputs. Data types supported: QS8/F32.
+     *                           and an optional 4th dimension for batch of inputs. Data types supported: QS8/QS16/FP16/F32.
      * @param[in]  input_squared Source with each element has been squared. 3 lower dims represent a single input with dimensions [width, height, IFM],
      *                           Data type supported: same as @p input
      * @param[out] output        Destination tensor. Output will have the same number of dimensions as input. Data type supported: same as @p input
@@ -86,7 +86,7 @@
      *
      * @param[in] window Region on which to execute the kernel.
      */
-    template <unsigned int dim, bool do_2D_norm>
+    template <DataType dt, unsigned int dim, bool do_2D_norm>
     void normalize_fixed_point(const Window &window);
     /** Common signature for all the specialised normalization functions
      *