COMPMID-456: Add support for QS16 NEON Normalization Layer.

Change-Id: I1e542808cfd7774c67cc4e9a58e42449e4fb29aa
Reviewed-on: http://mpd-gerrit.cambridge.arm.com/81735
Tested-by: Kaizen <jeremy.johnson+kaizengerrit@arm.com>
Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
diff --git a/arm_compute/core/NEON/NEFixedPoint.h b/arm_compute/core/NEON/NEFixedPoint.h
index 50463b5..08f6808 100644
--- a/arm_compute/core/NEON/NEFixedPoint.h
+++ b/arm_compute/core/NEON/NEFixedPoint.h
@@ -235,13 +235,22 @@
 
 /** Duplicate a float and convert it to 8 bit fixed point vector (16 elements)
  *
- * @param[in] a                    8 bit fixed point to duplicate
+ * @param[in] a                    floating point value to convert and duplicate
  * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
  *
  * @return The result of the vector duplication
  */
 qint8x16_t vdupq_n_qs8_f32(float a, int fixed_point_position);
 
+/** Duplicate a float and convert it to 16 bit fixed point vector (8 elements)
+ *
+ * @param[in] a                    floating point value to convert and duplicate
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the vector duplication
+ */
+qint16x8_t vdupq_n_qs16_f32(float a, int fixed_point_position);
+
 /** 16 bit fixed point vector duplicate (8 elements)
  *
  * @param[in] a 16 bit fixed point to duplicate
@@ -1178,7 +1187,19 @@
  *
  * @return The result of the 8bit power.
  */
-qint8x8_t vqpowq_qs8(qint8x8_t a, qint8x16_t b, int fixed_point_position);
+qint8x16_t vqpowq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position);
+
+/** Calculate saturating n power for fixed point 16bit (8 elements).
+ *
+ * pow(a,b) = e^(b*log(a))
+ *
+ * @param[in] a                    16bit fixed point input vector
+ * @param[in] b                    16bit fixed point power vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 16bit power.
+ */
+qint16x8_t vqpowq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position);
 
 /** Compute lane-by-lane maximum between elements of a float vector with 4x2 elements
  *
diff --git a/arm_compute/core/NEON/NEFixedPoint.inl b/arm_compute/core/NEON/NEFixedPoint.inl
index 7cebfad..c879d3e 100644
--- a/arm_compute/core/NEON/NEFixedPoint.inl
+++ b/arm_compute/core/NEON/NEFixedPoint.inl
@@ -250,6 +250,18 @@
     return vqcvtq_qs8_f32(res, fixed_point_position);
 }
 
+inline qint16x8_t vdupq_n_qs16_f32(float a, int fixed_point_position)
+{
+    float32x4x2_t res =
+    {
+        {
+            vdupq_n_f32(a),
+            vdupq_n_f32(a),
+        }
+    };
+    return vqcvtq_qs16_f32(res, fixed_point_position);
+}
+
 inline qint16x8_t vdupq_n_qs16(qint16_t a)
 {
     return vdupq_n_s16(a);
@@ -1941,6 +1953,11 @@
     return vqexpq_qs8(vqmulq_qs8(b, vlogq_qs8(a, fixed_point_position), fixed_point_position), fixed_point_position);
 }
 
+inline qint16x8_t vqpowq_qs16(qint16x8_t a, qint16x8_t b, int fixed_point_position)
+{
+    return vqexpq_qs16(vqmulq_qs16(b, vlogq_qs16(a, fixed_point_position), fixed_point_position), fixed_point_position);
+}
+
 inline float32x4x2_t vmax2q_f32(float32x4x2_t a, float32x4x2_t b)
 {
     float32x4x2_t res =
diff --git a/arm_compute/core/NEON/kernels/NENormalizationLayerKernel.h b/arm_compute/core/NEON/kernels/NENormalizationLayerKernel.h
index b1bc594..e24e481 100644
--- a/arm_compute/core/NEON/kernels/NENormalizationLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NENormalizationLayerKernel.h
@@ -50,7 +50,7 @@
     /** Set the input and output tensors.
      *
      * @param[in]  input         Source tensor. 3 lower dims represent a single input with dimensions [width, height, IFM],
-     *                           and an optional 4th dimension for batch of inputs. Data types supported: QS8/F32.
+     *                           and an optional 4th dimension for batch of inputs. Data types supported: QS8/QS16/FP16/F32.
      * @param[in]  input_squared Source with each element has been squared. 3 lower dims represent a single input with dimensions [width, height, IFM],
      *                           Data type supported: same as @p input
      * @param[out] output        Destination tensor. Output will have the same number of dimensions as input. Data type supported: same as @p input
@@ -86,7 +86,7 @@
      *
      * @param[in] window Region on which to execute the kernel.
      */
-    template <unsigned int dim, bool do_2D_norm>
+    template <DataType dt, unsigned int dim, bool do_2D_norm>
     void normalize_fixed_point(const Window &window);
     /** Common signature for all the specialised normalization functions
      *
diff --git a/src/core/NEON/kernels/NENormalizationLayerKernel.cpp b/src/core/NEON/kernels/NENormalizationLayerKernel.cpp
index 76ace91..085d412 100644
--- a/src/core/NEON/kernels/NENormalizationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NENormalizationLayerKernel.cpp
@@ -46,7 +46,7 @@
 
 void NENormalizationLayerKernel::configure(const ITensor *input, const ITensor *input_squared, ITensor *output, NormalizationLayerInfo norm_info)
 {
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32, DataType::QS8);
+    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
     ARM_COMPUTE_ERROR_ON_NULLPTR(output);
     // Output tensor auto initialization if not yet initialized
     auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position());
@@ -118,14 +118,35 @@
             switch(norm_info.type())
             {
                 case NormType::IN_MAP_1D:
-                    _func = &NENormalizationLayerKernel::normalize_fixed_point<0, false>;
+                    _func = &NENormalizationLayerKernel::normalize_fixed_point<DataType::QS8, 0, false>;
                     break;
                 case NormType::IN_MAP_2D:
                     // Normalize over X and Y
-                    _func = &NENormalizationLayerKernel::normalize_fixed_point<0, true>;
+                    _func = &NENormalizationLayerKernel::normalize_fixed_point<DataType::QS8, 0, true>;
                     break;
                 case NormType::CROSS_MAP:
-                    _func = &NENormalizationLayerKernel::normalize_fixed_point<2, false>;
+                    _func = &NENormalizationLayerKernel::normalize_fixed_point<DataType::QS8, 2, false>;
+                    break;
+                default:
+                    ARM_COMPUTE_ERROR("Not supported");
+                    break;
+            }
+            break;
+        }
+        case DataType::QS16:
+        {
+            num_elems_processed_per_iteration = 8;
+            switch(norm_info.type())
+            {
+                case NormType::IN_MAP_1D:
+                    _func = &NENormalizationLayerKernel::normalize_fixed_point<DataType::QS16, 0, false>;
+                    break;
+                case NormType::IN_MAP_2D:
+                    // Normalize over X and Y
+                    _func = &NENormalizationLayerKernel::normalize_fixed_point<DataType::QS16, 0, true>;
+                    break;
+                case NormType::CROSS_MAP:
+                    _func = &NENormalizationLayerKernel::normalize_fixed_point<DataType::QS16, 2, false>;
                     break;
                 default:
                     ARM_COMPUTE_ERROR("Not supported");
@@ -250,7 +271,7 @@
     }
 }
 
-template <unsigned int dim, bool do_2D_norm>
+template <DataType dt, unsigned int dim, bool do_2D_norm>
 void NENormalizationLayerKernel::normalize_fixed_point(const Window &window)
 {
     Iterator input(_input, window);
@@ -269,40 +290,84 @@
 
     const int fixed_point_position = _input->info()->fixed_point_position();
 
-    const qint8x16_t coeff_vec = vdupq_n_qs8_f32(_norm_info.scale_coeff(), fixed_point_position);
-    const qint8x16_t beta_vec  = vdupq_n_qs8_f32(_norm_info.beta(), fixed_point_position);
-    const qint8x16_t kappa_vec = vdupq_n_qs8_f32(_norm_info.kappa(), fixed_point_position);
-
-    execute_window_loop(window, [&](const Coordinates & id)
+    if(dt == DataType::QS8)
     {
-        // Get range to normalize
-        const int current_row   = do_2D_norm ? id[dim_y] : 0;
-        const int current_slice = id[dim];
-        const int first_row     = do_2D_norm ? std::max(current_row - radius, min_top) : 0;
-        const int last_row      = do_2D_norm ? std::min(current_row + radius, max_bottom) : 0;
-        const int first_slice   = std::max(current_slice - radius, min_left);
-        const int last_slice    = std::min(current_slice + radius, max_right);
+        const qint8x16_t coeff_vec = vdupq_n_qs8_f32(_norm_info.scale_coeff(), fixed_point_position);
+        const qint8x16_t beta_vec  = vdupq_n_qs8_f32(_norm_info.beta(), fixed_point_position);
+        const qint8x16_t kappa_vec = vdupq_n_qs8_f32(_norm_info.kappa(), fixed_point_position);
 
-        // Accumulate 2D In-Map values
-        qint8x16_t accu = vdupq_n_qs8(0);
-        for(int j = first_row; j <= last_row; ++j)
+        execute_window_loop(window, [&](const Coordinates & id)
         {
-            // Compute row displacement
-            const int            row               = (j - current_row) * _input_squared->info()->strides_in_bytes()[dim_y];
-            const uint8_t *const input_squared_ptr = input_squared.ptr() + row - (current_slice * input_squared_stride);
-            for(int i = first_slice; i <= last_slice; ++i)
-            {
-                accu = vqaddq_qs8(accu, vld1q_qs8(reinterpret_cast<const qint8_t *>(input_squared_ptr + i * input_squared_stride)));
-            }
-        }
+            // Get range to normalize
+            const int current_row   = do_2D_norm ? id[dim_y] : 0;
+            const int current_slice = id[dim];
+            const int first_row     = do_2D_norm ? std::max(current_row - radius, min_top) : 0;
+            const int last_row      = do_2D_norm ? std::min(current_row + radius, max_bottom) : 0;
+            const int first_slice   = std::max(current_slice - radius, min_left);
+            const int last_slice    = std::min(current_slice + radius, max_right);
 
-        // Normalize
-        const qint8x16_t accu_scale       = vqmlaq_qs8(kappa_vec, coeff_vec, accu, fixed_point_position);
-        const qint8x16_t normalized       = vqpowq_qs8(accu_scale, beta_vec, fixed_point_position);
-        const qint8x16_t normalized_pixel = vdivq_qs8(vld1q_qs8(reinterpret_cast<const qint8_t *>(input.ptr())), normalized, fixed_point_position);
-        vst1q_qs8(reinterpret_cast<qint8_t *>(output.ptr()), normalized_pixel);
-    },
-    input, input_squared, output);
+            // Accumulate 2D In-Map values
+            qint8x16_t accu = vdupq_n_qs8(0);
+            for(int j = first_row; j <= last_row; ++j)
+            {
+                // Compute row displacement
+                const int            row               = (j - current_row) * _input_squared->info()->strides_in_bytes()[dim_y];
+                const uint8_t *const input_squared_ptr = input_squared.ptr() + row - (current_slice * input_squared_stride);
+                for(int i = first_slice; i <= last_slice; ++i)
+                {
+                    accu = vqaddq_qs8(accu, vld1q_qs8(reinterpret_cast<const qint8_t *>(input_squared_ptr + i * input_squared_stride)));
+                }
+            }
+
+            // Normalize
+            const qint8x16_t accu_scale       = vqmlaq_qs8(kappa_vec, coeff_vec, accu, fixed_point_position);
+            const qint8x16_t normalized       = vqpowq_qs8(accu_scale, beta_vec, fixed_point_position);
+            const qint8x16_t normalized_pixel = vdivq_qs8(vld1q_qs8(reinterpret_cast<const qint8_t *>(input.ptr())), normalized, fixed_point_position);
+            vst1q_qs8(reinterpret_cast<qint8_t *>(output.ptr()), normalized_pixel);
+        },
+        input, input_squared, output);
+    }
+    else if(dt == DataType::QS16)
+    {
+        const qint16x8_t coeff_vec = vdupq_n_qs16_f32(_norm_info.scale_coeff(), fixed_point_position);
+        const qint16x8_t beta_vec  = vdupq_n_qs16_f32(_norm_info.beta(), fixed_point_position);
+        const qint16x8_t kappa_vec = vdupq_n_qs16_f32(_norm_info.kappa(), fixed_point_position);
+
+        execute_window_loop(window, [&](const Coordinates & id)
+        {
+            // Get range to normalize
+            const int current_row   = do_2D_norm ? id[dim_y] : 0;
+            const int current_slice = id[dim];
+            const int first_row     = do_2D_norm ? std::max(current_row - radius, min_top) : 0;
+            const int last_row      = do_2D_norm ? std::min(current_row + radius, max_bottom) : 0;
+            const int first_slice   = std::max(current_slice - radius, min_left);
+            const int last_slice    = std::min(current_slice + radius, max_right);
+
+            // Accumulate 2D In-Map values
+            qint16x8_t accu = vdupq_n_qs16(0);
+            for(int j = first_row; j <= last_row; ++j)
+            {
+                // Compute row displacement
+                const int            row               = (j - current_row) * _input_squared->info()->strides_in_bytes()[dim_y];
+                const uint8_t *const input_squared_ptr = input_squared.ptr() + row - (current_slice * input_squared_stride);
+                for(int i = first_slice; i <= last_slice; ++i)
+                {
+                    accu = vqaddq_qs16(accu, vld1q_qs16(reinterpret_cast<const qint16_t *>(input_squared_ptr + i * input_squared_stride)));
+                }
+            }
+
+            // Normalize
+            const qint16x8_t accu_scale       = vqmlaq_qs16(kappa_vec, coeff_vec, accu, fixed_point_position);
+            const qint16x8_t normalized       = vqpowq_qs16(accu_scale, beta_vec, fixed_point_position);
+            const qint16x8_t normalized_pixel = vdivq_qs16(vld1q_qs16(reinterpret_cast<const qint16_t *>(input.ptr())), normalized, fixed_point_position);
+            vst1q_qs16(reinterpret_cast<qint16_t *>(output.ptr()), normalized_pixel);
+        },
+        input, input_squared, output);
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR("Not supported");
+    }
 }
 
 void NENormalizationLayerKernel::run(const Window &window)
diff --git a/tests/benchmark_new/NEON/NormalizationLayer.cpp b/tests/benchmark_new/NEON/NormalizationLayer.cpp
index 71dd9c3..de7183d 100644
--- a/tests/benchmark_new/NEON/NormalizationLayer.cpp
+++ b/tests/benchmark_new/NEON/NormalizationLayer.cpp
@@ -41,9 +41,9 @@
 namespace
 {
 #ifdef ARM_COMPUTE_ENABLE_FP16
-const auto normalization_layer_data_types = framework::dataset::make("DataType", { DataType::F16, DataType::F32, DataType::QS8 });
+const auto normalization_layer_data_types = framework::dataset::make("DataType", { DataType::QS8, DataType::QS16, DataType::F16, DataType::F32 });
 #else  /* ARM_COMPUTE_ENABLE_FP16 */
-const auto normalization_layer_data_types = framework::dataset::make("DataType", { DataType::F32, DataType::QS8 });
+const auto normalization_layer_data_types = framework::dataset::make("DataType", { DataType::QS8, DataType::QS16, DataType::F32 });
 #endif /* ARM_COMPUTE_ENABLE_FP16 */
 } // namespace
 using NENormalizationLayerFixture = NormalizationLayerFixture<Tensor, NENormalizationLayer, Accessor>;
diff --git a/tests/validation_new/CPP/NormalizationLayer.cpp b/tests/validation_new/CPP/NormalizationLayer.cpp
index 72f4900..a8818d8 100644
--- a/tests/validation_new/CPP/NormalizationLayer.cpp
+++ b/tests/validation_new/CPP/NormalizationLayer.cpp
@@ -268,6 +268,7 @@
 template SimpleTensor<float> normalization_layer(const SimpleTensor<float> &src, NormalizationLayerInfo info);
 template SimpleTensor<half_float::half> normalization_layer(const SimpleTensor<half_float::half> &src, NormalizationLayerInfo info);
 template SimpleTensor<qint8_t> normalization_layer(const SimpleTensor<qint8_t> &src, NormalizationLayerInfo info);
+template SimpleTensor<qint16_t> normalization_layer(const SimpleTensor<qint16_t> &src, NormalizationLayerInfo info);
 } // namespace reference
 } // namespace validation
 } // namespace test
diff --git a/tests/validation_new/NEON/NormalizationLayer.cpp b/tests/validation_new/NEON/NormalizationLayer.cpp
index f364975..dfe7931 100644
--- a/tests/validation_new/NEON/NormalizationLayer.cpp
+++ b/tests/validation_new/NEON/NormalizationLayer.cpp
@@ -50,7 +50,8 @@
 #endif /* ARM_COMPUTE_ENABLE_FP16 */
 constexpr float tolerance_f32 = 0.00001f;
 /** Tolerance for fixed point operations */
-constexpr int8_t tolerance_qs8 = 2;
+constexpr int8_t  tolerance_qs8  = 2;
+constexpr int16_t tolerance_qs16 = 3;
 
 /** Input data set. */
 const auto NormalizationDataset = combine(combine(combine(datasets::SmallShapes(), datasets::NormalizationTypes()), framework::dataset::make("NormalizationSize", 3, 9, 2)),
@@ -116,6 +117,24 @@
     validate(Accessor(_target), _reference, tolerance_qs8);
 }
 TEST_SUITE_END()
+
+TEST_SUITE(QS16)
+// Testing for fixed point position [1,14) as reciprocal limits the maximum fixed point position to 14
+FIXTURE_DATA_TEST_CASE(RunSmall, NENormalizationLayerFixedPointFixture<int16_t>, framework::DatasetMode::PRECOMMIT, combine(combine(NormalizationDataset, framework::dataset::make("DataType",
+                       DataType::QS16)),
+                       framework::dataset::make("FractionalBits", 1, 14)))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qs16);
+}
+FIXTURE_DATA_TEST_CASE(RunLarge, NENormalizationLayerFixedPointFixture<int16_t>, framework::DatasetMode::NIGHTLY, combine(combine(NormalizationDataset, framework::dataset::make("DataType",
+                       DataType::QS16)),
+                       framework::dataset::make("FractionalBits", 1, 14)))
+{
+    // Validate output
+    validate(Accessor(_target), _reference, tolerance_qs16);
+}
+TEST_SUITE_END()
 TEST_SUITE_END()
 
 TEST_SUITE_END()