COMPMID-417: Port NEDirectConvolution 1x1 to QS16.

Change-Id: Icae6a5091e836d0aca24375f43cca9e6d3a2090f
Reviewed-on: http://mpd-gerrit.cambridge.arm.com/81662
Reviewed-by: Moritz Pflanzer <moritz.pflanzer@arm.com>
Tested-by: Kaizen <jeremy.johnson+kaizengerrit@arm.com>
Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
diff --git a/arm_compute/core/NEON/NEFixedPoint.h b/arm_compute/core/NEON/NEFixedPoint.h
index 08f6808..3de2261 100644
--- a/arm_compute/core/NEON/NEFixedPoint.h
+++ b/arm_compute/core/NEON/NEFixedPoint.h
@@ -48,6 +48,7 @@
 using qint16x8x4_t = int16x8x4_t; /**< 16 bit fixed point vector with 32 elements */
 using qint32x2_t   = int32x2_t;   /**< 32 bit fixed point vector with 2 elements */
 using qint32x4_t   = int32x4_t;   /**< 32 bit fixed point vector with 4 elements */
+using qint32x4x2_t = int32x4x2_t; /**< 32 bit fixed point vector with 8 elements */
 
 /** Get the lower half of a 16 elements vector
  *
@@ -673,6 +674,16 @@
  */
 qint16x8_t vmull_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position);
 
+/** 16 bit fixed point vector long multiply (4 elements)
+ *
+ * @param[in] a                    First 16 bit fixed point input vector
+ * @param[in] b                    Second 16 bit fixed point input vector
+ * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number
+ *
+ * @return The result of the 32 bit fixed point long vector multiplication.
+ */
+qint32x4_t vmull_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position);
+
 /** 8 bit fixed point vector multiply-accumulate (8 elements). This operation performs the product between @p b and @p c and add the result to @p a (a + b * c).
  *
  * @param[in] a                    First 8 bit fixed point input vector where the result of multiplication must be added to
diff --git a/arm_compute/core/NEON/NEFixedPoint.inl b/arm_compute/core/NEON/NEFixedPoint.inl
index c879d3e..dd1066d 100644
--- a/arm_compute/core/NEON/NEFixedPoint.inl
+++ b/arm_compute/core/NEON/NEFixedPoint.inl
@@ -624,6 +624,20 @@
     return vqrshlq_s16(res, fixed_point_position_s16);
 }
 
+inline qint32x4_t vmull_qs16(qint16x4_t a, qint16x4_t b, int fixed_point_position)
+{
+    const int32x4_t fixed_point_position_s32 = vdupq_n_s32(-fixed_point_position);
+
+    // Initialize the temporary results with a constant used to round up the result
+    qint32x4_t tmp = vdupq_n_s32(1 << (fixed_point_position - 1));
+
+    // Vector multiply-accumulate long
+    tmp = vmull_s16(a, b);
+
+    // Shift right by fixed_point_position
+    return vqshlq_s32(tmp, fixed_point_position_s32);
+}
+
 inline qint8x8_t vmla_qs8(qint8x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position)
 {
     const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position);
diff --git a/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.h b/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.h
index f098e18..87788ba 100644
--- a/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.h
+++ b/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.h
@@ -51,7 +51,7 @@
     /** Set the accumulate buffer and the biases of the kernel.
      *
      * @param[in, out] input  Input to add the bias to. If @p output is not specified then accumulation is done in-place.
-     *                        Data type supported: QS8/F32
+     *                        Data type supported: QS8/QS16/F16/F32
      * @param[in]      bias   The shared bias tensor to add. It must be 1D Tensor. Data type supported: Same as @p input
      * @param[out]     output (Optional) If the output tensor is specified the accumulation is done out-of-place. (Defaults to nullptr)
      *                         Data type supported: Same as @p input
diff --git a/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerKernel.h b/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerKernel.h
index 5612e1a..e0dac98 100644
--- a/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerKernel.h
+++ b/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerKernel.h
@@ -49,7 +49,7 @@
     /** Set the input, weights, and output tensors.
      *
      * @param[in]  input     The input tensor to convolve. 3 lower dimensions represent a single input [width, height, IFM],
-     *                       while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QS8/F32.
+     *                       while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QS8/QS16/F16/F32.
      * @param[in]  weights   Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
      *                       The 3rd dimension must be the same as the input's volume 3rd dimension.
      *                       Data type supported:Same as @p input.
diff --git a/arm_compute/core/Types.h b/arm_compute/core/Types.h
index 765cae4..2d3b3d6 100644
--- a/arm_compute/core/Types.h
+++ b/arm_compute/core/Types.h
@@ -68,6 +68,7 @@
     QS16,
     U32,
     S32,
+    QS32,
     U64,
     S64,
     F16,
diff --git a/arm_compute/core/Utils.h b/arm_compute/core/Utils.h
index 4ecd464..af788be 100644
--- a/arm_compute/core/Utils.h
+++ b/arm_compute/core/Utils.h
@@ -100,6 +100,7 @@
         case DataType::F32:
         case DataType::U32:
         case DataType::S32:
+        case DataType::QS32:
             return 4;
         case DataType::F64:
         case DataType::U64:
@@ -173,6 +174,7 @@
         case DataType::U32:
         case DataType::S32:
         case DataType::F32:
+        case DataType::QS32:
             return 4;
         default:
             ARM_COMPUTE_ERROR("Undefined element size for given data type");
@@ -645,6 +647,7 @@
     {
         case DataType::QS8:
         case DataType::QS16:
+        case DataType::QS32:
             return true;
         default:
             return false;
diff --git a/arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h
index a66cab3..872fae3 100644
--- a/arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEDirectConvolutionLayer.h
@@ -48,7 +48,7 @@
     NEDirectConvolutionLayer();
     /** Set the input, weights, biases and output tensors.
       *
-      * @param[in, out] input     Input tensor. Data types supported: QS8/F16/F32.
+      * @param[in, out] input     Input tensor. Data types supported: QS8/QS16/F16/F32.
       * @param[in]      weights   Set of kernels to convolve the input volume.
       *                           The 3rd dimension must be the same as the input's volume 3rd dimension.
       *                           Data type supported: Same as @p input.