COMPMID-444: Add support for QS8/QS16 NEON Arithmetic Add/Sub/Mul.

Change-Id: Ia482498688ca1884272b5062e3415e736e03d36f
Reviewed-on: http://mpd-gerrit.cambridge.arm.com/80448
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
Tested-by: Kaizen <jeremy.johnson+kaizengerrit@arm.com>
diff --git a/arm_compute/core/NEON/NEFixedPoint.h b/arm_compute/core/NEON/NEFixedPoint.h
index 09579f9..50463b5 100644
--- a/arm_compute/core/NEON/NEFixedPoint.h
+++ b/arm_compute/core/NEON/NEFixedPoint.h
@@ -145,6 +145,14 @@
  */
 qint16x8_t vld1q_dup_qs16(const qint16_t *addr);
 
+/** Load two 16 bit fixed point vectors from memory (8x2 elements)
+ *
+ * @param[in] addr Memory address of the 16 bit fixed point vectors to load
+ *
+ * @return 16 bit fixed point vectors (8x2 elements)
+ */
+qint16x8x2_t vld2q_qs16(qint16_t *addr);
+
 /** Store a single 8 bit fixed point vector to memory (8 elements)
  *
  * @param[in] addr Memory address where the 8 bit fixed point vector should be stored
diff --git a/arm_compute/core/NEON/NEFixedPoint.inl b/arm_compute/core/NEON/NEFixedPoint.inl
index 4e862ba..05e4815 100644
--- a/arm_compute/core/NEON/NEFixedPoint.inl
+++ b/arm_compute/core/NEON/NEFixedPoint.inl
@@ -181,6 +181,11 @@
     return vld1q_dup_s16(addr);
 }
 
+inline qint16x8x2_t vld2q_qs16(const qint16_t *addr)
+{
+    return vld2q_s16(addr);
+}
+
 inline void vst1_qs8(qint8_t *addr, qint8x8_t b)
 {
     vst1_s8(addr, b);
diff --git a/arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h b/arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h
index 9bfdde1..7ad5893 100644
--- a/arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h
+++ b/arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h
@@ -50,9 +50,9 @@
 
     /** Initialise the kernel's input, output and border mode.
      *
-     * @param[in]  input1 An input tensor. Data types supported: U8/S16/F16/F32
-     * @param[in]  input2 An input tensor. Data types supported: U8/S16/F16 (only if @p input1 is F16)/F32 (only if @p input1 is F32).
-     * @param[out] output The output tensor. Data types supported: U8 (Only if both inputs are U8), S16/F16 (only if both inputs are F16), F32 (only if both inputs are F32).
+     * @param[in]  input1 An input tensor. Data types supported: U8/QS8/QS16/S16/F16/F32
+     * @param[in]  input2 An input tensor. Data types supported: U8, QS8 (only if @p input1 is QS8), QS16 (only if @p input1 is QS16), S16/F16 (only if @p input1 is F16), F32 (only if @p input1 is F32).
+     * @param[out] output The output tensor. Data types supported: U8 (Only if both inputs are U8), QS8 (only if both inputs are QS8), QS16 (only if both inputs are QS16), S16/F16 (only if @p input1 is F16), F32 (only if both inputs are F32).
      * @param[in]  policy Overflow policy.
      */
     void configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy);
@@ -63,9 +63,9 @@
 private:
     /** Common signature for all the specialised add functions
      *
-     * @param[in]  input1 An input tensor. Data types supported: U8/S16/F16/F32
-     * @param[in]  input2 An input tensor. Data types supported: U8/S16/F16 (only if @p input1 is F16)/F32 (only if @p input1 is F32).
-     * @param[out] output The output tensor. Data types supported: U8 (Only if both inputs are U8), S16/F16 (only if both inputs are F16), F32 (only if both inputs are F32).
+     * @param[in]  input1 An input tensor. Data types supported: U8/QS8/QS16/S16/F16/F32
+     * @param[in]  input2 An input tensor. Data types supported: U8, QS8 (only if @p input1 is QS8), QS16 (only if @p input1 is QS16), S16/F16 (only if @p input1 is F16), F32 (only if @p input1 is F32).
+     * @param[out] output The output tensor. Data types supported: U8 (Only if both inputs are U8), QS8 (only if both inputs are QS8), QS16 (only if both inputs are QS16), S16/F16 (only if @p input1 is F16), F32 (only if both inputs are F32).
      * @param[in]  window Region on which to execute the kernel.
      */
     using AddFunction = void(const ITensor *input1, const ITensor *input2, ITensor *output, const Window &window);
diff --git a/arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h b/arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h
index 0eb9c23..6f88d27 100644
--- a/arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h
+++ b/arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h
@@ -50,9 +50,9 @@
 
     /** Initialise the kernel's input, output and border mode.
      *
-     * @param[in]  input1 An input tensor. Data types supported: U8/S16/F32
-     * @param[in]  input2 An input tensor. Data types supported: U8/S16/F32  (only if @p input1 is F32).
-     * @param[out] output The output tensor. Data types supported: U8 (Only if both inputs are U8), S16/F32 (only if both inputs are F32).
+     * @param[in]  input1 An input tensor. Data types supported: U8/QS8/QS16/S16/F32
+     * @param[in]  input2 An input tensor. Data types supported: U8, QS8 (only if @p input1 is QS8),QS16 (only if @p input1 is QS16), S16/F32 (only if @p input1 is F32).
+     * @param[out] output The output tensor. Data types supported: U8 (Only if both inputs are U8), QS8 (only if both inputs are QS8), QS16 (only if both inputs are QS16), S16/F32 (only if both inputs are F32).
      * @param[in]  policy Overflow policy.
      */
     void configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy);
@@ -63,9 +63,9 @@
 private:
     /** Common signature for all the specialised sub functions
      *
-     * @param[in]  input1 An input tensor. Data types supported: U8, S16, F32.
-     * @param[in]  input2 An input tensor. Data types supported: U8, S16, F32 (only if @p input1 is F32).
-     * @param[out] output The output tensor. Data types supported: U8 (Only if both inputs are U8), S16, F32 (only if both inputs are F32)
+     * @param[in]  input1 An input tensor. Data types supported: U8/S16/F32
+     * @param[in]  input2 An input tensor. Data types supported: U8/S16/F32  (only if @p input1 is F32).
+     * @param[out] output The output tensor. Data types supported: U8 (Only if both inputs are U8), S16/F32 (only if both inputs are F32).
      * @param[in]  window Region on which to execute the kernel.
      */
     using SubFunction = void(const ITensor *input1, const ITensor *input2, ITensor *output, const Window &window);
diff --git a/arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h b/arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h
index 433a20e..bf96c90 100644
--- a/arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h
+++ b/arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h
@@ -51,10 +51,11 @@
      *
      * @note For @p scale equal to 1/255 only round to nearest even (implemented as round half up) is supported.
      *       For all other scale values only round to zero (implemented as round towards minus infinity) is supported.
+     *       For QS8/QS16 scale = 1 is the only supported value.
      *
-     * @param[in]  input1          An input tensor. Data types supported: U8/QS8/S16/F16/F32.
-     * @param[in]  input2          An input tensor. Data types supported: U8/QS8/S16/F16/F32.
-     * @param[out] output          The output tensor. Data types supported: U8 (Only if both inputs are U8) /S16/F16/F32.
+     * @param[in]  input1          An input tensor. Data types supported: U8/QS8/QS16/S16/F16/F32
+     * @param[in]  input2          An input tensor. Data types supported: U8, QS8 (only if @p input1 is QS8), QS16 (only if @p input1 is QS16), S16/F16 (only if @p input1 is F16), F32 (only if @p input1 is F32).
+     * @param[out] output          The output tensor. Data types supported: U8 (Only if both inputs are U8), QS8 (only if both inputs are QS8), QS16 (only if both inputs are QS16), S16/F16 (only if @p input1 is F16), F32 (only if both inputs are F32).
      * @param[in]  scale           Scale to apply after multiplication.
      *                             Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15.
      * @param[in]  overflow_policy Overflow policy.