COMPMID-2308: NEConvolutionLayer: support QUANT8_SYMM_PER_CHANNEL filters

Change-Id: Ic1bf5f0d21ccd525f84213a360f7e199d7f50577
Signed-off-by: Georgios Pinitas <georgios.pinitas@arm.com>
Reviewed-on: https://review.mlplatform.org/c/2177
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
diff --git a/arm_compute/core/NEON/NEAsymm.h b/arm_compute/core/NEON/NEAsymm.h
index 56d4c09..a3bd7e2 100644
--- a/arm_compute/core/NEON/NEAsymm.h
+++ b/arm_compute/core/NEON/NEAsymm.h
@@ -115,6 +115,66 @@
     return out_u8;
 }
 
+/** Performs final quantization step on 16 elements for symmetric quantization
+ *
+ * @tparam is_bounded_relu Specified if a fused bounded relu should be applied
+ *
+ * @param in_s32                        Input to be quantized.
+ * @param result_fixedpoint_multiplier  Result multiplier parameter
+ * @param result_shift                  Result shift parameter
+ * @param result_offset_after_shift_s32 Result offset parameter
+ * @param min_s8                        Relu lower bound
+ * @param max_s8                        Relu upper bound
+ *
+ * @return Quantized values
+ */
+template <bool   is_bounded_relu>
+inline int8x16_t finalize_quantization_symm(int32x4x4_t       &in_s32,
+                                            const int32x4x4_t &result_fixedpoint_multiplier,
+                                            const int32x4x4_t &result_shift,
+                                            const int32x4_t   &result_offset_after_shift_s32,
+                                            const int8x16_t   &min_s8,
+                                            const int8x16_t   &max_s8)
+{
+    // Fixed point multiplication with vector saturating rounding doubling multiply high with scalar
+    in_s32.val[0] = vqrdmulhq_s32(in_s32.val[0], result_fixedpoint_multiplier.val[0]);
+    in_s32.val[1] = vqrdmulhq_s32(in_s32.val[1], result_fixedpoint_multiplier.val[1]);
+    in_s32.val[2] = vqrdmulhq_s32(in_s32.val[2], result_fixedpoint_multiplier.val[2]);
+    in_s32.val[3] = vqrdmulhq_s32(in_s32.val[3], result_fixedpoint_multiplier.val[3]);
+
+    // Round to the nearest division by a power-of-two using result_shift_s32
+    in_s32.val[0] = rounding_divide_by_pow2(in_s32.val[0], result_shift.val[0]);
+    in_s32.val[1] = rounding_divide_by_pow2(in_s32.val[1], result_shift.val[1]);
+    in_s32.val[2] = rounding_divide_by_pow2(in_s32.val[2], result_shift.val[2]);
+    in_s32.val[3] = rounding_divide_by_pow2(in_s32.val[3], result_shift.val[3]);
+
+    // Add the offset terms
+    in_s32.val[0] = vaddq_s32(in_s32.val[0], result_offset_after_shift_s32);
+    in_s32.val[1] = vaddq_s32(in_s32.val[1], result_offset_after_shift_s32);
+    in_s32.val[2] = vaddq_s32(in_s32.val[2], result_offset_after_shift_s32);
+    in_s32.val[3] = vaddq_s32(in_s32.val[3], result_offset_after_shift_s32);
+
+    // Convert S32 to S16
+    const int16x8x2_t in_s16 =
+    {
+        {
+            vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
+            vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))
+        }
+    };
+
+    // Convert S16 to S8
+    int8x16_t out_s8 = vcombine_s8(vqmovn_s16(in_s16.val[0]), vqmovn_s16(in_s16.val[1]));
+
+    if(is_bounded_relu)
+    {
+        out_s8 = vmaxq_s8(out_s8, min_s8);
+        out_s8 = vminq_s8(out_s8, max_s8);
+    }
+
+    return out_s8;
+}
+
 /** Performs final quantization step on single element
  *
  * @tparam is_bounded_relu Specified if a fused bounded relu should be applied
@@ -154,6 +214,45 @@
     return out_u8;
 }
 
+/** Performs final quantization step on single element
+ *
+ * @tparam is_bounded_relu Specified if a fused bounded relu should be applied
+ *
+ * @param[in] in_value                      Input to be quantized.
+ * @param[in] result_fixedpoint_multiplier  Result multiplier parameter
+ * @param[in] result_shift                  Result shift parameter
+ * @param[in] result_offset_after_shift_s32 Result offset parameter
+ * @param[in] min_s8                        Relu lower bound
+ * @param[in] max_s8                        Relu upper bound
+ *
+ * @return Quantized value
+ */
+template <bool is_bounded_relu>
+inline int8_t finalize_quantization(int32_t in_value, int result_fixedpoint_multiplier,
+                                    int32_t result_shift, int32_t result_offset_after_shift_s32,
+                                    int8_t min_s8, int8_t max_s8)
+{
+    int32x4_t in_s32 = vdupq_n_s32(in_value);
+
+    // Fixed point multiplication with vector saturating rounding doubling multiply high with scalar
+    in_value = vgetq_lane_s32(vqrdmulhq_n_s32(in_s32, result_fixedpoint_multiplier), 0);
+
+    // Shift value by result_shift_s32
+    in_value = rounding_divide_by_pow2(in_value, result_shift);
+
+    // Add the offset term
+    in_value += result_offset_after_shift_s32;
+
+    // Bound the result
+    int8_t out_s8 = static_cast<int8_t>(std::max<int32_t>(-128, std::min<int32_t>(127, in_value)));
+    if(is_bounded_relu)
+    {
+        out_s8 = static_cast<int8_t>(std::max(min_s8, std::min(max_s8, out_s8)));
+    }
+
+    return out_s8;
+}
+
 /** Dequantize a neon vector holding 8 quantized values.
  *
  * @param[in] qv Input values to be dequantized.
diff --git a/arm_compute/core/NEON/NEKernels.h b/arm_compute/core/NEON/NEKernels.h
index 33a640f..aa46a34 100644
--- a/arm_compute/core/NEON/NEKernels.h
+++ b/arm_compute/core/NEON/NEKernels.h
@@ -46,6 +46,7 @@
 #include "arm_compute/core/NEON/kernels/NECol2ImKernel.h"
 #include "arm_compute/core/NEON/kernels/NEColorConvertKernel.h"
 #include "arm_compute/core/NEON/kernels/NEConvertFullyConnectedWeightsKernel.h"
+#include "arm_compute/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h"
 #include "arm_compute/core/NEON/kernels/NEConvolutionKernel.h"
 #include "arm_compute/core/NEON/kernels/NECopyKernel.h"
 #include "arm_compute/core/NEON/kernels/NECropKernel.h"
diff --git a/arm_compute/core/NEON/NEMath.h b/arm_compute/core/NEON/NEMath.h
index 560abd6..8593059 100644
--- a/arm_compute/core/NEON/NEMath.h
+++ b/arm_compute/core/NEON/NEMath.h
@@ -129,6 +129,17 @@
  * @note This function calculates the following expression: (x + 2^n -1 ) / 2^n where n = exponent
  *
  * @param[in] x        Vector of 4 elements
+ * @param[in] exponent Vector of 4 elements with integer value used to round to nearest division by a power-of-two
+ *
+ * @return the nearest division by a power-of-two using exponent
+ */
+int32x4_t rounding_divide_by_pow2(int32x4_t x, int32x4_t exponent);
+
+/** Round to the nearest division by a power-of-two using exponent
+ *
+ * @note This function calculates the following expression: (x + 2^n -1 ) / 2^n where n = exponent
+ *
+ * @param[in] x        Vector of 4 elements
  * @param[in] exponent Integer value used to round to nearest division by a power-of-two
  *
  * @return the nearest division by a power-of-two using exponent
diff --git a/arm_compute/core/NEON/NEMath.inl b/arm_compute/core/NEON/NEMath.inl
index 61315e8..f1c9c20 100644
--- a/arm_compute/core/NEON/NEMath.inl
+++ b/arm_compute/core/NEON/NEMath.inl
@@ -294,6 +294,14 @@
 
 #endif /* DOXYGEN_SKIP_THIS */
 
+inline int32x4_t rounding_divide_by_pow2(int32x4_t x, int32x4_t exponent)
+{
+    const int32x4_t shift_vec  = vnegq_s32(exponent);
+    const int32x4_t fixup      = vshrq_n_s32(vandq_s32(x, shift_vec), 31);
+    const int32x4_t fixed_up_x = vqaddq_s32(x, fixup);
+    return vrshlq_s32(fixed_up_x, shift_vec);
+}
+
 inline int32x4_t rounding_divide_by_pow2(int32x4_t x, int exponent)
 {
     const int32x4_t shift_vec  = vdupq_n_s32(-exponent);
diff --git a/arm_compute/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h b/arm_compute/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h
new file mode 100644
index 0000000..d3f0907
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/NEConvertQuantizedSignednessKernel.h
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_NECONVERTQUANTIZEDSIGNEDNESSKERNEL_H__
+#define __ARM_COMPUTE_NECONVERTQUANTIZEDSIGNEDNESSKERNEL_H__
+
+#include "arm_compute/core/NEON/INEKernel.h"
+#include "arm_compute/core/Types.h"
+
+namespace arm_compute
+{
+// Forward declarations
+class ITensor;
+
+/** NEON kernel to convert asymmetric signed to asymmetric signed and vice-versa */
+class NEConvertQuantizedSignednessKernel : public INEKernel
+{
+public:
+    const char *name() const override
+    {
+        return "NEConvertQuantizedSignednessKernel";
+    }
+    /** Default constructor */
+    NEConvertQuantizedSignednessKernel();
+    /** Prevent instances of this class from being copied (As this class contains pointers). */
+    NEConvertQuantizedSignednessKernel(const NEConvertQuantizedSignednessKernel &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers). */
+    NEConvertQuantizedSignednessKernel &operator=(const NEConvertQuantizedSignednessKernel &) = delete;
+    /** Allow instances of this class to be moved */
+    NEConvertQuantizedSignednessKernel(NEConvertQuantizedSignednessKernel &&) = default;
+    /** Allow instances of this class to be moved */
+    NEConvertQuantizedSignednessKernel &operator=(NEConvertQuantizedSignednessKernel &&) = default;
+    /** Initialize the kernel's input, output.
+     *
+     * @param[in]  input  Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED.
+     * @param[out] output Destination tensor. Data types supported: opposite of @p input.
+     */
+    void configure(const ITensor *input, ITensor *output);
+    /** Static function to check if given info will lead to a valid configuration of @ref NECopyKernel
+     *
+     * @param[in] input  Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED.
+     * @param[in] output Destination tensor. Data types supported: opposite of @p input.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output);
+
+    // Inherited methods overridden:
+    void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+    const ITensor *_input;
+    ITensor       *_output;
+};
+} // namespace arm_compute
+#endif /*__ARM_COMPUTE_NECONVERTQUANTIZEDSIGNEDNESSKERNEL_H__ */
diff --git a/arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h b/arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h
index 5c0104d..f470126 100644
--- a/arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h
+++ b/arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -60,13 +60,13 @@
     NEGEMMInterleave4x4Kernel();
     /** Initialise the kernel's input and output.
      *
-     * @param[in]  input  Input tensor. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+     * @param[in]  input  Input tensor. Data types supported: All
      * @param[out] output Output tensor which stores the interleaved matrix. Data type supported: same as @p input.
      */
     void configure(const ITensor *input, ITensor *output);
     /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMInterleave4x4Kernel
      *
-     * @param[in] input  Input tensor info. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+     * @param[in] input  Input tensor info. Data types supported: All
      * @param[in] output Output tensor info which stores the interleaved matrix. Data type supported: same as @p input.
      *
      * @return a status
@@ -79,7 +79,7 @@
 private:
     /** Common signature for all the transpose functions
      *
-     * @param[in]  input  An input tensor. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+     * @param[in]  input  An input tensor. Data types supported: All
      * @param[out] output The output tensor. Data type supported: same as @p input
      * @param[in]  window Region on which to execute the kernel.
      */
diff --git a/arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h b/arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h
index 354ae21..6467a8d 100644
--- a/arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h
+++ b/arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -62,15 +62,15 @@
      * The input matrices @p input0 and @p input1 must be the output of the kernels: @ref NEGEMMInterleave4x4Kernel and @ref NEGEMMTranspose1xWKernel. These two
      * kernels change the layout of the original matrices to be more cache-friendly.
      *
-     * @param[in]  input0 Input tensor containing the interleaved Matrix A. Data type supported: QASYMM8
-     * @param[in]  input1 Input tensor containing the transposed1xW Matrix B. Data type supported: same as @p input0
+     * @param[in]  input0 Input tensor containing the interleaved Matrix A. Data type supported: U8/QASYMM8/S8/QASYMM8_SIGNED
+     * @param[in]  input1 Input tensor containing the transposed1xW Matrix B. Data type supported: U8/QASYMM8/S8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL
      * @param[out] output Output tensor to store the result of matrix multiplication. Data type supported: S32
      */
     void configure(const ITensor *input0, const ITensor *input1, ITensor *output);
     /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpMatrixMultiplyKernel
      *
-     * @param[in] input0 Input tensor info containing the interleaved Matrix A. Data type supported: QASYMM8
-     * @param[in] input1 Input tensor info containing the transposed Matrix B. Data type supported: same as @p input0
+     * @param[in] input0 Input tensor info containing the interleaved Matrix A. Data type supported: U8/QASYMM8/S8/QASYMM8_SIGNED
+     * @param[in] input1 Input tensor info containing the transposed Matrix B. Data type supported: U8/QASYMM8/S8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL
      * @param[in] output Output tensor info to store the result of matrix multiplication. Data type supported: S32
      *
      * @return a status
diff --git a/arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h b/arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h
index 4eab86d..ce3dddb 100644
--- a/arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h
+++ b/arm_compute/core/NEON/kernels/NEGEMMLowpReductionKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -47,7 +47,7 @@
 
     /** Initialise the kernel's input and output.
      *
-     * @param[in]  input       Input tensor. Data type supported: QASYMM8
+     * @param[in]  input       Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED
      * @param[out] output      Output row-vector of sums of all the entries in each row/col of input tensor. Data type supported: S32
      * @param[in]  k           Number of matrix A columns (or matrix B rows)
      * @param[in]  is_reshaped True if the input tensor has been reshaped
@@ -75,7 +75,7 @@
     }
     /** Initialise the kernel's input and output.
      *
-     * @param[in]  mtx_a             Input tensor. Data type supported: QASYMM8
+     * @param[in]  mtx_a             Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED
      * @param[out] vector_sum_row    Output row-vector of sums of all the entries in each row of mtx_a. Data type supported: S32
      * @param[in]  num_mtx_a_cols    Number of matrix A columns
      * @param[in]  is_interleaved4x4 True if the matrix A has been interleaved4x4
@@ -83,7 +83,7 @@
     void configure(const ITensor *mtx_a, ITensor *vector_sum_row, int32_t num_mtx_a_cols, bool is_interleaved4x4) override;
     /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpMatrixAReductionKernel
      *
-     * @param[in] mtx_a             Input tensor. Data type supported: QASYMM8
+     * @param[in] mtx_a             Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED
      * @param[in] vector_sum_row    Output row-vector of sums of all the entries in each row of mtx_a. Data type supported: S32
      * @param[in] num_mtx_a_cols    Number of matrix A columns
      * @param[in] is_interleaved4x4 True if the matrix A has been interleaved4x4
@@ -94,6 +94,14 @@
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+    /** Execution of the reduction kernel specialized on the input type
+     *
+     * @param[in] window Execution window
+     */
+    template <typename T>
+    void run_internal(const Window &window);
 };
 
 /** NEON kernel used to compute the row-vectors of sums of all the entries in each column of Matrix B.
@@ -110,7 +118,7 @@
     }
     /** Initialise the kernel's input and output.
      *
-     * @param[in]  mtx_b            Input tensor. Data type supported: Data type supported: QASYMM8
+     * @param[in]  mtx_b            Input tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED
      * @param[out] vector_sum_col   Output row-vector of sums of all the entries in each column of mtx_b. Data type supported: S32
      * @param[in]  num_mtx_b_rows   Number of matrix B rows
      * @param[in]  is_transposed1xW True if the input tensor is transposed 1xW
@@ -118,7 +126,7 @@
     void configure(const ITensor *mtx_b, ITensor *vector_sum_col, int32_t num_mtx_b_rows, bool is_transposed1xW) override;
     /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMLowpMatrixBReductionKernel
      *
-     * @param[in] mtx_b            Input tensor. Data type supported: Data type supported: QASYMM8
+     * @param[in] mtx_b            Input tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED
      * @param[in] vector_sum_col   Output row-vector of sums of all the entries in each column of mtx_b. Data type supported: S32
      * @param[in] num_mtx_b_rows   Number of matrix B rows
      * @param[in] is_transposed1xW True if the input tensor is transposed 1xW
@@ -129,6 +137,15 @@
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
+
+private:
+    /** Execution of the reduction kernel specialized on the input type
+     *
+     * @param[in] window Execution window
+     * @param[in] info   Thread-related information
+     */
+    template <typename T>
+    void run_internal(const Window &window, const ThreadInfo &info);
 };
 } // namespace arm_compute
 
diff --git a/arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h b/arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h
index b7fbfcf..54086d1 100644
--- a/arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h
+++ b/arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2018 ARM Limited.
+ * Copyright (c) 2016-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -74,13 +74,13 @@
     }
     /** Initialise the kernel's input and output.
      *
-     * @param[in]  input  Input tensor. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+     * @param[in]  input  Input tensor. Data types supported: U8/S8/QASYMM8/QSYMM8_PER_CHANNEL/U16/S16/F16/U32/S32/F32
      * @param[out] output Output tensor. Data type supported: same as @p input.
      */
     void configure(const ITensor *input, ITensor *output);
     /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMTranspose1xWKernel
      *
-     * @param[in] input  Input tensor info. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+     * @param[in] input  Input tensor info. Data types supported: U8/S8/QASYMM8/QSYMM8_PER_CHANNEL/U16/S16/F16/U32/S32/F32
      * @param[in] output Output tensor info. Data type supported: same as @p input.
      *
      * @return a status
diff --git a/arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h b/arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h
index bba18a8..585c707 100644
--- a/arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h
+++ b/arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h
@@ -75,7 +75,7 @@
     /** Set the input and output of the kernel.
      *
      * @param[in]  input  The input tensor to convert. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] if shared,
-     *                    and 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM, num_patches] if unshared. Data types supported: QASYMM8/F32
+     *                    and 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM, num_patches] if unshared. Data types supported: QASYMM8/QSYMM8_PER_CHANNEL/FP16/F32
      * @param[in]  bias   The shared biases tensor to append.  Bias is 1D tensor with dimensions [OFM] if shared and 2D tensor with
      *                    dimensions [OFM, num_patches] if unshared. Data types supported: Same as @p input
      *                    @warning Appending biases to weights reshaped matrix is not supported for quantized asymmetric types.
@@ -85,7 +85,7 @@
     /** Static function to check if given info will lead to a valid configuration of @ref NEWeightsReshapeKernel
      *
      * @param[in] input  The input tensor to convert. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] if shared,
-     *                   and 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM,  num_patches] if unshared. Data types supported: QASYMM8/F16/F32
+     *                   and 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM,  num_patches] if unshared. Data types supported: QASYMM8/QSYMM8_PER_CHANNEL/F16/F32
      * @param[in] biases The shared biases tensor to append.  Bias is 1D tensor with dimensions [OFM] if shared and 2D tensor with
      *                   dimensions [OFM, num_patches] if unshared. Data types supported: Same as @p input
      *                   @warning Appending biases to weights reshaped matrix is not supported for quantized asymmetric types.
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/add.h b/arm_compute/core/NEON/wrapper/intrinsics/add.h
index 4f4d244..1839170 100644
--- a/arm_compute/core/NEON/wrapper/intrinsics/add.h
+++ b/arm_compute/core/NEON/wrapper/intrinsics/add.h
@@ -63,13 +63,13 @@
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 #undef VADD_IMPL
 
+// VQADD: Vector saturating add (No notion of saturation for floating point)
 #define VQADD_IMPL(stype, vtype, prefix, postfix)      \
     inline vtype vqadd(const vtype &a, const vtype &b) \
     {                                                  \
         return prefix##_##postfix(a, b);               \
     }
 
-// VQADD: Vector saturating add (No notion of saturation for floating point)
 VQADD_IMPL(uint8x8_t, uint8x8_t, vqadd, u8)
 VQADD_IMPL(int8x8_t, int8x8_t, vqadd, s8)
 VQADD_IMPL(uint16x4_t, uint16x4_t, vqadd, u16)
@@ -96,6 +96,86 @@
 VQADD_IMPL(float16x8_t, float16x8_t, vaddq, f16)
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 #undef VQADD_IMPL
+
+// VADDW: Vector widening add
+#define VADDW_IMPL(wtype, vtype, prefix, postfix)      \
+    inline wtype vaddw(const wtype &a, const vtype &b) \
+    {                                                  \
+        return prefix##_##postfix(a, b);               \
+    }
+
+VADDW_IMPL(uint16x8_t, uint8x8_t, vaddw, u8)
+VADDW_IMPL(int16x8_t, int8x8_t, vaddw, s8)
+VADDW_IMPL(uint32x4_t, uint16x4_t, vaddw, u16)
+VADDW_IMPL(int32x4_t, int16x4_t, vaddw, s16)
+VADDW_IMPL(uint64x2_t, uint32x2_t, vaddw, u32)
+VADDW_IMPL(int64x2_t, int32x2_t, vaddw, s32)
+#undef VADDW_IMPL
+
+// VADDL: Vector long add
+#define VADDL_IMPL(wtype, vtype, prefix, postfix)      \
+    inline wtype vaddl(const vtype &a, const vtype &b) \
+    {                                                  \
+        return prefix##_##postfix(a, b);               \
+    }
+
+VADDL_IMPL(uint16x8_t, uint8x8_t, vaddl, u8)
+VADDL_IMPL(int16x8_t, int8x8_t, vaddl, s8)
+VADDL_IMPL(uint32x4_t, uint16x4_t, vaddl, u16)
+VADDL_IMPL(int32x4_t, int16x4_t, vaddl, s16)
+VADDL_IMPL(uint64x2_t, uint32x2_t, vaddl, u32)
+VADDL_IMPL(int64x2_t, int32x2_t, vaddl, s32)
+#undef VADDL_IMPL
+
+#if defined(__aarch64__)
+// VADDV: Across vector add
+#define VADDV_IMPL(stype, vtype, prefix, postfix) \
+    inline stype vaddv(const vtype &a)            \
+    {                                             \
+        return prefix##_##postfix(a);             \
+    }
+
+VADDV_IMPL(uint8_t, uint8x8_t, vaddv, u8)
+VADDV_IMPL(int8_t, int8x8_t, vaddv, s8)
+VADDV_IMPL(uint16_t, uint16x4_t, vaddv, u16)
+VADDV_IMPL(int16_t, int16x4_t, vaddv, s16)
+VADDV_IMPL(uint32_t, uint32x2_t, vaddv, u32)
+VADDV_IMPL(int32_t, int32x2_t, vaddv, s32)
+VADDV_IMPL(float, float32x2_t, vaddv, f32)
+
+VADDV_IMPL(uint8_t, uint8x16_t, vaddvq, u8)
+VADDV_IMPL(int8_t, int8x16_t, vaddvq, s8)
+VADDV_IMPL(uint16_t, uint16x8_t, vaddvq, u16)
+VADDV_IMPL(int16_t, int16x8_t, vaddvq, s16)
+VADDV_IMPL(uint32_t, uint32x4_t, vaddvq, u32)
+VADDV_IMPL(int32_t, int32x4_t, vaddvq, s32)
+VADDV_IMPL(uint64_t, uint64x2_t, vaddvq, u64)
+VADDV_IMPL(int64_t, int64x2_t, vaddvq, s64)
+VADDV_IMPL(float, float32x4_t, vaddvq, f32)
+#undef VADDV_IMPL
+#endif // defined(__aarch64__)
+
+// VPADDL: Signed add long pairwise
+#define VPADDL_IMPL(ltype, vtype, prefix, postfix) \
+    inline ltype vpaddl(const vtype &a)            \
+    {                                              \
+        return prefix##_##postfix(a);              \
+    }
+
+VPADDL_IMPL(uint16x4_t, uint8x8_t, vpaddl, u8)
+VPADDL_IMPL(int16x4_t, int8x8_t, vpaddl, s8)
+VPADDL_IMPL(uint32x2_t, uint16x4_t, vpaddl, u16)
+VPADDL_IMPL(int32x2_t, int16x4_t, vpaddl, s16)
+VPADDL_IMPL(uint64x1_t, uint32x2_t, vpaddl, u32)
+VPADDL_IMPL(int64x1_t, int32x2_t, vpaddl, s32)
+
+VPADDL_IMPL(uint16x8_t, uint8x16_t, vpaddlq, u8)
+VPADDL_IMPL(int16x8_t, int8x16_t, vpaddlq, s8)
+VPADDL_IMPL(uint32x4_t, uint16x8_t, vpaddlq, u16)
+VPADDL_IMPL(int32x4_t, int16x8_t, vpaddlq, s16)
+VPADDL_IMPL(uint64x2_t, uint32x4_t, vpaddlq, u32)
+VPADDL_IMPL(int64x2_t, int32x4_t, vpaddlq, s32)
+#undef VPADDL_IMPL
 } // namespace wrapper
 } // namespace arm_compute
 #endif /* __ARM_COMPUTE_WRAPPER_ADD_H__ */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/eor.h b/arm_compute/core/NEON/wrapper/intrinsics/eor.h
new file mode 100644
index 0000000..1002808
--- /dev/null
+++ b/arm_compute/core/NEON/wrapper/intrinsics/eor.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_WRAPPER_EOR_H__
+#define __ARM_COMPUTE_WRAPPER_EOR_H__
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace wrapper
+{
+#define VEOR_IMPL(vtype, prefix, postfix)             \
+    inline vtype veor(const vtype &a, const vtype &b) \
+    {                                                 \
+        return prefix##_##postfix(a, b);              \
+    }
+
+VEOR_IMPL(uint8x8_t, veor, u8)
+VEOR_IMPL(int8x8_t, veor, s8)
+VEOR_IMPL(uint16x4_t, veor, u16)
+VEOR_IMPL(int16x4_t, veor, s16)
+VEOR_IMPL(uint32x2_t, veor, u32)
+VEOR_IMPL(int32x2_t, veor, s32)
+
+VEOR_IMPL(uint8x16_t, veorq, u8)
+VEOR_IMPL(int8x16_t, veorq, s8)
+VEOR_IMPL(uint16x8_t, veorq, u16)
+VEOR_IMPL(int16x8_t, veorq, s16)
+VEOR_IMPL(uint32x4_t, veorq, u32)
+VEOR_IMPL(int32x4_t, veorq, s32)
+
+#undef VEOR_IMPL
+} // namespace wrapper
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_WRAPPER_EOR_H__ */
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/intrinsics.h b/arm_compute/core/NEON/wrapper/intrinsics/intrinsics.h
index 0362ca1..6eae1cf 100644
--- a/arm_compute/core/NEON/wrapper/intrinsics/intrinsics.h
+++ b/arm_compute/core/NEON/wrapper/intrinsics/intrinsics.h
@@ -35,6 +35,7 @@
 #include "arm_compute/core/NEON/wrapper/intrinsics/combine.h"
 #include "arm_compute/core/NEON/wrapper/intrinsics/div.h"
 #include "arm_compute/core/NEON/wrapper/intrinsics/dup_n.h"
+#include "arm_compute/core/NEON/wrapper/intrinsics/eor.h"
 #include "arm_compute/core/NEON/wrapper/intrinsics/exp.h"
 #include "arm_compute/core/NEON/wrapper/intrinsics/gethigh.h"
 #include "arm_compute/core/NEON/wrapper/intrinsics/getlane.h"
@@ -56,6 +57,7 @@
 #include "arm_compute/core/NEON/wrapper/intrinsics/pmax.h"
 #include "arm_compute/core/NEON/wrapper/intrinsics/pmin.h"
 #include "arm_compute/core/NEON/wrapper/intrinsics/pow.h"
+#include "arm_compute/core/NEON/wrapper/intrinsics/reinterpret.h"
 #include "arm_compute/core/NEON/wrapper/intrinsics/rev64.h"
 #include "arm_compute/core/NEON/wrapper/intrinsics/round.h"
 #include "arm_compute/core/NEON/wrapper/intrinsics/setlane.h"
diff --git a/arm_compute/core/NEON/wrapper/intrinsics/reinterpret.h b/arm_compute/core/NEON/wrapper/intrinsics/reinterpret.h
new file mode 100644
index 0000000..0956959
--- /dev/null
+++ b/arm_compute/core/NEON/wrapper/intrinsics/reinterpret.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2019 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef __ARM_COMPUTE_WRAPPER_REINTERPRET_H__
+#define __ARM_COMPUTE_WRAPPER_REINTERPRET_H__
+
+#include <arm_neon.h>
+
+namespace arm_compute
+{
+namespace wrapper
+{
+inline int32x4_t vreinterpret_s32(const uint32x4_t &val)
+{
+    return vreinterpretq_s32_u32(val);
+}
+inline int32x4_t vreinterpret_s32(const int32x4_t &val)
+{
+    return val;
+}
+} // namespace wrapper
+} // namespace arm_compute
+#endif /* __ARM_COMPUTE_WRAPPER_REINTERPRET_H__ */
diff --git a/arm_compute/core/NEON/wrapper/traits.h b/arm_compute/core/NEON/wrapper/traits.h
index cc22597..d700aac 100644
--- a/arm_compute/core/NEON/wrapper/traits.h
+++ b/arm_compute/core/NEON/wrapper/traits.h
@@ -112,6 +112,22 @@
 template <typename T, BitWidth BW> using neon_bitvector_t = typename neon_bitvector<T, BW>::type;
 /**  Helper type template to get the tag type of a neon vector */
 template <typename T, BitWidth BW> using neon_bitvector_tag_t = typename neon_bitvector<T, BW>::tag_type;
+
+/** Promote a type */
+template <typename T> struct promote { };
+template <> struct promote<uint8_t> { using type = uint16_t; };
+template <> struct promote<int8_t> { using type = int16_t; };
+template <> struct promote<uint16_t> { using type = uint32_t; };
+template <> struct promote<int16_t> { using type = int32_t; };
+template <> struct promote<uint32_t> { using type = uint64_t; };
+template <> struct promote<int32_t> { using type = int64_t; };
+template <> struct promote<float> { using type = float; };
+template <> struct promote<half> { using type = half; };
+
+/** Get promoted type */
+template <typename T>
+using promote_t = typename promote<T>::type;
+
 // clang-format on
 // *INDENT-ON*
 } // namespace traits
diff --git a/arm_compute/core/QuantizationInfo.h b/arm_compute/core/QuantizationInfo.h
index 5e6e5b3..949ee66 100644
--- a/arm_compute/core/QuantizationInfo.h
+++ b/arm_compute/core/QuantizationInfo.h
@@ -250,6 +250,36 @@
     return quantized;
 }
 
+/** Quantize a value given a 8-bit symmetric per channel quantization scheme
+ *
+ * @param[in] value      Value to quantize
+ * @param[in] qinfo      Quantization information to use for quantizing
+ * @param[in] channel_id channel index into the scale vector of quantization info
+ *
+ * @return Quantized value
+ */
+inline int8_t quantize_qsymm8_per_channel(float value, const QuantizationInfo &qinfo, size_t channel_id = 0)
+{
+    int quantized = arm_compute::round(value / qinfo.scale()[channel_id], RoundingPolicy::TO_NEAREST_UP);
+    quantized     = std::max(-128, std::min(quantized, 127));
+    return quantized;
+}
+
+/** Quantize a value given a 8-bit asymmetric per channel quantization scheme
+ *
+ * @param[in] value      Value to quantize
+ * @param[in] qinfo      Quantization information to use for quantizing
+ * @param[in] channel_id channel index into the scale vector of quantization info
+ *
+ * @return Quantized value
+ */
+inline int8_t quantize_qasymm8_per_channel(float value, const QuantizationInfo &qinfo, size_t channel_id = 0)
+{
+    int quantized = arm_compute::round(value / qinfo.scale()[channel_id], RoundingPolicy::TO_NEAREST_UP);
+    quantized     = std::max(0, std::min(quantized, 255));
+    return quantized;
+}
+
 /** Dequantize a value given a 8-bit asymmetric quantization scheme
  *
  * @param[in] value Value to dequantize
diff --git a/arm_compute/core/Types.h b/arm_compute/core/Types.h
index f4955ed..1c9e8ce 100644
--- a/arm_compute/core/Types.h
+++ b/arm_compute/core/Types.h
@@ -77,7 +77,8 @@
     U8,                  /**< unsigned 8-bit number */
     S8,                  /**< signed 8-bit number */
     QSYMM8,              /**< quantized, symmetric fixed-point 8-bit number */
-    QASYMM8,             /**< quantized, asymmetric fixed-point 8-bit number */
+    QASYMM8,             /**< quantized, asymmetric fixed-point 8-bit number unsigned */
+    QASYMM8_SIGNED,      /**< quantized, asymmetric fixed-point 8-bit number signed */
     QSYMM8_PER_CHANNEL,  /**< quantized, symmetric per channel fixed-point 8-bit number */
     QASYMM8_PER_CHANNEL, /**< quantized, asymmetric per channel fixed-point 8-bit number */
     U16,                 /**< unsigned 16-bit number */
@@ -1881,6 +1882,8 @@
     int                     gemmlowp_shift{ 0 };                   /**< GEMMLowp output stage shift used for quantizing to uint8 */
     int                     gemmlowp_min_bound{ 0 };               /**< GEMMLowp min value used to saturate down the output result before converting back to QASYMM8 */
     int                     gemmlowp_max_bound{ 0 };               /**< GEMMLowp max value used to saturate down the output result before converting back to QASYMM8 */
+    std::vector<int>        gemmlowp_multipliers{};                /**< GEMMLowp output stage multiplier used for quantizing to QASYMM8 */
+    std::vector<int>        gemmlowp_shifts{};                     /**< GEMMLowp output stage multiplier used for quantizing to QASYMM8 */
 };
 
 /** GEMM LHS (Left Hand Side) matrix information */
@@ -2015,6 +2018,14 @@
     {
         return _gemmlowp_output_stage;
     };
+    /** Sets GEMMLowp output stage
+     *
+     * @param[in] output_stage Output stage to set
+     */
+    void set_gemmlowp_output_stage(GEMMLowpOutputStageInfo &output_stage)
+    {
+        _gemmlowp_output_stage = output_stage;
+    };
     /** Flag which specifies if a wider accumulator should be used.
      *
      * @return True if a wider accumulator has to be used
diff --git a/arm_compute/core/Utils.h b/arm_compute/core/Utils.h
index 3939491..a6e1ea1 100644
--- a/arm_compute/core/Utils.h
+++ b/arm_compute/core/Utils.h
@@ -114,6 +114,7 @@
         case DataType::S8:
         case DataType::QSYMM8:
         case DataType::QASYMM8:
+        case DataType::QASYMM8_SIGNED:
         case DataType::QSYMM8_PER_CHANNEL:
         case DataType::QASYMM8_PER_CHANNEL:
             return 1;
@@ -191,6 +192,7 @@
         case DataType::U8:
         case DataType::QSYMM8:
         case DataType::QASYMM8:
+        case DataType::QASYMM8_SIGNED:
         case DataType::QSYMM8_PER_CHANNEL:
             return 1;
         case DataType::U16:
@@ -533,6 +535,7 @@
             return DataType::S32;
         case DataType::QSYMM8:
         case DataType::QASYMM8:
+        case DataType::QASYMM8_SIGNED:
         case DataType::QSYMM8_PER_CHANNEL:
         case DataType::QASYMM8_PER_CHANNEL:
         case DataType::QSYMM16:
@@ -1024,6 +1027,7 @@
     {
         case DataType::QSYMM8:
         case DataType::QASYMM8:
+        case DataType::QASYMM8_SIGNED:
         case DataType::QSYMM8_PER_CHANNEL:
         case DataType::QASYMM8_PER_CHANNEL:
         case DataType::QSYMM16:
@@ -1045,6 +1049,7 @@
     switch(dt)
     {
         case DataType::QASYMM8:
+        case DataType::QASYMM8_SIGNED:
         case DataType::QASYMM8_PER_CHANNEL:
         case DataType::QASYMM16:
             return true;
diff --git a/arm_compute/core/utils/quantization/AsymmHelpers.h b/arm_compute/core/utils/quantization/AsymmHelpers.h
index 8ec4a33..6b6cb00 100644
--- a/arm_compute/core/utils/quantization/AsymmHelpers.h
+++ b/arm_compute/core/utils/quantization/AsymmHelpers.h
@@ -59,6 +59,21 @@
  * @return a status
  */
 Status calculate_quantized_multiplier_greater_than_one(float multiplier, int *quantized_multiplier, int *left_shift);
+
+/** Calculate quantized representation of per-channel multipliers with value less than one.
+ *
+ * @param[in]      iq_info    Input quantization info.
+ * @param[in]      wq_info    Weights quantization info.
+ * @param[in]      oq_info    Output quantization info.
+ * @param[in, out] stage_info GemmLowp output stage info
+ *
+ * @return a status
+ */
+Status calculate_quantized_multipliers_less_than_one(const QuantizationInfo &iq_info,
+                                                     const QuantizationInfo &wq_info,
+                                                     const QuantizationInfo &oq_info,
+                                                     GEMMLowpOutputStageInfo &stage_info);
+
 /** Get minimum and maximum values for the input quantized data type
  *
  * @return min and max values for the quantized data type