COMPMID-2309 : CLConvolutionLayer: support QUANT8_SYMM_PER_CHANNEL filters

Change-Id: I16f6758b768ede404a064db057302ded706e1e8a
Signed-off-by: Vidhya Sudhan Loganathan <vidhyasudhan.loganathan@arm.com>
Signed-off-by: Michele Di Giorgio <michele.digiorgio@arm.com>
Reviewed-on: https://review.mlplatform.org/c/2215
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
diff --git a/arm_compute/core/CL/kernels/CLDepthConvertLayerKernel.h b/arm_compute/core/CL/kernels/CLDepthConvertLayerKernel.h
index 7475d8d..cce7b69 100644
--- a/arm_compute/core/CL/kernels/CLDepthConvertLayerKernel.h
+++ b/arm_compute/core/CL/kernels/CLDepthConvertLayerKernel.h
@@ -41,6 +41,7 @@
      *
      * Valid conversions Input -> Output :
      *
+     *   - QSYMM8_PER_CHANNEL -> QASYMM8 (ATTENTION: it is the user's responsibility to keep track of the quantization info in the TensorInfo meta-data)
      *   - U8  -> S8, U16, S16, U32, S32, F16, F32
      *   - U16 -> U8, S8, S16, U32, S32, F16, F32
      *   - S16 -> U8, S8, U16, U32, S32, F16, F32
@@ -49,16 +50,16 @@
      *   - F16 -> U8, S8, U16, S16, U32, F32
      *   - F32 -> U8, S8, U16, S16, U32, F16
      *
-     * @param[in]  input  The input tensor to convert. Data types supported: U8/S8/U16/S16/U32/S32/F16/F32.
-     * @param[out] output The output tensor. Data types supported: U8/S8/U16/S16/U32/S32/F16/F32.
+     * @param[in]  input  The input tensor to convert. Data types supported: U8/S8/QSYMM8_PER_CHANNEL/U16/S16/U32/S32/F16/F32.
+     * @param[out] output The output tensor. Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32.
      * @param[in]  policy Conversion policy
      * @param[in]  shift  Value for down/up conversions. Must be 0 <= shift < 8.
      */
     void configure(const ICLTensor *input, ICLTensor *output, ConvertPolicy policy, uint32_t shift);
     /** Static function to check if given info will lead to a valid configuration of @ref CLDepthConvertLayerKernel
      *
-     * @param[in] input  Source tensor info. Data types supported: U8/S8/U16/S16/U32/S32/F16/F32.
-     * @param[in] output Destination tensor info. Data type supported: U8/S8/U16/S16/U32/S32/F16/F32.
+     * @param[in] input  Source tensor info. Data types supported: U8/S8/QSYMM8_PER_CHANNEL/U16/S16/U32/S32/F16/F32.
+     * @param[in] output Destination tensor info. Data type supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32.
      * @param[in] policy Conversion policy
      * @param[in] shift  Value for down/up conversions. Must be 0 <= shift < 8.
      *
diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h b/arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h
index de06c88..301c673 100644
--- a/arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h
+++ b/arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -51,39 +51,47 @@
     CLGEMMLowpOffsetContributionOutputStageKernel &operator=(CLGEMMLowpOffsetContributionOutputStageKernel &&) = default;
     /** Initialise the kernel's input and output.
      *
-     * @param[in]  mm_result      Input tensor containing the result of @ref CLGEMMLowpMatrixMultiplyKernel. Data type supported: S32
-     * @param[in]  vector_sum_col Input row-vector of sums of all the entries in each column of matrix B.
-     *                            Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result
-     * @param[in]  vector_sum_row Input row-vector of sums of all the entries in each row of matrix A.
-     *                            Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p mm_result
-     * @param[in]  bias           Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
-     *                            Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[out] output         Output tensor. Data type supported: QASYMM8
-     * @param[in]  k              Number of matrix A columns or Matrix B rows
-     * @param[in]  a_offset       Offset to be added to each element of the matrix A.
-     * @param[in]  b_offset       Offset to be added to each element of the matrix B.
-     * @param[in]  output_stage   GEMMLowp output stage info
+     * @param[in]  mm_result          Input tensor containing the result of @ref CLGEMMLowpMatrixMultiplyKernel. Data type supported: S32
+     * @param[in]  vector_sum_col     Input row-vector of sums of all the entries in each column of matrix B.
+     *                                Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result
+     * @param[in]  vector_sum_row     Input row-vector of sums of all the entries in each row of matrix A.
+     *                                Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p mm_result
+     * @param[in]  bias               Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
+     *                                Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
+     * @param[out] output             Output tensor. Data type supported: QASYMM8.
+     * @param[in]  k                  Number of matrix A columns or Matrix B rows
+     * @param[in]  a_offset           Offset to be added to each element of the matrix A.
+     * @param[in]  b_offset           Offset to be added to each element of the matrix B.
+     * @param[in]  output_stage       GEMMLowp output stage info
+     * @param[in]  output_multipliers Output multipliers tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM).
+     *                                Supported data types: S32
+     * @param[in]  output_shifts      Output shifts tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM).
+     *                                Supported data types: S32
      */
     void configure(const ICLTensor *mm_result, const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row, const ICLTensor *bias, ICLTensor *output, int32_t k, int32_t a_offset, int32_t b_offset,
-                   const GEMMLowpOutputStageInfo &output_stage);
+                   const GEMMLowpOutputStageInfo &output_stage, const ICLTensor *output_multipliers, const ICLTensor *output_shifts);
     /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpOffsetContributionKernel
      *
-     * @param[in] mm_result      Input tensor containing the result of @ref CLGEMMLowpOffsetContributionKernel. Data type supported: S32 or QASYMM8 if output_stage != NONE
-     * @param[in] vector_sum_col Input row-vector of sums of all the entries in each column of matrix B.
-     *                           Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result
-     * @param[in] vector_sum_row Input row-vector of sums of all the entries in each row of matrix A.
-     *                           Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p mm_result
-     * @param[in] bias           Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
-     *                           Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[in] output         Output tensor. Data type supported: QASYMM8
-     * @param[in] a_offset       Offset to be added to each element of the matrix A.
-     * @param[in] b_offset       Offset to be added to each element of the matrix B.
-     * @param[in] output_stage   GEMMLowp output stage info
+     * @param[in] mm_result          Input tensor containing the result of @ref CLGEMMLowpOffsetContributionKernel. Data type supported: S32 or QASYMM8 if output_stage != NONE
+     * @param[in] vector_sum_col     Input row-vector of sums of all the entries in each column of matrix B.
+     *                               Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result
+     * @param[in] vector_sum_row     Input row-vector of sums of all the entries in each row of matrix A.
+     *                               Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p mm_result
+     * @param[in] bias               Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
+     *                               Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
+     * @param[in] output             Output tensor. Data type supported: QASYMM8.
+     * @param[in] a_offset           Offset to be added to each element of the matrix A.
+     * @param[in] b_offset           Offset to be added to each element of the matrix B.
+     * @param[in] output_stage       GEMMLowp output stage info
+     * @param[in] output_multipliers Output multipliers tensor info. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM).
+     *                               Supported data types: S32
+     * @param[in] output_shifts      Output shifts tensor info. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM).
+     *                               Supported data types: S32
      *
      * @return a status
      */
     static Status validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, const ITensorInfo *output, int32_t a_offset,
-                           int32_t b_offset, const GEMMLowpOutputStageInfo &output_stage);
+                           int32_t b_offset, const GEMMLowpOutputStageInfo &output_stage, const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
@@ -94,6 +102,9 @@
     const ICLTensor *_vector_sum_row;
     const ICLTensor *_bias;
     ICLTensor       *_output;
+    const ICLTensor *_output_multipliers;
+    const ICLTensor *_output_shifts;
+    bool             _is_quantized_per_channel;
 };
 } // namespace arm_compute
 
diff --git a/arm_compute/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h b/arm_compute/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h
index 26ab210..937f6a9 100644
--- a/arm_compute/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h
+++ b/arm_compute/core/CL/kernels/CLGEMMReshapeRHSMatrixKernel.h
@@ -48,7 +48,7 @@
     CLGEMMReshapeRHSMatrixKernel &operator=(CLGEMMReshapeRHSMatrixKernel &&) = default;
     /** Initialise the kernel's input and output.
      *
-     * @param[in]  input    Input tensor. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+     * @param[in]  input    Input tensor. Data types supported: All
      * @param[out] output   Output tensor. Data type supported: same as @p input
      * @param[in]  rhs_info RHS matrix information to be used for reshaping. This object contains all the necessary
      *                      information to reshape the input tensor. Only the following values are supported:
@@ -61,7 +61,7 @@
     void configure(const ICLTensor *input, ICLTensor *output, const GEMMRHSMatrixInfo &rhs_info);
     /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMReshapeRHSMatrixKernel
      *
-     * @param[in] input    Input tensor info. Data types supported: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
+     * @param[in] input    Input tensor info. Data types supported: All
      * @param[in] output   Output tensor info which stores the interleaved matrix. Data type supported: same as @p input.
      * @param[in] rhs_info RHS matrix information to be used for reshaping. This object contains all the necessary
      *                     information to reshape the input tensor. Only the following values are supported:
diff --git a/arm_compute/core/CL/kernels/CLWeightsReshapeKernel.h b/arm_compute/core/CL/kernels/CLWeightsReshapeKernel.h
index bdc5792..59740b9 100644
--- a/arm_compute/core/CL/kernels/CLWeightsReshapeKernel.h
+++ b/arm_compute/core/CL/kernels/CLWeightsReshapeKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -69,9 +69,9 @@
     /** Set the input and output of the kernel.
      *
      * @param[in]  input      The input tensor to convert. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] if shared,
-     *                        and 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM,  num_patches] if unshared. Data types supported: QASYMM8/F16/F32
+     *                        and 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM,  num_patches] if unshared. Data types supported: All
      * @param[in]  biases     The shared biases tensor to append.  Bias is 1D tensor with dimensions [OFM] if shared and 2D tensor with
-     *                        dimensions [OFM, num_patches] if unshared. Data types supported: Same as @p input
+     *                        dimensions [OFM, num_patches] if unshared. Data types supported: F16/F32, for quantized types this must be nullptr.
      *                        @warning Appending biases to weights reshaped matrix is not supported for quantized asymmetric types.
      * @param[out] output     The output tensor. Should be a 2D Tensor if there are no groups and the weights are not shared; a 3D Tensor otherwise.
      *                        Data types supported: Same as @p input
@@ -82,9 +82,9 @@
     /** Static function to check if given info will lead to a valid configuration of @ref CLWeightsReshapeKernel
      *
      * @param[in] input      The input tensor to convert. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] if shared,
-     *                       and 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM,  num_patches] if unshared. Data types supported: QASYMM8/F16/F32
+     *                       and 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM,  num_patches] if unshared. Data types supported: All
      * @param[in] biases     The shared biases tensor to append.  Bias is 1D tensor with dimensions [OFM] if shared and 2D tensor with
-     *                       dimensions [OFM, num_patches] if unshared. Data types supported: Same as @p input
+     *                       dimensions [OFM, num_patches] if unshared. Data types supported: F16/F32, for quantized types this must be nullptr.
      *                       @warning Appending biases to weights reshaped matrix is not supported for quantized asymmetric types.
      * @param[in] output     The output tensor. Should be a 2D Tensor if there are no groups and the weights are not shared; a 3D Tensor otherwise.
      *                       Data types supported: Same as @p input
diff --git a/arm_compute/core/Types.h b/arm_compute/core/Types.h
index 851292f..38d7897 100644
--- a/arm_compute/core/Types.h
+++ b/arm_compute/core/Types.h
@@ -1883,6 +1883,7 @@
     int                     gemmlowp_max_bound{ 0 };               /**< GEMMLowp max value used to saturate down the output result before converting back to QASYMM8 */
     std::vector<int32_t>    gemmlowp_multipliers{};                /**< GEMMLowp output stage multiplier used for quantizing to QASYMM8 */
     std::vector<int32_t>    gemmlowp_shifts{};                     /**< GEMMLowp output stage multiplier used for quantizing to QASYMM8 */
+    bool                    is_quantized_per_channel{ false };     /**< GEMMLowp quantized per-channel flag */
 };
 
 /** GEMM LHS (Left Hand Side) matrix information */
diff --git a/arm_compute/core/utils/quantization/AsymmHelpers.h b/arm_compute/core/utils/quantization/AsymmHelpers.h
index 6b6cb00..0bf6ff5 100644
--- a/arm_compute/core/utils/quantization/AsymmHelpers.h
+++ b/arm_compute/core/utils/quantization/AsymmHelpers.h
@@ -84,15 +84,21 @@
  * per-channel, multipliers and shifts will end up being the same for each
  * channel.
  *
- * @param[in]  input                  Input tensor.
- * @param[in]  weights                Weights tensor.
- * @param[in]  output                 Output tensor.
+ * @param[in]  input                  Input tensor info.
+ * @param[in]  weights                Weights tensor info.
+ * @param[in]  output                 Output tensor info.
+ * @param[in]  idx_ofms               Dimension index to get OFMs from the weights tensor.
  * @param[out] output_multipliers_ptr Pointer to the buffer where to store per-channel multipliers.
  * @param[out] output_shifts_ptr      Pointer to the buffer where to store per-channel shifts.
  *
  * @return min and max values for the quantized data type
  */
-void compute_quantized_multipliers_and_shifts(const ITensor *input, const ITensor *weights, const ITensor *output, int32_t *output_multipliers_ptr, int32_t *output_shifts_ptr);
+void compute_quantized_multipliers_and_shifts(const ITensorInfo *input,
+                                              const ITensorInfo *weights,
+                                              const ITensorInfo *output,
+                                              unsigned int       idx_ofms,
+                                              int32_t           *output_multipliers_ptr,
+                                              int32_t           *output_shifts_ptr);
 } // namespace quantization
 } // namespace arm_compute
 #endif /* __ARM_COMPUTE_IO_FILE_HANDLER_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLConvolutionLayer.h b/arm_compute/runtime/CL/functions/CLConvolutionLayer.h
index 04ce1cf..8dfb6c8 100644
--- a/arm_compute/runtime/CL/functions/CLConvolutionLayer.h
+++ b/arm_compute/runtime/CL/functions/CLConvolutionLayer.h
@@ -78,7 +78,8 @@
      * @param[in]  input            Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
      *                              while every optional dimension from 4 and above represent a batch of inputs.
      *                              Data types supported: QASYMM8/F16/F32.
-     * @param[in]  weights          Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported: Same as @p input.
+     * @param[in]  weights          Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
+     *                              Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8.
      * @param[in]  biases           Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
      *                              Data type supported: Should match @p input data type, except for input of QASYMM8 type where biases should be of S32 type.
      * @param[out] output           Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
@@ -98,7 +99,8 @@
      * @param[in] input            Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
      *                             while every optional dimension from 4 and above represent a batch of inputs.
      *                             Data types supported: QASYMM8/F16/F32.
-     * @param[in] weights          Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported:Same as @p input.
+     * @param[in] weights          Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
+     *                             Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8.
      * @param[in] biases           Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. Data type supported:Same as @p input.
      * @param[in] output           Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
      *                             Data types supported: Same as @p input.
@@ -120,7 +122,8 @@
      * @param[in] input            Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
      *                             while every optional dimension from 4 and above represent a batch of inputs.
      *                             Data types supported: QASYMM8/F16/F32.
-     * @param[in] weights          Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported:Same as @p input.
+     * @param[in] weights          Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
+     *                             Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8.
      * @param[in] output           Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
      *                             Data types supported: Same as @p input.
      * @param[in] conv_info        Contains padding and stride information described in @ref PadStrideInfo.
diff --git a/arm_compute/runtime/CL/functions/CLGEMMConvolutionLayer.h b/arm_compute/runtime/CL/functions/CLGEMMConvolutionLayer.h
index 017bf78..3392f11 100644
--- a/arm_compute/runtime/CL/functions/CLGEMMConvolutionLayer.h
+++ b/arm_compute/runtime/CL/functions/CLGEMMConvolutionLayer.h
@@ -60,7 +60,7 @@
     /** Set the input and output tensors.
      *
      * @param[in]  weights    Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
-     *                        Data type supported: QASYMM8/F16/F32.
+     *                        Data type supported: QASYMM8/QSYMM8_PER_CHANNEL/F16/F32.
      * @param[in]  biases     Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p weights.
      * @param[out] output     Destination tensor. Data types supported: Same as @p weights.
      * @param[in]  num_groups (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is only supported for NCHW data layout
@@ -69,7 +69,7 @@
     /** Static function to check if given info will lead to a valid configuration of @ref CLConvolutionLayerReshapeWeights
      *
      * @param[in] weights    Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
-     *                       Data type supported: QASYMM8/F16/F32.
+     *                       Data type supported: QASYMM8/QSYMM8_PER_CHANNEL/F16/F32.
      * @param[in] biases     Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p weights.
      * @param[in] output     Destination tensor. Data types supported: Same as @p weights.
      * @param[in] num_groups (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is only supported for NCHW data layout
@@ -168,7 +168,8 @@
      * @param[in]  input        Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
      *                          while every optional dimension from 4 and above represent a batch of inputs.
      *                          Data types supported: QASYMM8/F16/F32.
-     * @param[in]  weights      Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported: Same as @p input.
+     * @param[in]  weights      Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
+     *                          Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8.
      * @param[in]  biases       Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
      *                          Data type supported: Should match @p input data type, except for input of QASYMM8 type where biases should be of S32 type.
      * @param[out] output       Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
@@ -187,7 +188,8 @@
      * @param[in]  input        Source tensor. 3 lower dimensions represent a single input [width, height, IFM],
      *                          while every optional dimension from 4 and above represent a batch of inputs.
      *                          Data types supported: QASYMM8/F16/F32.
-     * @param[in]  weights      Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported: Same as @p input.
+     * @param[in]  weights      Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
+     *                          Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8.
      * @param[in]  biases       Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
      *                          Data type supported: Should match @p input data type, except for input of QASYMM8 type where biases should be of S32 type.
      * @param[out] output       Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
@@ -212,7 +214,7 @@
     /** Configures the appropriate matrix multiply routine
      *
      * @param[in]      input                 Input tensor. Data types supported: QASYMM8/F16/F32.
-     * @param[in]      weights               Weights tensor. Data type supported: Same as @p input.
+     * @param[in]      weights               Weights tensor. Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8.
      * @param[in]      biases                Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
      *                                       Data type supported: Should match @p input data type, except for input of QASYMM8 type where biases should be of S32 type.
      * @param[in, out] output                Output tensor. Data types supported: Same as @p input,
@@ -225,12 +227,12 @@
                       const ActivationLayerInfo &act_info);
     /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMConvolutionLayer matrix multiply routines
      *
-     * @param[in] input                 Input tensor. Data types supported: QASYMM8/F16/F32.
-     * @param[in] weights               Weights tensor. Data type supported: Same as @p input.
-     * @param[in] output                Output tensor. Data types supported: Same as @p input,
-     *                                  except for input of QASYMM8 type where output should be of S32 type.
-     * @param[in] biases                Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
+     * @param[in] input                 Input tensor info. Data types supported: QASYMM8/F16/F32.
+     * @param[in] weights               Weights tensor info. Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8.
+     * @param[in] biases                Biases tensor info. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
      *                                  Data type supported: Should match @p input data type, except for input of QASYMM8 type where biases should be of S32 type.
+     * @param[in] output                Output tensor info. Data types supported: Same as @p input,
+     *                                  except for input of QASYMM8 type where output should be of S32 type.
      * @param[in] gemmlowp_output_stage GEMMLowp output stage info
      * @param[in] gemm_3d_depth         Depth of GEMM 3D
      * @param[in] skip_im2col           Flag which specifies if im2col has to be skipped. i.e. 1x1 convolution with NHWC data layout.
diff --git a/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h b/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h
index 6aacbf6..b364653 100644
--- a/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h
+++ b/arm_compute/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.h
@@ -24,6 +24,7 @@
 #ifndef __ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYCORE_H__
 #define __ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYCORE_H__
 
+#include "arm_compute/core/CL/kernels/CLDepthConvertLayerKernel.h"
 #include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.h"
 #include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyNativeKernel.h"
 #include "arm_compute/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h"
@@ -49,6 +50,7 @@
  *  -# @ref CLGEMMLowpMatrixBReductionKernel (if the offset of matrix A is not 0)
  *  -# @ref CLGEMMLowpOffsetContributionKernel (if gemm_info.gemmlowp_output_stage == NONE)
  *  -# @ref CLGEMMLowpOffsetContributionOutputStageKernel (if gemm_info.gemmlowp_output_stage != NONE)
+ *  -# @ref CLDepthConvertLayerKernel
  *
 */
 class CLGEMMLowpMatrixMultiplyCore : public IFunction
@@ -84,10 +86,10 @@
     void configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, const GEMMInfo &gemm_info = GEMMInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpMatrixMultiplyCore
      *
-     * @param[in] a         First input tensor  (Matrix A). Data type supported: QASYMM8.
-     * @param[in] b         Second input tensor (Matrix B). Data type supported: same as @p a
-     * @param[in] c         Third input tensor  (Matrix C). It can be a nullptr. Data type supported: S32
-     * @param[in] output    Output tensor. Data type supported: S32 or QASYMM8 if gemm_info.gemmlowp_output_stage != NONE
+     * @param[in] a         First input tensor info (Matrix A). Data type supported: QASYMM8.
+     * @param[in] b         Second input tensor info (Matrix B). Data type supported: same as @p a
+     * @param[in] c         Third input tensor info (Matrix C). It can be a nullptr. Data type supported: S32
+     * @param[in] output    Output tensor info. Data type supported: S32 or QASYMM8 if gemm_info.gemmlowp_output_stage != NONE
      * @param[in] gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped and
      *                      if the reshape of matrix B should be executed only for the first run
      *
@@ -100,7 +102,10 @@
     void prepare() override;
 
 private:
-    MemoryGroup                                   _memory_group;
+    MemoryGroup _memory_group;
+
+    // Kernels used
+    CLDepthConvertLayerKernel                     _weights_to_qasymm8;
     CLGEMMLowpMatrixMultiplyKernel                _mm_midgard_kernel;
     CLGEMMLowpMatrixMultiplyNativeKernel          _mm_native_kernel;
     CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel _mm_reshaped_only_rhs_kernel;
@@ -109,18 +114,29 @@
     CLGEMMLowpMatrixBReductionKernel              _mtx_b_reduction_kernel;
     CLGEMMLowpOffsetContributionKernel            _offset_contribution_kernel;
     CLGEMMLowpOffsetContributionOutputStageKernel _offset_contribution_output_stage_kernel;
-    CLTensor                                      _vector_sum_col;
-    CLTensor                                      _vector_sum_row;
-    CLTensor                                      _tmp_b;
-    CLTensor                                      _mm_result_s32;
-    const ICLTensor                              *_original_b;
-    int32_t                                       _a_offset;
-    int32_t                                       _b_offset;
-    bool                                          _is_gemm_reshaped;
-    bool                                          _is_midgard;
-    bool                                          _reshape_b_only_on_first_run;
-    bool                                          _is_prepared;
-    bool                                          _fuse_output_stage;
+
+    // Temporary tensors
+    CLTensor _qasymm8_weights;
+    CLTensor _vector_sum_col;
+    CLTensor _vector_sum_row;
+    CLTensor _tmp_b;
+    CLTensor _mm_result_s32;
+    CLTensor _gemm_output_stage_multipliers;
+    CLTensor _gemm_output_stage_shifts;
+
+    // Tensor pointers
+    const ICLTensor *_matrix_a;
+    const ICLTensor *_original_b;
+    const ICLTensor *_output;
+
+    int32_t _a_offset;
+    int32_t _b_offset;
+    bool    _is_gemm_reshaped;
+    bool    _is_midgard;
+    bool    _reshape_b_only_on_first_run;
+    bool    _is_prepared;
+    bool    _fuse_output_stage;
+    bool    _convert_to_qasymm8;
 };
-}
+} // namespace arm_compute
 #endif /*__ARM_COMPUTE_CLGEMMLOWPMATRIXMULTIPLYCORE_H__ */
\ No newline at end of file