COMPMID-2609: Enable quantization with multiplier greater than 1 on OpenCL

Change-Id: I050f1f84e214e61f7cbb0197a672b68a4940edae
Signed-off-by: Michele Di Giorgio <michele.digiorgio@arm.com>
Reviewed-on: https://review.mlplatform.org/c/2158
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Manuel Bottini <manuel.bottini@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Giorgio Arena <giorgio.arena@arm.com>
diff --git a/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl b/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl
index 0835875..ac1406b 100644
--- a/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl
+++ b/src/core/CL/cl_kernels/depthwise_convolution_quantized.cl
@@ -47,7 +47,7 @@
 
 #define VEC_TYPE(size) VEC_DATA_TYPE(DATA_TYPE, size)
 
-#if defined(WEIGHTS_OFFSET) && defined(INPUT_OFFSET) && defined(K_OFFSET) && (defined(OUTPUT_OFFSET) || defined(REAL_MULTIPLIER))
+#if defined(WEIGHTS_OFFSET) && defined(INPUT_OFFSET) && defined(K_OFFSET) && ((defined(OUTPUT_OFFSET) && defined(OUTPUT_MULTIPLIER) && defined(OUTPUT_SHIFT)) || defined(REAL_MULTIPLIER))
 
 #if defined(WEIGHTS_PROMOTED_TYPE)
 #define VEC_WEIGHTS_PROMOTED_TYPE(size) VEC_DATA_TYPE(WEIGHTS_PROMOTED_TYPE, size)
@@ -224,9 +224,6 @@
 #if defined(PER_CHANNEL_QUANTIZATION)
     const int output_multiplier = *((__global int *)vector_offset(&output_multipliers, channel));
     const int output_shift      = *((__global int *)vector_offset(&output_shifts, channel));
-#else  // defined(PER_CHANNEL_QUANTIZATION)
-    const int output_multiplier = *((__global int *)vector_offset(&output_multipliers, 0));
-    const int output_shift      = *((__global int *)vector_offset(&output_shifts, 0));
 #endif // defined(PER_CHANNEL_QUANTIZATION)
 
     int8 values0 = 0;
@@ -335,7 +332,17 @@
 
 #else // defined(REAL_MULTIPLIER)
 
-    values0                            = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values0, output_multiplier, output_shift, 8);
+#if defined(PER_CHANNEL_QUANTIZATION)
+    int8 res0_shift_lt0                = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(values0, output_multiplier, output_shift, 8);
+    int8 res0_shift_gt0                = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values0, output_multiplier, output_shift, 8);
+    values0                            = select(res0_shift_lt0, res0_shift_gt0, (int8)(output_shift) >= 0);
+#else // defined(PER_CHANNEL_QUANTIZATION)
+#if OUTPUT_SHIFT < 0
+    values0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(values0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
+#else  // OUTPUT_SHIFT < 0
+    values0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
+#endif // OUTPUT_OFFSET < 0
+#endif // defined(PER_CHANNEL_QUANTIZATION)
 
 #endif // defined(REAL_MULTIPLIER)
 
@@ -351,7 +358,17 @@
 
 #else // defined(REAL_MULTIPLIER)
 
-    values1                           = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values1, output_multiplier, output_shift, 8);
+#if defined(PER_CHANNEL_QUANTIZATION)
+    int8 res1_shift_lt0      = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(values1, output_multiplier, output_shift, 8);
+    int8 res1_shift_gt0      = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values1, output_multiplier, output_shift, 8);
+    values1                  = select(res1_shift_lt0, res1_shift_gt0, (int8)(output_shift) >= 0);
+#else // defined(PER_CHANNEL_QUANTIZATION)
+#if OUTPUT_SHIFT < 0
+    values1 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(values1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
+#else  // OUTPUT_SHIFT < 0
+    values1 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
+#endif // OUTPUT_OFFSET < 0
+#endif // defined(PER_CHANNEL_QUANTIZATION)
 
 #endif // defined(REAL_MULTIPLIER)
 
@@ -667,7 +684,17 @@
 
 #else // defined(REAL_MULTIPLIER)
 
-    values0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values0, output_multiplier, output_shift, 8);
+#if defined(PER_CHANNEL_QUANTIZATION)
+    int8 res0_shift_lt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(values0, output_multiplier, output_shift, 8);
+    int8 res0_shift_gt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values0, output_multiplier, output_shift, 8);
+    values0             = select(res0_shift_lt0, res0_shift_gt0, (int8)(output_shift) >= 0);
+#else // defined(PER_CHANNEL_QUANTIZATION)
+#if OUTPUT_SHIFT < 0
+    values0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(values0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
+#else  // OUTPUT_SHIFT < 0
+    values0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
+#endif // OUTPUT_OFFSET < 0
+#endif // defined(PER_CHANNEL_QUANTIZATION)
 
 #endif // defined(REAL_MULTIPLIER)
 
@@ -684,7 +711,17 @@
 
 #else // defined(REAL_MULTIPLIER)
 
-    values1 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values1, output_multiplier, output_shift, 8);
+#if defined(PER_CHANNEL_QUANTIZATION)
+    int8 res1_shift_lt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(values1, output_multiplier, output_shift, 8);
+    int8 res1_shift_gt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values1, output_multiplier, output_shift, 8);
+    values1             = select(res1_shift_lt0, res1_shift_gt0, (int8)(output_shift) >= 0);
+#else // defined(PER_CHANNEL_QUANTIZATION)
+#if OUTPUT_SHIFT < 0
+    values1 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(values1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
+#else  // OUTPUT_SHIFT < 0
+    values1 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
+#endif // OUTPUT_OFFSET < 0
+#endif // defined(PER_CHANNEL_QUANTIZATION)
 
 #endif // defined(REAL_MULTIPLIER)
 
@@ -943,17 +980,23 @@
     acc = CONVERT(round(CONVERT(acc, VEC_FLOAT) * (VEC_FLOAT)REAL_MULTIPLIER), VEC_INT);
 
 #else // defined(REAL_MULTIPLIER)
+
 #if defined(PER_CHANNEL_QUANTIZATION)
     Vector          output_multipliers = CONVERT_TO_VECTOR_STRUCT(output_multipliers);
     Vector          output_shifts      = CONVERT_TO_VECTOR_STRUCT(output_shifts);
     VEC_INT         output_multiplier  = VLOAD(VEC_SIZE)(0, (__global int *)output_multipliers.ptr);
     VEC_INT         output_shift       = VLOAD(VEC_SIZE)(0, (__global int *)output_shifts.ptr);
-#else  // defined(PER_CHANNEL_QUANTIZATION)
-    const int output_multiplier = *((__global int *)(output_multipliers_ptr + output_multipliers_offset_first_element_in_bytes));
-    const int output_shift      = *((__global int *)(output_shifts_ptr + output_shifts_offset_first_element_in_bytes));
-#endif // defined(PER_CHANNEL_QUANTIZATION)
 
-    acc                                = asymm_mult_by_quant_multiplier_less_than_one(acc, output_multiplier, output_shift);
+    VEC_INT res_shift_lt0              = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(acc, output_multiplier, output_shift, VEC_SIZE);
+    VEC_INT res_shift_gt0              = asymm_mult_by_quant_multiplier_less_than_one(acc, output_multiplier, output_shift);
+    acc                                = select(res_shift_lt0, res_shift_gt0, output_shift >= 0);
+#else // defined(PER_CHANNEL_QUANTIZATION)
+#if OUTPUT_SHIFT < 0
+    acc     = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(acc, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, VEC_SIZE);
+#else  // OUTPUT_SHIFT < 0
+    acc     = asymm_mult_by_quant_multiplier_less_than_one(acc, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
+#endif // OUTPUT_SHIFT < 0
+#endif // defined(PER_CHANNEL_QUANTIZATION)
 
 #endif // defined(REAL_MULTIPLIER)
 
@@ -1255,15 +1298,32 @@
     Vector          output_shifts      = CONVERT_TO_VECTOR_STRUCT(output_shifts);
     VEC_INT         output_multiplier  = VLOAD(VEC_SIZE)(0, (__global int *)output_multipliers.ptr);
     VEC_INT         output_shift       = VLOAD(VEC_SIZE)(0, (__global int *)output_shifts.ptr);
-#else  // defined(PER_CHANNEL_QUANTIZATION)
-    const int output_multiplier = *((__global int *)(output_multipliers_ptr + output_multipliers_offset_first_element_in_bytes));
-    const int output_shift      = *((__global int *)(output_shifts_ptr + output_shifts_offset_first_element_in_bytes));
-#endif // defined(PER_CHANNEL_QUANTIZATION)
 
-    acc0                     = asymm_mult_by_quant_multiplier_less_than_one(acc0, output_multiplier, output_shift);
-    acc1                     = asymm_mult_by_quant_multiplier_less_than_one(acc1, output_multiplier, output_shift);
-    acc2                     = asymm_mult_by_quant_multiplier_less_than_one(acc2, output_multiplier, output_shift);
-    acc3                     = asymm_mult_by_quant_multiplier_less_than_one(acc3, output_multiplier, output_shift);
+    res0_shift_lt0           = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(acc0, output_multiplier, output_shift, VEC_SIZE);
+    res1_shift_lt0           = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(acc1, output_multiplier, output_shift, VEC_SIZE);
+    res2_shift_lt0           = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(acc2, output_multiplier, output_shift, VEC_SIZE);
+    res3_shift_lt0           = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(acc3, output_multiplier, output_shift, VEC_SIZE);
+    res0_shift_gt0           = asymm_mult_by_quant_multiplier_less_than_one(acc0, output_multiplier, output_shift);
+    res1_shift_gt0           = asymm_mult_by_quant_multiplier_less_than_one(acc1, output_multiplier, output_shift);
+    res2_shift_gt0           = asymm_mult_by_quant_multiplier_less_than_one(acc2, output_multiplier, output_shift);
+    res3_shift_gt0           = asymm_mult_by_quant_multiplier_less_than_one(acc3, output_multiplier, output_shift);
+    acc0                     = select(res0_shift_lt0, res0_shift_gt0, output_shift >= 0);
+    acc1                     = select(res1_shift_lt0, res1_shift_gt0, output_shift >= 0);
+    acc2                     = select(res2_shift_lt0, res2_shift_gt0, output_shift >= 0);
+    acc3                     = select(res3_shift_lt0, res3_shift_gt0, output_shift >= 0);
+#else // defined(PER_CHANNEL_QUANTIZATION)
+#if OUTPUT_SHIFT < 0
+    acc0    = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(acc0, output_multiplier, output_shift, VEC_SIZE);
+    acc1    = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(acc1, output_multiplier, output_shift, VEC_SIZE);
+    acc2    = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(acc2, output_multiplier, output_shift, VEC_SIZE);
+    acc3    = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(acc3, output_multiplier, output_shift, VEC_SIZE);
+#else  // OUTPUT_SHIFT < 0
+    acc0    = asymm_mult_by_quant_multiplier_less_than_one(acc0, output_multiplier, output_shift);
+    acc1    = asymm_mult_by_quant_multiplier_less_than_one(acc1, output_multiplier, output_shift);
+    acc2    = asymm_mult_by_quant_multiplier_less_than_one(acc2, output_multiplier, output_shift);
+    acc3    = asymm_mult_by_quant_multiplier_less_than_one(acc3, output_multiplier, output_shift);
+#endif // OUTPUT_SHIFT < 0
+#endif // defined(PER_CHANNEL_QUANTIZATION)
 
 #endif // defined(REAL_MULTIPLIER)
 
@@ -1375,7 +1435,7 @@
     int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y
     int b = get_global_id(2) / (int)DST_DEPTH; // batch
 #else                                          // defined(DST_DEPTH)
-    int      z                        = get_global_id(2); // spatial coordinate y
+    int      z               = get_global_id(2); // spatial coordinate y
 #endif                                         // defined(DST_DEPTH)
 
     __global uchar *weights_addr = weights_ptr + weights_offset_first_element_in_bytes + x * weights_stride_y;
@@ -1383,7 +1443,7 @@
 #if defined(DST_DEPTH)
     __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * VEC_SIZE + b * src_stride_w;
 #else  /* defined(DST_DEPTH) */
-    __global uchar *src_addr          = src_ptr + src_offset_first_element_in_bytes + x * VEC_SIZE;
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * VEC_SIZE;
 #endif /* defined(DST_DEPTH) */
 
     int  z_coord = 0;
@@ -1519,11 +1579,14 @@
     acc1 = CONVERT(round(CONVERT(acc1, VEC_FLOAT) * (VEC_FLOAT)REAL_MULTIPLIER), VEC_INT);
 
 #else // defined(REAL_MULTIPLIER)
-    const int       output_multiplier = *((__global int *)(output_multipliers_ptr + output_multipliers_offset_first_element_in_bytes));
-    const int       output_shift      = *((__global int *)(output_shifts_ptr + output_shifts_offset_first_element_in_bytes));
 
-    acc0                     = asymm_mult_by_quant_multiplier_less_than_one(acc0, output_multiplier, output_shift);
-    acc1                     = asymm_mult_by_quant_multiplier_less_than_one(acc1, output_multiplier, output_shift);
+#if OUTPUT_SHIFT < 0
+    acc0                     = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(acc0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, VEC_SIZE);
+    acc1                     = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(acc1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, VEC_SIZE);
+#else  // OUTPUT_SHIFT < 0
+    acc0    = asymm_mult_by_quant_multiplier_less_than_one(acc0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
+    acc1    = asymm_mult_by_quant_multiplier_less_than_one(acc1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT);
+#endif // OUTPUT_SHIFT < 0
 
 #endif // defined(REAL_MULTIPLIER)
     acc0 += (VEC_INT)OUTPUT_OFFSET;
@@ -1553,9 +1616,9 @@
 
 #endif // defined(WEIGHTS_PROMOTED_TYPE)
 
-#endif // defined(WEIGHTS_OFFSET) && defined(INPUT_OFFSET) && defined(K_OFFSET) && (defined(OUTPUT_OFFSET) || defined(REAL_MULTIPLIER))
+#endif // defined(WEIGHTS_OFFSET) && defined(INPUT_OFFSET) && defined(K_OFFSET) && ((defined(OUTPUT_OFFSET) && defined(OUTPUT_MULTIPLIER) && defined(OUTPUT_SHIFT)) || defined(REAL_MULTIPLIER))
 
-#if defined(SRC_DIM1) && defined(SRC_DIM2) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(N0) && defined(DILATION_X) && defined(DILATION_Y) && defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y) && defined(CONV_PAD_LEFT) && defined(CONV_PAD_TOP) && defined(INPUT_OFFSET) && defined(WEIGHTS_OFFSET) && defined(OUTPUT_OFFSET)
+#if defined(SRC_DIM1) && defined(SRC_DIM2) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(N0) && defined(DILATION_X) && defined(DILATION_Y) && defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y) && defined(CONV_PAD_LEFT) && defined(CONV_PAD_TOP) && defined(INPUT_OFFSET) && defined(WEIGHTS_OFFSET) && defined(OUTPUT_OFFSET) && defined(OUTPUT_SHIFT) && defined(OUTPUT_MULTIPLIER)
 /** This function computes the depthwise convolution for NHWC data layout. This kernel assumes that the weights tensor is NOT reshaped
  *
  * @note The number of elements processed must be passed at compile time using -DN0 (e.g. -DN0=2)
@@ -1630,7 +1693,7 @@
     int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y
     int b = get_global_id(2) / (int)DST_DEPTH; // batch
 #else                                          // defined(DST_DEPTH)
-    int       z                 = get_global_id(2); // spatial coordinate y
+    int z = get_global_id(2); // spatial coordinate y
 #endif                                         // defined(DST_DEPTH)
 
     __global uchar *s_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) * (int)N0;
@@ -1646,12 +1709,6 @@
 #if defined(PER_CHANNEL_QUANTIZATION)
     __global uchar *out_mul_addr   = output_multipliers_ptr + output_multipliers_offset_first_element_in_bytes + x * sizeof(int) * (int)DEPTH_MULTIPLIER * (int)N0;
     __global uchar *out_shift_addr = output_shifts_ptr + output_shifts_offset_first_element_in_bytes + x * sizeof(int) * (int)DEPTH_MULTIPLIER * (int)N0;
-
-    VEC_INT output_multiplier = (VEC_INT)0;
-    VEC_INT output_shift      = (VEC_INT)0;
-#else  // defined(PER_CHANNEL_QUANTIZATION)
-    const int output_multiplier = *((__global int *)(output_multipliers_ptr + output_multipliers_offset_first_element_in_bytes));
-    const int output_shift      = *((__global int *)(output_shifts_ptr + output_shifts_offset_first_element_in_bytes));
 #endif // defined(PER_CHANNEL_QUANTIZATION)
 
 #if defined(DST_DEPTH)
@@ -1700,11 +1757,20 @@
 #endif // defined(HAS_BIAS)
 
 #if defined(PER_CHANNEL_QUANTIZATION)
-        output_multiplier = VLOAD(N0)(0, (__global int *)(out_mul_addr));
-        output_shift      = VLOAD(N0)(0, (__global int *)(out_shift_addr));
+        VEC_INT output_multiplier = VLOAD(N0)(0, (__global int *)(out_mul_addr));
+        VEC_INT output_shift      = VLOAD(N0)(0, (__global int *)(out_shift_addr));
+
+        VEC_INT res_shift_lt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(res, output_multiplier, output_shift, N0);
+        VEC_INT res_shift_gt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(res, output_multiplier, output_shift, N0);
+        res                   = select(res_shift_lt0, res_shift_gt0, (VEC_INT)(output_shift) >= 0);
+#else // defined(PER_CHANNEL_QUANTIZATION)
+#if OUTPUT_SHIFT < 0
+        res   = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(res, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, N0);
+#else  // OUTPUT_SHIFT < 0
+        res = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(res, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, N0);
+#endif // OUTPUT_OFFSET < 0
 #endif // defined(PER_CHANNEL_QUANTIZATION)
 
-        res = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(res, output_multiplier, output_shift, N0);
         res += (VEC_INT)OUTPUT_OFFSET;
 
         VEC_TYPE(VEC_SIZE)
@@ -1726,5 +1792,5 @@
     }
 #endif // DEPTH_MULTIPLIER > 1
 }
-#endif // defined(SRC_DIM1) && defined(SRC_DIM2) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defiend(N0) && defined(DILATION_X) && defined(DILATION_Y) && defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y) && defined(CONV_PAD_LEFT) && defined(CONV_PAD_TOP) && defined(INPUT_OFFSET) && defined(WEIGHTS_OFFSET) && defined(OUTPUT_OFFSET)
+#endif // defined(SRC_DIM1) && defined(SRC_DIM2) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defiend(N0) && defined(DILATION_X) && defined(DILATION_Y) && defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y) && defined(CONV_PAD_LEFT) && defined(CONV_PAD_TOP) && defined(INPUT_OFFSET) && defined(WEIGHTS_OFFSET) && defined(OUTPUT_OFFSET) && defined(OUTPUT_SHIFT) && defined(OUTPUT_MULTIPLIER)
 #endif // defined(DATA_TYPE) && defined(WEIGHTS_TYPE)
diff --git a/src/core/CL/cl_kernels/direct_convolution_quantized.cl b/src/core/CL/cl_kernels/direct_convolution_quantized.cl
index 1182428..37fd9a0 100644
--- a/src/core/CL/cl_kernels/direct_convolution_quantized.cl
+++ b/src/core/CL/cl_kernels/direct_convolution_quantized.cl
@@ -25,7 +25,7 @@
 
 #undef CONVERT_SAT
 
-#if defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH)
+#if defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH) && defined(OUTPUT_MULTIPLIER) && defined(OUTPUT_SHIFT)
 
 #if KERNEL_SIZE == 9
 
@@ -194,6 +194,8 @@
  * @note The convolution stride x must be passed at compile time using -DSTRIDE_X e.g. -DSTRIDE_X=1
  * @note The third dimensions of the weights tensors must be passed at compile time using -DWEIGHTS_DEPTH
  * @note If biases are used then -DHAS_BIAS has to be passed at compile time
+ * @note The output quantization multiplier must be passed at compile time using -DOUTPUT_MULTIPLIER e.g. -DOUTPUT_MULTIPLIER=1234
+ * @note The output quantization shift must be passed at compile time using -DOUTPUT_SHIFT e.g. -DOUTPUT_SHIFT=4
  *
  * @param[in]  src_ptr                               Pointer to the source tensor. Supported data types: QASYMM8
  * @param[in]  src_stride_x                          Stride of the source tensor in X dimension (in bytes)
@@ -227,8 +229,6 @@
  * @param[in]  input_offset                          Input offset quantization parameter
  * @param[in]  weight_offset                         Weights offset quantization parameter
  * @param[in]  output_offset                         Output offset quantization parameter
- * @param[in]  output_multiplier                     Output integer multiplier quantization parameter
- * @param[in]  output_shift                          Output integer shift quantization parameter
  */
 __kernel void direct_convolution_quantized(
     TENSOR3D_DECLARATION(src),
@@ -240,9 +240,7 @@
     unsigned int weights_stride_w,
     int          input_offset,
     int          weight_offset,
-    int          output_offset,
-    int          output_multiplier,
-    int          output_shift)
+    int          output_offset)
 {
     Image    src     = CONVERT_TO_IMAGE_STRUCT(src);
     Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
@@ -294,9 +292,13 @@
     pixels0 += (int8)(*bias_addr);
 #endif /* defined(HAS_BIAS) */
 
-    pixels0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(pixels0, output_multiplier, output_shift, 8);
+#if OUTPUT_SHIFT < 0
+    pixels0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(pixels0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
+#else  // OUTPUT_SHIFT < 0
+    pixels0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(pixels0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
+#endif // OUTPUT_SHIFT < 0
     pixels0 = pixels0 + output_offset;
 
     vstore8(convert_uchar8_sat(pixels0), 0, (__global uchar *)dst.ptr);
 }
-#endif // defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH)
+#endif // defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH) && defined(OUTPUT_MULTIPLIER) && defined(OUTPUT_SHIFT)
diff --git a/src/core/CL/cl_kernels/gemmlowp.cl b/src/core/CL/cl_kernels/gemmlowp.cl
index 7a97fa6..fa08b14 100644
--- a/src/core/CL/cl_kernels/gemmlowp.cl
+++ b/src/core/CL/cl_kernels/gemmlowp.cl
@@ -1673,9 +1673,17 @@
     int4            result_multipliers_values = vload4(0, (__global int *)result_multipliers_addr);
     int4            result_shifts_values      = vload4(0, (__global int *)result_shifts_addr);
 
-    in_s32 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(in_s32, result_multipliers_values, result_shifts_values, 4);
-#else  // !defined(PER_CHANNEL_QUANTIZATION)
+    int4 in_s32_shift_lt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(in_s32, result_multipliers_values, result_shifts_values, 4);
+    int4 in_s32_shift_gt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(in_s32, result_multipliers_values, result_shifts_values, 4);
+    in_s32                = select(in_s32_shift_lt0, in_s32_shift_gt0, result_shifts_values >= 0);
+#else // defined(PER_CHANNEL_QUANTIZATION)
+
+#if RESULT_SHIFT < 0
+    in_s32 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(in_s32, RESULT_MULTIPLIER, RESULT_SHIFT, 4);
+#else  // RESULT_SHIFT >= 0
     in_s32 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(in_s32, RESULT_MULTIPLIER, RESULT_SHIFT, 4);
+#endif // RESULT_SHIFT < 0
+
 #endif // defined(PER_CHANNEL_QUANTIZATION)
 
     // Add the offset terms to GEMM's result
@@ -1768,7 +1776,11 @@
     // Multiply by result_mult_int and shift
     input_values *= RESULT_MULT_INT;
 
+#if RESULT_SHIFT < 0
+    input_values >>= -RESULT_SHIFT;
+#else  // RESULT_SHIFT >= 0
     input_values >>= RESULT_SHIFT;
+#endif // RESULT_SHIFT < 0
 
     uchar4 res = convert_uchar4_sat(input_values);
 
@@ -1850,7 +1862,11 @@
 #endif // defined(ADD_BIAS)
 
     // Multiply by result_mult_int and shift
+#if RESULT_SHIFT < 0
+    input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, 4);
+#else  // RESULT_SHIFT >= 0
     input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, 4);
+#endif // RESULT_SHIFT < 0
 
     // Add the offset terms to GEMM's result
     input_values += (int4)RESULT_OFFSET_AFTER_SHIFT;
@@ -1937,7 +1953,7 @@
 
     // Multiply by result_mult_int and shift
 #if RESULT_SHIFT < 0
-    input_values = ASYMM_MULT(input_values * (1 << (-RESULT_SHIFT)), RESULT_FIXEDPOINT_MULTIPLIER, 4);
+    input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, 4);
 #else  // RESULT_SHIFT >= 0
     input_values = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(input_values, RESULT_FIXEDPOINT_MULTIPLIER, RESULT_SHIFT, 4);
 #endif // RESULT_SHIFT < 0
diff --git a/src/core/CL/cl_kernels/helpers_asymm.h b/src/core/CL/cl_kernels/helpers_asymm.h
index f7eff75..09409dc 100644
--- a/src/core/CL/cl_kernels/helpers_asymm.h
+++ b/src/core/CL/cl_kernels/helpers_asymm.h
@@ -369,6 +369,8 @@
 
 #define ASYMM_ROUNDING_DIVIDE_BY_POW2(x, exponent, size) asymm_rounding_divide_by_POW2_##size(x, exponent)
 #define ASYMM_MULT(a, b, size) asymm_mult##size(a, b)
+#define ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(x, quantized_multiplier, left_shift, size) \
+    ASYMM_MULT(x *((VEC_DATA_TYPE(int, size))(1) << (-left_shift)), quantized_multiplier, size)
 #define ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(x, quantized_multiplier, right_shift, size) \
     ASYMM_ROUNDING_DIVIDE_BY_POW2(ASYMM_MULT(x, quantized_multiplier, size), right_shift, size)
 #define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL(a, size) asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(a)