COMPMID-1413 - Improve the performance of GEMMLowp with 8 bit dot product on OpenCL
COMPMID-1424 - Add dot product support for CLDepthwise QASYMM8 3x3 NHWC non-unit stride

With this patch we are able to improve the performance of MobileNet v1-qasymm8 by 37 %
Tried to use the dot product instruction in CLDepthwise QASYMM8 3x3 NHWC non-unit stride
but I have not seen any benefit (maybe because we have few arithemtic operation and we
do not have more load instructions). However Depthwise convolution has been improved by
30%

Change-Id: Id768a99c2e53a04276707e427af5d0ec93419ada
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/155082
Tested-by: bsgcomp <bsgcomp@arm.com>
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp
index eb561fa..19cc649 100644
--- a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp
+++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NCHWKernel.cpp
@@ -246,13 +246,12 @@
         int   output_shift      = 0;
         quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
 
+        build_opts.add_option("-DREAL_MULTIPLIER=" + support::cpp11::to_string(multiplier));
         build_opts.add_option("-DCONV_STRIDE_Y=" + support::cpp11::to_string(_conv_stride_y));
         build_opts.add_option("-DINPUT_OFFSET=" + support::cpp11::to_string(-_input->info()->quantization_info().offset));
         build_opts.add_option("-DWEIGHTS_OFFSET=" + support::cpp11::to_string(-_weights->info()->quantization_info().offset));
         build_opts.add_option("-DOUTPUT_OFFSET=" + support::cpp11::to_string(_output->info()->quantization_info().offset));
         build_opts.add_option("-DK_OFFSET=" + support::cpp11::to_string(9 * input->info()->quantization_info().offset * weights->info()->quantization_info().offset));
-        build_opts.add_option("-DOUTPUT_MULTIPLIER=" + support::cpp11::to_string(output_multiplier));
-        build_opts.add_option("-DOUTPUT_SHIFT=" + support::cpp11::to_string(output_shift));
 
         if(act_info.enabled())
         {
diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp
index d3bed87..93d96da 100644
--- a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp
+++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3NHWCKernel.cpp
@@ -159,8 +159,9 @@
     ARM_COMPUTE_ERROR_ON(conv_stride_x < 1 || conv_stride_x > 2);
     ARM_COMPUTE_ERROR_ON(std::max(conv_info.pad_top(), conv_info.pad_bottom()) > 1);
 
-    const bool is_qasymm   = is_data_type_quantized_asymmetric(input->info()->data_type());
-    const bool is_stride_1 = ((conv_info.stride().first == conv_info.stride().second) && (conv_info.stride().first == 1));
+    const bool is_qasymm         = is_data_type_quantized_asymmetric(input->info()->data_type());
+    const bool is_stride_1       = ((conv_info.stride().first == conv_info.stride().second) && (conv_info.stride().first == 1));
+    const bool is_dot8_supported = dot8_supported(CLKernelLibrary::get().get_device());
 
     _input                              = input;
     _output                             = output;
@@ -169,7 +170,14 @@
     _conv_stride_y                      = conv_info.stride().second;
     _num_rows_processed_per_iteration   = is_stride_1 ? 2 : 1;
     _num_planes_processed_per_iteration = is_stride_1 ? 2 : 1;
-    _border_size                        = BorderSize(conv_info.pad_left(), 0, std::max(std::max(conv_info.pad_right(), conv_info.pad_bottom()), conv_info.pad_top()), 0);
+
+    // If QASYMM8 and the 8 bit dot product is available, force _num_planes_processed_per_iteration to 1
+    if(is_dot8_supported && is_qasymm)
+    {
+        _num_planes_processed_per_iteration = 1;
+    }
+
+    _border_size = BorderSize(is_qasymm && is_stride_1 ? 0 : conv_info.pad_left(), 0, std::max(std::max(conv_info.pad_right(), conv_info.pad_bottom()), conv_info.pad_top()), 0);
 
     const unsigned int num_elems_accessed_per_iteration = is_qasymm ? 4 : (8 / input->info()->element_size());
 
@@ -187,13 +195,12 @@
         int   output_shift      = 0;
         quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
 
+        build_opts.add_option("-DREAL_MULTIPLIER=" + support::cpp11::to_string(multiplier));
         build_opts.add_option("-DSRC_DIM_1=" + support::cpp11::to_string(_input->info()->dimension(1)));
         build_opts.add_option("-DINPUT_OFFSET=" + support::cpp11::to_string(-_input->info()->quantization_info().offset));
         build_opts.add_option("-DWEIGHTS_OFFSET=" + support::cpp11::to_string(-_weights->info()->quantization_info().offset));
         build_opts.add_option("-DOUTPUT_OFFSET=" + support::cpp11::to_string(_output->info()->quantization_info().offset));
         build_opts.add_option("-DK_OFFSET=" + support::cpp11::to_string(9 * input->info()->quantization_info().offset * weights->info()->quantization_info().offset));
-        build_opts.add_option("-DOUTPUT_MULTIPLIER=" + support::cpp11::to_string(output_multiplier));
-        build_opts.add_option("-DOUTPUT_SHIFT=" + support::cpp11::to_string(output_shift));
 
         if(act_info.enabled())
         {
@@ -240,9 +247,8 @@
     }
 
     // Create kernel
-    const bool  is_dot8_supported = dot8_supported(CLKernelLibrary::get().get_device());
-    std::string kernel_name       = std::string("depthwise_convolution_3x3") + (is_qasymm ? std::string("_quantized") + ((is_dot8_supported
-                                                                                                                          && is_stride_1 /* FIXME (COMPMID-1424) */) ? "_dot8" : "") : "") + "_nhwc" + (is_stride_1 ? "_stride1" : "");
+    std::string kernel_name = std::string("depthwise_convolution_3x3") + (is_qasymm ? std::string("_quantized") + ((is_dot8_supported
+                                                                                                                    && is_stride_1) ? "_dot8" : "") : "") + "_nhwc" + (is_stride_1 ? "_stride1" : "");
 
     _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
 
diff --git a/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp b/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp
index ae54e77..f333c1b 100644
--- a/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp
+++ b/src/core/CL/kernels/CLGEMMInterleave4x4Kernel.cpp
@@ -115,7 +115,7 @@
 {
 }
 
-void CLGEMMInterleave4x4Kernel::configure(const ICLTensor *input, ICLTensor *output, int mult_interleave4x4_height, bool reinterpret_input_as_3d)
+void CLGEMMInterleave4x4Kernel::configure(const ICLTensor *input, ICLTensor *output, int mult_interleave4x4_height, bool reinterpret_input_as_3d, bool unroll_block)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
@@ -132,6 +132,7 @@
     // Create build options
     CLBuildOptions build_opts;
     build_opts.add_option("-DMULT_INTERLEAVE4X4_HEIGHT=" + support::cpp11::to_string(mult_interleave4x4_height));
+    build_opts.add_option_if(unroll_block, "-DUNROLL_BLOCK");
     build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D");
     build_opts.add_option_if(_reinterpret_input_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(input->info()->dimension(1)));
     build_opts.add_option_if(_reinterpret_input_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(input->info()->dimension(2)));
diff --git a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.cpp b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.cpp
index 99e1840..73b1d41 100644
--- a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.cpp
@@ -108,6 +108,7 @@
 std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1, ITensorInfo *output, bool is_interleaved_transposed,
                                                         const GEMMReshapeInfo &reshape_info, ElementsProcessed &num_elements_processed)
 {
+    const bool    is_dot8_supported                   = dot8_supported(CLKernelLibrary::get().get_device());
     unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
     unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];
     bool          reinterpret_input_as_3d             = reshape_info.reinterpret_input_as_3d();
@@ -126,7 +127,7 @@
     }
 
     // Output tensor auto inizialitation if not yet initialized
-    auto_init_if_empty(*output, input0->clone()->set_tensor_shape(compute_mm_shape(*input0, *input1, is_interleaved_transposed, reshape_info)));
+    auto_init_if_empty(*output, input0->clone()->set_tensor_shape(compute_mm_shape(*input0, *input1, is_interleaved_transposed, reshape_info)).set_data_type(DataType::S32));
 
     TensorInfo tmp_info(*output);
 
@@ -173,8 +174,9 @@
     else
     {
         // Special case for 1xN, 2xN, 3xN and 4xN input0 tensor. num_elems_processed_per_iteration_x
-        num_elems_processed_per_iteration_x = 4;
-        num_elems_processed_per_iteration_y = std::min(static_cast<int>(output->dimension(1)), 5);
+        // Note: if the dot product instruction is available, the 8x2 tile has to be used
+        num_elems_processed_per_iteration_x = is_dot8_supported ? 8 : 4;
+        num_elems_processed_per_iteration_y = std::min(static_cast<int>(output->dimension(1)), is_dot8_supported ? 2 : 4);
 
         // Note: bottom paddings are calculated manually as the output can be reinterpreted as 3D tensor
         // The only way to set properly the paddings, it is to set those explicitly through the AccessWindowStatic
@@ -270,6 +272,7 @@
         //        the correct step which is calculated as (16 * mult_transpose1xW_width) / 4)
 
         build_opts.add_option("-DCOLS_B=" + support::cpp11::to_string(input1->info()->dimension(0)));
+        build_opts.add_option("-DMULT_TRANSPOSE1XW_WIDTH=" + support::cpp11::to_string(mult_transpose1xW_width));
         build_opts.add_option("-DTRANSPOSE1XW_WIDTH_STEP=" + support::cpp11::to_string(4 * mult_transpose1xW_width));
         build_opts.add_option("-DMULT_INTERLEAVE4X4_HEIGHT=" + support::cpp11::to_string(mult_interleave4x4_height));
 
diff --git a/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.cpp b/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.cpp
index 3888353..d348f2c 100644
--- a/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMLowpOffsetContributionKernel.cpp
@@ -46,11 +46,18 @@
 
 namespace
 {
-Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row,
+Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias,
                           int32_t a_offset, int32_t b_offset)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mm_result, 1, DataType::S32);
 
+    if(bias != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
+        ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
+        ARM_COMPUTE_RETURN_ERROR_ON(mm_result->dimension(0) != bias->dimension(0));
+    }
+
     // If a_offset == 0, vector_sum_col can be a nullptr
     if(a_offset != 0)
     {
@@ -64,11 +71,11 @@
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32);
 
         // Check if input is a 3D reinterpretation
-        const bool reinterpret_as_3d = vector_sum_row != nullptr && mm_result->num_dimensions() > 1 && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x();
+        const bool reinterpret_as_3d = mm_result->num_dimensions() > 1 && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x();
 
         // Validate input
         ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) != (mm_result->dimension(1) * mm_result->dimension(2)));
-        ARM_COMPUTE_RETURN_ERROR_ON(!reinterpret_as_3d && vector_sum_row != nullptr && vector_sum_row->dimension(0) != mm_result->dimension(1));
+        ARM_COMPUTE_RETURN_ERROR_ON(!reinterpret_as_3d && vector_sum_row->dimension(0) != mm_result->dimension(1));
 
         TensorShape output_shape = mm_result->tensor_shape();
         if(output_shape.num_dimensions() > 1)
@@ -96,7 +103,7 @@
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *mm_result, ITensorInfo *vector_sum_col, ITensorInfo *vector_sum_row,
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *mm_result, ITensorInfo *vector_sum_col, ITensorInfo *vector_sum_row, ITensorInfo *bias,
                                                         int32_t a_offset, int32_t b_offset)
 {
     constexpr unsigned int num_elems_processed_per_iteration = 4;
@@ -119,28 +126,37 @@
         window_changed = window_changed || update_window_and_padding(win, vector_sum_row_access);
     }
 
+    if(bias != nullptr)
+    {
+        AccessWindowStatic bias_access(bias, 0, 0, ceil_to_multiple(bias->dimension(0), num_elems_processed_per_iteration), bias->tensor_shape()[1]);
+        window_changed = window_changed || update_window_and_padding(win, bias_access);
+    }
+
     Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
     return std::make_pair(err, win);
 }
 } // namespace
 
 CLGEMMLowpOffsetContributionKernel::CLGEMMLowpOffsetContributionKernel()
-    : _vector_sum_col(nullptr), _vector_sum_row(nullptr), _mm_result(nullptr)
+    : _vector_sum_col(nullptr), _vector_sum_row(nullptr), _mm_result(nullptr), _bias(nullptr)
 {
 }
 
-void CLGEMMLowpOffsetContributionKernel::configure(ICLTensor *mm_result, const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row, int32_t k, int32_t a_offset, int32_t b_offset)
+void CLGEMMLowpOffsetContributionKernel::configure(ICLTensor *mm_result, const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row, const ICLTensor *bias, int32_t k, int32_t a_offset,
+                                                   int32_t b_offset)
 {
     // Perform validate step
     ARM_COMPUTE_ERROR_ON_NULLPTR(mm_result);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(mm_result->info(),
                                                   vector_sum_col != nullptr ? vector_sum_col->info() : nullptr,
                                                   vector_sum_row != nullptr ? vector_sum_row->info() : nullptr,
+                                                  bias != nullptr ? bias->info() : nullptr,
                                                   a_offset, b_offset)); // NOLINT
 
     _vector_sum_col = vector_sum_col;
     _vector_sum_row = vector_sum_row;
     _mm_result      = mm_result;
+    _bias           = bias;
 
     // Check if input is a 3D reinterpretation
     const bool reinterpret_as_3d = vector_sum_row != nullptr
@@ -161,20 +177,24 @@
     build_opts.add_option("-DK_OFFSET=" + support::cpp11::to_string(a_offset * b_offset * k));
     build_opts.add_option_if(reinterpret_as_3d, "-DHEIGHT_INPUT3D=" + support::cpp11::to_string(mm_result->info()->dimension(1)));
     build_opts.add_option_if(reinterpret_as_3d, "-DDEPTH_INPUT3D=" + support::cpp11::to_string(mm_result->info()->dimension(2)));
+    build_opts.add_option_if(bias != nullptr, "-DADD_BIAS");
+
+    std::string kernel_name("gemmlowp_offset_contribution");
 
     // Create kernel
-    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gemmlowp_offset_contribution", build_opts.options()));
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
 
     // Configure kernel window
     auto win_config = validate_and_configure_window(mm_result->info(),
                                                     vector_sum_col != nullptr ? vector_sum_col->info() : nullptr,
                                                     vector_sum_row != nullptr ? vector_sum_row->info() : nullptr,
+                                                    bias != nullptr ? bias->info() : nullptr,
                                                     a_offset, b_offset); // NOLINT
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
     ICLKernel::configure_internal(win_config.second);
 
     // Set config_id for enabling LWS tuning
-    _config_id = "gemmlowp_offset_contribution_";
+    _config_id = kernel_name + "_";
     _config_id += support::cpp11::to_string(mm_result->info()->dimension(0));
     _config_id += "_";
     _config_id += support::cpp11::to_string(mm_result->info()->dimension(1));
@@ -182,13 +202,14 @@
     _config_id += support::cpp11::to_string(mm_result->info()->dimension(2));
 }
 
-Status CLGEMMLowpOffsetContributionKernel::validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row,
+Status CLGEMMLowpOffsetContributionKernel::validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias,
                                                     int32_t a_offset, int32_t b_offset)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(mm_result, vector_sum_col, vector_sum_row, a_offset, b_offset));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, a_offset, b_offset));
     ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(mm_result->clone().get(),
                                                               vector_sum_col != nullptr ? vector_sum_col->clone().get() : nullptr,
                                                               vector_sum_row != nullptr ? vector_sum_row->clone().get() : nullptr,
+                                                              bias != nullptr ? bias->clone().get() : nullptr,
                                                               a_offset, b_offset)
                                 .first); // NOLINT
 
@@ -214,6 +235,10 @@
     win_vector_sum_row.set(Window::DimY, Window::Dimension(0, 0, 0));
     win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0));
 
+    Window biases_slice = slice;
+    biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1));
+    biases_slice.set(Window::DimZ, Window::Dimension(0, 1, 1));
+
     do
     {
         unsigned int idx = 0;
@@ -226,7 +251,11 @@
         {
             add_2D_tensor_argument(idx, _vector_sum_row, win_vector_sum_row);
         }
-        enqueue(queue, *this, slice);
+        if(_bias != nullptr)
+        {
+            add_1D_tensor_argument(idx, _bias, biases_slice);
+        }
+        enqueue(queue, *this, slice, lws_hint());
     }
     while(collapsed.slide_window_slice_3D(slice));
 }
diff --git a/src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.cpp b/src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.cpp
new file mode 100644
index 0000000..83af0c6
--- /dev/null
+++ b/src/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.cpp
@@ -0,0 +1,301 @@
+/*
+ * Copyright (c) 2018 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/CL/kernels/CLGEMMLowpOffsetContributionOutputStageKernel.h"
+
+#include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+#include "support/ToolchainSupport.h"
+
+#include <cstddef>
+#include <cstdint>
+
+using namespace arm_compute;
+
+namespace arm_compute
+{
+class Coordinates;
+} // namespace arm_compute
+
+namespace
+{
+Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, const ITensorInfo *output,
+                          int32_t a_offset, int32_t b_offset, const GEMMLowpOutputStageInfo &output_stage)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mm_result, 1, DataType::S32);
+    ARM_COMPUTE_RETURN_ERROR_ON(output_stage.type == GEMMLowpOutputStageType::NONE);
+    ARM_COMPUTE_RETURN_ERROR_ON(bias == nullptr && a_offset == 0 && b_offset == 0);
+    ARM_COMPUTE_RETURN_ERROR_ON(output_stage.gemmlowp_max_bound > 255);
+    ARM_COMPUTE_RETURN_ERROR_ON(output_stage.gemmlowp_min_bound < 0 || output_stage.gemmlowp_min_bound > output_stage.gemmlowp_max_bound);
+
+    if(bias != nullptr)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
+        ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
+        ARM_COMPUTE_RETURN_ERROR_ON(mm_result->dimension(0) != bias->dimension(0));
+    }
+
+    // If a_offset == 0, vector_sum_col can be a nullptr
+    if(a_offset != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32);
+        ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_col->dimension(0) != mm_result->dimension(0));
+    }
+
+    // If b_offset == 0, vector_sum_row can be a nullptr
+    if(b_offset != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32);
+
+        // Check if input is a 3D reinterpretation
+        const bool reinterpret_as_3d = mm_result->num_dimensions() > 1 && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x();
+
+        // Validate input
+        ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) != (mm_result->dimension(1) * mm_result->dimension(2)));
+        ARM_COMPUTE_RETURN_ERROR_ON(!reinterpret_as_3d && vector_sum_row->dimension(0) != mm_result->dimension(1));
+
+        TensorShape output_shape = mm_result->tensor_shape();
+        if(output_shape.num_dimensions() > 1)
+        {
+            const unsigned int output_batch_idx = reinterpret_as_3d ? 3 : 2;
+
+            TensorShape vector_sum_row_shape = vector_sum_row->tensor_shape();
+            vector_sum_row_shape.collapse_from(1);
+            output_shape.collapse_from(output_batch_idx);
+
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_row_shape[1] != output_shape[output_batch_idx],
+                                            "mm_result tensor must have the same number of batches of output tensor");
+
+            if(a_offset != 0)
+            {
+                TensorShape vector_sum_col_shape = vector_sum_col->tensor_shape();
+                vector_sum_col_shape.collapse_from(1);
+
+                ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 && vector_sum_col_shape[1] != vector_sum_row_shape[1],
+                                                "vector_sum_col tensor must have the same number of batches of vector_sum_row_shape or the number of batches must be set to 1");
+            }
+        }
+    }
+
+    if(output->total_size() != 0)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mm_result, output);
+    }
+
+    return Status{};
+}
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *mm_result, ITensorInfo *vector_sum_col, ITensorInfo *vector_sum_row, ITensorInfo *bias, ITensorInfo *output,
+                                                        int32_t a_offset, int32_t b_offset)
+{
+    constexpr unsigned int num_elems_processed_per_iteration = 4;
+    bool                   window_changed                    = false;
+
+    // Auto initialize the output
+    auto_init_if_empty(*output, mm_result->clone()->set_data_type(DataType::QASYMM8));
+
+    // Configure kernel window
+    Window win = calculate_max_window(*mm_result, Steps(num_elems_processed_per_iteration));
+
+    AccessWindowHorizontal mm_result_access(mm_result, 0, num_elems_processed_per_iteration);
+    window_changed = window_changed || update_window_and_padding(win, mm_result_access);
+
+    AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
+    window_changed = window_changed || update_window_and_padding(win, output_access);
+
+    if(a_offset != 0)
+    {
+        AccessWindowHorizontal vector_sum_col_access(vector_sum_col, 0, num_elems_processed_per_iteration);
+        window_changed = window_changed || update_window_and_padding(win, vector_sum_col_access);
+    }
+    if(b_offset != 0)
+    {
+        AccessWindowStatic vector_sum_row_access(vector_sum_row, 0, 0, vector_sum_row->dimension(0), 0); // NOLINT
+        window_changed = window_changed || update_window_and_padding(win, vector_sum_row_access);
+    }
+
+    if(bias != nullptr)
+    {
+        AccessWindowStatic bias_access(bias, 0, 0, ceil_to_multiple(bias->dimension(0), num_elems_processed_per_iteration), bias->tensor_shape()[1]);
+        window_changed = window_changed || update_window_and_padding(win, bias_access);
+    }
+
+    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    return std::make_pair(err, win);
+}
+} // namespace
+
+CLGEMMLowpOffsetContributionOutputStageKernel::CLGEMMLowpOffsetContributionOutputStageKernel()
+    : _mm_result(nullptr), _vector_sum_col(nullptr), _vector_sum_row(nullptr), _bias(nullptr), _output(nullptr)
+{
+}
+
+void CLGEMMLowpOffsetContributionOutputStageKernel::configure(const ICLTensor *mm_result, const ICLTensor *vector_sum_col, const ICLTensor *vector_sum_row, const ICLTensor *bias, ICLTensor *output,
+                                                              int32_t k, int32_t a_offset, int32_t b_offset, const GEMMLowpOutputStageInfo &output_stage)
+{
+    // Perform validate step
+    ARM_COMPUTE_ERROR_ON_NULLPTR(mm_result, output);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(mm_result->info(),
+                                                  vector_sum_col != nullptr ? vector_sum_col->info() : nullptr,
+                                                  vector_sum_row != nullptr ? vector_sum_row->info() : nullptr,
+                                                  bias != nullptr ? bias->info() : nullptr,
+                                                  output->info(),
+                                                  a_offset, b_offset, output_stage)); // NOLINT
+
+    const int min = output_stage.gemmlowp_min_bound;
+    const int max = output_stage.gemmlowp_max_bound;
+
+    _vector_sum_col = vector_sum_col;
+    _vector_sum_row = vector_sum_row;
+    _mm_result      = mm_result;
+    _bias           = bias;
+    _output         = output;
+
+    // Check if input is a 3D reinterpretation
+    const bool reinterpret_as_3d = vector_sum_row != nullptr
+                                   && mm_result->info()->num_dimensions() > 1
+                                   && mm_result->info()->tensor_shape().y() != vector_sum_row->info()->tensor_shape().x();
+
+    // Set the arguments to pass at compile time
+    CLBuildOptions build_opts;
+
+    // If a_offset == 0, vector_sum_col can be a nullptr
+    if(a_offset != 0)
+    {
+        build_opts.add_option("-DA_OFFSET=" + support::cpp11::to_string(a_offset));
+        build_opts.add_option_if(vector_sum_col->info()->tensor_shape().num_dimensions() > 1, "-DSUM_COL_HAS_BATCHES");
+    }
+    // If b_offset == 0, vector_sum_row can be a nullptr
+    build_opts.add_option_if(b_offset != 0, "-DB_OFFSET=" + support::cpp11::to_string(b_offset));
+    build_opts.add_option("-DK_OFFSET=" + support::cpp11::to_string(a_offset * b_offset * k));
+    build_opts.add_option_if(reinterpret_as_3d, "-DHEIGHT_INPUT3D=" + support::cpp11::to_string(mm_result->info()->dimension(1)));
+    build_opts.add_option_if(reinterpret_as_3d, "-DDEPTH_INPUT3D=" + support::cpp11::to_string(mm_result->info()->dimension(2)));
+    build_opts.add_option_if(bias != nullptr, "-DADD_BIAS");
+    build_opts.add_option("-DRESULT_OFFSET=" + support::cpp11::to_string(output_stage.gemmlowp_offset));
+    build_opts.add_option("-DRESULT_MULTIPLIER=" + support::cpp11::to_string(output_stage.gemmlowp_multiplier));
+    build_opts.add_option("-DRESULT_SHIFT=" + support::cpp11::to_string(output_stage.gemmlowp_shift));
+    build_opts.add_option_if((min != 0) && (min != max), "-DMIN_BOUND=" + support::cpp11::to_string(min));
+    build_opts.add_option_if((max != 255) && (min != max), "-DMAX_BOUND=" + support::cpp11::to_string(max));
+
+    std::string kernel_name("gemmlowp_offset_contribution");
+
+    // Fuse output stage
+    if(output_stage.type != GEMMLowpOutputStageType::NONE)
+    {
+        kernel_name += "_" + string_from_gemmlowp_output_stage(output_stage.type);
+    }
+    else
+    {
+        ARM_COMPUTE_ERROR("GEMMLowpOutputStage can not be NONE!");
+    }
+
+    // Create kernel
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
+
+    // Configure kernel window
+    auto win_config = validate_and_configure_window(mm_result->info(),
+                                                    vector_sum_col != nullptr ? vector_sum_col->info() : nullptr,
+                                                    vector_sum_row != nullptr ? vector_sum_row->info() : nullptr,
+                                                    bias != nullptr ? bias->info() : nullptr,
+                                                    output->info(),
+                                                    a_offset, b_offset); // NOLINT
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    ICLKernel::configure_internal(win_config.second);
+
+    // Set config_id for enabling LWS tuning
+    _config_id = kernel_name + "_";
+    _config_id += support::cpp11::to_string(mm_result->info()->dimension(0));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(mm_result->info()->dimension(1));
+    _config_id += "_";
+    _config_id += support::cpp11::to_string(mm_result->info()->dimension(2));
+}
+
+Status CLGEMMLowpOffsetContributionOutputStageKernel::validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias,
+                                                               const ITensorInfo *output,
+                                                               int32_t a_offset, int32_t b_offset, const GEMMLowpOutputStageInfo &output_stage)
+{
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, output, a_offset, b_offset, output_stage));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(mm_result->clone().get(),
+                                                              vector_sum_col != nullptr ? vector_sum_col->clone().get() : nullptr,
+                                                              vector_sum_row != nullptr ? vector_sum_row->clone().get() : nullptr,
+                                                              bias != nullptr ? bias->clone().get() : nullptr,
+                                                              output->clone().get(),
+                                                              a_offset, b_offset)
+                                .first); // NOLINT
+
+    return Status{};
+}
+
+void CLGEMMLowpOffsetContributionOutputStageKernel::run(const Window &window, cl::CommandQueue &queue)
+{
+    ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+    Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
+    Window slice     = collapsed.first_slice_window_3D();
+
+    // Set window for vector_sum_col
+    Window win_vector_sum_col = slice;
+    win_vector_sum_col.set(Window::DimY, Window::Dimension(0, 0, 0));
+    win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0));
+
+    // Set window for vector_sum_row
+    Window win_vector_sum_row = slice;
+    win_vector_sum_row.set(Window::DimX, Window::Dimension(0, 0, 0));
+    win_vector_sum_row.set(Window::DimY, Window::Dimension(0, 0, 0));
+    win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0));
+
+    Window biases_slice = slice;
+    biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1));
+    biases_slice.set(Window::DimZ, Window::Dimension(0, 1, 1));
+
+    do
+    {
+        unsigned int idx = 0;
+        add_3D_tensor_argument(idx, _mm_result, slice);
+        if(_vector_sum_col != nullptr)
+        {
+            add_2D_tensor_argument(idx, _vector_sum_col, win_vector_sum_col);
+        }
+        if(_vector_sum_row != nullptr)
+        {
+            add_2D_tensor_argument(idx, _vector_sum_row, win_vector_sum_row);
+        }
+        if(_bias != nullptr)
+        {
+            add_1D_tensor_argument(idx, _bias, biases_slice);
+        }
+        add_3D_tensor_argument(idx, _output, slice);
+        enqueue(queue, *this, slice, lws_hint());
+    }
+    while(collapsed.slide_window_slice_3D(slice));
+}
diff --git a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp
index d403d67..38e0474 100644
--- a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp
@@ -42,7 +42,7 @@
 namespace
 {
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output,
-                          int min, int max, unsigned int output_3d_depth)
+                          int min, int max)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S32);
     ARM_COMPUTE_RETURN_ERROR_ON(max > 255);
@@ -58,10 +58,8 @@
 
     if(output->total_size() != 0)
     {
-        const TensorShape output_shape       = arm_compute::misc::shape_calculator::compute_output_stage_shape(*input, output_3d_depth, true);
-        const TensorInfo  tensor_info_output = output->clone()->set_tensor_shape(output_shape);
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
     }
 
     return Status{};
@@ -69,7 +67,7 @@
 
 std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *bias, ITensorInfo *output)
 {
-    constexpr unsigned int num_elems_processed_per_iteration = 16;
+    constexpr unsigned int num_elems_processed_per_iteration = 4;
 
     // Configure kernel window
     Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
@@ -103,15 +101,15 @@
 } // namespace arm_compute
 
 CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel()
-    : _input(nullptr), _bias(nullptr), _output(nullptr), _reinterpret_as_3d(false)
+    : _input(nullptr), _bias(nullptr), _output(nullptr)
 {
 }
 
 Status CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output,
-                                                                           int min, int max, unsigned int output_3d_depth)
+                                                                           int min, int max)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, bias, output, min, max, output_3d_depth));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, bias, output, min, max));
     ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(),
                                                               (bias != nullptr) ? bias->clone().get() : nullptr,
                                                               output->clone().get())
@@ -122,22 +120,20 @@
 
 void CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output,
                                                                           int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift,
-                                                                          int min, int max, unsigned int output_3d_depth)
+                                                                          int min, int max)
 {
     // Perform validate step
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
     // Output auto inizialitation if not yet initialized
-    const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_output_stage_shape(*input->info(), output_3d_depth, true);
-    auto_init_if_empty(*output->info(), input->info()->clone()->set_data_type(DataType::QASYMM8).set_tensor_shape(output_shape));
+    auto_init_if_empty(*output->info(), input->info()->clone()->set_data_type(DataType::QASYMM8));
 
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (bias != nullptr) ? bias->info() : nullptr, output->info(),
-                                                  min, max, output_3d_depth));
+                                                  min, max));
 
-    _input             = input;
-    _bias              = bias;
-    _output            = output;
-    _reinterpret_as_3d = output_3d_depth > 1;
+    _input  = input;
+    _bias   = bias;
+    _output = output;
 
     // Set the arguments to pass at compile time
     CLBuildOptions build_opts;
@@ -147,7 +143,6 @@
     build_opts.add_option_if((min != 0) && (min != max), "-DMIN_BOUND=" + support::cpp11::to_string(min));
     build_opts.add_option_if((max != 255) && (min != max), "-DMAX_BOUND=" + support::cpp11::to_string(max));
     build_opts.add_option_if(bias != nullptr, "-DADD_BIAS");
-    build_opts.add_option_if(_reinterpret_as_3d, "-DDST_HEIGHT=" + support::cpp11::to_string(input->info()->tensor_shape().y() / output_3d_depth));
 
     // Create kernel
     _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gemmlowp_output_stage_quantize_down_fixedpoint", build_opts.options()));
@@ -177,32 +172,12 @@
         add_1D_tensor_argument(idx1, _bias, biases_slice);
     }
 
-    if(_reinterpret_as_3d)
+    do
     {
-        // Create output window
-        Window window_out;
-        window_out.use_tensor_dimensions(_output->info()->tensor_shape());
-        Window collapsed_out = window_out.collapse_if_possible(window_out, 3);
-        Window slice_out     = collapsed.first_slice_window_4D();
-
-        do
-        {
-            unsigned int idx = 0;
-            add_3D_tensor_argument(idx, _input, slice);
-            add_4D_tensor_argument(idx1, _output, slice_out);
-            enqueue(queue, *this, slice);
-        }
-        while(collapsed.slide_window_slice_3D(slice) && collapsed_out.slide_window_slice_4D(slice_out));
+        unsigned int idx = 0;
+        add_3D_tensor_argument(idx, _input, slice);
+        add_3D_tensor_argument(idx1, _output, slice);
+        enqueue(queue, *this, slice);
     }
-    else
-    {
-        do
-        {
-            unsigned int idx = 0;
-            add_3D_tensor_argument(idx, _input, slice);
-            add_3D_tensor_argument(idx1, _output, slice);
-            enqueue(queue, *this, slice);
-        }
-        while(collapsed.slide_window_slice_3D(slice));
-    }
+    while(collapsed.slide_window_slice_3D(slice));
 }
diff --git a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.cpp b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.cpp
index 5789113..621bd2b 100644
--- a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.cpp
@@ -63,7 +63,7 @@
 
 std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *bias, ITensorInfo *output)
 {
-    constexpr unsigned int num_elems_processed_per_iteration = 16;
+    constexpr unsigned int num_elems_processed_per_iteration = 4;
 
     // Configure kernel window
     Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
diff --git a/src/core/CL/kernels/CLGEMMLowpReductionKernel.cpp b/src/core/CL/kernels/CLGEMMLowpReductionKernel.cpp
index 9cf5d1f..225c358 100644
--- a/src/core/CL/kernels/CLGEMMLowpReductionKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMLowpReductionKernel.cpp
@@ -24,6 +24,7 @@
 #include "arm_compute/core/CL/kernels/CLGEMMLowpReductionKernel.h"
 
 #include "arm_compute/core/AccessWindowStatic.h"
+#include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
@@ -59,7 +60,7 @@
 
     Window win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
 
-    AccessWindowStatic     input_access(input, 0, 0, ceil_to_multiple(input->dimension(0), 16), input->dimension(1));
+    AccessWindowStatic     input_access(input, 0, 0, input->dimension(0), input->dimension(1));
     AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
 
     bool window_changed = update_window_and_padding(win, input_access, output_access);
@@ -115,8 +116,12 @@
     CLBuildOptions build_opts;
     build_opts.add_option("-DCOLS_A=" + support::cpp11::to_string(mtx_a->info()->dimension(0)));
 
+    const bool is_dot8_supported = dot8_supported(CLKernelLibrary::get().get_device());
+
+    std::string kernel_name = "gemmlowp_matrix_a_reduction" + std::string(is_dot8_supported ? "_dot8" : "");
+
     // Create kernel
-    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gemmlowp_matrix_a_reduction", build_opts.options()));
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
 
     // Configure kernel window
     auto win_config = validate_and_configure_window_matrix_a_reduction(_input->info(), _output->info());