COMPMID-665 - NEON: Add QASYMM8 in place Activation layer

- Added min and max arguments for QuantizeDownInt32ToUint8Scale in order
  to apply bounded relu

- Added support for int32_t biases

- Extended tests

Change-Id: I015dae17faa7284766b5435ca33bcf593c1b2b69
Reviewed-on: http://mpd-gerrit.cambridge.arm.com/96512
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
Tested-by: Kaizen <jeremy.johnson+kaizengerrit@arm.com>
diff --git a/arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h b/arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h
index 04b8433..8c1bae9 100644
--- a/arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h
+++ b/arm_compute/core/NEON/kernels/NEGEMMLowpOffsetContributionKernel.h
@@ -35,6 +35,13 @@
  * This kernel takes a final int32 accumulator value (the output of @NEGEMMLowpMatrixMultiplyKernel),
  * and adds to it the offset contribution of matrix A and matrix B in-place.
  *
+ * The final result is:
+ *
+ * mm_result[i][k] = mm_result[i][k] +
+ *                   (vector_sum_col[k] * a_offset) +
+ *                   (vector_sum_row[i] * b_offset) +
+ *                   (a_offset * b_offset * k)
+ *
  */
 class NEGEMMLowpOffsetContributionKernel : public INEKernel
 {
diff --git a/arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h b/arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h
index 65f1042..4ec0e9d 100644
--- a/arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h
+++ b/arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h
@@ -36,7 +36,10 @@
  * The following computations will be performed by the kernel:
  *
  *  -# Add offset terms to final result
- *  -# Multiply each entry of result and round to nearest integer
+ *  -# Multiply each entry of result by result_mult_int
+ *  -# Add bias to final result if bias tensor is not a nullptr
+ *  -# Shift the int32 accumulator by result_shift
+ *  -# Clamp the value between the specified min and max bounds
  *  -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8.
  *
  */
@@ -56,22 +59,44 @@
     /** Initialise the kernel's input and output.
      *
     * @param[in]  input           Input tensor. Data type supported: S32
+    * @param[in]  bias            Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
+    *                             Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
     * @param[out] output          Output tensor. Data type supported: Data type supported: QASYMM8
     * @param[in]  result_offset   Offset to be added to each element of the input matrix
     * @param[in]  result_mult_int Value to be multiplied to each element of the input matrix when once the result_offset has been add
     * @param[in]  result_shift    Number of bits to shift right the result before converting back to QASYMM8
+    * @param[in]  min             (Optional) Min value used to saturate down the output result before converting back to QASYMM8
+    * @param[in]  max             (Optional) Max value used to saturate up the output result before converting back to QASYMM8,
+    *                             Along with @p min, this value can be used to implement "rectified linear unit" activation functions
      */
-    void configure(const ITensor *input, ITensor *output, int result_offset, int result_mult_int, int result_shift);
+    void configure(const ITensor *input, const ITensor *bias, ITensor *output, int result_offset, int result_mult_int, int result_shift, int min = 0, int max = 0);
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
 
 private:
-    const ITensor *_input;
-    ITensor       *_output;
-    int32_t        _result_offset;
-    int32_t        _result_mult_int;
-    int32_t        _result_shift;
+    /** Template function to run the NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel
+     *
+     * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
+     */
+    template <bool is_bounded_relu>
+    void run(const Window &window);
+
+    /** Common signature for all the specialised NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel functions
+     *
+     * @param[in] window Region on which to execute the kernel.
+     */
+    using QuantizeDownFunctionPtr = void (NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel::*)(const Window &window);
+
+    QuantizeDownFunctionPtr _func;
+    const ITensor          *_input;
+    const ITensor          *_bias;
+    ITensor                *_output;
+    int                     _result_offset;
+    int                     _result_mult_int;
+    int                     _result_shift;
+    int                     _min;
+    int                     _max;
 };
 } // namespace arm_compute
 
diff --git a/arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h b/arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h
index 8557ef4..a3db23a 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h
@@ -43,14 +43,18 @@
  *  NEGEMMLowpQuantizeDownInt32ToUint8Scale depends on 3 parameters: result_offset, result_mult_int, result_shift
  *  The final result is:
  *
- *  ((input[i][k] + result_offset) * result_mult_int + rounding) >> result_shift
+ *  ((input[i][k] + result_offset) * result_mult_int) >> result_shift
  *
- *  where rounding = (result_shift < 1) ? 0 : (1 << (result_shift - 1))
+ * In case the bias tensor is provided, the final result is:
+ *
+ *  ((input[i][k] + result_offset) * result_mult_int + bias[k]) >> result_shift
  *
  *  This function calls the following NEON kernels:
  *
  * -# @ref NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel
  *
+ * @note The function accepts also 2 optional input arguments (min and max) which can be used to implement "rectified linear unit" activation functions
+ *       before the result is shifted right by result_shift
 */
 class NEGEMMLowpQuantizeDownInt32ToUint8Scale : public INESimpleFunction
 {
@@ -58,12 +62,17 @@
     /** Initialise the kernel's inputs, output
     *
     * @param[in]  input           Input tensor. It is the output of @ref NEGEMMLowpMatrixMultiplyCore function. Data type supported: S32
+    * @param[in]  bias            Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
+    *                             Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
     * @param[out] output          Output tensor. Data type supported: Data type supported: QASYMM8
     * @param[in]  result_offset   Offset to be added to each element of the input matrix
     * @param[in]  result_mult_int Value to be multiplied to each element of the input matrix when once the result_offset has been add
     * @param[in]  result_shift    Number of bits to shift right the result before converting back to QASYMM8
+    * @param[in]  min             (Optional) Min value used to saturate down the output result before converting back to QASYMM8
+    * @param[in]  max             (Optional) Max value used to saturate up the output result before converting back to QASYMM8,
+    *                             Along with @p min, this value can be used to implement "rectified linear unit" activation functions
     */
-    void configure(const ITensor *input, ITensor *output, int result_offset, int result_mult_int, int result_shift);
+    void configure(const ITensor *input, const ITensor *bias, ITensor *output, int result_offset, int result_mult_int, int result_shift, int min = 0, int max = 0);
 };
 }
 #endif /*__ARM_COMPUTE_NEGEMMLOWPOUTPUTSTAGE_H__ */
\ No newline at end of file
diff --git a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.cpp
index aa3c280..26aaa2a 100644
--- a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.cpp
@@ -23,10 +23,12 @@
  */
 #include "arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h"
 
+#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 
@@ -36,26 +38,173 @@
 
 using namespace arm_compute;
 
+namespace
+{
+inline void scale_input(int32x4x4_t &in_s32, int32x4_t result_offset_s32, int32_t result_mult_int)
+{
+    // Add the offset terms to GEMM's result
+    in_s32.val[0] = vaddq_s32(in_s32.val[0], result_offset_s32);
+    in_s32.val[1] = vaddq_s32(in_s32.val[1], result_offset_s32);
+    in_s32.val[2] = vaddq_s32(in_s32.val[2], result_offset_s32);
+    in_s32.val[3] = vaddq_s32(in_s32.val[3], result_offset_s32);
+
+    // Multiply by result_mult_int
+    in_s32.val[0] = vmulq_n_s32(in_s32.val[0], result_mult_int);
+    in_s32.val[1] = vmulq_n_s32(in_s32.val[1], result_mult_int);
+    in_s32.val[2] = vmulq_n_s32(in_s32.val[2], result_mult_int);
+    in_s32.val[3] = vmulq_n_s32(in_s32.val[3], result_mult_int);
+}
+
+template <bool    is_bounded_relu>
+inline uint8x16_t finalize_quantization(int32x4x4_t &in_s32, int32x4_t result_shift_s32, uint8x16_t min_u8, uint8x16_t max_u8)
+{
+    const static int32x4_t zero_s32 = vdupq_n_s32(0);
+
+    // Shift final result (negative value shift right)
+    in_s32.val[0] = vshlq_s32(in_s32.val[0], result_shift_s32);
+    in_s32.val[1] = vshlq_s32(in_s32.val[1], result_shift_s32);
+    in_s32.val[2] = vshlq_s32(in_s32.val[2], result_shift_s32);
+    in_s32.val[3] = vshlq_s32(in_s32.val[3], result_shift_s32);
+
+    // Saturate negative values
+    in_s32.val[0] = vmaxq_s32(in_s32.val[0], zero_s32);
+    in_s32.val[1] = vmaxq_s32(in_s32.val[1], zero_s32);
+    in_s32.val[2] = vmaxq_s32(in_s32.val[2], zero_s32);
+    in_s32.val[3] = vmaxq_s32(in_s32.val[3], zero_s32);
+
+    // Convert S32 to S16
+    const int16x8x2_t in_s16 =
+    {
+        {
+            vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
+            vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))
+        }
+    };
+
+    // Convert S16 to U8
+    uint8x16_t out_u8 = vcombine_u8(vqmovun_s16(in_s16.val[0]), vqmovun_s16(in_s16.val[1]));
+
+    if(is_bounded_relu)
+    {
+        out_u8 = vmaxq_u8(out_u8, min_u8);
+        out_u8 = vminq_u8(out_u8, max_u8);
+    }
+
+    return out_u8;
+}
+} // namespace
+
 namespace arm_compute
 {
 class Coordinates;
 } // namespace arm_compute
 
+template <bool is_bounded_relu>
+void NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel::run(const Window &window)
+{
+    const int32x4_t  result_offset_s32 = vdupq_n_s32(_result_offset);
+    const int32x4_t  result_shift_s32  = vdupq_n_s32(-_result_shift);
+    const uint8x16_t min_u8            = vdupq_n_u8(static_cast<uint8_t>(_min));
+    const uint8x16_t max_u8            = vdupq_n_u8(static_cast<uint8_t>(_max));
+
+    ARM_COMPUTE_UNUSED(min_u8);
+    ARM_COMPUTE_UNUSED(max_u8);
+
+    Iterator in(_input, window);
+    Iterator out(_output, window);
+
+    if(_bias != nullptr)
+    {
+        Window win_biases;
+        win_biases.set(Window::DimX, Window::Dimension(window.x().start(), window.x().end(), window.x().step()));
+        win_biases.set(Window::DimY, Window::Dimension(0, 1, 1));
+
+        Iterator bias(_bias, win_biases);
+        execute_window_loop(window, [&](const Coordinates & id)
+        {
+            int32x4x4_t in_s32 =
+            {
+                {
+                    vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + 0),
+                    vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + 4),
+                    vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + 8),
+                    vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + 12)
+                }
+            };
+
+            const int32x4x4_t bias_s32 =
+            {
+                {
+                    vld1q_s32(reinterpret_cast<const int32_t *>(bias.ptr()) + 0),
+                    vld1q_s32(reinterpret_cast<const int32_t *>(bias.ptr()) + 4),
+                    vld1q_s32(reinterpret_cast<const int32_t *>(bias.ptr()) + 8),
+                    vld1q_s32(reinterpret_cast<const int32_t *>(bias.ptr()) + 12)
+                }
+            };
+
+            // Add the offset terms to GEMM's result and multiply by result_mult_int
+            scale_input(in_s32, result_offset_s32, _result_mult_int);
+
+            // Add the bias to GEMM's result
+            in_s32.val[0] = vaddq_s32(in_s32.val[0], bias_s32.val[0]);
+            in_s32.val[1] = vaddq_s32(in_s32.val[1], bias_s32.val[1]);
+            in_s32.val[2] = vaddq_s32(in_s32.val[2], bias_s32.val[2]);
+            in_s32.val[3] = vaddq_s32(in_s32.val[3], bias_s32.val[3]);
+
+            vst1q_u8(out.ptr(), finalize_quantization<is_bounded_relu>(in_s32, result_shift_s32, min_u8, max_u8));
+        },
+        in, bias, out);
+    }
+    else
+    {
+        execute_window_loop(window, [&](const Coordinates & id)
+        {
+            int32x4x4_t in_s32 =
+            {
+                {
+                    vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + 0),
+                    vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + 4),
+                    vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + 8),
+                    vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + 12)
+                }
+            };
+
+            // Add the offset terms to GEMM's result and multiply by result_mult_int
+            scale_input(in_s32, result_offset_s32, _result_mult_int);
+
+            vst1q_u8(out.ptr(), finalize_quantization<is_bounded_relu>(in_s32, result_shift_s32, min_u8, max_u8));
+        },
+        in, out);
+    }
+}
+
 NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel::NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel()
-    : _input(nullptr), _output(nullptr), _result_offset(0), _result_mult_int(0), _result_shift(0)
+    : _func(nullptr), _input(nullptr), _bias(nullptr), _output(nullptr), _result_offset(0), _result_mult_int(0), _result_shift(0), _min(0), _max(0)
 {
 }
 
-void NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel::configure(const ITensor *input, ITensor *output, int result_offset, int result_mult_int, int result_shift)
+void NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel::configure(const ITensor *input, const ITensor *bias, ITensor *output, int result_offset, int result_mult_int, int result_shift, int min, int max)
 {
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S32);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8);
+    ARM_COMPUTE_ERROR_ON(max > 255);
+    ARM_COMPUTE_ERROR_ON(min < 0 || min > max);
+
+    if(bias != nullptr)
+    {
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
+        ARM_COMPUTE_ERROR_ON(bias->info()->num_dimensions() > 1);
+        ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != bias->info()->dimension(0));
+    }
 
     _input           = input;
+    _bias            = bias;
     _output          = output;
     _result_offset   = result_offset;
     _result_mult_int = result_mult_int;
     _result_shift    = result_shift;
+    _min             = min;
+    _max             = max;
 
     constexpr unsigned int num_elems_processed_per_iteration = 16;
 
@@ -69,9 +218,22 @@
                               input_access,
                               output_result_access);
 
+    if(bias != nullptr)
+    {
+        AccessWindowStatic bias_access(bias->info(), 0, 0, ceil_to_multiple(bias->info()->dimension(0), num_elems_processed_per_iteration), bias->info()->tensor_shape()[1]);
+
+        update_window_and_padding(win,
+                                  bias_access);
+    }
+
     output_result_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape()));
 
     INEKernel::configure(win);
+
+    const bool is_bounded_relu = ((min != max) && !(min == 0 && max == 255));
+
+    // Check if we need to clamp the result using min and max
+    _func = is_bounded_relu ? &NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel::run<true> : &NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel::run<false>;
 }
 
 void NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel::run(const Window &window, const ThreadInfo &info)
@@ -80,62 +242,5 @@
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
 
-    const int32x4_t result_offset_s32 = vdupq_n_s32(_result_offset);
-    const int32x4_t result_shift_s32  = vdupq_n_s32(-_result_shift);
-    const int32x4_t zero_s32          = vdupq_n_s32(0);
-
-    Iterator in(_input, window);
-    Iterator out(_output, window);
-
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        int32x4x4_t in_s32 =
-        {
-            {
-                vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + 0),
-                vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + 4),
-                vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + 8),
-                vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + 12)
-            }
-        };
-
-        // Add the offset terms to GEMM's result
-        in_s32.val[0] = vaddq_s32(in_s32.val[0], result_offset_s32);
-        in_s32.val[1] = vaddq_s32(in_s32.val[1], result_offset_s32);
-        in_s32.val[2] = vaddq_s32(in_s32.val[2], result_offset_s32);
-        in_s32.val[3] = vaddq_s32(in_s32.val[3], result_offset_s32);
-
-        // Multiply by c_mult_int
-        in_s32.val[0] = vmulq_n_s32(in_s32.val[0], _result_mult_int);
-        in_s32.val[1] = vmulq_n_s32(in_s32.val[1], _result_mult_int);
-        in_s32.val[2] = vmulq_n_s32(in_s32.val[2], _result_mult_int);
-        in_s32.val[3] = vmulq_n_s32(in_s32.val[3], _result_mult_int);
-
-        // Shift final result (negative value shift right)
-        in_s32.val[0] = vshlq_s32(in_s32.val[0], result_shift_s32);
-        in_s32.val[1] = vshlq_s32(in_s32.val[1], result_shift_s32);
-        in_s32.val[2] = vshlq_s32(in_s32.val[2], result_shift_s32);
-        in_s32.val[3] = vshlq_s32(in_s32.val[3], result_shift_s32);
-
-        // Saturate negative values
-        in_s32.val[0] = vmaxq_s32(in_s32.val[0], zero_s32);
-        in_s32.val[1] = vmaxq_s32(in_s32.val[1], zero_s32);
-        in_s32.val[2] = vmaxq_s32(in_s32.val[2], zero_s32);
-        in_s32.val[3] = vmaxq_s32(in_s32.val[3], zero_s32);
-
-        // Convert S32 to S16
-        const int16x8x2_t in_s16 =
-        {
-            {
-                vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
-                vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))
-            }
-        };
-
-        // Convert S16 to U8
-        const uint8x16_t out_u8 = vcombine_u8(vqmovun_s16(in_s16.val[0]), vqmovun_s16(in_s16.val[1]));
-
-        vst1q_u8(out.ptr(), out_u8);
-    },
-    in, out);
+    (this->*_func)(window);
 }
\ No newline at end of file
diff --git a/src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp b/src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp
index d09827f..66cdf58 100644
--- a/src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp
+++ b/src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp
@@ -29,9 +29,9 @@
 
 using namespace arm_compute;
 
-void NEGEMMLowpQuantizeDownInt32ToUint8Scale::configure(const ITensor *input, ITensor *output, int result_offset, int result_mult_int, int result_shift)
+void NEGEMMLowpQuantizeDownInt32ToUint8Scale::configure(const ITensor *input, const ITensor *bias, ITensor *output, int result_offset, int result_mult_int, int result_shift, int min, int max)
 {
     auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel>();
-    k->configure(input, output, result_offset, result_mult_int, result_shift);
+    k->configure(input, bias, output, result_offset, result_mult_int, result_shift, min, max);
     _kernel = std::move(k);
 }
\ No newline at end of file
diff --git a/tests/validation/CPP/GEMMLowp.cpp b/tests/validation/CPP/GEMMLowp.cpp
index 8670a22..bf002cf 100644
--- a/tests/validation/CPP/GEMMLowp.cpp
+++ b/tests/validation/CPP/GEMMLowp.cpp
@@ -33,6 +33,36 @@
 {
 namespace reference
 {
+namespace
+{
+template <typename T>
+void quantize_down_int32_to_uint8_scale(const SimpleTensor<T> *in, const SimpleTensor<T> *bias, SimpleTensor<uint8_t> *dst, int32_t result_offset, int32_t result_mult_int, int32_t result_shift,
+                                        int32_t min, int32_t max)
+{
+    const int cols_in = in->shape().x();
+
+    for(int i = 0; i < in->num_elements(); ++i)
+    {
+        int32_t result = ((*in)[i] + result_offset) * result_mult_int;
+
+        if(bias != nullptr)
+        {
+            result += (*bias)[i % cols_in];
+        }
+
+        result >>= result_shift;
+
+        // Bounded ReLu
+        if(min != max)
+        {
+            result = std::max(min, std::min(max, result));
+        }
+
+        (*dst)[i] = static_cast<uint8_t>(std::max(0, std::min(255, result)));
+    }
+}
+} // namespace
+
 template <typename T>
 SimpleTensor<int32_t> gemmlowp_matrix_multiply_core(const SimpleTensor<T> &a, const SimpleTensor<T> &b, int32_t a_offset, int32_t b_offset)
 {
@@ -80,21 +110,31 @@
 }
 
 template <typename T>
-SimpleTensor<uint8_t> gemmlowp_quantize_down_int32_to_uint8_scale(const SimpleTensor<T> &in, int32_t result_offset, int32_t result_mult_int, int32_t result_shift)
+SimpleTensor<uint8_t> gemmlowp_quantize_down_int32_to_uint8_scale(const SimpleTensor<T> &in, int32_t result_offset, int32_t result_mult_int, int32_t result_shift, int32_t min, int32_t max)
 {
     SimpleTensor<uint8_t> dst(in.shape(), DataType::QASYMM8);
 
-    for(int i = 0; i < in.num_elements(); ++i)
-    {
-        const int32_t result = ((in[i] + result_offset) * result_mult_int) >> result_shift;
-        dst[i]               = static_cast<uint8_t>(std::max(0, std::min(255, result)));
-    }
+    quantize_down_int32_to_uint8_scale<T>(&in, nullptr, &dst, result_offset, result_mult_int, result_shift, min, max);
+
+    return dst;
+}
+
+template <typename T>
+SimpleTensor<uint8_t> gemmlowp_quantize_down_int32_to_uint8_scale(const SimpleTensor<T> &in, const SimpleTensor<T> &bias, int32_t result_offset, int32_t result_mult_int, int32_t result_shift,
+                                                                  int32_t min, int32_t max)
+{
+    SimpleTensor<uint8_t> dst(in.shape(), DataType::QASYMM8);
+
+    quantize_down_int32_to_uint8_scale<T>(&in, &bias, &dst, result_offset, result_mult_int, result_shift, min, max);
 
     return dst;
 }
 
 template SimpleTensor<int32_t> gemmlowp_matrix_multiply_core(const SimpleTensor<uint8_t> &a, const SimpleTensor<uint8_t> &b, int32_t a_offset, int32_t b_offset);
-template SimpleTensor<uint8_t> gemmlowp_quantize_down_int32_to_uint8_scale(const SimpleTensor<int32_t> &a, int32_t result_offset, int32_t result_mult_int, int32_t result_shift);
+template SimpleTensor<uint8_t> gemmlowp_quantize_down_int32_to_uint8_scale(const SimpleTensor<int32_t> &a, int32_t result_offset, int32_t result_mult_int, int32_t result_shift, int32_t min,
+                                                                           int32_t max);
+template SimpleTensor<uint8_t> gemmlowp_quantize_down_int32_to_uint8_scale(const SimpleTensor<int32_t> &a, const SimpleTensor<int32_t> &b, int32_t result_offset, int32_t result_mult_int,
+                                                                           int32_t result_shift, int32_t min, int32_t max);
 } // namespace reference
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/CPP/GEMMLowp.h b/tests/validation/CPP/GEMMLowp.h
index cbed2206..ee33d8e 100644
--- a/tests/validation/CPP/GEMMLowp.h
+++ b/tests/validation/CPP/GEMMLowp.h
@@ -35,14 +35,17 @@
 {
 namespace reference
 {
+SimpleTensor<int32_t> gemmlowp(const SimpleTensor<int8_t> &a, const SimpleTensor<int8_t> &b);
+
 template <typename T>
 SimpleTensor<int32_t> gemmlowp_matrix_multiply_core(const SimpleTensor<T> &a, const SimpleTensor<T> &b, int32_t a_offset, int32_t b_offset);
 
 template <typename T>
-SimpleTensor<uint8_t> gemmlowp_quantize_down_int32_to_uint8_scale(const SimpleTensor<T> &in, int32_t result_offset, int32_t result_mult_int, int32_t result_shift);
+SimpleTensor<uint8_t> gemmlowp_quantize_down_int32_to_uint8_scale(const SimpleTensor<T> &in, int32_t result_offset, int32_t result_mult_int, int32_t result_shift, int32_t min = 0, int32_t max = 0);
 
-SimpleTensor<int32_t> gemmlowp(const SimpleTensor<int8_t> &a, const SimpleTensor<int8_t> &b);
-
+template <typename T>
+SimpleTensor<uint8_t> gemmlowp_quantize_down_int32_to_uint8_scale(const SimpleTensor<T> &in, const SimpleTensor<T> &bias, int32_t result_offset, int32_t result_mult_int, int32_t result_shift,
+                                                                  int32_t min = 0, int32_t max = 0);
 } // namespace reference
 } // namespace validation
 } // namespace test
diff --git a/tests/validation/NEON/GEMMLowp.cpp b/tests/validation/NEON/GEMMLowp.cpp
index ba91ced..078096a 100644
--- a/tests/validation/NEON/GEMMLowp.cpp
+++ b/tests/validation/NEON/GEMMLowp.cpp
@@ -131,34 +131,55 @@
 
 TEST_SUITE(QuantizeDownInt32ToUint8Scale)
 
+const auto quantize_down_int32_to_uint8_scale_cases = framework::dataset::make("result_offset", -2, 2) * framework::dataset::make("result_mult_int", 1, 2) * framework::dataset::make("result_shift", 2,
+                                                      3)
+                                                      * framework::dataset::make("min", 0) * framework::dataset::make("max", 0) * framework::dataset::make("addBias", { false, true });
+
+const auto quantize_down_int32_to_uint8_scale_relu_cases = framework::dataset::make("result_offset", -2, 2) * framework::dataset::make("result_mult_int", 1,
+                                                           2)
+                                                           * framework::dataset::make("result_shift", 2, 3) * framework::dataset::make("min", 0, 2) * framework::dataset::make("max", 171, 174) * framework::dataset::make("addBias", { false, true });
+
 using NEGEMMLowpQuantizeDownInt32ToUint8ScaleFixture = GEMMLowpQuantizeDownInt32ToUint8ScaleValidationFixture<Tensor, Accessor, NEGEMMLowpQuantizeDownInt32ToUint8Scale>;
 
-const auto quantize_down_int32_to_uint8_scale_cases = framework::dataset::make("result_offset", -4, 4) * framework::dataset::make("result_mult_int", 1, 3) * framework::dataset::make("result_shift", 2,
-                                                      4);
-
 DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(framework::dataset::concat(datasets::SmallShapes(), datasets::LargeShapes()), quantize_down_int32_to_uint8_scale_cases),
-               shape, result_offset, result_mult_int, result_shift)
+               shape, result_offset, result_mult_int, result_shift, min, max, add_bias)
 {
+    TensorShape shape_bias(shape[0]);
+
     // Create tensors
-    Tensor in  = create_tensor<Tensor>(shape, DataType::S32);
-    Tensor out = create_tensor<Tensor>(shape, DataType::QASYMM8);
+    Tensor in   = create_tensor<Tensor>(shape, DataType::S32);
+    Tensor bias = create_tensor<Tensor>(shape_bias, DataType::S32);
+    Tensor out  = create_tensor<Tensor>(shape, DataType::QASYMM8);
 
     ARM_COMPUTE_EXPECT(in.info()->is_resizable(), framework::LogLevel::ERRORS);
+    ARM_COMPUTE_EXPECT(bias.info()->is_resizable(), framework::LogLevel::ERRORS);
     ARM_COMPUTE_EXPECT(out.info()->is_resizable(), framework::LogLevel::ERRORS);
 
     // Create and configure function
     NEGEMMLowpQuantizeDownInt32ToUint8Scale output_stage;
-    output_stage.configure(&in, &out, result_offset, result_mult_int, result_shift);
+    output_stage.configure(&in, add_bias ? &bias : nullptr, &out, result_offset, result_mult_int, result_shift, min, max);
 
-    // Validate valid region
+    // Validate valid region input and output
     const ValidRegion valid_region = shape_to_valid_region(shape);
     validate(in.info()->valid_region(), valid_region);
     validate(out.info()->valid_region(), valid_region);
 
+    // Validate valid region bias
+    if(add_bias)
+    {
+        const ValidRegion valid_region_bias = shape_to_valid_region(shape_bias);
+        validate(bias.info()->valid_region(), valid_region_bias);
+    }
+
     // Validate padding
     const PaddingSize padding = PaddingCalculator(shape.x(), 16).required_padding();
     validate(in.info()->padding(), padding);
     validate(out.info()->padding(), padding);
+
+    if(add_bias)
+    {
+        validate(bias.info()->padding(), padding);
+    }
 }
 
 FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpQuantizeDownInt32ToUint8ScaleFixture, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), quantize_down_int32_to_uint8_scale_cases))
@@ -173,8 +194,35 @@
     validate(Accessor(_target), _reference);
 }
 
-TEST_SUITE_END() // QuantizeDownInt32ToUint8Scale
+TEST_SUITE(BoundedReLu)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpQuantizeDownInt32ToUint8ScaleFixture, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), quantize_down_int32_to_uint8_scale_relu_cases))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
 
+FIXTURE_DATA_TEST_CASE(RunLarge, NEGEMMLowpQuantizeDownInt32ToUint8ScaleFixture, framework::DatasetMode::NIGHTLY, combine(datasets::LargeShapes(), quantize_down_int32_to_uint8_scale_relu_cases))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+TEST_SUITE_END() // BoundedReLu
+
+TEST_SUITE(AddBias)
+FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpQuantizeDownInt32ToUint8ScaleFixture, framework::DatasetMode::ALL, combine(datasets::SmallShapes(), quantize_down_int32_to_uint8_scale_relu_cases))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+
+FIXTURE_DATA_TEST_CASE(RunLarge, NEGEMMLowpQuantizeDownInt32ToUint8ScaleFixture, framework::DatasetMode::NIGHTLY, combine(datasets::LargeShapes(), quantize_down_int32_to_uint8_scale_relu_cases))
+{
+    // Validate output
+    validate(Accessor(_target), _reference);
+}
+TEST_SUITE_END() // AddBias
+
+TEST_SUITE_END() // QuantizeDownInt32ToUint8Scale
 TEST_SUITE_END() // OutputStage
 
 TEST_SUITE_END() // GEMMLowp
diff --git a/tests/validation/fixtures/GEMMLowpFixture.h b/tests/validation/fixtures/GEMMLowpFixture.h
index f9b0dbd..a99e932 100644
--- a/tests/validation/fixtures/GEMMLowpFixture.h
+++ b/tests/validation/fixtures/GEMMLowpFixture.h
@@ -122,10 +122,10 @@
 {
 public:
     template <typename...>
-    void setup(TensorShape shape, int32_t result_offset, int32_t result_mult_int, int32_t result_shift)
+    void setup(TensorShape shape, int32_t result_offset, int32_t result_mult_int, int32_t result_shift, int32_t min, int32_t max, bool add_bias)
     {
-        _target    = compute_target(shape, result_offset, result_mult_int, result_shift);
-        _reference = compute_reference(shape, result_offset, result_mult_int, result_shift);
+        _target    = compute_target(shape, result_offset, result_mult_int, result_shift, min, max, add_bias);
+        _reference = compute_reference(shape, result_offset, result_mult_int, result_shift, min, max, add_bias);
     }
 
 protected:
@@ -136,43 +136,72 @@
         library->fill(tensor, distribution, i);
     }
 
-    TensorType compute_target(const TensorShape &shape, int32_t result_offset, int32_t result_mult_int, int32_t result_shift)
+    TensorType compute_target(const TensorShape &shape, int32_t result_offset, int32_t result_mult_int, int32_t result_shift, int32_t min, int32_t max, bool add_bias)
     {
+        TensorShape shape_bias(shape[0]);
+
         // Create tensors
         TensorType a = create_tensor<TensorType>(shape, DataType::S32, 1);
-        TensorType b = create_tensor<TensorType>(shape, DataType::QASYMM8, 1);
+        TensorType b = create_tensor<TensorType>(shape_bias, DataType::S32, 1);
+        TensorType c = create_tensor<TensorType>(shape, DataType::QASYMM8, 1);
 
         // Create and configure function
         FunctionType output_stage;
-        output_stage.configure(&a, &b, result_offset, result_mult_int, result_shift);
+        output_stage.configure(&a, add_bias ? &b : nullptr, &c, result_offset, result_mult_int, result_shift, min, max);
 
         ARM_COMPUTE_EXPECT(a.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(b.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(c.info()->is_resizable(), framework::LogLevel::ERRORS);
 
         // Allocate tensors
         a.allocator()->allocate();
-        b.allocator()->allocate();
+        c.allocator()->allocate();
 
         ARM_COMPUTE_EXPECT(!a.info()->is_resizable(), framework::LogLevel::ERRORS);
-        ARM_COMPUTE_EXPECT(!b.info()->is_resizable(), framework::LogLevel::ERRORS);
+        ARM_COMPUTE_EXPECT(!c.info()->is_resizable(), framework::LogLevel::ERRORS);
 
-        // Fill tensors
+        // Fill tensor
         fill(AccessorType(a), 0);
 
+        if(add_bias)
+        {
+            ARM_COMPUTE_EXPECT(b.info()->is_resizable(), framework::LogLevel::ERRORS);
+
+            // Allocate bias tensor
+            b.allocator()->allocate();
+
+            ARM_COMPUTE_EXPECT(!b.info()->is_resizable(), framework::LogLevel::ERRORS);
+
+            // Fill tensor
+            fill(AccessorType(b), 1);
+        }
+
         // Compute GEMM function
         output_stage.run();
-        return b;
+        return c;
     }
 
-    SimpleTensor<uint8_t> compute_reference(const TensorShape &shape, int32_t result_offset, int32_t result_mult_int, int32_t result_shift)
+    SimpleTensor<uint8_t> compute_reference(const TensorShape &shape, int32_t result_offset, int32_t result_mult_int, int32_t result_shift, int32_t min, int32_t max, bool add_bias)
     {
         // Create reference
+        TensorShape shape_bias(shape[0]);
+
         SimpleTensor<int32_t> a{ shape, DataType::S32, 1 };
+        SimpleTensor<int32_t> b{ shape_bias, DataType::S32, 1 };
 
         // Fill reference
         fill(a, 0);
 
-        return reference::gemmlowp_quantize_down_int32_to_uint8_scale<int32_t>(a, result_offset, result_mult_int, result_shift);
+        if(add_bias)
+        {
+            // Fill bias
+            fill(b, 1);
+
+            return reference::gemmlowp_quantize_down_int32_to_uint8_scale<int32_t>(a, b, result_offset, result_mult_int, result_shift, min, max);
+        }
+        else
+        {
+            return reference::gemmlowp_quantize_down_int32_to_uint8_scale<int32_t>(a, result_offset, result_mult_int, result_shift, min, max);
+        }
     }
 
     TensorType            _target{};