COMPMID-665 - NEON: Add QASYMM8 in place Activation layer

- Added min and max arguments for QuantizeDownInt32ToUint8Scale in order
  to apply bounded relu

- Added support for int32_t biases

- Extended tests

Change-Id: I015dae17faa7284766b5435ca33bcf593c1b2b69
Reviewed-on: http://mpd-gerrit.cambridge.arm.com/96512
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
Tested-by: Kaizen <jeremy.johnson+kaizengerrit@arm.com>
diff --git a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.cpp
index aa3c280..26aaa2a 100644
--- a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.cpp
@@ -23,10 +23,12 @@
  */
 #include "arm_compute/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.h"
 
+#include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 
@@ -36,26 +38,173 @@
 
 using namespace arm_compute;
 
+namespace
+{
+inline void scale_input(int32x4x4_t &in_s32, int32x4_t result_offset_s32, int32_t result_mult_int)
+{
+    // Add the offset terms to GEMM's result
+    in_s32.val[0] = vaddq_s32(in_s32.val[0], result_offset_s32);
+    in_s32.val[1] = vaddq_s32(in_s32.val[1], result_offset_s32);
+    in_s32.val[2] = vaddq_s32(in_s32.val[2], result_offset_s32);
+    in_s32.val[3] = vaddq_s32(in_s32.val[3], result_offset_s32);
+
+    // Multiply by result_mult_int
+    in_s32.val[0] = vmulq_n_s32(in_s32.val[0], result_mult_int);
+    in_s32.val[1] = vmulq_n_s32(in_s32.val[1], result_mult_int);
+    in_s32.val[2] = vmulq_n_s32(in_s32.val[2], result_mult_int);
+    in_s32.val[3] = vmulq_n_s32(in_s32.val[3], result_mult_int);
+}
+
+template <bool    is_bounded_relu>
+inline uint8x16_t finalize_quantization(int32x4x4_t &in_s32, int32x4_t result_shift_s32, uint8x16_t min_u8, uint8x16_t max_u8)
+{
+    const static int32x4_t zero_s32 = vdupq_n_s32(0);
+
+    // Shift final result (negative value shift right)
+    in_s32.val[0] = vshlq_s32(in_s32.val[0], result_shift_s32);
+    in_s32.val[1] = vshlq_s32(in_s32.val[1], result_shift_s32);
+    in_s32.val[2] = vshlq_s32(in_s32.val[2], result_shift_s32);
+    in_s32.val[3] = vshlq_s32(in_s32.val[3], result_shift_s32);
+
+    // Saturate negative values
+    in_s32.val[0] = vmaxq_s32(in_s32.val[0], zero_s32);
+    in_s32.val[1] = vmaxq_s32(in_s32.val[1], zero_s32);
+    in_s32.val[2] = vmaxq_s32(in_s32.val[2], zero_s32);
+    in_s32.val[3] = vmaxq_s32(in_s32.val[3], zero_s32);
+
+    // Convert S32 to S16
+    const int16x8x2_t in_s16 =
+    {
+        {
+            vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
+            vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))
+        }
+    };
+
+    // Convert S16 to U8
+    uint8x16_t out_u8 = vcombine_u8(vqmovun_s16(in_s16.val[0]), vqmovun_s16(in_s16.val[1]));
+
+    if(is_bounded_relu)
+    {
+        out_u8 = vmaxq_u8(out_u8, min_u8);
+        out_u8 = vminq_u8(out_u8, max_u8);
+    }
+
+    return out_u8;
+}
+} // namespace
+
 namespace arm_compute
 {
 class Coordinates;
 } // namespace arm_compute
 
+template <bool is_bounded_relu>
+void NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel::run(const Window &window)
+{
+    const int32x4_t  result_offset_s32 = vdupq_n_s32(_result_offset);
+    const int32x4_t  result_shift_s32  = vdupq_n_s32(-_result_shift);
+    const uint8x16_t min_u8            = vdupq_n_u8(static_cast<uint8_t>(_min));
+    const uint8x16_t max_u8            = vdupq_n_u8(static_cast<uint8_t>(_max));
+
+    ARM_COMPUTE_UNUSED(min_u8);
+    ARM_COMPUTE_UNUSED(max_u8);
+
+    Iterator in(_input, window);
+    Iterator out(_output, window);
+
+    if(_bias != nullptr)
+    {
+        Window win_biases;
+        win_biases.set(Window::DimX, Window::Dimension(window.x().start(), window.x().end(), window.x().step()));
+        win_biases.set(Window::DimY, Window::Dimension(0, 1, 1));
+
+        Iterator bias(_bias, win_biases);
+        execute_window_loop(window, [&](const Coordinates & id)
+        {
+            int32x4x4_t in_s32 =
+            {
+                {
+                    vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + 0),
+                    vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + 4),
+                    vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + 8),
+                    vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + 12)
+                }
+            };
+
+            const int32x4x4_t bias_s32 =
+            {
+                {
+                    vld1q_s32(reinterpret_cast<const int32_t *>(bias.ptr()) + 0),
+                    vld1q_s32(reinterpret_cast<const int32_t *>(bias.ptr()) + 4),
+                    vld1q_s32(reinterpret_cast<const int32_t *>(bias.ptr()) + 8),
+                    vld1q_s32(reinterpret_cast<const int32_t *>(bias.ptr()) + 12)
+                }
+            };
+
+            // Add the offset terms to GEMM's result and multiply by result_mult_int
+            scale_input(in_s32, result_offset_s32, _result_mult_int);
+
+            // Add the bias to GEMM's result
+            in_s32.val[0] = vaddq_s32(in_s32.val[0], bias_s32.val[0]);
+            in_s32.val[1] = vaddq_s32(in_s32.val[1], bias_s32.val[1]);
+            in_s32.val[2] = vaddq_s32(in_s32.val[2], bias_s32.val[2]);
+            in_s32.val[3] = vaddq_s32(in_s32.val[3], bias_s32.val[3]);
+
+            vst1q_u8(out.ptr(), finalize_quantization<is_bounded_relu>(in_s32, result_shift_s32, min_u8, max_u8));
+        },
+        in, bias, out);
+    }
+    else
+    {
+        execute_window_loop(window, [&](const Coordinates & id)
+        {
+            int32x4x4_t in_s32 =
+            {
+                {
+                    vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + 0),
+                    vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + 4),
+                    vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + 8),
+                    vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + 12)
+                }
+            };
+
+            // Add the offset terms to GEMM's result and multiply by result_mult_int
+            scale_input(in_s32, result_offset_s32, _result_mult_int);
+
+            vst1q_u8(out.ptr(), finalize_quantization<is_bounded_relu>(in_s32, result_shift_s32, min_u8, max_u8));
+        },
+        in, out);
+    }
+}
+
 NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel::NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel()
-    : _input(nullptr), _output(nullptr), _result_offset(0), _result_mult_int(0), _result_shift(0)
+    : _func(nullptr), _input(nullptr), _bias(nullptr), _output(nullptr), _result_offset(0), _result_mult_int(0), _result_shift(0), _min(0), _max(0)
 {
 }
 
-void NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel::configure(const ITensor *input, ITensor *output, int result_offset, int result_mult_int, int result_shift)
+void NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel::configure(const ITensor *input, const ITensor *bias, ITensor *output, int result_offset, int result_mult_int, int result_shift, int min, int max)
 {
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S32);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8);
+    ARM_COMPUTE_ERROR_ON(max > 255);
+    ARM_COMPUTE_ERROR_ON(min < 0 || min > max);
+
+    if(bias != nullptr)
+    {
+        ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
+        ARM_COMPUTE_ERROR_ON(bias->info()->num_dimensions() > 1);
+        ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != bias->info()->dimension(0));
+    }
 
     _input           = input;
+    _bias            = bias;
     _output          = output;
     _result_offset   = result_offset;
     _result_mult_int = result_mult_int;
     _result_shift    = result_shift;
+    _min             = min;
+    _max             = max;
 
     constexpr unsigned int num_elems_processed_per_iteration = 16;
 
@@ -69,9 +218,22 @@
                               input_access,
                               output_result_access);
 
+    if(bias != nullptr)
+    {
+        AccessWindowStatic bias_access(bias->info(), 0, 0, ceil_to_multiple(bias->info()->dimension(0), num_elems_processed_per_iteration), bias->info()->tensor_shape()[1]);
+
+        update_window_and_padding(win,
+                                  bias_access);
+    }
+
     output_result_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape()));
 
     INEKernel::configure(win);
+
+    const bool is_bounded_relu = ((min != max) && !(min == 0 && max == 255));
+
+    // Check if we need to clamp the result using min and max
+    _func = is_bounded_relu ? &NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel::run<true> : &NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel::run<false>;
 }
 
 void NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel::run(const Window &window, const ThreadInfo &info)
@@ -80,62 +242,5 @@
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
 
-    const int32x4_t result_offset_s32 = vdupq_n_s32(_result_offset);
-    const int32x4_t result_shift_s32  = vdupq_n_s32(-_result_shift);
-    const int32x4_t zero_s32          = vdupq_n_s32(0);
-
-    Iterator in(_input, window);
-    Iterator out(_output, window);
-
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        int32x4x4_t in_s32 =
-        {
-            {
-                vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + 0),
-                vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + 4),
-                vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + 8),
-                vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + 12)
-            }
-        };
-
-        // Add the offset terms to GEMM's result
-        in_s32.val[0] = vaddq_s32(in_s32.val[0], result_offset_s32);
-        in_s32.val[1] = vaddq_s32(in_s32.val[1], result_offset_s32);
-        in_s32.val[2] = vaddq_s32(in_s32.val[2], result_offset_s32);
-        in_s32.val[3] = vaddq_s32(in_s32.val[3], result_offset_s32);
-
-        // Multiply by c_mult_int
-        in_s32.val[0] = vmulq_n_s32(in_s32.val[0], _result_mult_int);
-        in_s32.val[1] = vmulq_n_s32(in_s32.val[1], _result_mult_int);
-        in_s32.val[2] = vmulq_n_s32(in_s32.val[2], _result_mult_int);
-        in_s32.val[3] = vmulq_n_s32(in_s32.val[3], _result_mult_int);
-
-        // Shift final result (negative value shift right)
-        in_s32.val[0] = vshlq_s32(in_s32.val[0], result_shift_s32);
-        in_s32.val[1] = vshlq_s32(in_s32.val[1], result_shift_s32);
-        in_s32.val[2] = vshlq_s32(in_s32.val[2], result_shift_s32);
-        in_s32.val[3] = vshlq_s32(in_s32.val[3], result_shift_s32);
-
-        // Saturate negative values
-        in_s32.val[0] = vmaxq_s32(in_s32.val[0], zero_s32);
-        in_s32.val[1] = vmaxq_s32(in_s32.val[1], zero_s32);
-        in_s32.val[2] = vmaxq_s32(in_s32.val[2], zero_s32);
-        in_s32.val[3] = vmaxq_s32(in_s32.val[3], zero_s32);
-
-        // Convert S32 to S16
-        const int16x8x2_t in_s16 =
-        {
-            {
-                vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
-                vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))
-            }
-        };
-
-        // Convert S16 to U8
-        const uint8x16_t out_u8 = vcombine_u8(vqmovun_s16(in_s16.val[0]), vqmovun_s16(in_s16.val[1]));
-
-        vst1q_u8(out.ptr(), out_u8);
-    },
-    in, out);
+    (this->*_func)(window);
 }
\ No newline at end of file
diff --git a/src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp b/src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp
index d09827f..66cdf58 100644
--- a/src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp
+++ b/src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp
@@ -29,9 +29,9 @@
 
 using namespace arm_compute;
 
-void NEGEMMLowpQuantizeDownInt32ToUint8Scale::configure(const ITensor *input, ITensor *output, int result_offset, int result_mult_int, int result_shift)
+void NEGEMMLowpQuantizeDownInt32ToUint8Scale::configure(const ITensor *input, const ITensor *bias, ITensor *output, int result_offset, int result_mult_int, int result_shift, int min, int max)
 {
     auto k = arm_compute::support::cpp14::make_unique<NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel>();
-    k->configure(input, output, result_offset, result_mult_int, result_shift);
+    k->configure(input, bias, output, result_offset, result_mult_int, result_shift, min, max);
     _kernel = std::move(k);
 }
\ No newline at end of file