COMPMID-706 - Add GEMMLowp output stage for scaling by a fixed point number

DoD:
- Implement NEON kernel for quantizing down the gemmlowp result. The
  result should be scaled by a fixedpoint number
- Implement OpenCL kernel for quantizing down the gemmlowp result. The
  result should be scaled by a fixedpoint number
- Add test for validating the result

Required for:
- Integration of GEMMLowp in Android NN
- Convolution quantized
- Fully connected quantized

Change-Id: Ia963d25d695471e963961fb49a5600e78374ac4f
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/110981
Tested-by: BSG Visual Compute Jenkins server to access repositories on http://mpd-gerrit.cambridge.arm.com <bsgcomp@arm.com>
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
diff --git a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.cpp
index 670b11f..edd6a9f 100644
--- a/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.cpp
+++ b/src/core/NEON/kernels/NEGEMMLowpQuantizeDownInt32ToUint8ScaleKernel.cpp
@@ -78,7 +78,7 @@
         window_changed = window_changed || update_window_and_padding(win, bias_access);
     }
 
-    output_result_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->tensor_shape()));
+    output_result_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
 
     Error err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Error{};
     return std::make_pair(err, win);
@@ -186,15 +186,15 @@
                 }
             };
 
-            // Add the offset terms to GEMM's result and multiply by result_mult_int
-            scale_input(in_s32, result_offset_s32, _result_mult_int);
-
             // Add the bias to GEMM's result
             in_s32.val[0] = vaddq_s32(in_s32.val[0], bias_s32.val[0]);
             in_s32.val[1] = vaddq_s32(in_s32.val[1], bias_s32.val[1]);
             in_s32.val[2] = vaddq_s32(in_s32.val[2], bias_s32.val[2]);
             in_s32.val[3] = vaddq_s32(in_s32.val[3], bias_s32.val[3]);
 
+            // Add the offset terms to GEMM's result and multiply by result_mult_int
+            scale_input(in_s32, result_offset_s32, _result_mult_int);
+
             vst1q_u8(out.ptr(), finalize_quantization<is_bounded_relu>(in_s32, result_shift_s32, min_u8, max_u8));
         },
         in, bias, out);
@@ -231,6 +231,10 @@
 {
     // Perform validate step
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+
+    // Output auto inizialitation if not yet initialized
+    auto_init_if_empty(*output->info(), input->info()->clone()->set_data_type(DataType::QASYMM8));
+
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(),
                                                   (bias != nullptr) ? bias->info() : nullptr,
                                                   output->info(),