Fix Intermittent Neon™ ReduceMean QASYMM8 Mismatch

- Dividing scale by number of elements causes accuracy loss due to limitations in float datatype and truncation to int
- Adds rounding after division on aarch64 to negate this.

Resolves: [COMPMID-5839]
Signed-off-by: Mohammed Suhail Munshi <MohammedSuhail.Munshi@arm.com>
Change-Id: I54ef0f7e56f39da1fa5f30378f551b5ca419a61d
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/c/VisualCompute/ComputeLibrary/+/492456
Tested-by: bsgcomp <bsgcomp@arm.com>
Comments-Addressed: bsgcomp <bsgcomp@arm.com>
Reviewed-by: Viet-Hoa Do <viet-hoa.do@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/9110
Reviewed-by: Gunes Bayir <gunes.bayir@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Benchmark: Arm Jenkins <bsgcomp@arm.com>
diff --git a/src/core/NEON/kernels/NEReductionOperationKernel.cpp b/src/core/NEON/kernels/NEReductionOperationKernel.cpp
index e0f43ab..19955af 100644
--- a/src/core/NEON/kernels/NEReductionOperationKernel.cpp
+++ b/src/core/NEON/kernels/NEReductionOperationKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2022 Arm Limited.
+ * Copyright (c) 2017-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -625,13 +625,13 @@
         Iterator input(in, in_win_no_pad);
         Iterator output(out, out_window);
 
-        const float in_offset = static_cast<float>(iq_info.offset);
+        const auto  in_offset = static_cast<float>(iq_info.offset);
         const float in_scale  = iq_info.scale;
 
-        const float out_offset = static_cast<float>(oq_info.offset);
+        const auto  out_offset = static_cast<float>(oq_info.offset);
         const float out_scale  = oq_info.scale;
 
-        const float num_elements = static_cast<float>(in_info.dimension(0));
+        const auto num_elements = static_cast<float>(in_info.dimension(0));
 
         const float A = in_scale / (out_scale * num_elements);
         const float B = out_offset - (in_scale * in_offset) / (out_scale);
@@ -1382,10 +1382,17 @@
                         vec_res_value3_f = wrapper::vmla(vec_B, wrapper::vcvt<float>(vec_res_value3), vec_A);
                         vec_res_value4_f = wrapper::vmla(vec_B, wrapper::vcvt<float>(vec_res_value4), vec_A);
 
-                        vec_res_value1 = wrapper::vcvt<T>(vec_res_value1_f);
-                        vec_res_value2 = wrapper::vcvt<T>(vec_res_value2_f);
-                        vec_res_value3 = wrapper::vcvt<T>(vec_res_value3_f);
-                        vec_res_value4 = wrapper::vcvt<T>(vec_res_value4_f);
+#ifdef __aarch64__
+                        vec_res_value1 = wrapper::vcvta<PromotedType>(vec_res_value1_f);
+                        vec_res_value2 = wrapper::vcvta<PromotedType>(vec_res_value2_f);
+                        vec_res_value3 = wrapper::vcvta<PromotedType>(vec_res_value3_f);
+                        vec_res_value4 = wrapper::vcvta<PromotedType>(vec_res_value4_f);
+#else  // defined(__aarch64__)
+                        vec_res_value1 = wrapper::vcvt<PromotedType>(vec_res_value1_f);
+                        vec_res_value2 = wrapper::vcvt<PromotedType>(vec_res_value2_f);
+                        vec_res_value3 = wrapper::vcvt<PromotedType>(vec_res_value3_f);
+                        vec_res_value4 = wrapper::vcvt<PromotedType>(vec_res_value4_f);
+#endif // __aarch64__
 
                         const auto temp16x8t_1 = wrapper::vcombine(wrapper::vqmovn(vec_res_value1), wrapper::vqmovn(vec_res_value2));
                         const auto temp16x8t_2 = wrapper::vcombine(wrapper::vqmovn(vec_res_value3), wrapper::vqmovn(vec_res_value4));
@@ -1521,7 +1528,12 @@
                 {
                     case ReductionOperation::MEAN_SUM:
                     {
+                        // Apply previously calculated coefficients (with rounding on aarch64)
+#ifdef  __aarch64__
+                        const int32_t res                        = arm_compute::support::cpp11::round(A * (static_cast<float>(res_value_q)) + B);
+#else   // defined(__aarch64__)
                         const int32_t res                        = A * (static_cast<float>(res_value_q)) + B;
+#endif  // __aarch64__
                         *reinterpret_cast<T *>(output.ptr() + x) = utils::cast::saturate_cast<T>(res);
                         break;
                     }
diff --git a/tests/validation/fixtures/ReduceMeanFixture.h b/tests/validation/fixtures/ReduceMeanFixture.h
index 304630e..13354ee 100644
--- a/tests/validation/fixtures/ReduceMeanFixture.h
+++ b/tests/validation/fixtures/ReduceMeanFixture.h
@@ -124,7 +124,13 @@
         {
             TensorShape output_shape = i == 0 ? src_shape : out.shape();
             output_shape.set(axis[i], 1);
-            out = reference::reduction_operation<T, T>(i == 0 ? src : out, output_shape, axis[i], ReductionOperation::MEAN_SUM, quantization_info_output);
+            bool is_opencl = false;
+
+#ifdef ARM_COMPUTE_OPENCL_ENABLED
+            is_opencl = std::is_same<CLTensor, TensorType>::value; // Round down to zero on opencl to match kernel
+#endif /* ARM_COMPUTE_OPENCL_ENABLED */
+            out = reference::reduction_operation<T, T>(i == 0 ? src : out, output_shape, axis[i], ReductionOperation::MEAN_SUM, quantization_info_output, is_opencl ? RoundingPolicy::TO_ZERO : RoundingPolicy::TO_NEAREST_UP);
+
         }
 
         if(!keep_dims)
diff --git a/tests/validation/reference/ReductionOperation.cpp b/tests/validation/reference/ReductionOperation.cpp
index ffb79f8..e2890af 100644
--- a/tests/validation/reference/ReductionOperation.cpp
+++ b/tests/validation/reference/ReductionOperation.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2020, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -22,7 +22,6 @@
  * SOFTWARE.
  */
 #include "ReductionOperation.h"
-
 #include "tests/validation/Helpers.h"
 
 #include <algorithm>
@@ -39,7 +38,7 @@
 namespace
 {
 template <typename T, typename OT>
-OT reduce_operation(const T *ptr, int reduce_elements, ReductionOperation op, int stride)
+OT reduce_operation(const T *ptr, int reduce_elements, ReductionOperation op, int stride, RoundingPolicy policy)
 {
     using type = typename std::remove_cv<OT>::type;
     T res;
@@ -99,7 +98,14 @@
         }
         if(op == ReductionOperation::MEAN_SUM && reduce_elements > 0)
         {
-            int_res /= reduce_elements;
+            // Only use rounding in aarch64 to be consistent with kernel
+#ifdef __aarch64__
+            // Divide in float format, then rounded to nearest and implicitly cast back to int
+            int_res = round(static_cast<float>(int_res) / static_cast<float>(reduce_elements), policy);
+#else  // defined(__aarch64__)
+            ARM_COMPUTE_UNUSED(policy);
+            int_res /= reduce_elements; // Legacy compatibility
+#endif // __aarch64
         }
         res = static_cast<type>(int_res);
     }
@@ -175,7 +181,7 @@
 } // namespace
 
 template <typename T, typename OT>
-SimpleTensor<OT> compute_reduction_operation(const SimpleTensor<T> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op)
+SimpleTensor<OT> compute_reduction_operation(const SimpleTensor<T> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op, RoundingPolicy policy)
 {
     // Create reference
     const bool         is_arg_min_max   = (op == ReductionOperation::ARG_IDX_MIN || op == ReductionOperation::ARG_IDX_MAX);
@@ -197,7 +203,7 @@
                 const T *src_row_ptr = src.data() + du * reduce_elems;
                 dst[du]              = is_arg_min_max ?
                                        reduce_operation_arg_min_max<T, OT>(src_row_ptr, reduce_elems, op, 1) :
-                                       reduce_operation<T, OT>(src_row_ptr, reduce_elems, op, 1);
+                                       reduce_operation<T, OT>(src_row_ptr, reduce_elems, op, 1, policy);
             }
         }
         break;
@@ -213,7 +219,7 @@
                     const T *src_row_ptr = src.data() + in_offset;
                     dst[out_offset]       = is_arg_min_max ?
                                             reduce_operation_arg_min_max<T, OT>(src_row_ptr, reduce_elems, op, src_width) :
-                                            reduce_operation<T, OT>(src_row_ptr, reduce_elems, op, src_width);
+                                            reduce_operation<T, OT>(src_row_ptr, reduce_elems, op, src_width, policy);
                 }
             }
         }
@@ -232,7 +238,7 @@
                         const T *src_row_ptr = src.data() + in_offset;
                         dst[out_offset]       = is_arg_min_max ?
                                                 reduce_operation_arg_min_max<T, OT>(src_row_ptr, reduce_elems, op, src_width * src_height) :
-                                                reduce_operation<T, OT>(src_row_ptr, reduce_elems, op, src_width * src_height);
+                                                reduce_operation<T, OT>(src_row_ptr, reduce_elems, op, src_width * src_height, policy);
                     }
                 }
             }
@@ -254,7 +260,7 @@
                             const T *src_row_ptr = src.data() + in_offset;
                             dst[out_offset]       = is_arg_min_max ?
                                                     reduce_operation_arg_min_max<T, OT>(src_row_ptr, reduce_elems, op, src_width * src_height * src_depth) :
-                                                    reduce_operation<T, OT>(src_row_ptr, reduce_elems, op, src_width * src_height * src_depth);
+                                                    reduce_operation<T, OT>(src_row_ptr, reduce_elems, op, src_width * src_height * src_depth, policy);
                         }
                     }
                 }
@@ -269,21 +275,21 @@
 }
 
 template <typename T, typename OT>
-SimpleTensor<OT> reduction_operation(const SimpleTensor<T> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op, QuantizationInfo quantization_info_output)
+SimpleTensor<OT> reduction_operation(const SimpleTensor<T> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op, QuantizationInfo quantization_info_output, RoundingPolicy policy)
 {
     ARM_COMPUTE_UNUSED(quantization_info_output);
-    return compute_reduction_operation<T, OT>(src, dst_shape, axis, op);
+    return compute_reduction_operation<T, OT>(src, dst_shape, axis, op, policy);
 }
 
 template <>
-SimpleTensor<uint8_t> reduction_operation(const SimpleTensor<uint8_t> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op, QuantizationInfo quantization_info_output)
+SimpleTensor<uint8_t> reduction_operation(const SimpleTensor<uint8_t> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op, QuantizationInfo quantization_info_output, RoundingPolicy policy)
 {
     if(src.data_type() == DataType::QASYMM8)
     {
         // If the operation is MEAN_SUM, we can directly use the uint8 implementation without taking into account scale and offset
         if(op == ReductionOperation::MEAN_SUM && src.quantization_info() == quantization_info_output)
         {
-            return compute_reduction_operation<uint8_t, uint8_t>(src, dst_shape, axis, op);
+            return compute_reduction_operation<uint8_t, uint8_t>(src, dst_shape, axis, op, policy);
         }
         else
         {
@@ -294,19 +300,19 @@
     }
     else
     {
-        return compute_reduction_operation<uint8_t, uint8_t>(src, dst_shape, axis, op);
+        return compute_reduction_operation<uint8_t, uint8_t>(src, dst_shape, axis, op, policy);
     }
 }
 
 template <>
-SimpleTensor<int8_t> reduction_operation(const SimpleTensor<int8_t> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op, QuantizationInfo quantization_info_output)
+SimpleTensor<int8_t> reduction_operation(const SimpleTensor<int8_t> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op, QuantizationInfo quantization_info_output, RoundingPolicy policy)
 {
     if(src.data_type() == DataType::QASYMM8_SIGNED)
     {
         // If the operation is MEAN_SUM, we can directly use the int8 implementation without taking into account scale and offset
         if(op == ReductionOperation::MEAN_SUM && src.quantization_info() == quantization_info_output)
         {
-            return compute_reduction_operation<int8_t, int8_t>(src, dst_shape, axis, op);
+            return compute_reduction_operation<int8_t, int8_t>(src, dst_shape, axis, op, policy);
         }
         else
         {
@@ -317,25 +323,25 @@
     }
     else
     {
-        return compute_reduction_operation<int8_t, int8_t>(src, dst_shape, axis, op);
+        return compute_reduction_operation<int8_t, int8_t>(src, dst_shape, axis, op, policy);
     }
 }
 
 template SimpleTensor<float> reduction_operation(const SimpleTensor<float> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op,
-                                                 QuantizationInfo quantization_info_output = QuantizationInfo());
+                                                 QuantizationInfo quantization_info_output = QuantizationInfo(), RoundingPolicy policy = RoundingPolicy::TO_ZERO);
 template SimpleTensor<half> reduction_operation(const SimpleTensor<half> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op,
-                                                QuantizationInfo quantization_info_output = QuantizationInfo());
+                                                QuantizationInfo quantization_info_output = QuantizationInfo(), RoundingPolicy policy = RoundingPolicy::TO_ZERO);
 
 template SimpleTensor<int32_t> reduction_operation(const SimpleTensor<float> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op,
-                                                   QuantizationInfo quantization_info_output = QuantizationInfo());
+                                                   QuantizationInfo quantization_info_output = QuantizationInfo(), RoundingPolicy policy = RoundingPolicy::TO_ZERO);
 template SimpleTensor<int32_t> reduction_operation(const SimpleTensor<int32_t> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op,
-                                                   QuantizationInfo quantization_info_output = QuantizationInfo());
+                                                   QuantizationInfo quantization_info_output = QuantizationInfo(), RoundingPolicy policy = RoundingPolicy::TO_ZERO);
 template SimpleTensor<int32_t> reduction_operation(const SimpleTensor<half> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op,
-                                                   QuantizationInfo quantization_info_output = QuantizationInfo());
+                                                   QuantizationInfo quantization_info_output = QuantizationInfo(), RoundingPolicy policy = RoundingPolicy::TO_ZERO);
 template SimpleTensor<int32_t> reduction_operation(const SimpleTensor<uint8_t> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op,
-                                                   QuantizationInfo quantization_info_output = QuantizationInfo());
+                                                   QuantizationInfo quantization_info_output = QuantizationInfo(), RoundingPolicy policy = RoundingPolicy::TO_ZERO);
 template SimpleTensor<int32_t> reduction_operation(const SimpleTensor<int8_t> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op,
-                                                   QuantizationInfo quantization_info_output = QuantizationInfo());
+                                                   QuantizationInfo quantization_info_output = QuantizationInfo(), RoundingPolicy policy = RoundingPolicy::TO_ZERO);
 
 } // namespace reference
 } // namespace validation
diff --git a/tests/validation/reference/ReductionOperation.h b/tests/validation/reference/ReductionOperation.h
index 9c9e721..dd97778 100644
--- a/tests/validation/reference/ReductionOperation.h
+++ b/tests/validation/reference/ReductionOperation.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_TEST_REDUCTION_OPERATION_H
 
 #include "tests/SimpleTensor.h"
+#include "arm_compute/core/Rounding.h"
 #include "tests/validation/Helpers.h"
 
 namespace arm_compute
@@ -37,7 +38,7 @@
 {
 template <typename T, typename OT>
 SimpleTensor<OT> reduction_operation(const SimpleTensor<T> &src, const TensorShape &dst_shape, unsigned int axis, ReductionOperation op,
-                                     QuantizationInfo quantization_info_output = QuantizationInfo());
+                                     QuantizationInfo quantization_info_output = QuantizationInfo(), RoundingPolicy policy = RoundingPolicy::TO_ZERO);
 } // namespace reference
 } // namespace validation
 } // namespace test