COMPMID-2762: Add support for QASYMM8_SIGNED in CLReductionOperation and CLReduceMean

Change-Id: Ib6babd9ad80c57cf21c2f0ee2aab404221088595
Signed-off-by: Michalis Spyrou <michalis.spyrou@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/2670
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com>
diff --git a/src/core/CL/cl_kernels/helpers_asymm.h b/src/core/CL/cl_kernels/helpers_asymm.h
index 5a7c712..6377dba 100644
--- a/src/core/CL/cl_kernels/helpers_asymm.h
+++ b/src/core/CL/cl_kernels/helpers_asymm.h
@@ -399,15 +399,31 @@
 
 QUANTIZE_IMPL(uchar, 1)
 QUANTIZE_IMPL(char, 1)
+QUANTIZE_IMPL(uint, 1)
+QUANTIZE_IMPL(int, 1)
 QUANTIZE_IMPL(uchar, 4)
 QUANTIZE_IMPL(ushort, 4)
 QUANTIZE_IMPL(short, 4)
+QUANTIZE_IMPL(uchar, 16)
+QUANTIZE_IMPL(char, 16)
+QUANTIZE_IMPL(ushort, 16)
+QUANTIZE_IMPL(short, 16)
+QUANTIZE_IMPL(uint, 16)
+QUANTIZE_IMPL(int, 16)
 
 DEQUANTIZE_IMPL(uchar, 1)
 DEQUANTIZE_IMPL(char, 1)
+DEQUANTIZE_IMPL(uint, 1)
+DEQUANTIZE_IMPL(int, 1)
 DEQUANTIZE_IMPL(uchar, 4)
 DEQUANTIZE_IMPL(ushort, 4)
 DEQUANTIZE_IMPL(short, 4)
+DEQUANTIZE_IMPL(uchar, 16)
+DEQUANTIZE_IMPL(char, 16)
+DEQUANTIZE_IMPL(ushort, 16)
+DEQUANTIZE_IMPL(short, 16)
+DEQUANTIZE_IMPL(uint, 16)
+DEQUANTIZE_IMPL(int, 16)
 
 ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(1)
 ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(2)
diff --git a/src/core/CL/cl_kernels/reduction_operation.cl b/src/core/CL/cl_kernels/reduction_operation.cl
index 0c39334..a5fd0b3 100644
--- a/src/core/CL/cl_kernels/reduction_operation.cl
+++ b/src/core/CL/cl_kernels/reduction_operation.cl
@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "helpers.h"
+#include "helpers_asymm.h"
 
 #if defined(FLOAT_DATA_TYPE)
 #define ISGREATER(x, y) isgreater(x, y)
@@ -182,28 +183,50 @@
     Vector src    = CONVERT_TO_VECTOR_STRUCT(src);
     Vector output = CONVERT_TO_VECTOR_STRUCT(output);
 
-    DATA_TYPE_PROMOTED res = *((__global DATA_TYPE *)vector_offset(&src, 0));
+    DATA_TYPE_PROMOTED res = CONVERT(*((__global DATA_TYPE *)vector_offset(&src, 0)), DATA_TYPE_PROMOTED);
+
+    // Convert input into F32 in order to perform quantized multiplication
+#if defined(PROD) && defined(OFFSET) && defined(SCALE)
+    float res_f = DEQUANTIZE(res, OFFSET, SCALE, DATA_TYPE_PROMOTED, 1);
+#endif // defined(PROD) && defined(OFFSET) && defined(SCALE)
 
     for(unsigned int x = 1; x < WIDTH; ++x)
     {
-        DATA_TYPE_PROMOTED in = *((__global DATA_TYPE *)vector_offset(&src, x));
+        DATA_TYPE_PROMOTED in = CONVERT(*((__global DATA_TYPE *)vector_offset(&src, x)), DATA_TYPE_PROMOTED);
 #if defined(MIN)
         res = select(res, in, CONVERT(ISLESS(in, res), COND_DATA_TYPE));
 #elif defined(MAX)
-        res                         = select(res, in, CONVERT(ISGREATER(in, res), COND_DATA_TYPE));
-#else  // !(defined(MAX) || defined(MIN))
+        res                             = select(res, in, CONVERT(ISGREATER(in, res), COND_DATA_TYPE));
+#elif defined(PROD)
+#if defined(OFFSET) && defined(SCALE)
+        res_f *= DEQUANTIZE(in, OFFSET, SCALE, DATA_TYPE_PROMOTED, 1);
+#else  // !(defined(OFFSET) && defined(SCALE))
+        res *= in;
+#endif //  defined(OFFSET) && defined(SCALE)
+#else  // defined(SUM))
         res += in;
-#endif // defined(MAX) || defined(MIN)
+#endif // defined(MAX) || defined(MIN) || defined(PROD)
     }
 
     // Store result
 #if defined(MEAN)
     res /= WIDTH;
 #endif // defined(MEAN)
+
+    // Subtract the offsets in case of quantized SUM
+#if defined(SUM) && defined(OFFSET) && defined(SCALE)
+    res -= (WIDTH - 1) * OFFSET;
+#endif // defined(OFFSET) && defined(OFFSET) && defined(SCALE)
+
+    // Re-quantize
+#if defined(PROD) && defined(OFFSET) && defined(SCALE)
+    res = QUANTIZE(res_f, OFFSET, SCALE, DATA_TYPE_PROMOTED, 1);
+#endif // defined(PROD) && defined(OFFSET) && defined(SCALE)
+
 #if defined(MIN) || defined(MAX)
     *((__global DATA_TYPE_PROMOTED *)output.ptr) = res;
-#else  // defined(MIN) || defined(MAX)
-    *((__global uchar *)output.ptr) = convert_uchar(res);
+#else  // !(defined(MIN) || defined(MAX))
+    *((__global DATA_TYPE *)output.ptr) = CONVERT_SAT(res, DATA_TYPE);
 #endif // defined(MIN) || defined(MAX)
 }
 #endif // defined(WIDTH)
@@ -237,6 +260,11 @@
     VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)
     res = CONVERT(vload16(0, (__global DATA_TYPE *)offset(&src, 0, 0)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16));
 
+    // Convert input into F32 in order to perform quantized multiplication
+#if defined(PROD) && defined(OFFSET) && defined(SCALE)
+    float16 res_f = DEQUANTIZE(res, OFFSET, SCALE, DATA_TYPE_PROMOTED, 16);
+#endif // defined(PROD) && defined(OFFSET) && defined(SCALE)
+
 #if defined(SUM_SQUARE)
     res *= res;
 #endif // defined(SUM_SQUARE)
@@ -248,24 +276,41 @@
 #if defined(MIN)
         res = select(res, in, ISLESS(in, res));
 #elif defined(MAX)
-        res                         = select(res, in, ISGREATER(in, res));
+        res                             = select(res, in, ISGREATER(in, res));
 #else // !(defined(MAX) || defined(MIN))
 #if defined(SUM_SQUARE)
         in *= in;
 #endif // defined(SUM_SQUARE)
 #if defined(PROD)
+
+#if defined(OFFSET) && defined(SCALE)
+        res_f *= DEQUANTIZE(in, OFFSET, SCALE, DATA_TYPE_PROMOTED, 16);
+#else  // !(defined(OFFSET) && defined(SCALE))
         res *= in;
+#endif //  defined(OFFSET) && defined(SCALE)
+
 #else  // !defined(PROD)
         res += in;
 #endif // defined(PROD)
 #endif // defined(MAX) || defined(MIN)
     }
 
-    // Store result
 #if defined(MEAN)
     res /= HEIGHT;
 #endif // defined(MEAN)
-    vstore16(CONVERT(res, VEC_DATA_TYPE(DATA_TYPE, 16)), 0, (__global DATA_TYPE *)output.ptr);
+
+    // Subtract the offsets in case of quantized SUM
+#if defined(SUM) && defined(OFFSET) && defined(SCALE)
+    res -= (HEIGHT - 1) * OFFSET;
+#endif // defined(OFFSET) && defined(OFFSET) && defined(SCALE)
+
+    // Re-quantize
+#if defined(PROD) && defined(OFFSET) && defined(SCALE)
+    res = QUANTIZE(res_f, OFFSET, SCALE, DATA_TYPE_PROMOTED, 16);
+#endif // defined(PROD) && defined(OFFSET) && defined(SCALE)
+
+    // Store result
+    vstore16(CONVERT_SAT(res, VEC_DATA_TYPE(DATA_TYPE, 16)), 0, (__global DATA_TYPE *)output.ptr);
 }
 #endif // defined(HEIGHT)
 
@@ -302,6 +347,11 @@
     VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)
     res = CONVERT(vload16(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16));
 
+    // Convert input into F32 in order to perform quantized multiplication
+#if defined(PROD) && defined(OFFSET) && defined(SCALE)
+    float16 res_f = DEQUANTIZE(res, OFFSET, SCALE, DATA_TYPE_PROMOTED, 16);
+#endif // defined(PROD) && defined(OFFSET) && defined(SCALE)
+
 #if defined(COMPLEX)
     VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)
     res1 = CONVERT(vload16(0, (__global DATA_TYPE *)tensor3D_offset(&input, 8, 0, 0)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16));
@@ -323,14 +373,20 @@
 #if defined(MIN)
         res = select(res, in, ISLESS(in, res));
 #elif defined(MAX)
-        res                         = select(res, in, ISGREATER(in, res));
+        res                             = select(res, in, ISGREATER(in, res));
 #else // !(defined(MAX) || defined(MIN))
 #if defined(SUM_SQUARE)
         in *= in;
 #endif // defined(SUM_SQUARE)
 #if defined(PROD)
+
+#if defined(OFFSET) && defined(SCALE)
+        res_f *= DEQUANTIZE(in, OFFSET, SCALE, DATA_TYPE_PROMOTED, 16);
+#else  // !(defined(OFFSET) && defined(SCALE))
         res *= in;
-#else //!defined(PROD)
+#endif //  defined(OFFSET) && defined(SCALE)
+
+#else // !defined(PROD)
         res += in;
 #if defined(COMPLEX)
         res1 += in1;
@@ -339,11 +395,22 @@
 #endif // defined(MAX) || defined(MIN)
     }
 
-    // Store result
 #if defined(MEAN)
     res /= DEPTH;
 #endif // defined(MEAN)
-    vstore16(CONVERT(res, VEC_DATA_TYPE(DATA_TYPE, 16)), 0, (__global DATA_TYPE *)output.ptr);
+
+    // Subtract the offsets in case of quantized SUM
+#if defined(SUM) && defined(OFFSET) && defined(SCALE)
+    res -= (DEPTH - 1) * OFFSET;
+#endif // defined(OFFSET) && defined(OFFSET) && defined(SCALE)
+
+    // Re-quantize
+#if defined(PROD) && defined(OFFSET) && defined(SCALE)
+    res = QUANTIZE(res_f, OFFSET, SCALE, DATA_TYPE_PROMOTED, 16);
+#endif // defined(PROD) && defined(OFFSET) && defined(SCALE)
+
+    // Store result
+    vstore16(CONVERT_SAT(res, VEC_DATA_TYPE(DATA_TYPE, 16)), 0, (__global DATA_TYPE *)output.ptr);
 #if defined(COMPLEX)
     vstore16(CONVERT(res1, VEC_DATA_TYPE(DATA_TYPE, 16)), 0, (__global DATA_TYPE *)tensor3D_offset(&output, 8, 0, 0));
 #endif // defined(COMPLEX)
@@ -388,6 +455,11 @@
     VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)
     res = CONVERT(vload16(0, (__global DATA_TYPE *)tensor4D_offset(&input, 0, 0, 0, 0)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16));
 
+    // Convert input into F32 in order to perform quantized multiplication
+#if defined(PROD) && defined(OFFSET) && defined(SCALE)
+    float16 res_f = DEQUANTIZE(res, OFFSET, SCALE, DATA_TYPE_PROMOTED, 16);
+#endif // defined(PROD) && defined(OFFSET) && defined(SCALE)
+
 #if defined(SUM_SQUARE)
     res *= res;
 #endif // defined(SUM_SQUARE)
@@ -400,23 +472,40 @@
 #if defined(MIN)
         res = select(res, in, ISLESS(in, res));
 #elif defined(MAX)
-        res                         = select(res, in, ISGREATER(in, res));
+        res                             = select(res, in, ISGREATER(in, res));
 #else // !(defined(MAX) || defined(MIN))
 #if defined(SUM_SQUARE)
         in *= in;
 #endif // defined(SUM_SQUARE)
 #if defined(PROD)
+
+#if defined(OFFSET) && defined(SCALE)
+        res_f *= DEQUANTIZE(in, OFFSET, SCALE, DATA_TYPE_PROMOTED, 16);
+#else  // !(defined(OFFSET) && defined(SCALE))
         res *= in;
-#else  //!defined(PROD)
+#endif //  defined(OFFSET) && defined(SCALE)
+
+#else  // !defined(PROD)
         res += in;
 #endif //defined(PROD)
 #endif // defined(MAX) || defined(MIN)
     }
 
-    // Store result
 #if defined(MEAN)
     res /= BATCH;
 #endif // defined(MEAN)
-    vstore16(CONVERT(res, VEC_DATA_TYPE(DATA_TYPE, 16)), 0, (__global DATA_TYPE *)output.ptr);
+
+    // Subtract the offsets in case of quantized SUM
+#if defined(SUM) && defined(OFFSET) && defined(SCALE)
+    res -= (BATCH - 1) * OFFSET;
+#endif // defined(OFFSET) && defined(OFFSET) && defined(SCALE)
+
+    // Re-quantize
+#if defined(PROD) && defined(OFFSET) && defined(SCALE)
+    res = QUANTIZE(res_f, OFFSET, SCALE, DATA_TYPE_PROMOTED, 16);
+#endif // defined(PROD) && defined(OFFSET) && defined(SCALE)
+
+    // Store result
+    vstore16(CONVERT_SAT(res, VEC_DATA_TYPE(DATA_TYPE, 16)), 0, (__global DATA_TYPE *)output.ptr);
 }
 #endif /* defined(BATCH) && defined(DEPTH) */
diff --git a/src/core/CL/kernels/CLReductionOperationKernel.cpp b/src/core/CL/kernels/CLReductionOperationKernel.cpp
index 91ee83e..a2a5f2b 100644
--- a/src/core/CL/kernels/CLReductionOperationKernel.cpp
+++ b/src/core/CL/kernels/CLReductionOperationKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -50,7 +50,7 @@
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
     if(input->num_channels() == 1)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::S32, DataType::F16, DataType::F32);
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S32, DataType::F16, DataType::F32);
     }
     else
     {
@@ -59,8 +59,8 @@
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(op == ReductionOperation::SUM_SQUARE && input->data_type() == DataType::QASYMM8, "Not supported reduction operation for QASYMM8");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis");
-    ARM_COMPUTE_RETURN_ERROR_ON(op == ReductionOperation::MEAN_SUM && axis == 0 && width == 0 && input->data_type() != DataType::QASYMM8);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(op == ReductionOperation::ARG_IDX_MAX || op == ReductionOperation::ARG_IDX_MIN, "Not supported reduction operation, use CLArgMinMaxLayer");
+    ARM_COMPUTE_RETURN_ERROR_ON((op == ReductionOperation::MEAN_SUM) && (axis == 0) && (width == 0) && (input->data_type() != DataType::QASYMM8) && (input->data_type() != DataType::QASYMM8_SIGNED));
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((op == ReductionOperation::ARG_IDX_MAX) || (op == ReductionOperation::ARG_IDX_MIN), "Not supported reduction operation, use CLArgMinMaxLayer");
 
     if(output->total_size() != 0)
     {
@@ -147,21 +147,30 @@
 
     // Set build options
     CLBuildOptions build_opts;
-    std::string    data_type_promoted = get_cl_type_from_data_type(input->info()->data_type());
-    if(is_data_type_quantized(input->info()->data_type()))
+    DataType       data_type = input->info()->data_type();
+    std::string    data_type_promoted{};
+
+    if(is_data_type_quantized(data_type))
     {
-        data_type_promoted = "uint";
+        data_type_promoted = get_cl_dot8_acc_type_from_data_type(data_type);
+    }
+    else
+    {
+        data_type_promoted = get_cl_type_from_data_type(data_type);
     }
 
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
     build_opts.add_option("-DDATA_TYPE_PROMOTED=" + data_type_promoted);
-    build_opts.add_option_if(is_data_type_float(input->info()->data_type()), "-DFLOAT_DATA_TYPE");
+    build_opts.add_option_if(is_data_type_float(data_type), "-DFLOAT_DATA_TYPE");
     build_opts.add_option_if(op == ReductionOperation::SUM_SQUARE, "-DSUM_SQUARE");
     build_opts.add_option_if(op == ReductionOperation::MEAN_SUM, "-DMEAN");
+    build_opts.add_option_if(op == ReductionOperation::SUM, "-DSUM");
     build_opts.add_option_if(op == ReductionOperation::PROD, "-DPROD");
     build_opts.add_option_if(op == ReductionOperation::MIN, "-DMIN");
     build_opts.add_option_if(op == ReductionOperation::MAX, "-DMAX");
     build_opts.add_option_if(input->info()->num_channels() == 2, "-DCOMPLEX");
+    build_opts.add_option_if(is_data_type_quantized(data_type), "-DOFFSET=" + support::cpp11::to_string(input->info()->quantization_info().uniform().offset));
+    build_opts.add_option_if(is_data_type_quantized(data_type), "-DSCALE=" + float_to_string_with_full_precision(input->info()->quantization_info().uniform().scale));
 
     switch(op)
     {
diff --git a/src/runtime/CL/functions/CLReduceMean.cpp b/src/runtime/CL/functions/CLReduceMean.cpp
index c5de43d..9920617 100644
--- a/src/runtime/CL/functions/CLReduceMean.cpp
+++ b/src/runtime/CL/functions/CLReduceMean.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -28,9 +28,7 @@
 #include "arm_compute/core/CL/kernels/CLReductionOperationKernel.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/utils/helpers/tensor_transform.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/runtime/CL/CLScheduler.h"
 #include "support/ToolchainSupport.h"
 
 namespace arm_compute
@@ -42,7 +40,7 @@
     ARM_COMPUTE_UNUSED(keep_dims);
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() < 1);
     ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() > input->num_dimensions());
 
diff --git a/src/runtime/CL/functions/CLReductionOperation.cpp b/src/runtime/CL/functions/CLReductionOperation.cpp
index 2f9a386..e04982a 100644
--- a/src/runtime/CL/functions/CLReductionOperation.cpp
+++ b/src/runtime/CL/functions/CLReductionOperation.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -271,6 +271,11 @@
                         pixelValue = PixelValue(255, input->info()->data_type(), input->info()->quantization_info());
                         break;
                     }
+                    case DataType::QASYMM8_SIGNED:
+                    {
+                        pixelValue = PixelValue(127, input->info()->data_type(), input->info()->quantization_info());
+                        break;
+                    }
                     default:
                     {
                         ARM_COMPUTE_ERROR("Unsupported DataType");
@@ -298,6 +303,11 @@
                         pixelValue = PixelValue(0, input->info()->data_type(), input->info()->quantization_info());
                         break;
                     }
+                    case DataType::QASYMM8_SIGNED:
+                    {
+                        pixelValue = PixelValue(-128, input->info()->data_type(), input->info()->quantization_info());
+                        break;
+                    }
                     default:
                     {
                         ARM_COMPUTE_ERROR("Unsupported DataType");