COMPMID-1226 Extend CLMeanStdDev to support FP32 / FP16

- Extend support for FP16 in CLReduction.
- For F16/F32 MeanStdDev we perform one reduction operation for mean
and one for stddev and we calculate the final result in the host CPU.

Change-Id: Iad2099f26c0ba7969737d22f00c6c275634d875c
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/135870
Tested-by: Jenkins <bsgcomp@arm.com>
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
diff --git a/src/core/CL/kernels/CLMeanStdDevKernel.cpp b/src/core/CL/kernels/CLMeanStdDevKernel.cpp
index fc8764d..bd31131 100644
--- a/src/core/CL/kernels/CLMeanStdDevKernel.cpp
+++ b/src/core/CL/kernels/CLMeanStdDevKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,6 +23,7 @@
  */
 #include "arm_compute/core/CL/kernels/CLMeanStdDevKernel.h"
 
+#include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/CLValidate.h"
 #include "arm_compute/core/CL/ICLTensor.h"
@@ -49,14 +50,24 @@
     return _border_size;
 }
 
+Status CLMeanStdDevKernel::validate(const ITensorInfo *input, float *mean, cl::Buffer *global_sum, float *stddev, cl::Buffer *global_sum_squared)
+{
+    ARM_COMPUTE_UNUSED(mean);
+    ARM_COMPUTE_UNUSED(stddev);
+    ARM_COMPUTE_UNUSED(global_sum);
+    ARM_COMPUTE_UNUSED(global_sum_squared);
+    ARM_COMPUTE_RETURN_ERROR_ON_INT64_BASE_ATOMICS_UNSUPPORTED();
+    ARM_COMPUTE_RETURN_ERROR_ON_TENSOR_NOT_2D(input);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
+
+    return Status{};
+}
+
 void CLMeanStdDevKernel::configure(const ICLImage *input, float *mean, cl::Buffer *global_sum, float *stddev, cl::Buffer *global_sum_squared)
 {
-    ARM_COMPUTE_ERROR_ON_INT64_BASE_ATOMICS_UNSUPPORTED();
-    ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input);
-    ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8);
-    ARM_COMPUTE_ERROR_ON(nullptr == mean);
-    ARM_COMPUTE_ERROR_ON(nullptr == global_sum);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, mean, global_sum);
     ARM_COMPUTE_ERROR_ON(stddev && nullptr == global_sum_squared);
+    ARM_COMPUTE_ERROR_THROW_ON(CLMeanStdDevKernel::validate(input->info(), mean, global_sum, stddev, global_sum_squared));
 
     _input              = input;
     _mean               = mean;
diff --git a/src/core/CL/kernels/CLReductionOperationKernel.cpp b/src/core/CL/kernels/CLReductionOperationKernel.cpp
index d64f0d8..95967fa 100644
--- a/src/core/CL/kernels/CLReductionOperationKernel.cpp
+++ b/src/core/CL/kernels/CLReductionOperationKernel.cpp
@@ -44,7 +44,7 @@
     ARM_COMPUTE_UNUSED(op);
 
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() != DataLayout::NCHW);
 
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions");
@@ -69,7 +69,7 @@
     const unsigned int num_elems_processed_per_iteration = 16;
 
     Window             win          = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
-    const unsigned int border_width = ((input->dimension(0) % 128) != 0) ? 128 - input->dimension(0) % 128 : 0; // TODO (COMPMID-1143): Fix padding (possible value 127!)
+    const unsigned int border_width = ((input->dimension(0) % num_elems_processed_per_iteration) != 0) ? num_elems_processed_per_iteration - input->dimension(0) % num_elems_processed_per_iteration : 0;
 
     AccessWindowStatic     input_access(input, 0, 0, input->dimension(0) + border_width, 1);
     AccessWindowHorizontal output_access(output, 0, 1);