COMPMID-1226 Extend CLMeanStdDev to support FP32 / FP16

- Extend support for FP16 in CLReduction.
- For F16/F32 MeanStdDev we perform one reduction operation for mean
and one for stddev and we calculate the final result in the host CPU.

Change-Id: Iad2099f26c0ba7969737d22f00c6c275634d875c
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/135870
Tested-by: Jenkins <bsgcomp@arm.com>
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
diff --git a/arm_compute/core/CL/kernels/CLMeanStdDevKernel.h b/arm_compute/core/CL/kernels/CLMeanStdDevKernel.h
index a6898fd..46e266e 100644
--- a/arm_compute/core/CL/kernels/CLMeanStdDevKernel.h
+++ b/arm_compute/core/CL/kernels/CLMeanStdDevKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -59,6 +59,17 @@
      * @param[out] global_sum_squared (Optional if stddev is not set, required if stddev is set) Keeps global sum of squared pixel values (Buffer size: 1 cl_ulong).
      */
     void configure(const ICLImage *input, float *mean, cl::Buffer *global_sum, float *stddev = nullptr, cl::Buffer *global_sum_squared = nullptr);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLMeanStdDevKernel.
+     *
+     * @param[in] input              Input image info. Data types supported: U8.
+     * @param[in] mean               Input average pixel value.
+     * @param[in] global_sum         Keeps global sum of pixel values.
+     * @param[in] stddev             (Optional) Output standard deviation of pixel values.
+     * @param[in] global_sum_squared (Optional if stddev is not set, required if stddev is set) Keeps global sum of squared pixel values.
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *input, float *mean, cl::Buffer *global_sum, float *stddev = nullptr, cl::Buffer *global_sum_squared = nullptr);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
diff --git a/arm_compute/core/CL/kernels/CLReductionOperationKernel.h b/arm_compute/core/CL/kernels/CLReductionOperationKernel.h
index 56f75e5..60e2f08 100644
--- a/arm_compute/core/CL/kernels/CLReductionOperationKernel.h
+++ b/arm_compute/core/CL/kernels/CLReductionOperationKernel.h
@@ -50,7 +50,7 @@
 
     /** Set the input and output tensors.
      *
-     * @param[in]  input  Source tensor. Data types supported: F32. Data layouts supported: NCHW.
+     * @param[in]  input  Source tensor. Data types supported: F16/F32. Data layouts supported: NCHW.
      * @param[out] output Destination tensor. Data types and data layouts supported: Same as @p input.
      *                    Output will have the same number of dimensions as input.
      * @param[in]  axis   Axis along which to reduce. Supported reduction axis : 0
@@ -60,7 +60,7 @@
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLReductionOperationKernel.
      *
-     * @param[in] input  Source tensor info. Data types supported: F32. Data layouts supported: NCHW.
+     * @param[in] input  Source tensor info. Data types supported: F16/F32. Data layouts supported: NCHW.
      * @param[in] output Destination tensor info. Data types and data layouts supported: Same as @p input.
      *                   Output will have the same number of dimensions as input.
      * @param[in] axis   Axis along which to reduce. Supported reduction axis : 0
diff --git a/arm_compute/core/Validate.h b/arm_compute/core/Validate.h
index 1646ebe..918c8e5 100644
--- a/arm_compute/core/Validate.h
+++ b/arm_compute/core/Validate.h
@@ -787,6 +787,19 @@
  */
 arm_compute::Status error_on_tensor_not_2d(const char *function, const char *file, const int line,
                                            const ITensor *tensor);
+
+/** Return an error if the tensor info is not 2D.
+ *
+ * @param[in] function Function in which the error occurred.
+ * @param[in] file     Name of the file where the error occurred.
+ * @param[in] line     Line on which the error occurred.
+ * @param[in] tensor   Tensor info to validate.
+ *
+ * @return Status
+ */
+arm_compute::Status error_on_tensor_not_2d(const char *function, const char *file, const int line,
+                                           const ITensorInfo *tensor);
+
 #define ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(t) \
     ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_tensor_not_2d(__func__, __FILE__, __LINE__, t))
 #define ARM_COMPUTE_RETURN_ERROR_ON_TENSOR_NOT_2D(t) \
diff --git a/arm_compute/runtime/CL/functions/CLMeanStdDev.h b/arm_compute/runtime/CL/functions/CLMeanStdDev.h
index 7622138..2e46563 100644
--- a/arm_compute/runtime/CL/functions/CLMeanStdDev.h
+++ b/arm_compute/runtime/CL/functions/CLMeanStdDev.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016, 2017 ARM Limited.
+ * Copyright (c) 2016-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,7 +27,10 @@
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/CL/kernels/CLFillBorderKernel.h"
 #include "arm_compute/core/CL/kernels/CLMeanStdDevKernel.h"
+#include "arm_compute/runtime/CL/CLMemoryGroup.h"
+#include "arm_compute/runtime/CL/functions/CLReductionOperation.h"
 #include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/IMemoryManager.h"
 
 namespace arm_compute
 {
@@ -36,23 +39,56 @@
 {
 public:
     /** Default Constructor. */
-    CLMeanStdDev();
+    CLMeanStdDev(std::shared_ptr<IMemoryManager> memory_manager = nullptr);
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLMeanStdDev(const CLMeanStdDev &) = delete;
+    /** Prevent instances of this class from being copied (As this class contains pointers) */
+    CLMeanStdDev &operator=(const CLMeanStdDev &) = delete;
+    /** Allow instances of this class to be moved */
+    CLMeanStdDev(CLMeanStdDev &&) = default;
+    /** Allow instances of this class to be moved */
+    CLMeanStdDev &operator=(CLMeanStdDev &&) = default;
+    /** Default destructor */
+    ~CLMeanStdDev() = default;
     /** Initialise the kernel's inputs and outputs.
      *
-     * @param[in, out] input  Input image. Data types supported: U8. (Written to only for border filling)
+     * @param[in, out] input  Input image. Data types supported: U8/F16/F32. (Written to only for border filling)
      * @param[out]     mean   Output average pixel value.
-     * @param[out]     stddev (Optional)Output standard deviation of pixel values.
+     * @param[out]     stddev (Optional) Output standard deviation of pixel values.
      */
     void configure(ICLImage *input, float *mean, float *stddev = nullptr);
+    /** Static function to check if given info will lead to a valid configuration of @ref CLMeanStdDev
+     *
+     * @param[in] input  Input image. Data types supported: U8/F16/F32.
+     * @param[in] mean   Output average pixel value.
+     * @param[in] stddev (Optional) Output standard deviation of pixel values.
+     *
+     * @return a status
+     */
+    static Status validate(ITensorInfo *input, float *mean, float *stddev = nullptr);
 
     // Inherited methods overridden:
     void run() override;
 
 private:
-    CLMeanStdDevKernel _mean_stddev_kernel; /**< Kernel that standard deviation calculation. */
-    CLFillBorderKernel _fill_border_kernel; /**< Kernel that fills the border with zeroes. */
-    cl::Buffer         _global_sum;         /**< Variable that holds the global sum among calls in order to ease reduction */
-    cl::Buffer         _global_sum_squared; /**< Variable that holds the global sum of squared values among calls in order to ease reduction */
+    template <typename T>
+    void run_float();
+    void run_int();
+
+    CLMemoryGroup        _memory_group;               /**< Function's memory group */
+    DataType             _data_type;                  /**< Input data type. */
+    unsigned int         _num_pixels;                 /**< Number of image's pixels. */
+    bool                 _run_stddev;                 /**< Flag for knowing if we should run stddev reduction function. */
+    CLReductionOperation _reduction_operation_mean;   /**< Reduction operation function for computing mean value. */
+    CLReductionOperation _reduction_operation_stddev; /**< Reduction operation function for computing standard deviation. */
+    CLTensor             _reduction_output_mean;      /**< Reduction operation output tensor for mean value. */
+    CLTensor             _reduction_output_stddev;    /**< Reduction operation output tensor for standard deviation value. */
+    float               *_mean;                       /**< Pointer that holds the mean value. */
+    float               *_stddev;                     /**< Pointer that holds the standard deviation value. */
+    CLMeanStdDevKernel   _mean_stddev_kernel;         /**< Kernel that standard deviation calculation. */
+    CLFillBorderKernel   _fill_border_kernel;         /**< Kernel that fills the border with zeroes. */
+    cl::Buffer           _global_sum;                 /**< Variable that holds the global sum among calls in order to ease reduction */
+    cl::Buffer           _global_sum_squared;         /**< Variable that holds the global sum of squared values among calls in order to ease reduction */
 };
 }
 #endif /*__ARM_COMPUTE_CLMEANSTDDEV_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLReductionOperation.h b/arm_compute/runtime/CL/functions/CLReductionOperation.h
index b8108b5..d862aff 100644
--- a/arm_compute/runtime/CL/functions/CLReductionOperation.h
+++ b/arm_compute/runtime/CL/functions/CLReductionOperation.h
@@ -53,7 +53,7 @@
 
     /** Set the input and output tensors.
      *
-     * @param[in]  input  Source tensor. Data types supported: F32. Data layouts supported: NCHW.
+     * @param[in]  input  Source tensor. Data types supported: F16/F32. Data layouts supported: NCHW.
      * @param[out] output Destination tensor. Data types and data layouts supported: Same as @p input.
      * @param[in]  axis   Axis along which to reduce. Supported reduction axis : 0
      * @param[in]  op     Reduction operation to perform.
@@ -62,7 +62,7 @@
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLReductionOperation.
      *
-     * @param[in] input  Source tensor info. Data types supported: F32. Data layouts supported: NCHW.
+     * @param[in] input  Source tensor info. Data types supported: F16/F32. Data layouts supported: NCHW.
      * @param[in] output Destination tensor info. Data types and data layouts supported: Same as @p input.
      * @param[in] axis   Axis along which to reduce. Supported reduction axis : 0
      * @param[in] op     Reduction operation to perform.