Remove OpenCL padding: CLReductionOperationKernel

Change the parallel implementation across the X, now every thread computes one row
Add missing test for MEAN_SUM
Make reduction on any axis != 0 work with num_channels > 1

Resolve COMPMID-3917

Signed-off-by: Giorgio Arena <giorgio.arena@arm.com>
Change-Id: Ib0f99540104e3c253bcd1ea637833db533f5e76e
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5522
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Manuel Bottini <manuel.bottini@arm.com>
Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
diff --git a/arm_compute/runtime/CL/functions/CLReductionOperation.h b/arm_compute/runtime/CL/functions/CLReductionOperation.h
index 3fbcee6..58164fd 100644
--- a/arm_compute/runtime/CL/functions/CLReductionOperation.h
+++ b/arm_compute/runtime/CL/functions/CLReductionOperation.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -36,7 +36,6 @@
 {
 // Forward declarations
 class CLCompileContext;
-class CLFillBorderKernel;
 class CLReductionOperationKernel;
 class ICLTensor;
 
@@ -99,15 +98,12 @@
 private:
     ICLTensor *configure_intermediate_result_vector(ICLTensor *input, ICLTensor *output);
 
-    MemoryGroup                                              _memory_group;
-    std::vector<CLTensor>                                    _results_vector;
-    std::vector<std::unique_ptr<CLReductionOperationKernel>> _reduction_kernels_vector;
-    std::vector<std::unique_ptr<CLFillBorderKernel>>         _border_handlers_vector;
-    CLReshapeLayer                                           _reshape;
-    unsigned int                                             _num_of_stages;
-    unsigned int                                             _reduction_axis;
-    bool                                                     _is_serial;
-    bool                                                     _is_reshape_required;
+    MemoryGroup                                 _memory_group;
+    CLTensor                                    _unreshaped_output;
+    std::unique_ptr<CLReductionOperationKernel> _reduction_kernel;
+    CLReshapeLayer                              _reshape;
+    unsigned int                                _reduction_axis;
+    bool                                        _is_reshape_required;
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_CLREDUCTIONOPERATION_H */
\ No newline at end of file