Remove OpenCL padding: CLReductionOperationKernel

Change the parallel implementation across the X, now every thread computes one row
Add missing test for MEAN_SUM
Make reduction on any axis != 0 work with num_channels > 1

Resolve COMPMID-3917

Signed-off-by: Giorgio Arena <giorgio.arena@arm.com>
Change-Id: Ib0f99540104e3c253bcd1ea637833db533f5e76e
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5522
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Manuel Bottini <manuel.bottini@arm.com>
Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
diff --git a/tests/validation/CL/ReductionOperation.cpp b/tests/validation/CL/ReductionOperation.cpp
index 31c5a97..beb5838 100644
--- a/tests/validation/CL/ReductionOperation.cpp
+++ b/tests/validation/CL/ReductionOperation.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -50,10 +50,11 @@
 /** Tolerance for quantized operations */
 RelativeTolerance<float> tolerance_qasymm8(1);
 
-const auto ReductionOperationsSumProd = framework::dataset::make("ReductionOperationsSumProd",
+const auto ReductionOperationsSumProdMean = framework::dataset::make("ReductionOperationsSumProdMean",
 {
     ReductionOperation::SUM,
     ReductionOperation::PROD,
+    ReductionOperation::MEAN_SUM
 
 });
 const auto ReductionOperationsMinMax = framework::dataset::make("ReductionMinMax",
@@ -109,15 +110,16 @@
 TEST_SUITE(Float)
 TEST_SUITE(FP16)
 FIXTURE_DATA_TEST_CASE(RunSmall4D, CLReductionOperationFixture<half>, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(combine(datasets::Small4DShapes(), framework::dataset::make("DataType", DataType::F16)), framework::dataset::make("Axis", { 0, 1, 2, 3 })), concat(ReductionOperationsSumProd,
-                                       ReductionOperationsMinMax)),
+                       combine(combine(combine(combine(datasets::Small4DShapes(), framework::dataset::make("DataType", DataType::F16)), framework::dataset::make("Axis", { 0, 1, 2, 3 })),
+                                       concat(ReductionOperationsSumProdMean,
+                                              ReductionOperationsMinMax)),
                                KeepDimensions))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f16);
 }
 FIXTURE_DATA_TEST_CASE(RunLarge, CLReductionOperationFixture<half>, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType", DataType::F16)), framework::dataset::make("Axis", { 0, 1, 2, 3 })), concat(ReductionOperationsSumProd,
+                       combine(combine(combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType", DataType::F16)), framework::dataset::make("Axis", { 0, 1, 2, 3 })), concat(ReductionOperationsSumProdMean,
                                        ReductionOperationsMinMax)),
                                KeepDimensions))
 {
@@ -127,15 +129,16 @@
 TEST_SUITE_END() // F16
 TEST_SUITE(FP32)
 FIXTURE_DATA_TEST_CASE(RunSmall4D, CLReductionOperationFixture<float>, framework::DatasetMode::PRECOMMIT,
-                       combine(combine(combine(combine(datasets::Small4DShapes(), framework::dataset::make("DataType", DataType::F32)), framework::dataset::make("Axis", { 0, 1, 2, 3 })), concat(ReductionOperationsSumProd,
-                                       ReductionOperationsMinMax)),
+                       combine(combine(combine(combine(datasets::Small4DShapes(), framework::dataset::make("DataType", DataType::F32)), framework::dataset::make("Axis", { 0, 1, 2, 3 })),
+                                       concat(ReductionOperationsSumProdMean,
+                                              ReductionOperationsMinMax)),
                                KeepDimensions))
 {
     // Validate output
     validate(CLAccessor(_target), _reference, tolerance_f32);
 }
 FIXTURE_DATA_TEST_CASE(RunLarge, CLReductionOperationFixture<float>, framework::DatasetMode::NIGHTLY,
-                       combine(combine(combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType", DataType::F32)), framework::dataset::make("Axis", { 0, 1, 2, 3 })), concat(ReductionOperationsSumProd,
+                       combine(combine(combine(combine(datasets::LargeShapes(), framework::dataset::make("DataType", DataType::F32)), framework::dataset::make("Axis", { 0, 1, 2, 3 })), concat(ReductionOperationsSumProdMean,
                                        ReductionOperationsMinMax)),
                                KeepDimensions))
 {
@@ -152,7 +155,7 @@
 TEST_SUITE(QASYMM8)
 FIXTURE_DATA_TEST_CASE(RunSmall, CLReductionOperationQuantizedFixture<uint8_t>, framework::DatasetMode::ALL,
                        combine(combine(combine(combine(combine(datasets::Small4DShapes(), framework::dataset::make("DataType", DataType::QASYMM8)), framework::dataset::make("Axis", { 0, 1, 2, 3 })),
-                                               ReductionOperationsSumProd),
+                                               ReductionOperationsSumProdMean),
                                        framework::dataset::make("QuantizationInfo", QuantizationInfo(1.f / 64, 2))),
                                KeepDimensions))
 {
@@ -172,7 +175,7 @@
 TEST_SUITE(QASYMM8_SIGNED)
 FIXTURE_DATA_TEST_CASE(RunSmall, CLReductionOperationQuantizedFixture<int8_t>, framework::DatasetMode::ALL,
                        combine(combine(combine(combine(combine(datasets::Small4DShapes(), framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)), framework::dataset::make("Axis", { 0, 1, 2, 3 })),
-                                               ReductionOperationsSumProd),
+                                               ReductionOperationsSumProdMean),
                                        framework::dataset::make("QuantizationInfo", QuantizationInfo(1.f / 64, 2))),
                                KeepDimensions))
 {