Remove OpenCL padding CLTransposeKernel

By handling more general NxM blocks (where M and N can be 1,2,4,8,16)
instead of only 4x4, 8x8, 16x16 and managing corner left values with
partial stores

Resolves: COMPMID-3923

Change-Id: I49b1a560c8325e00e061bd04edcf55034d04dcd8
Signed-off-by: Manuel Bottini <manuel.bottini@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/4780
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com>
Reviewed-by: Giorgio Arena <giorgio.arena@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
diff --git a/tests/validation/CL/Transpose.cpp b/tests/validation/CL/Transpose.cpp
index 876bf29..9435340 100644
--- a/tests/validation/CL/Transpose.cpp
+++ b/tests/validation/CL/Transpose.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -47,19 +47,15 @@
 // *INDENT-OFF*
 // clang-format off
 DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(
-    framework::dataset::make("InputInfo", { TensorInfo(TensorShape(21U, 13U), 1, DataType::U8),  // Input not a multiple of 8
-                                            TensorInfo(TensorShape(21U, 13U), 1, DataType::U16), // Invalid shape
-                                            TensorInfo(TensorShape(20U, 13U), 1, DataType::U32), // Window shrink
+    framework::dataset::make("InputInfo", { TensorInfo(TensorShape(21U, 13U), 1, DataType::U16), // Invalid shape
                                             TensorInfo(TensorShape(20U, 13U), 1, DataType::U8),  // Wrong data type
                                             TensorInfo(TensorShape(20U, 16U), 1, DataType::U32), // Valid
                                           }),
-    framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(13U, 21U), 1, DataType::U8),
-                                            TensorInfo(TensorShape(21U, 13U), 1, DataType::U16),
-                                            TensorInfo(TensorShape(13U, 20U), 1, DataType::U32),
+    framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(13U, 20U), 1, DataType::U32),
                                             TensorInfo(TensorShape(31U, 20U), 1, DataType::U16),
                                             TensorInfo(TensorShape(16U, 20U), 1, DataType::U32),
                                            })),
-    framework::dataset::make("Expected", { false, false, false, false, true })),
+    framework::dataset::make("Expected", { false, false, true })),
     a_info, output_info, expected)
 {
     // Lock tensors
@@ -102,7 +98,9 @@
 TEST_SUITE_END() // U16
 
 TEST_SUITE(U32)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLTransposeFixture<uint32_t>, framework::DatasetMode::PRECOMMIT, combine(concat(datasets::Small1DShapes(), datasets::Small2DShapes()),
+FIXTURE_DATA_TEST_CASE(RunSmall, CLTransposeFixture<uint32_t>, framework::DatasetMode::PRECOMMIT, combine(concat(concat(framework::dataset::make("Shape", { TensorShape{ 1U, 5U }, TensorShape{ 4U, 5U }, TensorShape{ 3, 12 } }),
+                                                                                                                        datasets::Small1DShapes()),
+                                                                                                                 datasets::Small2DShapes()),
                                                                                                           framework::dataset::make("DataType", DataType::U32)))
 {
     // Validate output