Implementation of Permute CL kernel to handle all permutations

This patch will add a generic permute cl-kernel to handle
all permutations available for tensors having rank upto 4.

Change-Id: I50eb555d9d45d5ad5f7fa9b0a3862dd17551d458
Signed-off-by: shubham <shub98.gupta@samsung.com>
Reviewed-on: https://review.mlplatform.org/449
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Manuel Bottini <manuel.bottini@arm.com>
Reviewed-by: Michalis Spyrou <michalis.spyrou@arm.com>
diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp
index 2bc2d06..905a34a 100644
--- a/src/core/CL/CLKernelLibrary.cpp
+++ b/src/core/CL/CLKernelLibrary.cpp
@@ -368,9 +368,7 @@
     { "NV21_to_RGBA8888_bt709", "color_convert.cl" },
     { "NV21_to_YUV444_bt709", "color_convert.cl" },
     { "output_stage_quantized", "direct_convolution_1x1_3x3_5x5_quantized.cl" },
-    { "permute_201", "permute.cl" },
-    { "permute_120", "permute.cl" },
-    { "permute_3201", "permute.cl" },
+    { "permute", "permute.cl" },
     { "pixelwise_mul_float", "pixelwise_mul_float.cl" },
     { "pixelwise_mul_int", "pixelwise_mul_int.cl" },
     { "pixelwise_mul_quantized", "pixelwise_mul_int.cl" },
diff --git a/src/core/CL/cl_kernels/permute.cl b/src/core/CL/cl_kernels/permute.cl
index 03fc15e..77f03f7 100644
--- a/src/core/CL/cl_kernels/permute.cl
+++ b/src/core/CL/cl_kernels/permute.cl
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,11 +23,12 @@
  */
 #include "helpers.h"
 
-#if defined(DATA_TYPE) && defined(DEPTH_IN)
-/** Perform a DCHW -> DHWC permute operation on an input tensor.
+#if defined(DATA_TYPE) && defined(DEPTH_IN) && defined(P1) && defined(P2) && defined(P3) && defined(P4)
+/**Perform a permute operation on an input tensor of Shape DCHW.
  *
  * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
  * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_IN=size. e.g. -DDEPTH_IN=16
+ * @attention Permutation vector is passed as a preprocessor arguement using -DP1, -DP2, -DP3 and -DP4=int, e.g. -DP1=2, -DP2=1, -DP3=0 and -DP4=3.
  *
  * @param[in]  input_ptr                            Pointer to the source image. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
  * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
@@ -48,81 +49,26 @@
  * @param[in]  output_step_w                        output_stride_w * number of elements along W processed per workitem(in bytes)
  * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
  */
-__kernel void permute_201(
-    TENSOR4D_DECLARATION(input),
-    TENSOR4D_DECLARATION(output))
+__kernel void permute(TENSOR4D_DECLARATION(input),
+                      TENSOR4D_DECLARATION(output))
+
 {
     Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT(input, DEPTH_IN);
     Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
 
-    *((__global DATA_TYPE *)tensor4D_offset(&out, (get_global_id(2) % DEPTH_IN), get_global_id(0), get_global_id(1), (get_global_id(2) / DEPTH_IN))) = *((__global DATA_TYPE *)in.ptr);
+    int out_index[4] = { 0 };
+    int in_index[4]  = { 0 };
+
+    in_index[0] = get_global_id(0);            // W
+    in_index[1] = get_global_id(1);            // H
+    in_index[2] = get_global_id(2) % DEPTH_IN; // C
+    in_index[3] = get_global_id(2) / DEPTH_IN; // B
+
+    out_index[0] = in_index[P1];
+    out_index[1] = in_index[P2];
+    out_index[2] = in_index[P3];
+    out_index[3] = in_index[P4];
+
+    *((__global DATA_TYPE *)tensor4D_offset(&out, out_index[0], out_index[1], out_index[2], out_index[3])) = *((__global DATA_TYPE *)in.ptr);
 }
-
-/** Perform a DCHW -> DWCH permute operation on an input tensor.
- *
- * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
- * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_IN=size. e.g. -DDEPTH_IN=16
- *
- * @param[in]  input_ptr                            Pointer to the source image. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
- * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  output_step_w                        output_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void permute_120(
-    TENSOR4D_DECLARATION(input),
-    TENSOR4D_DECLARATION(output))
-{
-    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT(input, DEPTH_IN);
-    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
-
-    *((__global DATA_TYPE *)tensor4D_offset(&out, get_global_id(1), (get_global_id(2) % DEPTH_IN), get_global_id(0), (get_global_id(2) / DEPTH_IN))) = *((__global DATA_TYPE *)in.ptr);
-}
-
-/** Perform a DCHW -> HWCD permute operation on an input tensor.
- *
- * @attention Data type can be passed using the -DDATA_TYPE compile flag, e.g. -DDATA_TYPE=float
- * @attention Input tensor depth should be given as a preprocessor argument using -DDEPTH_IN=size. e.g. -DDEPTH_IN=16
- *
- * @param[in]  input_ptr                            Pointer to the source image. Supported data types: U8/S8/QASYMM8/U16/S16/F16/U32/S32/F32
- * @param[in]  input_stride_x                       Stride of the source image in X dimension (in bytes)
- * @param[in]  input_step_x                         input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  input_stride_y                       Stride of the source image in Y dimension (in bytes)
- * @param[in]  input_step_y                         input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  input_stride_z                       Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  input_step_z                         input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  input_offset_first_element_in_bytes  The offset of the first element in the source image
- * @param[out] output_ptr                           Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in]  output_stride_x                      Stride of the destination image in X dimension (in bytes)
- * @param[in]  output_step_x                        output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in]  output_stride_y                      Stride of the destination image in Y dimension (in bytes)
- * @param[in]  output_step_y                        output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in]  output_stride_z                      Stride of the source tensor in Z dimension (in bytes)
- * @param[in]  output_step_z                        output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in]  output_stride_w                      Stride of the source tensor in W dimension (in bytes)
- * @param[in]  output_step_w                        output_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in]  output_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void permute_3201(
-    TENSOR4D_DECLARATION(input),
-    TENSOR4D_DECLARATION(output))
-{
-    Tensor4D in  = CONVERT_TO_TENSOR4D_STRUCT(input, DEPTH_IN);
-    Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
-
-    *((__global DATA_TYPE *)tensor4D_offset(&out, (get_global_id(2) / DEPTH_IN), (get_global_id(2) % DEPTH_IN), get_global_id(0), get_global_id(1))) = *((__global DATA_TYPE *)in.ptr);
-}
-#endif // defined(DATA_TYPE) && defined(DEPTH_IN)
+#endif // defined(DATA_TYPE) && defined(DEPTH_IN) && defined(P1) && defined(P2) && defined(P3) && defined(P4)
diff --git a/src/core/CL/kernels/CLPermuteKernel.cpp b/src/core/CL/kernels/CLPermuteKernel.cpp
index a9a2c5c..6c44199 100644
--- a/src/core/CL/kernels/CLPermuteKernel.cpp
+++ b/src/core/CL/kernels/CLPermuteKernel.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -56,16 +56,20 @@
                                                          DataType::U16, DataType::S16,
                                                          DataType::U32, DataType::S32,
                                                          DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((perm != PermutationVector{ 2U, 0U, 1U })
-                                    && (perm != PermutationVector{ 1U, 2U, 0U })
-                                    && (perm != PermutationVector{ 3U, 2U, 0U, 1U }),
-                                    "Only [2, 0, 1], [1, 2, 0] and [3, 2, 0, 1] permutation is supported");
 
-    const TensorShape output_shape = misc::shape_calculator::compute_permutation_output_shape(*input, perm);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() < 1 || input->num_dimensions() > 4,
+                                    "Permutation upto 4-D input tensor is supported");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(perm.num_dimensions() < 1 || perm.num_dimensions() > 4,
+                                    "Permutation vector size should be less than or equal to 4");
+    for(const auto &p : perm)
+    {
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(p >= perm.num_dimensions(), "Permutation vector has invalid values");
+    }
 
     // Validate configured output
     if(output->total_size() != 0)
     {
+        const TensorShape output_shape = misc::shape_calculator::compute_permutation_output_shape(*input, perm);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
     }
@@ -87,30 +91,16 @@
     auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
 
     // Create kernel
-    std::set<std::string> build_opts;
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
+    build_opts.add_option("-DDEPTH_IN=" + support::cpp11::to_string(input->info()->dimension(2)));
+    // New positions of  width(W), height(H), channel(C) and batch(D) based on permutation vector
+    build_opts.add_option("-DP1=" + support::cpp11::to_string((_perm.num_dimensions() >= 1) ? perm[0] : 0));
+    build_opts.add_option("-DP2=" + support::cpp11::to_string((_perm.num_dimensions() >= 2) ? perm[1] : 1));
+    build_opts.add_option("-DP3=" + support::cpp11::to_string((_perm.num_dimensions() >= 3) ? perm[2] : 2));
+    build_opts.add_option("-DP4=" + support::cpp11::to_string((_perm.num_dimensions() >= 4) ? perm[3] : 3));
 
-    build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
-    build_opts.emplace("-DDEPTH_IN=" + support::cpp11::to_string(input->info()->dimension(2)));
-
-    // Run [2, 0, 1] permute
-    if(_perm == PermutationVector{ 2U, 0U, 1U })
-    {
-        _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("permute_201", build_opts));
-    }
-    // Run [1, 2, 0] permute
-    else if(_perm == PermutationVector{ 1U, 2U, 0U })
-    {
-        _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("permute_120", build_opts));
-    }
-    // Run [3, 2, 0, 1] permute
-    else if(_perm == PermutationVector{ 3U, 2U, 0U, 1U })
-    {
-        _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("permute_3201", build_opts));
-    }
-    else
-    {
-        ARM_COMPUTE_ERROR("Not supported.");
-    }
+    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("permute", build_opts.options()));
 
     // Configure  kernel window
     Window win = calculate_max_window(*input->info(), Steps());
diff --git a/tests/validation/CL/Permute.cpp b/tests/validation/CL/Permute.cpp
index a75b8cf..e1908ab 100644
--- a/tests/validation/CL/Permute.cpp
+++ b/tests/validation/CL/Permute.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018 ARM Limited.
+ * Copyright (c) 2018-2019 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -42,16 +42,29 @@
 {
 namespace
 {
-const auto PermuteVectors = framework::dataset::make("PermutationVector",
+const auto PermuteVectors3 = framework::dataset::make("PermutationVector",
 {
     PermutationVector(2U, 0U, 1U),
     PermutationVector(1U, 2U, 0U),
-    PermutationVector(3U, 2U, 0U, 1U)
+    PermutationVector(0U, 1U, 2U),
+    PermutationVector(0U, 2U, 1U),
+    PermutationVector(1U, 0U, 2U),
+    PermutationVector(2U, 1U, 0U),
 });
+const auto PermuteVectors4 = framework::dataset::make("PermutationVector",
+{
+    PermutationVector(3U, 2U, 0U, 1U),
+    PermutationVector(3U, 2U, 1U, 0U),
+    PermutationVector(2U, 3U, 1U, 0U),
+    PermutationVector(1U, 3U, 2U, 0U),
+    PermutationVector(3U, 1U, 2U, 0U),
+    PermutationVector(3U, 0U, 2U, 1U),
+    PermutationVector(0U, 3U, 2U, 1U)
+});
+const auto PermuteVectors         = concat(PermuteVectors3, PermuteVectors4);
 const auto PermuteInputLayout     = framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC });
 const auto PermuteParametersSmall = concat(concat(datasets::Small2DShapes(), datasets::Small3DShapes()), datasets::Small4DShapes()) * PermuteInputLayout * PermuteVectors;
 const auto PermuteParametersLarge = datasets::Large4DShapes() * PermuteInputLayout * PermuteVectors;
-
 } // namespace
 TEST_SUITE(CL)
 TEST_SUITE(Permute)
@@ -59,47 +72,49 @@
 // *INDENT-OFF*
 // clang-format off
 DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
-                                                framework::dataset::make("InputInfo",{
-                                                                                        TensorInfo(TensorShape(7U, 7U, 5U, 3U), 1, DataType::U16),     // permutation not supported
-                                                                                        TensorInfo(TensorShape(7U, 7U, 5U, 3U), 1, DataType::U16),     // permutation not supported
-                                                                                        TensorInfo(TensorShape(7U, 7U, 5U, 3U), 1, DataType::U16),     // permutation not supported
-                                                                                        TensorInfo(TensorShape(1U, 7U), 1, DataType::U8),              // invalid input size
-                                                                                        TensorInfo(TensorShape(7U, 7U, 5U, 3U), 1, DataType::U16),     // valid
-                                                                                        TensorInfo(TensorShape(27U, 13U, 37U, 2U), 1, DataType::F32),  // valid
-                                                                                        TensorInfo(TensorShape(27U, 13U, 37U, 2U), 1, DataType::F32),  // valid
-                                                                                        TensorInfo(TensorShape(128U, 64U, 21U, 2U), 1, DataType::QASYMM8), // permutation not supported
-                                                                                        TensorInfo(TensorShape(128U, 64U, 21U, 2U), 1, DataType::F32), // permutation not supported
-                                                                                        TensorInfo(TensorShape(128U, 64U, 21U, 2U), 1, DataType::F32), // permutation not supported
-                                                                                        TensorInfo(TensorShape(128U, 64U, 21U, 2U), 1, DataType::U16), // permutation not supported
-                                                                                    }),
-                                                framework::dataset::make("OutputInfo", {
-                                                                                        TensorInfo(TensorShape(5U, 7U, 7U, 3U), 1, DataType::U16),
-                                                                                        TensorInfo(TensorShape(5U, 5U, 7U, 3U), 1, DataType::U16),
-                                                                                        TensorInfo(TensorShape(7U, 7U, 7U, 3U), 1, DataType::U16),
-                                                                                        TensorInfo(TensorShape(5U, 7U), 1, DataType::U8),
-                                                                                        TensorInfo(TensorShape(5U, 7U, 7U, 3U), 1, DataType::U16),
-                                                                                        TensorInfo(TensorShape(13U, 37U, 27U, 2U), 1, DataType::F32),
-                                                                                        TensorInfo(TensorShape(2U, 37U, 27U, 13U), 1, DataType::F32),
-                                                                                        TensorInfo(TensorShape(128U, 64U, 21U, 2U), 1, DataType::QASYMM8),
-                                                                                        TensorInfo(TensorShape(128U, 64U, 21U, 2U), 1, DataType::F32),
-                                                                                        TensorInfo(TensorShape(21U, 64U, 2U, 128U), 1, DataType::F32),
-                                                                                        TensorInfo(TensorShape(2U, 21U, 64U, 128U), 1, DataType::U16),
-                                                                                    })),
-                                                framework::dataset::make("PermutationVector", {
-                                                                                                PermutationVector(2U, 1U, 0U),
-                                                                                                PermutationVector(2U, 2U, 1U),
-                                                                                                PermutationVector(1U, 1U, 1U),
-                                                                                                PermutationVector(2U, 0U, 1U),
-                                                                                                PermutationVector(2U, 0U, 1U),
-                                                                                                PermutationVector(1U, 2U, 0U),
-                                                                                                PermutationVector(3U, 2U, 0U, 1U),
-                                                                                                PermutationVector(2U, 3U, 1U, 0U),
-                                                                                                PermutationVector(1U, 1U, 1U, 1U),
-                                                                                                PermutationVector(2U, 1U, 3U, 0U),
-                                                                                                PermutationVector(3U, 2U, 1U, 0U),
-                                                                                    })),
-                                                framework::dataset::make("Expected", { false, false, false, false, true, true, true, false, false, false, false })),
-                                            input_info, output_info, perm_vect, expected)
+        framework::dataset::make("InputInfo",{
+                TensorInfo(TensorShape(7U, 7U, 5U, 3U), 1, DataType::U16),     // valid
+                TensorInfo(TensorShape(7U, 7U, 5U, 3U), 1, DataType::U16),     // permutation not supported
+                TensorInfo(TensorShape(7U, 7U, 5U, 3U), 1, DataType::U16),     // permutation not supported
+                TensorInfo(TensorShape(1U, 7U), 1, DataType::U8),              // invalid input size
+                TensorInfo(TensorShape(7U, 7U, 5U, 3U), 1, DataType::U16),     // valid
+                TensorInfo(TensorShape(27U, 13U, 37U, 2U), 1, DataType::F32),  // valid
+                TensorInfo(TensorShape(7U, 7U, 5U, 3U), 1, DataType::U16),     // permutation not supported
+                TensorInfo(TensorShape(7U, 7U, 5U, 3U), 1, DataType::S16),     // valid
+                TensorInfo(TensorShape(27U, 13U, 37U, 2U), 1, DataType::F32),  // permutation not supported
+                TensorInfo(TensorShape(27U, 13U, 37U, 2U), 1, DataType::F32),  // valid
+                TensorInfo(TensorShape(27U, 13U, 37U, 2U), 1, DataType::F32)   // permutation not supported
+
+        }),
+        framework::dataset::make("OutputInfo", {
+                TensorInfo(TensorShape(5U, 7U, 7U, 3U), 1, DataType::U16),
+                TensorInfo(TensorShape(7U, 7U, 5U, 3U), 1, DataType::U16),
+                TensorInfo(TensorShape(7U, 7U, 5U, 3U), 1, DataType::U16),
+                TensorInfo(TensorShape(5U, 7U), 1, DataType::U8),
+                TensorInfo(TensorShape(5U, 7U, 7U, 3U), 1, DataType::U16),
+                TensorInfo(TensorShape(13U, 37U, 27U, 2U), 1, DataType::F32),
+                TensorInfo(TensorShape(5U, 7U, 7U, 3U), 1, DataType::U16),
+                TensorInfo(TensorShape(3U, 5U, 7U, 7U), 1, DataType::S16),
+                TensorInfo(TensorShape(13U, 37U, 27U, 2U), 1, DataType::F32),
+                TensorInfo(TensorShape(37U, 2U, 13U, 27U), 1, DataType::F32),
+                TensorInfo(TensorShape(37U, 2U, 13U, 27U), 1, DataType::F32)
+
+        })),
+        framework::dataset::make("PermutationVector", {
+                PermutationVector(2U, 1U, 0U),
+                PermutationVector(2U, 2U, 1U),
+                PermutationVector(1U, 1U, 1U),
+                PermutationVector(2U, 0U, 1U),
+                PermutationVector(2U, 0U, 1U),
+                PermutationVector(1U, 2U, 0U),
+                PermutationVector(3U, 2U, 0U, 1U),
+                PermutationVector(3U, 2U, 0U, 1U),
+                PermutationVector(2U, 3U, 1U, 0U),
+                PermutationVector(2U, 3U, 1U, 0U),
+                PermutationVector(0U, 0U, 0U, 1000U)
+        })),
+        framework::dataset::make("Expected", { true, false, false, false, true, true, false, true, false, true, false })),
+        input_info, output_info, perm_vect, expected)
 {
     ARM_COMPUTE_EXPECT(bool(CLPermute::validate(&input_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), perm_vect)) == expected, framework::LogLevel::ERRORS);
 }
@@ -129,50 +144,61 @@
     validate(dst.info()->valid_region(), valid_region);
 }
 
+#ifndef DOXYGEN_SKIP_THIS
+
 template <typename T>
 using CLPermuteFixture = PermuteValidationFixture<CLTensor, CLAccessor, CLPermute, T>;
 
 TEST_SUITE(U8)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLPermuteFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(PermuteParametersSmall, framework::dataset::make("DataType", DataType::U8)))
+FIXTURE_DATA_TEST_CASE(RunSmall, CLPermuteFixture<uint8_t>, framework::DatasetMode::PRECOMMIT,
+                       PermuteParametersSmall * framework::dataset::make("DataType", DataType::U8))
 {
     // Validate output
     validate(CLAccessor(_target), _reference);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, CLPermuteFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(PermuteParametersLarge, framework::dataset::make("DataType", DataType::U8)))
+
+FIXTURE_DATA_TEST_CASE(RunLarge, CLPermuteFixture<uint8_t>, framework::DatasetMode::NIGHTLY,
+                       PermuteParametersLarge * framework::dataset::make("DataType", DataType::U8))
 {
     // Validate output
     validate(CLAccessor(_target), _reference);
 }
-TEST_SUITE_END()
+TEST_SUITE_END() // U8
 
 TEST_SUITE(U16)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLPermuteFixture<uint16_t>, framework::DatasetMode::PRECOMMIT, combine(PermuteParametersSmall, framework::dataset::make("DataType", DataType::U16)))
+FIXTURE_DATA_TEST_CASE(RunSmall, CLPermuteFixture<uint16_t>, framework::DatasetMode::PRECOMMIT,
+                       PermuteParametersSmall * framework::dataset::make("DataType", DataType::U16))
 {
     // Validate output
     validate(CLAccessor(_target), _reference);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, CLPermuteFixture<uint16_t>, framework::DatasetMode::NIGHTLY, combine(PermuteParametersLarge, framework::dataset::make("DataType", DataType::U16)))
+FIXTURE_DATA_TEST_CASE(RunLarge, CLPermuteFixture<uint16_t>, framework::DatasetMode::NIGHTLY,
+                       PermuteParametersLarge * framework::dataset::make("DataType", DataType::U16))
 {
     // Validate output
     validate(CLAccessor(_target), _reference);
 }
-TEST_SUITE_END()
+TEST_SUITE_END() // U16
 
 TEST_SUITE(U32)
-FIXTURE_DATA_TEST_CASE(RunSmall, CLPermuteFixture<uint32_t>, framework::DatasetMode::PRECOMMIT, combine(PermuteParametersSmall, framework::dataset::make("DataType", DataType::U32)))
+FIXTURE_DATA_TEST_CASE(RunSmall, CLPermuteFixture<uint32_t>, framework::DatasetMode::PRECOMMIT,
+                       PermuteParametersSmall * framework::dataset::make("DataType", DataType::U32))
 {
     // Validate output
     validate(CLAccessor(_target), _reference);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, CLPermuteFixture<uint32_t>, framework::DatasetMode::NIGHTLY, combine(PermuteParametersLarge, framework::dataset::make("DataType", DataType::U32)))
+FIXTURE_DATA_TEST_CASE(RunLarge, CLPermuteFixture<uint32_t>, framework::DatasetMode::NIGHTLY,
+                       PermuteParametersLarge * framework::dataset::make("DataType", DataType::U32))
 {
     // Validate output
     validate(CLAccessor(_target), _reference);
 }
-TEST_SUITE_END()
+TEST_SUITE_END() // U32
 
-TEST_SUITE_END()
-TEST_SUITE_END()
+#endif /* DOXYGEN_SKIP_THIS */
+
+TEST_SUITE_END() // Permute
+TEST_SUITE_END() // CL
 } // namespace validation
 } // namespace test
 } // namespace arm_compute