Mixed data-layout testing on high priority operators

Change data layouts after the configure in validation tests for:
  - Scale
  - Pooling
  - FullyConnected
  - DepthwiseConvolution
  - DirectConvolution
  - FFTConvolution
  - WinogradConvolution
  - GEMMConvolution (Indirect GEMM included)
Extending fixtures

Fixes for new mixed data layout tests

Resolves: COMPMID-4162
Change-Id: I2f2eb2075f7e24ab3872249d88cadb57b82c5dde
Signed-off-by: Manuel Bottini <manuel.bottini@arm.com>
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5326
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
diff --git a/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp b/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp
index 09f9974..98b76c7 100644
--- a/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp
@@ -1217,7 +1217,7 @@
 
 NEDirectConvolutionLayerKernel::NEDirectConvolutionLayerKernel()
     : _input(nullptr), _weights(nullptr), _output(nullptr), _conv_info(), _border_size(0), _kernel_size(0), _num_weight_elems_read_per_row(0), _num_elems_read_per_iteration(0),
-      _num_elems_written_per_iteration(0)
+      _num_elems_written_per_iteration(0), _data_layout()
 {
 }
 
@@ -1234,13 +1234,14 @@
     _weights     = weights;
     _output      = output;
     _conv_info   = conv_info;
-    _kernel_size = weights->info()->dimension(get_data_layout_dimension_index(weights->info()->data_layout(), DataLayoutDimension::WIDTH));
+    _data_layout = _input->info()->data_layout();
+    _kernel_size = weights->info()->dimension(get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH));
 
     const unsigned int conv_pad_left   = conv_info.pad_left();
     const unsigned int conv_pad_top    = conv_info.pad_top();
     const unsigned int conv_pad_right  = conv_info.pad_right();
     const unsigned int conv_pad_bottom = conv_info.pad_bottom();
-    if(_input->info()->data_layout() == DataLayout::NCHW)
+    if(_data_layout == DataLayout::NCHW)
     {
         _border_size = BorderSize(conv_pad_top, conv_pad_right, conv_pad_bottom, conv_pad_left);
     }
@@ -1294,9 +1295,9 @@
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
     ARM_COMPUTE_ERROR_ON(_input->buffer() == nullptr);
 
-    const int kernel_size = _weights->info()->dimension(get_data_layout_dimension_index(_weights->info()->data_layout(), DataLayoutDimension::WIDTH));
+    const int kernel_size = _weights->info()->dimension(get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH));
 
-    if(_input->info()->data_layout() == DataLayout::NCHW)
+    if(_data_layout == DataLayout::NCHW)
     {
         switch(kernel_size)
         {
diff --git a/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.h b/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.h
index 258def7..259eb68 100644
--- a/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.h
+++ b/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.h
@@ -103,6 +103,7 @@
     unsigned int   _num_weight_elems_read_per_row;
     unsigned int   _num_elems_read_per_iteration;
     unsigned int   _num_elems_written_per_iteration;
+    DataLayout     _data_layout;
 };
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_NEDIRECTCONVOLUTIONLAYERKERNEL_H */
diff --git a/src/core/cpu/kernels/pooling/neon/fp16.cpp b/src/core/cpu/kernels/pooling/neon/fp16.cpp
index 314be37..1ecceaf 100644
--- a/src/core/cpu/kernels/pooling/neon/fp16.cpp
+++ b/src/core/cpu/kernels/pooling/neon/fp16.cpp
@@ -93,7 +93,7 @@
             // Store result
             vst1q_f16(reinterpret_cast<float16_t *>(out.ptr()) + x_off, vres);
 
-            const uint32_t   offset_base    = offset_no_padding<float16_t>(in.offset(), id, *src->info(), pool_stride_x, pool_stride_y);
+            const uint32_t   offset_base    = offset_no_padding<float16_t>(in.offset(), id, *src->info(), pool_stride_x, pool_stride_y, DataLayout::NHWC);
             const uint32_t   offset_x0      = (uint32_t)offset_base / sizeof(float16_t) + x_off;
             const uint32_t   offset_x1      = (uint32_t)offset_x0 + in_stride_y / sizeof(float16_t) - pad_right;
             const uint32_t   offset_x2      = (uint32_t)offset_x0 + in_stride_z / sizeof(float16_t) - pad_right * src->info()->tensor_shape()[1];
@@ -132,7 +132,7 @@
             // Store result
             *(reinterpret_cast<float16_t *>(out.ptr()) + x_off) = res;
 
-            const uint32_t offset_base = offset_no_padding<float16_t>(in.offset(), id, *src->info(), pool_stride_x, pool_stride_y);
+            const uint32_t offset_base = offset_no_padding<float16_t>(in.offset(), id, *src->info(), pool_stride_x, pool_stride_y, DataLayout::NHWC);
             const uint32_t offset_x0   = (uint32_t)offset_base / sizeof(float16_t) + x_off;
             const uint32_t offset_x1   = (uint32_t)offset_x0 + in_stride_y / sizeof(float16_t) - pad_right;
             const uint32_t offset_x2   = (uint32_t)offset_x0 + in_stride_z / sizeof(float16_t) - pad_right * src->info()->tensor_shape()[1];
diff --git a/src/core/cpu/kernels/pooling/neon/fp32.cpp b/src/core/cpu/kernels/pooling/neon/fp32.cpp
index e319047..a2bd4a6 100644
--- a/src/core/cpu/kernels/pooling/neon/fp32.cpp
+++ b/src/core/cpu/kernels/pooling/neon/fp32.cpp
@@ -95,7 +95,7 @@
             // Store result
             vst1q_f32(reinterpret_cast<float *>(out.ptr()) + x_off, vres);
 
-            const uint32_t   offset_base  = offset_no_padding<float>(in.offset(), id, *src->info(), pool_stride_x, pool_stride_y);
+            const uint32_t   offset_base  = offset_no_padding<float>(in.offset(), id, *src->info(), pool_stride_x, pool_stride_y, DataLayout::NHWC);
             const uint32_t   offset_x0    = (uint32_t)offset_base / sizeof(float) + x_off;
             const uint32_t   offset_x1    = (uint32_t)offset_x0 + in_stride_y / sizeof(float) - pad_right;
             const uint32_t   offset_x2    = (uint32_t)offset_x0 + in_stride_z / sizeof(float) - pad_right * src->info()->tensor_shape()[1];
@@ -124,7 +124,7 @@
             // Store result
             *(reinterpret_cast<float *>(out.ptr()) + x_off) = res;
 
-            const uint32_t offset_base = offset_no_padding<float>(in.offset(), id, *src->info(), pool_stride_x, pool_stride_y);
+            const uint32_t offset_base = offset_no_padding<float>(in.offset(), id, *src->info(), pool_stride_x, pool_stride_y, DataLayout::NHWC);
             const uint32_t offset_x0   = (uint32_t)offset_base / sizeof(float) + x_off;
             const uint32_t offset_x1   = (uint32_t)offset_x0 + in_stride_y / sizeof(float) - pad_right;
             const uint32_t offset_x2   = (uint32_t)offset_x0 + in_stride_z / sizeof(float) - pad_right * src->info()->tensor_shape()[1];
diff --git a/src/core/cpu/kernels/pooling/neon/list.h b/src/core/cpu/kernels/pooling/neon/list.h
index 3435ee6..bec1536 100644
--- a/src/core/cpu/kernels/pooling/neon/list.h
+++ b/src/core/cpu/kernels/pooling/neon/list.h
@@ -59,7 +59,7 @@
 #undef DECLARE_POOLING_KERNEL
 
 template <typename T>
-inline uint32_t offset_no_padding(uint32_t padded_offset, const Coordinates &id, const ITensorInfo &info, int pool_stride_x, int pool_stride_y)
+inline uint32_t offset_no_padding(uint32_t padded_offset, const Coordinates &id, const ITensorInfo &info, int pool_stride_x, int pool_stride_y, DataLayout data_layout)
 {
     const int pad_left    = info.padding().left;
     const int pad_right   = info.padding().right;
@@ -70,7 +70,7 @@
     const int pad_horiz   = pad_left + pad_right;
     const int pad_vert    = pad_top + pad_bottom;
 
-    if(info.data_layout() == DataLayout::NCHW)
+    if(data_layout == DataLayout::NCHW)
     {
         const uint32_t offset_base = padded_offset
                                      - sizeof(T) * pad_horiz * id.y() * pool_stride_y                                            /* subtract padding elems per row */
diff --git a/src/core/cpu/kernels/pooling/neon/nchw/all.cpp b/src/core/cpu/kernels/pooling/neon/nchw/all.cpp
index 47ac7b4..80eac68 100644
--- a/src/core/cpu/kernels/pooling/neon/nchw/all.cpp
+++ b/src/core/cpu/kernels/pooling/neon/nchw/all.cpp
@@ -150,7 +150,7 @@
         *(reinterpret_cast<T *>(out.ptr())) = static_cast<T>(vget_lane_f32(max_data, 0));
 
         // Calculate max data indice, which will be used in max unpool.
-        const uint32_t   offset_base              = offset_no_padding<T>(in.offset(), id, *src->info(), pool_stride_x, pool_stride_y);
+        const uint32_t   offset_base              = offset_no_padding<T>(in.offset(), id, *src->info(), pool_stride_x, pool_stride_y, DataLayout::NCHW);
         const uint32_t   offset_top               = (uint32_t)(offset_base / sizeof(T));
         const uint32_t   offset_bottom            = offset_top + in_stride_y / sizeof(T) - pad_right - pad_left;
         const uint32x2_t voffset_top              = { offset_top, offset_top + 1u };
diff --git a/src/core/cpu/kernels/scale/sve/qasymm8.cpp b/src/core/cpu/kernels/scale/sve/qasymm8.cpp
index c475ad6..c041f14 100644
--- a/src/core/cpu/kernels/scale/sve/qasymm8.cpp
+++ b/src/core/cpu/kernels/scale/sve/qasymm8.cpp
@@ -89,10 +89,9 @@
                                 BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
                                 bool align_corners, const Window &window)
 {
-    // Get data layout and width/height indices
-    const DataLayout data_layout = src->info()->data_layout();
-    const int        idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const int        idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+    // Data layout is NHWC
+    const int        idx_width   = 1;
+    const int        idx_height  = 2;
 
     // Compute the ratio between source height and destination height
     const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(idx_height), dst->info()->dimension(idx_height), align_corners);
diff --git a/src/core/cpu/kernels/scale/sve/qasymm8_signed.cpp b/src/core/cpu/kernels/scale/sve/qasymm8_signed.cpp
index b39b75a..9df4301 100644
--- a/src/core/cpu/kernels/scale/sve/qasymm8_signed.cpp
+++ b/src/core/cpu/kernels/scale/sve/qasymm8_signed.cpp
@@ -89,10 +89,9 @@
                                        BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
                                        bool align_corners, const Window &window)
 {
-    // Get data layout and width/height indices
-    const DataLayout data_layout = src->info()->data_layout();
-    const int        idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const int        idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+    // Data layout is NHWC
+    const int        idx_width   = 1;
+    const int        idx_height  = 2;
 
     // Compute the ratio between source height and destination height
     const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(idx_height), dst->info()->dimension(idx_height), align_corners);
diff --git a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
index 8d2c81b..5ed8aa9 100644
--- a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -324,7 +324,7 @@
         {
             _output_multipliers.map();
             _output_shifts.map();
-            const unsigned int idx_ofms = get_data_layout_dimension_index(_output->info()->data_layout(), DataLayoutDimension::CHANNEL);
+            const unsigned int idx_ofms = _needs_permute ? 2 : 0;
             quantization::compute_quantized_multipliers_and_shifts(_input->info(),
                                                                    _original_weights->info(),
                                                                    _output->info(),
@@ -529,7 +529,7 @@
         {
             _output_multipliers.map();
             _output_shifts.map();
-            const unsigned int idx_ofms = get_data_layout_dimension_index(_output->info()->data_layout(), DataLayoutDimension::CHANNEL);
+            const unsigned int idx_ofms = _needs_permute ? 2 : 0;
             quantization::compute_quantized_multipliers_and_shifts(_input->info(),
                                                                    _original_weights->info(),
                                                                    _output->info(),
diff --git a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
index dc3bbbe..941cb21 100644
--- a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
@@ -303,7 +303,7 @@
 NEWinogradConvolutionLayer::NEWinogradConvolutionLayer(const std::shared_ptr<IMemoryManager> &memory_manager)
     : _memory_group(memory_manager), _gemm_function(memory_manager), _transform_input_kernel(nullptr), _transform_output_kernel(nullptr), _transform_weights_kernel(nullptr), _activationlayer_function(),
       _permute_input(), _permute_weights(), _permute_output(), _input_transformed(), _output_transformed(), _input_workspace(), _output_workspace(), _kernel_storage(), _input_nhwc(), _output_nhwc(),
-      _weights_hwio(), _input(), _weights(), _output(), _is_prepared(false), _is_activationlayer_enabled(false)
+      _weights_hwio(), _input(), _weights(), _output(), _is_prepared(false), _is_activationlayer_enabled(false), _data_layout()
 {
 }
 
@@ -314,10 +314,10 @@
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), conv_info));
 
     // Get indices for the width and height
-    const DataLayout   data_layout = input->info()->data_layout();
-    const unsigned int width_idx   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const unsigned int height_idx  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-    const unsigned int channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+    _data_layout                   = input->info()->data_layout();
+    const unsigned int width_idx   = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
+    const unsigned int height_idx  = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
+    const unsigned int channel_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL);
 
     const Size2D   input_dims  = Size2D(input->info()->dimension(width_idx), input->info()->dimension(height_idx));
     const Size2D   kernel_size = Size2D(weights->info()->dimension(width_idx), weights->info()->dimension(height_idx));
@@ -537,7 +537,7 @@
     const unsigned int max_num_threads = NEScheduler::get().num_threads();
 
     // Configure the kernel to transform the input tensor from NCHW -> NHWC
-    if(data_layout == DataLayout::NCHW)
+    if(_data_layout == DataLayout::NCHW)
     {
         _memory_group.manage(&_input_nhwc);
         _permute_input.configure(input, &_input_nhwc, PermutationVector(2U, 0U, 1U));
@@ -554,7 +554,7 @@
     TensorInfo   input_workspace_info(TensorShape(input_workspace_size), 1, _input->info()->data_type());
     _input_workspace.allocator()->init(input_workspace_info);
     _input_workspace.allocator()->allocate();
-    if(data_layout == DataLayout::NCHW)
+    if(_data_layout == DataLayout::NCHW)
     {
         _input_nhwc.allocator()->allocate();
     }
@@ -570,7 +570,7 @@
 
     // Configure output transform function
     // The biases tensor has not been allocated at this point in time, the output transform will add the biases to the final result in the run() method
-    if(data_layout == DataLayout::NCHW)
+    if(_data_layout == DataLayout::NCHW)
     {
         _memory_group.manage(&_output_nhwc);
         output_to_use = &_output_nhwc;
@@ -595,7 +595,7 @@
     _output_transformed.allocator()->allocate();
 
     // Reorder the convoluted output to ACL's ordering NCHW
-    if(data_layout == DataLayout::NCHW)
+    if(_data_layout == DataLayout::NCHW)
     {
         _permute_output.configure(&_output_nhwc, _output, PermutationVector(1U, 2U, 0U));
         _output_nhwc.allocator()->allocate();
@@ -615,13 +615,11 @@
 
 void NEWinogradConvolutionLayer::run()
 {
-    const DataLayout data_layout = _input->info()->data_layout();
-
     prepare();
 
     MemoryGroupResourceScope scope_mg(_memory_group);
 
-    if(data_layout == DataLayout::NCHW)
+    if(_data_layout == DataLayout::NCHW)
     {
         //Bring channels to the front as Winograd code expects the tensor to be in the format NHWC
         _permute_input.run();
@@ -636,7 +634,7 @@
     // Transform output tensor to the spatial domain
     NEScheduler::get().schedule(_transform_output_kernel.get(), Window::DimX);
 
-    if(data_layout == DataLayout::NCHW)
+    if(_data_layout == DataLayout::NCHW)
     {
         // Reorder the convoluted output to ACL's ordering NCHW
         _permute_output.run();