COMPMID-802 Add NHWC data format support for NEON im2col.

Change-Id: I86e678179106a2b83d1c6a7cfe562df91b0f9eb2
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/124000
Tested-by: Jenkins <bsgcomp@arm.com>
Reviewed-by: Pablo Tello <pablo.tello@arm.com>
diff --git a/src/core/NEON/kernels/NEIm2ColKernel.cpp b/src/core/NEON/kernels/NEIm2ColKernel.cpp
index 348722c..5e165a6 100644
--- a/src/core/NEON/kernels/NEIm2ColKernel.cpp
+++ b/src/core/NEON/kernels/NEIm2ColKernel.cpp
@@ -53,27 +53,26 @@
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::QASYMM8 && has_bias);
     ARM_COMPUTE_RETURN_ERROR_ON((dilation.x() < 1) || (dilation.y() < 1));
 
+    TensorShape expected_output_shape;
     if(is_flatten) /* Called by FlattenLayer */
     {
-        size_t flatten_shape = input->tensor_shape().x() * input->tensor_shape().y() * input->tensor_shape().z();
-        ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(0) != flatten_shape);
+        expected_output_shape = misc::shape_calculator::compute_im2col_flatten_shape(input);
     }
     else if(!is_fully_connected) /* Called by ConvolutionLayer */
     {
-        std::pair<unsigned int, unsigned int> out_dims = scaled_dimensions(input->dimension(0), input->dimension(1), kernel_dims.width, kernel_dims.height, conv_info, dilation);
-        ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(0) != (input->dimension(2) * kernel_dims.area() + (has_bias ? 1 : 0)));
-        ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(1) != (out_dims.first * out_dims.second));
-        ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(2) != 1);
+        expected_output_shape = misc::shape_calculator::compute_im2col_conv_shape(input, kernel_dims, conv_info, has_bias, dilation);
     }
     else /* Called by FullyConnectedLayer */
     {
         const int num_batch_dimensions = std::max(0, static_cast<int>(output->tensor_shape().num_dimensions()) - 1);
         const int num_input_dimensions = input->tensor_shape().num_dimensions() - num_batch_dimensions;
 
-        TensorInfo expected_output = output->clone()->set_tensor_shape(misc::shape_calculator::compute_im2col_shape(input, num_input_dimensions));
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&expected_output, output);
+        expected_output_shape = misc::shape_calculator::compute_im2col_fc_shape(input, num_input_dimensions);
     }
 
+    TensorInfo expected_output = output->clone()->set_tensor_shape(expected_output_shape);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&expected_output, output);
+
     return Status{};
 }
 
@@ -194,12 +193,17 @@
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
 
-    const int kernel_depth   = _input->info()->dimension(2);
-    const int input_w        = _input->info()->dimension(0);
-    const int input_h        = _input->info()->dimension(1);
-    const int input_stride_x = _input->info()->strides_in_bytes().x();
-    const int input_stride_y = _input->info()->strides_in_bytes().y();
-    const int input_stride_z = _input->info()->strides_in_bytes().z();
+    const DataLayout   data_layout = _input->info()->data_layout();
+    const unsigned int width_idx   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const unsigned int height_idx  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+    const unsigned int channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+
+    const int kernel_depth   = _input->info()->dimension(channel_idx);
+    const int input_w        = _input->info()->dimension(width_idx);
+    const int input_h        = _input->info()->dimension(height_idx);
+    const int input_stride_x = _input->info()->strides_in_bytes()[width_idx];
+    const int input_stride_y = _input->info()->strides_in_bytes()[height_idx];
+    const int input_stride_z = _input->info()->strides_in_bytes()[channel_idx];
     const int offset         = is_data_type_quantized(_input->info()->data_type()) ? _input->info()->quantization_info().offset : 0;
 
     int pad_left = 0;
@@ -222,9 +226,9 @@
 
     // Setup output window
     Window window_out(window);
-    window_out.set(Window::DimX, Window::Dimension(0, _output->info()->dimension(0), _output->info()->strides_in_bytes().y() / _output->info()->element_size()));
-    window_out.set(Window::DimY, Window::Dimension(window.y().start() * _convolved_dims.first, window.y().end() * _convolved_dims.first, _convolved_dims.first));
-    window_out.set(Window::DimZ, Window::Dimension(0, 1, 1));
+    window_out.set(width_idx, Window::Dimension(0, _output->info()->dimension(width_idx), _output->info()->strides_in_bytes()[width_idx + 1] / _output->info()->strides_in_bytes()[width_idx]));
+    window_out.set(height_idx, Window::Dimension(window[height_idx].start() * _convolved_dims.first, window[height_idx].end() * _convolved_dims.first, _convolved_dims.first));
+    window_out.set(channel_idx, Window::Dimension(0, 1, 1));
 
     // Create iterators
     Iterator in(_input, window_in);
@@ -232,8 +236,8 @@
 
     execute_window_loop(window, [&](const Coordinates & id)
     {
-        const int top_left_x = id.x() * stride_x + start_x;
-        const int top_left_y = id.y() * stride_y + start_y;
+        const int top_left_x = id[width_idx] * stride_x + start_x;
+        const int top_left_y = id[height_idx] * stride_y + start_y;
 
         // Get pointers
         const uint8_t *const input_ptr  = in.ptr();
@@ -327,13 +331,18 @@
     ARM_COMPUTE_UNUSED(is_fully_connected, is_flatten);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), kernel_dims, conv_info, has_bias, is_fully_connected, is_flatten, dilation));
 
+    const DataLayout   data_layout = input->info()->data_layout();
+    const unsigned int width_idx   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const unsigned int height_idx  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+    const unsigned int channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+
     _input          = input;
     _output         = output;
     _conv_info      = conv_info;
     _kernel_width   = kernel_dims.width;
     _kernel_height  = kernel_dims.height;
     _dilation       = dilation;
-    _convolved_dims = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1),
+    _convolved_dims = scaled_dimensions(input->info()->dimension(width_idx), input->info()->dimension(height_idx),
                                         _kernel_width, _kernel_height,
                                         _conv_info, _dilation);
     _has_bias = has_bias;
@@ -402,9 +411,9 @@
                 ARM_COMPUTE_ERROR("Data type not supported");
                 break;
         }
-        window.set(Window::DimX, Window::Dimension(0, _convolved_dims.first, 1));
-        window.set(Window::DimY, Window::Dimension(0, _convolved_dims.second, 1));
-        window.set(Window::DimZ, Window::Dimension(0, 1, 1));
+        window.set(width_idx, Window::Dimension(0, _convolved_dims.first, 1));
+        window.set(height_idx, Window::Dimension(0, _convolved_dims.second, 1));
+        window.set(channel_idx, Window::Dimension(0, 1, 1));
     }
 
     // The NEIm2ColKernel doesn't need padding so update_window_and_padding() can be skipped
diff --git a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
index 676706f..5dd1f00 100644
--- a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
+++ b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
@@ -114,7 +114,7 @@
     // If the fully connected layer is called after a convolution layer, the input tensor must be linearized
 
     // Initialize output tensor for im2col
-    TensorShape shape_im2col = compute_im2col_shape(input->info());
+    TensorShape shape_im2col = compute_im2col_fc_shape(input->info());
     _im2col_output.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col));
 
     // Configure im2col kernel
@@ -244,7 +244,7 @@
     bool            is_quantized     = is_data_type_quantized_asymmetric(input->data_type());
     const GPUTarget gpu_target       = CLScheduler::get().target();
 
-    const ITensorInfo &im2col_input     = TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_im2col_shape(input)));
+    const ITensorInfo &im2col_input     = TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_im2col_fc_shape(input)));
     const ITensorInfo &reshaped_weights = TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_transposed_shape(*weights)));
     const ITensorInfo &gemmlowp_output  = TensorInfo(output->clone()->set_is_resizable(true).reset_padding().set_data_type(DataType::S32));
 
diff --git a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
index b310ad3..958d081 100644
--- a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
+++ b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
@@ -188,7 +188,7 @@
 
     if(_linearize_input)
     {
-        _im2col_output.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_im2col_shape(input->info(), num_input_dimensions)));
+        _im2col_output.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_im2col_fc_shape(input->info(), num_input_dimensions)));
 
         // Configure im2col kernel
         _memory_group.manage(&_im2col_output);
@@ -288,7 +288,7 @@
 
     if(linearize_input)
     {
-        im2col_output->set_tensor_shape(compute_im2col_shape(input, num_input_dimensions));
+        im2col_output->set_tensor_shape(compute_im2col_fc_shape(input, num_input_dimensions));
 
         ARM_COMPUTE_RETURN_ON_ERROR(NEIm2ColKernel::validate(input, im2col_output.get(), Size2D(1, 1), PadStrideInfo(1, 1, 0, 0), false, true));
 
diff --git a/src/runtime/NEON/functions/NEIm2Col.cpp b/src/runtime/NEON/functions/NEIm2Col.cpp
index b962db9..6b95cb0 100644
--- a/src/runtime/NEON/functions/NEIm2Col.cpp
+++ b/src/runtime/NEON/functions/NEIm2Col.cpp
@@ -23,19 +23,30 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEIm2Col.h"
 
-#include "arm_compute/core/NEON/kernels/NEIm2ColKernel.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
 #include "support/ToolchainSupport.h"
 
 using namespace arm_compute;
 
-void NEIm2Col::configure(const ITensor *input, ITensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, bool is_fully_connected)
+NEIm2Col::NEIm2Col()
+    : _kernel(), _y_dim(1)
 {
-    auto k = arm_compute::support::cpp14::make_unique<NEIm2ColKernel>();
-    k->configure(input, output, kernel_dims, conv_info, has_bias, is_fully_connected);
-    _kernel = std::move(k);
 }
 
-Status NEIm2Col::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, bool is_fully_connected)
+void NEIm2Col::configure(const ITensor *input, ITensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, bool is_fully_connected, bool is_flatten)
 {
-    return NEIm2ColKernel::validate(input, output, kernel_dims, conv_info, has_bias, is_fully_connected);
+    _y_dim = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
+
+    _kernel.configure(input, output, kernel_dims, conv_info, has_bias, is_fully_connected, is_flatten);
+}
+
+Status NEIm2Col::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, bool is_fully_connected, bool is_flatten)
+{
+    return NEIm2ColKernel::validate(input, output, kernel_dims, conv_info, has_bias, is_fully_connected, is_flatten);
+}
+
+void NEIm2Col::run()
+{
+    NEScheduler::get().schedule(&_kernel, _y_dim);
 }