COMPMID-1470 Add auto-init of the output in NECol2im

The output of NECol2Im is already auto-initialized.
This patch is about calling ShapeCalculator instead of computing the shape inside the kernel, adding validate_and_configure_window, and standardize the way convolved dims are passed (now NEON uses Size2D, while CL passes a pair of uint values: using Size2D for both implementations)

Change-Id: I795696e1b6532f57847c3186c1b532c09f5a25da
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/145345
Tested-by: Jenkins <bsgcomp@arm.com>
Reviewed-by: Michele DiGiorgio <michele.digiorgio@arm.com>
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
diff --git a/arm_compute/core/CL/kernels/CLCol2ImKernel.h b/arm_compute/core/CL/kernels/CLCol2ImKernel.h
index 2a18ae0..948b412 100644
--- a/arm_compute/core/CL/kernels/CLCol2ImKernel.h
+++ b/arm_compute/core/CL/kernels/CLCol2ImKernel.h
@@ -71,7 +71,7 @@
      * @param[in]  convolved_dims Output convolved dimensions.
      * @param[in]  num_groups     (Optional) Number of groups when performing a grouped convolution
      */
-    void configure(const ICLTensor *input, ICLTensor *output, std::pair<unsigned int, unsigned int> convolved_dims, unsigned int num_groups = 1);
+    void configure(const ICLTensor *input, ICLTensor *output, const Size2D &convolved_dims, unsigned int num_groups = 1);
     /** Static function to check if given info will lead to a valid configuration of @ref CLCol2ImKernel
      *
      * @param[in] input          The input tensor to convert. Data types supported: QASYMM8/F16/F32
@@ -82,7 +82,7 @@
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, std::pair<unsigned int, unsigned int> convolved_dims, unsigned int num_groups = 1);
+    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &convolved_dims, unsigned int num_groups = 1);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
@@ -90,7 +90,7 @@
 public:
     const ICLTensor *_input;
     ICLTensor       *_output;
-    std::pair<unsigned int, unsigned int> _convolved_dims;
+    Size2D           _convolved_dims;
 };
 } // namespace arm_compute
 #endif /*__ARM_COMPUTE_CLCOL2IMKERNEL_H__ */
diff --git a/arm_compute/core/utils/misc/ShapeCalculator.h b/arm_compute/core/utils/misc/ShapeCalculator.h
index c40e711..09f558d 100644
--- a/arm_compute/core/utils/misc/ShapeCalculator.h
+++ b/arm_compute/core/utils/misc/ShapeCalculator.h
@@ -141,18 +141,18 @@
 
     return shape_vector_sum_row;
 }
-inline TensorShape compute_col2im_shape(const ITensorInfo &input, std::pair<unsigned int, unsigned int> convolved_dims, unsigned int num_groups = 1)
+inline TensorShape compute_col2im_shape(const ITensorInfo &input, const Size2D &convolved_dims, bool batch_size_on_z, unsigned int num_groups = 1)
 {
     ARM_COMPUTE_ERROR_ON(num_groups == 0);
-    ARM_COMPUTE_ERROR_ON(input.tensor_shape()[1] != (convolved_dims.first * convolved_dims.second));
+    ARM_COMPUTE_ERROR_ON(input.tensor_shape()[1] != (convolved_dims.area()));
     ARM_COMPUTE_ERROR_ON((num_groups > 1) && input.tensor_shape()[2] != num_groups);
 
     TensorShape col2im_shape{ input.tensor_shape() };
-    col2im_shape.set(0, convolved_dims.first);
-    col2im_shape.set(1, convolved_dims.second);
+    col2im_shape.set(0, convolved_dims.width);
+    col2im_shape.set(1, convolved_dims.height);
     col2im_shape.set(2, input.tensor_shape()[0] * num_groups);
 
-    const unsigned int batch_idx = (num_groups == 1) ? 2 : 3;
+    const unsigned int batch_idx = (batch_size_on_z && num_groups == 1) ? 2 : 3;
     col2im_shape.set(3, input.tensor_shape()[batch_idx]);
 
     return col2im_shape;
diff --git a/src/core/CL/kernels/CLCol2ImKernel.cpp b/src/core/CL/kernels/CLCol2ImKernel.cpp
index 40032f9..74bbb9b 100644
--- a/src/core/CL/kernels/CLCol2ImKernel.cpp
+++ b/src/core/CL/kernels/CLCol2ImKernel.cpp
@@ -40,7 +40,7 @@
 
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, std::pair<unsigned int, unsigned int> convolved_dims, unsigned int num_groups)
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const Size2D &convolved_dims, unsigned int num_groups)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
@@ -49,7 +49,7 @@
     // Checks performed when output is configured
     if(output->total_size() != 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), compute_col2im_shape(*input, convolved_dims, num_groups));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), compute_col2im_shape(*input, convolved_dims, true, num_groups));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->data_layout() != DataLayout::NCHW, "Col2Im output's data layout must always be NCHW");
@@ -58,11 +58,11 @@
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, std::pair<unsigned int, unsigned int> convolved_dims, unsigned int num_groups)
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const Size2D &convolved_dims, unsigned int num_groups)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*output, input->clone()->set_tensor_shape(compute_col2im_shape(*input, convolved_dims, num_groups)).set_data_layout(DataLayout::NCHW));
+    auto_init_if_empty(*output, input->clone()->set_tensor_shape(compute_col2im_shape(*input, convolved_dims, true, num_groups)).set_data_layout(DataLayout::NCHW));
 
     const unsigned int num_elems_read_per_iteration = 8;
 
@@ -87,7 +87,7 @@
 {
 }
 
-void CLCol2ImKernel::configure(const ICLTensor *input, ICLTensor *output, std::pair<unsigned int, unsigned int> convolved_dims, unsigned int num_groups)
+void CLCol2ImKernel::configure(const ICLTensor *input, ICLTensor *output, const Size2D &convolved_dims, unsigned int num_groups)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
@@ -105,7 +105,7 @@
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
     build_opts.add_option("-DELEMENT_SIZE=" + support::cpp11::to_string(input->info()->element_size()));
     build_opts.add_option("-DWIDTH_INPUT=" + support::cpp11::to_string(input->info()->dimension(0)));
-    build_opts.add_option("-DWIDTH_OUTPUT=" + support::cpp11::to_string(_convolved_dims.first));
+    build_opts.add_option("-DWIDTH_OUTPUT=" + support::cpp11::to_string(_convolved_dims.width));
     build_opts.add_option_if(num_groups > 1, "-DGROUPING");
 
     _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("col2im", build_opts.options()));
@@ -130,7 +130,7 @@
     _config_id += support::cpp11::to_string(output->info()->dimension(1));
 }
 
-Status CLCol2ImKernel::validate(const ITensorInfo *input, const ITensorInfo *output, std::pair<unsigned int, unsigned int> convolved_dims, unsigned int num_groups)
+Status CLCol2ImKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &convolved_dims, unsigned int num_groups)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, convolved_dims, num_groups));
diff --git a/src/core/NEON/kernels/NECol2ImKernel.cpp b/src/core/NEON/kernels/NECol2ImKernel.cpp
index bb8e758..d6517ac 100644
--- a/src/core/NEON/kernels/NECol2ImKernel.cpp
+++ b/src/core/NEON/kernels/NECol2ImKernel.cpp
@@ -29,26 +29,17 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 
 #include <arm_neon.h>
 #include <cstddef>
 #include <cstdint>
 
 using namespace arm_compute;
+using namespace misc::shape_calculator;
 
 namespace
 {
-TensorShape get_output_shape(const ITensorInfo *input, const Size2D &convolved_dims)
-{
-    TensorShape output_shape = input->tensor_shape();
-    output_shape.set(0, convolved_dims.width);
-    output_shape.set(1, convolved_dims.height);
-    output_shape.set(2, input->tensor_shape()[0]);
-    output_shape.set(3, input->tensor_shape()[3]); // For NEON the batch size is on the fourth dimension of the input tensor
-
-    return output_shape;
-}
-
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const Size2D &convolved_dims)
 {
     //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use NEON FP16 instructions.
@@ -60,12 +51,28 @@
     // Validate configured output
     if(output->total_size() != 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), get_output_shape(input, convolved_dims));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), compute_col2im_shape(*input, convolved_dims, false));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
     }
 
     return Status{};
 }
+
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const Size2D &convolved_dims)
+{
+    // Output auto inizialitation if not yet initialized
+    auto_init_if_empty(*output, input->clone()->set_tensor_shape(compute_col2im_shape(*input, convolved_dims, false)));
+
+    // Configure kernel window
+    Window win = calculate_max_window(*input, Steps());
+
+    // The NECol2ImKernel doesn't need padding so update_window_and_padding() can be skipped
+    Coordinates coord;
+    coord.set_num_dimensions(output->num_dimensions());
+    output->set_valid_region(ValidRegion(coord, output->tensor_shape()));
+
+    return std::make_pair(Status{}, win);
+}
 } // namespace
 
 template <typename T>
@@ -102,11 +109,6 @@
 void NECol2ImKernel::configure(const ITensor *input, ITensor *output, const Size2D &convolved_dims)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
-    // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(get_output_shape(input->info(), convolved_dims)));
-
-    // Perform validation step
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), convolved_dims));
 
     _input          = input;
@@ -130,19 +132,15 @@
     }
 
     // Configure kernel window
-    Window win = calculate_max_window(*input->info(), Steps());
-
-    // The NECol2ImKernel doesn't need padding so update_window_and_padding() can be skipped
-    Coordinates coord;
-    coord.set_num_dimensions(output->info()->num_dimensions());
-    output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
-
-    INEKernel::configure(win);
+    auto win_config = validate_and_configure_window(input->info(), output->info(), convolved_dims);
+    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
+    INEKernel::configure(win_config.second);
 }
 
 Status NECol2ImKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &convolved_dims)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, convolved_dims));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), convolved_dims).first);
     return Status{};
 }
 
diff --git a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
index 782fe71..c9daea4 100644
--- a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
@@ -285,7 +285,7 @@
         if(input->info()->data_layout() == DataLayout::NCHW)
         {
             // Configure and tune Col2Im
-            _col2im_kernel.configure(_is_quantized ? gemm_output_staged_to_use : gemm_output_to_use, output, std::make_pair(conv_w, conv_h), num_groups);
+            _col2im_kernel.configure(_is_quantized ? gemm_output_staged_to_use : gemm_output_to_use, output, Size2D(conv_w, conv_h), num_groups);
             CLScheduler::get().tune_kernel_static(_col2im_kernel);
         }
         else
@@ -443,7 +443,7 @@
         {
             ARM_COMPUTE_RETURN_ON_ERROR(CLCol2ImKernel::validate(is_quantized ? gemm_output_staged_to_use : gemm_output_to_use,
                                                                  output,
-                                                                 std::make_pair(conv_w, conv_h), num_groups));
+                                                                 Size2D(conv_w, conv_h), num_groups));
         }
     }
 
diff --git a/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp b/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp
index 40bf032..5c6bef9 100644
--- a/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp
+++ b/src/runtime/CL/functions/CLLocallyConnectedLayer.cpp
@@ -122,7 +122,7 @@
     ARM_COMPUTE_RETURN_ON_ERROR(CLIm2ColKernel::validate(input, &input_im2col_reshaped_info, Size2D(kernel_width, kernel_height), conv_info, has_bias));
     ARM_COMPUTE_RETURN_ON_ERROR(CLWeightsReshapeKernel::validate(weights, biases, &weights_reshaped_info));
     ARM_COMPUTE_RETURN_ON_ERROR(CLLocallyConnectedMatrixMultiplyKernel::validate(&input_im2col_reshaped_info, &weights_reshaped_info, &gemm_output_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLCol2ImKernel::validate(&gemm_output_info, output, std::make_pair(conv_w, conv_h)));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLCol2ImKernel::validate(&gemm_output_info, output, Size2D(conv_w, conv_h)));
 
     return Status{};
 }
@@ -163,7 +163,7 @@
     _input_im2col_kernel.configure(input, &_input_im2col_reshaped, Size2D(kernel_width, kernel_height), conv_info, _has_bias);
     _weights_reshape_kernel.configure(weights, biases, &_weights_reshaped);
     _mm_kernel.configure(&_input_im2col_reshaped, &_weights_reshaped, &_gemm_output);
-    _output_col2im_kernel.configure(&_gemm_output, output, std::make_pair(conv_w, conv_h));
+    _output_col2im_kernel.configure(&_gemm_output, output, Size2D(conv_w, conv_h));
 
     // Allocate intermediate tensors
     _input_im2col_reshaped.allocator()->allocate();
diff --git a/src/runtime/CL/tuners/BifrostTuner.cpp b/src/runtime/CL/tuners/BifrostTuner.cpp
index 2d52f33..59d73b4 100644
--- a/src/runtime/CL/tuners/BifrostTuner.cpp
+++ b/src/runtime/CL/tuners/BifrostTuner.cpp
@@ -134,7 +134,7 @@
     // via exhaustive autotuning over 30 representative tensor shapes.
     if(gpu_target_is_in(gpu_target, GPUTarget::G71, GPUTarget::G72, GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT, GPUTarget::G76))
     {
-        if((k._convolved_dims.first == 7) || (k._convolved_dims.first == 14))
+        if((k._convolved_dims.width == 7) || (k._convolved_dims.width == 14))
         {
             lws_hint = cl::NDRange(1, 7, 1);
         }
diff --git a/tests/validation/CL/Col2Im.cpp b/tests/validation/CL/Col2Im.cpp
index 6f1163c..1fea5c1 100644
--- a/tests/validation/CL/Col2Im.cpp
+++ b/tests/validation/CL/Col2Im.cpp
@@ -63,14 +63,14 @@
                framework::dataset::make("Expected", { false, false, false, true })),
                input_info, output_info, convolved_width, convolved_height, num_groups, expected)
 {
-    bool status = bool(CLCol2Im::validate(&input_info, &output_info, std::make_pair(convolved_width, convolved_height), num_groups));
+    bool status = bool(CLCol2Im::validate(&input_info, &output_info, Size2D(convolved_width, convolved_height), num_groups));
     ARM_COMPUTE_EXPECT(status == expected, framework::LogLevel::ERRORS);
 }
 // clang-format on
 // *INDENT-ON*
 
 template <typename T>
-using CLCol2ImFixture = Col2ImValidationFixture<CLTensor, CLAccessor, CLCol2Im, T>;
+using CLCol2ImFixture = Col2ImValidationFixture<CLTensor, CLAccessor, CLCol2Im, T, true>;
 
 TEST_SUITE(Float)
 TEST_SUITE(FP32)
diff --git a/tests/validation/fixtures/Col2ImFixture.h b/tests/validation/fixtures/Col2ImFixture.h
index ddc78a5..5488f8a 100644
--- a/tests/validation/fixtures/Col2ImFixture.h
+++ b/tests/validation/fixtures/Col2ImFixture.h
@@ -44,16 +44,16 @@
 {
 using namespace arm_compute::misc::shape_calculator;
 
-template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T, bool batch_size_on_z>
 class Col2ImValidationFixture : public framework::Fixture
 {
 public:
     template <typename...>
     void setup(TensorShape input_shape, const unsigned int convolved_width, unsigned int convolved_height, unsigned int num_groups, DataType data_type)
     {
-        const std::pair<unsigned int, unsigned int> convolved_dims(convolved_width, convolved_height);
+        const Size2D convolved_dims(convolved_width, convolved_height);
 
-        const TensorShape output_shape = compute_col2im_shape(TensorInfo(input_shape, 1, data_type), convolved_dims, num_groups);
+        const TensorShape output_shape = compute_col2im_shape(TensorInfo(input_shape, 1, data_type), convolved_dims, batch_size_on_z, num_groups);
 
         _target    = compute_target(input_shape, output_shape, convolved_dims, num_groups, data_type);
         _reference = compute_reference(input_shape, output_shape, num_groups, data_type);
@@ -66,7 +66,7 @@
         library->fill_tensor_uniform(tensor, seed);
     }
 
-    TensorType compute_target(const TensorShape &input_shape, const TensorShape &output_shape, std::pair<unsigned int, unsigned int> convolved_dims, unsigned int num_groups, DataType data_type)
+    TensorType compute_target(const TensorShape &input_shape, const TensorShape &output_shape, const Size2D &convolved_dims, unsigned int num_groups, DataType data_type)
     {
         // Create tensors
         TensorType src = create_tensor<TensorType>(input_shape, data_type);