COMPMID-1009 Support 4x4 output tile for Winograd Filter Transform on OpenCL.

Change-Id: I68c6453e0f192de659582404f109a89616b9fbb9
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/124811
Tested-by: Jenkins <bsgcomp@arm.com>
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
diff --git a/src/core/CL/kernels/CLWinogradFilterTransformKernel.cpp b/src/core/CL/kernels/CLWinogradFilterTransformKernel.cpp
index 655b82b..5a03332 100644
--- a/src/core/CL/kernels/CLWinogradFilterTransformKernel.cpp
+++ b/src/core/CL/kernels/CLWinogradFilterTransformKernel.cpp
@@ -44,17 +44,18 @@
 
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output)
+Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const Size2D &output_tile)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != 3);
     ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != input->dimension(1));
     ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
+    ARM_COMPUTE_RETURN_ERROR_ON(output_tile != Size2D(2U, 2U) && output_tile != Size2D(4U, 4U));
 
     // Checks performed when output is configured
     if(output->total_size() != 0)
     {
-        const TensorInfo tensor_info_output = input->clone()->set_tensor_shape(compute_winograd_filter_transform_shape(*input));
+        const TensorInfo tensor_info_output = input->clone()->set_tensor_shape(compute_winograd_filter_transform_shape(*input, output_tile));
 
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
@@ -63,8 +64,9 @@
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const Size2D &output_tile)
 {
+    ARM_COMPUTE_UNUSED(output_tile);
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
     constexpr unsigned int num_elems_processed_per_iteration_x = 3;
@@ -90,35 +92,36 @@
 {
 }
 
-void CLWinogradFilterTransformKernel::configure(const ICLTensor *input, ICLTensor *output)
+void CLWinogradFilterTransformKernel::configure(const ICLTensor *input, ICLTensor *output, const Size2D &output_tile)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
     // Output tensor auto inizialitation if not yet initialized
-    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(compute_winograd_filter_transform_shape(*input->info())));
+    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(compute_winograd_filter_transform_shape(*input->info(), output_tile)));
 
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info()));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), output_tile));
 
     // Set build options
     CLBuildOptions build_opts;
     build_opts.add_option("-DNUM_CHANNELS=" + support::cpp11::to_string(input->info()->dimension(2)));
 
     // Create kernel
-    _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("winograd_filter_transform_2x2_3x3_nchw", build_opts.options()));
+    std::string kernel_name = std::string("winograd_filter_transform_") + output_tile.to_string() + std::string("_3x3_nchw");
+    _kernel                 = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
 
     _input  = input;
     _output = output;
 
     // Configure kernel window
-    auto win_config = validate_and_configure_window(input->info(), output->info());
+    auto win_config = validate_and_configure_window(input->info(), output->info(), output_tile);
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
     ICLKernel::configure(win_config.second);
 }
 
-Status CLWinogradFilterTransformKernel::validate(const ITensorInfo *input, const ITensorInfo *output)
+Status CLWinogradFilterTransformKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &output_tile)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get()).first);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, output_tile));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), output_tile).first);
 
     return Status{};
 }