COMPMID-1026 - Add support for 4x4 output tile in CLWinogradConvolutionLayer

The performance achieved can be found at the following confluence page:
https://confluence.arm.com/display/MLENG/GEMM-based+convolution+vs+Winograd-based+convolution+on+OpenCL

Change-Id: I4b690cfdd4eb4ff0cd17b14fdd49ccaa1d1dc85c
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/127729
Tested-by: Jenkins <bsgcomp@arm.com>
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
index 0a0de7a..805a594 100644
--- a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
@@ -286,17 +286,23 @@
     else // The input tensors have not been reshaped
     {
         build_opts.add_option("-DCOLS_A=" + support::cpp11::to_string(input0->info()->dimension(0)));
+        build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
 
         // Create kernels according to the architecture, data type and input size.
         if(gpu_target_is_in(gpu_target, GPUTarget::G71, GPUTarget::G72, GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT, GPUTarget::TNOX) && is_data_type_float(data_type))
         {
-            kernel_name = "gemm_mm_floating_point_" + lower_string(string_from_data_type(data_type)) + "_bifrost";
-            // The first kernel is optimized for the case of 1000 or less output elements (e.g. FC8 of AlexNet and VGG-16, and
-            // FC1 of Inception v3). The second kernel is optimized for the case of greater than 1000 output elements (e.g.
-            // FC6 and FC7 of AlexNet and VGG-16).
-            if(input1->info()->dimension(0) <= 1000 && input0->info()->num_dimensions() == 1 && data_type == DataType::F32)
+            kernel_name = "gemm_mm_floating_point";
+
+            if(input0->info()->num_dimensions() != 1)
             {
-                kernel_name += "_1000";
+                kernel_name += "_" + lower_string(string_from_data_type(data_type)) + "_bifrost";
+            }
+            else if(input1->info()->dimension(0) <= 1000 && data_type == DataType::F32)
+            {
+                // The first kernel is optimized for the case of 1000 or less output elements (e.g. FC8 of AlexNet and VGG-16, and
+                // FC1 of Inception v3). The second kernel is optimized for the case of greater than 1000 output elements (e.g.
+                // FC6 and FC7 of AlexNet and VGG-16).
+                kernel_name += "_" + lower_string(string_from_data_type(data_type)) + "_bifrost_1000";
             }
 
             // The work-group size equal to the Bifrost quad size has been proved to be optimal for these kernels
@@ -309,7 +315,6 @@
         }
         else // (MIDGARD and F32) or (F16)
         {
-            build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
             kernel_name = "gemm_mm_floating_point";
         }
         build_opts.add_option("-DNUM_ELEMS_PROCESSED_PER_THREAD_Y=" + support::cpp11::to_string(num_elements_processed.y()));
diff --git a/src/core/CL/kernels/CLWinogradFilterTransformKernel.cpp b/src/core/CL/kernels/CLWinogradFilterTransformKernel.cpp
index d3a33c0..41b3ac5 100644
--- a/src/core/CL/kernels/CLWinogradFilterTransformKernel.cpp
+++ b/src/core/CL/kernels/CLWinogradFilterTransformKernel.cpp
@@ -55,9 +55,11 @@
     const size_t idx_w = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
     const size_t idx_h = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
 
-    ARM_COMPUTE_RETURN_ERROR_ON(kernel_size != Size2D(3U, 3U) && kernel_size != Size2D(5U, 5U));
-    ARM_COMPUTE_RETURN_ERROR_ON(kernel_size == Size2D(3U, 3U) && output_tile_size != Size2D(2U, 2U) && output_tile_size != Size2D(4U, 4U));
-    ARM_COMPUTE_RETURN_ERROR_ON(kernel_size == Size2D(5U, 5U) && output_tile_size != Size2D(4U, 4U));
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(kernel_size != Size2D(3U, 3U) && kernel_size != Size2D(5U, 5U), "Winograd filter transform only supports 3x3 and 5x5 kernels");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(kernel_size == Size2D(3U, 3U) && output_tile_size != Size2D(2U, 2U)
+                                    && output_tile_size != Size2D(4U, 4U),
+                                    "Winograd filter transform only supports 2x2 or 4x4 output tile for 3x3 kernels");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(kernel_size == Size2D(5U, 5U) && output_tile_size != Size2D(4U, 4U), "Winograd filter transform only supports 4x4 output tile for 5x5 kernels");
     ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(idx_w) != kernel_size.width || input->dimension(idx_h) != kernel_size.height);
     ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
 
diff --git a/src/core/CL/kernels/CLWinogradInputTransformKernel.cpp b/src/core/CL/kernels/CLWinogradInputTransformKernel.cpp
index a47590d..febd22b 100644
--- a/src/core/CL/kernels/CLWinogradInputTransformKernel.cpp
+++ b/src/core/CL/kernels/CLWinogradInputTransformKernel.cpp
@@ -47,7 +47,9 @@
     const Size2D        kernel_size      = winograd_info.kernel_size;
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.stride().first != 1 || conv_info.stride().second != 1, "Winograd input transform only supports unit strides");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(kernel_size != Size2D(3U, 3U) && kernel_size != Size2D(5U, 5U), "Winograd input transform only supports 3x3 and 5x5 kernels");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(kernel_size == Size2D(3U, 3U) && output_tile_size != Size2D(2U, 2U), "Winograd input transform only supports 2x2 output tile for 3x3 kernels");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(kernel_size == Size2D(3U, 3U) && output_tile_size != Size2D(2U, 2U)
+                                    && output_tile_size != Size2D(4U, 4U),
+                                    "Winograd input transform only supports 2x2 or 4x4 output tile for 3x3 kernels");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(kernel_size == Size2D(5U, 5U) && output_tile_size != Size2D(4U, 4U), "Winograd input transform only supports 4x4 output tile for 5x5 kernels");
     ARM_COMPUTE_UNUSED(conv_info);
     ARM_COMPUTE_UNUSED(output_tile_size);
@@ -111,7 +113,6 @@
     const int num_elements_y = input->info()->dimension(1) - (kernel_size.height - 1) + conv_info.pad_top() + conv_info.pad_bottom();
 
     // Check if we need to extend the right or bottom border
-    // FIXME: This actually is not needed. Added just for validating the result;
     const unsigned int extra_border_right  = ((num_elements_x % output_tile_size.width) == 0) ? 0u : static_cast<unsigned int>(output_tile_size.width - 1);
     const unsigned int extra_border_bottom = ((num_elements_y % output_tile_size.height) == 0) ? 0u : static_cast<unsigned int>(output_tile_size.height - 1);
 
@@ -137,19 +138,13 @@
     std::string kernel_name = "winograd_input_transform_" + output_tile_size.to_string() + "_" + kernel_size.to_string();
 
     // Check optimized kernel if output_dims == 2x2
-    if(output_tile_size.width == 2 && output_tile_size.height == 2)
+    if(output_tile_size == Size2D(2U, 2U))
     {
-        if((_input->info()->dimension(2) % 2) != 0)
-        {
-            _step_z = 1;
-        }
-        else
-        {
-            _step_z   = 2;
-            _lws_hint = cl::NDRange(1, 1, 8);
-        }
+        _step_z = (_input->info()->dimension(2) % 2) != 0 ? 1 : 2;
     }
 
+    _lws_hint = cl::NDRange(1, 1, 8);
+
     // Append stepz and data layout
     kernel_name += "_stepz";
     kernel_name += support::cpp11::to_string(_step_z);
diff --git a/src/core/CL/kernels/CLWinogradOutputTransformKernel.cpp b/src/core/CL/kernels/CLWinogradOutputTransformKernel.cpp
index 8ee1a82..c5d2528 100644
--- a/src/core/CL/kernels/CLWinogradOutputTransformKernel.cpp
+++ b/src/core/CL/kernels/CLWinogradOutputTransformKernel.cpp
@@ -58,6 +58,7 @@
 
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(kernel_size != Size2D(3U, 3U) && kernel_size != Size2D(5U, 5U), "Only 3x3 and 5x5 kernels are supported");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(kernel_size == Size2D(3U, 3U) && output_tile_size == Size2D(2U, 2U) && input->dimension(2) != 16, "Wrong number of batches");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(kernel_size == Size2D(3U, 3U) && output_tile_size == Size2D(4U, 4U) && input->dimension(2) != 36, "Wrong number of batches");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(kernel_size == Size2D(5U, 5U) && output_tile_size == Size2D(4U, 4U) && input->dimension(2) != 64, "Wrong number of batches");
 
     // Compute number of elements to process in the X and Y direction
@@ -67,7 +68,6 @@
     const int num_tiles_y    = std::ceil(num_elements_y / static_cast<float>(output_tile_size.height));
 
     ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(1) != static_cast<unsigned int>((num_tiles_x * num_tiles_y)));
-    ARM_COMPUTE_UNUSED(output_tile_size);
 
     if(bias != nullptr)
     {
@@ -207,4 +207,4 @@
         enqueue(queue, *this, slice, _lws_hint);
     }
     while(window.slide_window_slice_3D(slice) && window.slide_window_slice_3D(slice_out));
-}
\ No newline at end of file
+}