COMPMID-1201 - Implementing Winograd Convolution Layer 1x3 and 3x1 kernels on OpenCL

Change-Id: I39667bab49daa4da009694163274a59fd3574c73
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/137595
Tested-by: Jenkins <bsgcomp@arm.com>
Reviewed-by: Giorgio Arena <giorgio.arena@arm.com>
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
diff --git a/arm_compute/core/CL/CLHelpers.h b/arm_compute/core/CL/CLHelpers.h
index 1054f9a..3b025cc 100644
--- a/arm_compute/core/CL/CLHelpers.h
+++ b/arm_compute/core/CL/CLHelpers.h
@@ -109,5 +109,15 @@
  * @return True if the extension is supported
  */
 bool dot8_supported(const cl::Device &device);
+
+/** This function checks if the Winograd configuration (defined through the output tile, kernel size and the data layout) is supported on OpenCL
+ *
+ * @param[in] output_tile Output tile for the Winograd filtering algorithm
+ * @param[in] kernel_size Kernel size for the Winograd filtering algorithm
+ * @param[in] data_layout Data layout of the input tensor
+ *
+ * @return True if the configuration is supported
+ */
+bool cl_winograd_convolution_layer_supported(const Size2D &output_tile, const Size2D &kernel_size, DataLayout data_layout);
 }
 #endif /* __ARM_COMPUTE_CLHELPERS_H__ */
diff --git a/arm_compute/core/Helpers.h b/arm_compute/core/Helpers.h
index 7d922ae..a3cbfb9 100644
--- a/arm_compute/core/Helpers.h
+++ b/arm_compute/core/Helpers.h
@@ -111,6 +111,28 @@
 };
 }
 
+/** Calculate the number of output tiles required by Winograd Convolution layer. This utility function can be used by the Winograd input transform
+ *  to know the number of tiles on the x and y direction
+ *
+ * @param[in] in_dims          Spatial dimensions of the input tensor of convolution layer
+ * @param[in] kernel_size      Kernel size
+ * @param[in] output_tile_size Size of a single output tile
+ * @param[in] conv_info        Convolution info (i.e. pad, stride,...)
+ *
+ * @return the number of output tiles along the x and y directions of size "output_tile_size"
+ */
+inline Size2D compute_winograd_convolution_tiles(const Size2D &in_dims, const Size2D &kernel_size, const Size2D &output_tile_size, const PadStrideInfo &conv_info)
+{
+    int num_tiles_x = std::ceil((in_dims.width - (kernel_size.width - 1) + conv_info.pad_left() + conv_info.pad_right()) / static_cast<float>(output_tile_size.width));
+    int num_tiles_y = std::ceil((in_dims.height - (kernel_size.height - 1) + conv_info.pad_top() + conv_info.pad_bottom()) / static_cast<float>(output_tile_size.height));
+
+    // Clamp in case we provide paddings but we have 1D convolution
+    num_tiles_x = std::min(num_tiles_x, static_cast<int>(in_dims.width));
+    num_tiles_y = std::min(num_tiles_y, static_cast<int>(in_dims.height));
+
+    return Size2D(num_tiles_x, num_tiles_y);
+}
+
 /** Computes bilinear interpolation using the pointer to the top-left pixel and the pixel's distance between
  * the real coordinates and the smallest following integer coordinates. Input must be in single channel format.
  *
diff --git a/arm_compute/core/utils/misc/ShapeCalculator.h b/arm_compute/core/utils/misc/ShapeCalculator.h
index 115cbe6..2213876 100644
--- a/arm_compute/core/utils/misc/ShapeCalculator.h
+++ b/arm_compute/core/utils/misc/ShapeCalculator.h
@@ -255,12 +255,14 @@
     const size_t idx_h = get_data_layout_dimension_index(input.data_layout(), DataLayoutDimension::HEIGHT);
     const size_t idx_c = get_data_layout_dimension_index(input.data_layout(), DataLayoutDimension::CHANNEL);
 
-    // Compute height
-    const unsigned int num_tiles_x = std::ceil((input.tensor_shape()[idx_w] - (kernel_size.width - 1) + conv_info.pad_left() + conv_info.pad_right()) / static_cast<float>(output_tile_size.width));
-    const unsigned int num_tiles_y = std::ceil((input.tensor_shape()[idx_h] - (kernel_size.height - 1) + conv_info.pad_top() + conv_info.pad_bottom()) / static_cast<float>(output_tile_size.height));
+    // Compute the number of output tiles along the x and y direction of size "output_tile_size"
+    const Size2D num_tiles = compute_winograd_convolution_tiles(Size2D(input.tensor_shape()[idx_w], input.tensor_shape()[idx_h]),
+                                                                kernel_size,
+                                                                output_tile_size,
+                                                                conv_info);
 
     const unsigned int width  = input.tensor_shape()[idx_c];
-    const unsigned int height = num_tiles_x * num_tiles_y;
+    const unsigned int height = num_tiles.area();
     const unsigned int depth  = input_tile_size.area();
 
     TensorShape output_shape{ input.tensor_shape() };