MLBEDSW-7648: Fix bug with filter padding in conv2d

* Fix bug that caused filter padding to not be added proportionally
  compared to the hardware padding added to IFM.
* Update needed_total_padding function that calculates hardware padding
  to also account for the cases in which IFM width is not divisible by
  the stride width.
* Update supported ops constraint on strides for conv2d to mark ops with
  stride width > 3 and IFM width that is not divisible by the
  optimization resize factor as not supported.
* Update unit tests that verify correct functionality when checking
  whether ops are supported or not.

Change-Id: I62f14cca890b779ca787a9603fa37c873ad522f8
Signed-off-by: Raul Farkas <raul.farkas@arm.com>
diff --git a/SUPPORTED_OPS.md b/SUPPORTED_OPS.md
index 0d42d9c..947b585 100644
--- a/SUPPORTED_OPS.md
+++ b/SUPPORTED_OPS.md
@@ -19,7 +19,7 @@
 # Supported Ops
 
 This file was automatically generated by Vela using the `--supported-ops-report` parameter.  
-Vela version: `3.8.1.dev3+gc66541d.d20230613`
+Vela version: `3.8.1.dev3+gc66541d`
 
 This file complies with
 [**Gitiles Markdown syntax**](https://github.com/google/gitiles/blob/master/Documentation/markdown.md)
@@ -155,7 +155,9 @@
 
 - Stride values for both width and height must be integer types
 - Dilation factor values for both width and height must be integer types
-- Stride width must be greater than or equal to 1 and stride height must be between 1 and 3
+- Stride width must be greater than or equal to 1.  
+        For stride widths greater than 3, the post-optimization stride needs to be less than or equal to 3.  
+        Stride height must be between 1 and 3.
 - Dilated kernel height must be in the range [1, 64]
 - Product of dilated kernel width and height must be in the range [1, 4096]
 - Weight tensor must be 8-bit
diff --git a/ethosu/vela/graph_optimiser_util.py b/ethosu/vela/graph_optimiser_util.py
index da3fe13..220ba1a 100644
--- a/ethosu/vela/graph_optimiser_util.py
+++ b/ethosu/vela/graph_optimiser_util.py
@@ -185,10 +185,11 @@
 
 
 def needed_total_padding(input_size, stride, filter_size):
-    out_size = (input_size + stride - 1) // stride
-    needed_input = (out_size - 1) * stride + filter_size
-    total_padding = max(0, needed_input - input_size)
-    return total_padding
+    """Compute hardware padding."""
+    if input_size % stride == 0:
+        return max(filter_size - stride, 0)
+
+    return max(filter_size - (input_size % stride), 0)
 
 
 # Set input/output tensor equivalence to the same id for memory operations
diff --git a/ethosu/vela/test/test_tflite_supported_operators.py b/ethosu/vela/test/test_tflite_supported_operators.py
index 4aca00d..cbad171 100644
--- a/ethosu/vela/test/test_tflite_supported_operators.py
+++ b/ethosu/vela/test/test_tflite_supported_operators.py
@@ -106,23 +106,24 @@
 
 
 @pytest.mark.parametrize(
-    "stride_w, stride_h, supported",
+    "ifm_shape, stride_w, stride_h, supported",
     [
-        [0, 20, False],
-        [20, 0, False],
-        [4, 3, True],
-        [4, 5, False],
-        [4, 9, False],
-        [3, 3, True],
-        [1, 1, True],
-        [20, 2, True],
-        [6, 3, True],
-        [8, 1, True],
+        [[1, 8, 8, 8], 0, 20, False],
+        [[1, 8, 8, 8], 20, 0, False],
+        [[1, 8, 8, 8], 4, 3, True],
+        [[1, 8, 8, 8], 4, 5, False],
+        [[1, 8, 8, 8], 4, 9, False],
+        [[1, 8, 8, 8], 3, 3, True],
+        [[1, 8, 8, 8], 1, 1, True],
+        [[1, 8, 8, 8], 20, 2, False],
+        [[1, 8, 40, 8], 20, 2, True],
+        [[1, 8, 40, 8], 6, 3, True],
+        [[1, 8, 40, 8], 8, 1, True],
     ],
 )
-def test_constraint_stride_range(stride_w: int, stride_h: int, supported: bool):
+def test_constraint_stride_range(ifm_shape: list[int], stride_w: int, stride_h: int, supported: bool):
     # Stride width and height must lie within a certain range
-    op = testutil.create_op_with_quant_tensors(Op.Conv2DBias, [1, 8, 8, 8], [1, 8, 8, 8], [1, 1, 1, 1])
+    op = testutil.create_op_with_quant_tensors(Op.Conv2DBias, ifm_shape, [1, 8, 8, 8], [1, 1, 1, 1])
     op.attrs = {"stride_w": stride_w, "stride_h": stride_h}
     assert support.is_operator_supported(op) == supported
 
diff --git a/ethosu/vela/tflite_graph_optimiser.py b/ethosu/vela/tflite_graph_optimiser.py
index 99ac24e..76383a4 100644
--- a/ethosu/vela/tflite_graph_optimiser.py
+++ b/ethosu/vela/tflite_graph_optimiser.py
@@ -73,6 +73,7 @@
 from .tensor import Tensor
 from .tensor import TensorPurpose
 from .tflite_mapping import optype_to_builtintype
+from .utils import calc_resize_factor
 
 passthrough_nodes = (Op.Identity,)
 
@@ -970,29 +971,6 @@
     if op.op_index != 0 and stride_x < 4:
         return op
 
-    def calc_resize_factor(ifm_width: int, stride_x: int) -> tuple[int, int]:
-        """Compute resize factor for strided Conv2D optimization"""
-        # Define strides that are supported by HW
-        hw_supported_strides = (2, 3)
-        resize_factor = stride_x
-
-        if ifm_width % resize_factor != 0:
-            # In case it is not divisible, check if the resize factor is
-            # divisible by any of the hw_supported_strides. If it is, re-compute
-            # the resize factor to be the value that leads us to
-            # reach a hw supported stride.
-            # E.g.: IFM width = 133, stride = 14, filter width = 7 can be
-            #       optimised to IFM width = 19, stride = 2, filter width = 7 using
-            #       a resize factor of 7. The final stride is 2 which is
-            #       supported by the hardware.
-            supported_final_strides = (x for x in hw_supported_strides if resize_factor % x == 0)
-            new_resize_factor = resize_factor // next(supported_final_strides, 1)
-            resize_factor = new_resize_factor if resize_factor != new_resize_factor else 1
-
-        optimised_stride = stride_x // resize_factor
-
-        return resize_factor, optimised_stride
-
     resize_factor, final_stride = calc_resize_factor(ifm_shape.width, stride_x)
 
     def calc_filter_padding(
@@ -1001,6 +979,7 @@
         post_op_stride: int,
         opt_resize_factor: int,
         filter_width: int,
+        ifm_width: int,
     ) -> tuple[int, int, int, int]:
         """Calculate zero padding to be added to the filter.
 
@@ -1018,6 +997,8 @@
             a stride of 2 after the optimization
         filter_width : int
             Width of the filter before optimization.
+        ifm_width : int
+            Width of the IFM before optimization
 
         Returns
         -------
@@ -1027,15 +1008,40 @@
         padding_size = 0
         padding = (0, 0, 0, 0)
         if ifm_padding_type and ifm_padding_type != Padding.VALID:
-            padding_size = (ifm_current_padding_x + post_op_stride) * opt_resize_factor - filter_width
-            # Distribute padding between left and right side of the filter
-            padding_left = padding_size // 2
+            # Compute padding size for the filter that guarantees that HW padding added to IFM matches
+            # before and after the optimization is performed
+            expected_filter_size = 0
+            pre_opt_stride = post_op_stride * opt_resize_factor
+            post_opt_ifm_width = ifm_width // opt_resize_factor
+            # Compute the total expected filter size post optimization that ensures that the same HW padding
+            # is added to IFM.
+            # There are two ways of calculating required filter size depending on whether IFM width is divisible
+            # by stride width or not. These approaches match the cases used to calculate HW padding in
+            # needed_total_padding method.
+            if ifm_width % pre_opt_stride == 0:
+                expected_filter_size = ifm_current_padding_x + post_op_stride
+            else:
+                expected_filter_size = ifm_current_padding_x + (post_opt_ifm_width % post_op_stride)
+            # Compute padding size from expected filter size
+            padding_size = expected_filter_size * opt_resize_factor - filter_width
+
+            if ifm_current_padding_x == 0:
+                # If no HW padding is added to IFM, divide filter padding between left and right following
+                # the same strategy as the reference.
+                padding_left = padding_size // 2
+            else:
+                # If HW padding is added to IFM, split padding for the filter so that left padding and right padding
+                # are proportional to left and right HW padding.
+                left_hw_padding = ifm_current_padding_x // 2
+                # Compute filter padding
+                padding_left = padding_size // ifm_current_padding_x * left_hw_padding
             padding = (0, padding_left, 0, padding_size - padding_left)
 
         # Check if filter width is divisible by the stride width (required for optimization)
-        # If padding was already added above, the filter width is already divisible by
-        # resize factor, so this should be skipped.
-        if padding_size == 0 and filter_width % opt_resize_factor != 0:
+        # If filter width is not divisible by stride width and no HW padding is added to IFM, compute
+        # filter padding required for the filter width to be divisible by the stride width and apply it as right
+        # padding.
+        if filter_width % opt_resize_factor != 0 and (padding_size == 0 or ifm_current_padding_x == 0):
             padding_size = opt_resize_factor - (filter_width % opt_resize_factor)
             # Add padding zeros to the right
             padding = (0, 0, 0, padding_size)
@@ -1056,7 +1062,7 @@
         curr_padding_x = needed_total_padding(ifm_shape.width, stride_x, k_w)
         # Compute the padding needed on the filter for the optimisation
         _, left_filter_padding, _, right_filter_padding = calc_filter_padding(
-            padding_type, curr_padding_x, final_stride, resize_factor, k_w
+            padding_type, curr_padding_x, final_stride, resize_factor, k_w, ifm_shape.width
         )
         total_horizontal_padding = left_filter_padding + right_filter_padding
         # If IFM padding is enabled, check if pre-opt and post-opt padding is
diff --git a/ethosu/vela/tflite_supported_operators.py b/ethosu/vela/tflite_supported_operators.py
index 0dfdc66..25b6897 100644
--- a/ethosu/vela/tflite_supported_operators.py
+++ b/ethosu/vela/tflite_supported_operators.py
@@ -29,6 +29,7 @@
 from .tensor import check_quantized_tens_scaling_equal
 from .tflite_mapping import BUILTIN_OPERATOR_UNKNOWN
 from .tflite_mapping import optype_to_builtintype
+from .utils import calc_resize_factor
 
 
 def _optype_formatter(op_list):
@@ -545,11 +546,18 @@
 
     @staticmethod
     def constraint_conv_stride(op):
-        "Stride width must be greater than or equal to 1 and stride height must be between 1 and 3"
+        """Stride width must be greater than or equal to 1.
+        For stride widths greater than 3, the post-optimization stride needs to be less than or equal to 3.
+        Stride height must be between 1 and 3."""
         w, h = op.get_kernel_stride()
         stride_min = 1
         stride_max_h = 3
-        valid = (stride_min <= w) and (stride_min <= h <= stride_max_h)
+        ifm_width = op.ifm.shape[2]
+        _, optimized_stride = calc_resize_factor(ifm_width, w) if w > 1 else (1, w)
+        # Optimized stride indicates the final Conv2D stride width after all optimizations are performed
+        can_optimize_stride_width_gt_3 = optimized_stride <= 3
+        valid = (stride_min <= w) and (stride_min <= h <= stride_max_h) and can_optimize_stride_width_gt_3
+
         return valid, f"Op has stride WxH as: {w}x{h}"
 
     @staticmethod
diff --git a/ethosu/vela/utils.py b/ethosu/vela/utils.py
index 6a36897..11c253c 100644
--- a/ethosu/vela/utils.py
+++ b/ethosu/vela/utils.py
@@ -84,3 +84,32 @@
         return
 
     print(f"{context_str}{message}")
+
+
+def calc_resize_factor(ifm_width: int, stride_x: int) -> tuple[int, int]:
+    """Compute resize factor for strided Conv2D optimization."""
+    # Define strides that are supported by HW
+    hw_supported_strides = (2, 3)
+    resize_factor = stride_x
+
+    if ifm_width % resize_factor != 0:
+        # In case it is not divisible, check if the resize factor is
+        # divisible by any of the hw_supported_strides. If it is, re-compute
+        # the resize factor to be the value that leads us to
+        # reach a hw supported stride. The IFM width needs to be divisible by the new stride.
+        # E.g.: IFM width = 133, stride = 14, filter width = 7 can be
+        #       optimised to IFM width = 19, stride = 2, filter width = 7 using
+        #       a resize factor of 7. The final stride is 2 which is
+        #       supported by the hardware.
+
+        # Filter strides that can be obtained from current stride
+        divisible_strides = (x for x in hw_supported_strides if resize_factor % x == 0)
+        # Remove strides that are not IFM width divisors
+        divisor_strides = (x for x in divisible_strides if ifm_width % (stride_x // x) == 0)
+        # Compute new resize factor based on chosen stride
+        new_resize_factor = resize_factor // next(divisor_strides, 1)
+        resize_factor = new_resize_factor if resize_factor != new_resize_factor else 1
+
+    optimised_stride = stride_x // resize_factor
+
+    return resize_factor, optimised_stride