MLBEDSW-8201: [MLCE] Extended stride support for CONV_2D

- Added support for stride_h > 3 when ofm height is 1
- Added support for stride_w > 3 when ofm width is 1
- Updated constraints
- Updated tests
- Updated SUPPORTED_OPS.md

Change-Id: I8f89909b05a0f052df5f03702966cee50da61cfc
Signed-off-by: Johan Alfven <johan.alfven@arm.com>
diff --git a/SUPPORTED_OPS.md b/SUPPORTED_OPS.md
index b22c539..81704e5 100644
--- a/SUPPORTED_OPS.md
+++ b/SUPPORTED_OPS.md
@@ -19,7 +19,7 @@
 # Supported Ops
 
 This file was automatically generated by Vela using the `--supported-ops-report` parameter.  
-Vela version: `3.9.1.dev14+g7e03323`
+Vela version: `3.9.1.dev16+gd230ce9.d20231030`
 
 This file complies with
 [**Gitiles Markdown syntax**](https://gerrit.googlesource.com/gitiles/+/HEAD/Documentation/markdown.md)
@@ -159,9 +159,11 @@
 - IFM depth must be a whole multiple of the filter kernel depth
 - Number of filter kernels must be equally divisible by the number of convolution groups
 - Dilation factor values for both width and height must be integer types
-- Stride width must be greater than or equal to 1.  
-        For stride widths greater than 3, the post-optimization stride needs to be less than or equal to 3.  
-        Stride height must be between 1 and 3.
+- Strides must fulfil the following criteria:  
+        - Stride h must be between 1 and 3 when ofm height is greater than 1  
+        - Stride w must be between 1 and 3 when ofm height is greater than 1 or  
+          stride w must be divisible by 2 or 3 and ifm width must be divisible  
+          by stride_w/2 or stride_w/3
 - Dilated kernel height must be in the range [1, 64]
 - Product of dilated kernel width and height must be in the range [1, 4096]
 - Weight tensor must be 8-bit
diff --git a/ethosu/vela/test/test_tflite_supported_operators.py b/ethosu/vela/test/test_tflite_supported_operators.py
index d6b9478..a433fb8 100644
--- a/ethosu/vela/test/test_tflite_supported_operators.py
+++ b/ethosu/vela/test/test_tflite_supported_operators.py
@@ -128,6 +128,11 @@
     op = testutil.create_op_with_quant_tensors(Op.Conv2DBias, ifm_shape, [1, 8, 8, 8], [1, 1, 1, 1])
     op.attrs = {"stride_w": stride_w, "stride_h": stride_h}
     assert support.is_operator_supported(op) == supported
+    if not supported and stride_w > 0 and stride_h > 0:
+        # Test not supported but with ofm width and height = 1 -> supported
+        op = testutil.create_op_with_quant_tensors(Op.Conv2DBias, ifm_shape, [1, 1, 1, 8], [1, 1, 1, 1])
+        op.attrs = {"stride_w": stride_w, "stride_h": stride_h}
+        assert support.is_operator_supported(op)
 
 
 def test_constraint_dilated_height_range():
diff --git a/ethosu/vela/tflite_graph_optimiser.py b/ethosu/vela/tflite_graph_optimiser.py
index 794a6ec..ae11bec 100644
--- a/ethosu/vela/tflite_graph_optimiser.py
+++ b/ethosu/vela/tflite_graph_optimiser.py
@@ -1170,6 +1170,26 @@
         stride_x = final_stride
         op.attrs.update({"stride_w": stride_x, "stride_h": stride_y, "strides": (1, stride_y, stride_x, 1)})
 
+    ofm_shape = op.ofm_shapes[0]
+    if ofm_shape.height == 1 or ofm_shape.width == 1:
+        # If height or width is 1 no stride is done in y or x direction and stride value can be set to 1
+        # Before forcing kernel stride to 1 make sure to calculate the correct padding since it is
+        # based on the original kernel stride
+        padding, _ = calc_padding_and_skirt(
+            op.attrs["padding"],
+            op.kernel,
+            ifm_shape,
+            op.attrs.get("explicit_padding"),
+        )
+        # Use explicit padding so it is not recalculated later with the wrong kernel stride
+        op.attrs["padding"] = Padding.EXPLICIT
+        op.attrs["explicit_padding"] = padding
+
+        stride_y = 1 if ofm_shape.height == 1 else stride_y
+        stride_x = 1 if ofm_shape.width == 1 else stride_x
+
+        op.attrs.update({"stride_w": stride_x, "stride_h": stride_y, "strides": (1, stride_y, stride_x, 1)})
+
     return op
 
 
diff --git a/ethosu/vela/tflite_supported_operators.py b/ethosu/vela/tflite_supported_operators.py
index 41862b6..14c2213 100644
--- a/ethosu/vela/tflite_supported_operators.py
+++ b/ethosu/vela/tflite_supported_operators.py
@@ -556,19 +556,31 @@
 
     @staticmethod
     def constraint_stride_width_no_upper_limit(op):
-        """Stride width must be greater than or equal to 1.
-        For stride widths greater than 3, the post-optimization stride needs to be less than or equal to 3.
-        Stride height must be between 1 and 3."""
-        w, h = op.get_kernel_stride()
+        """Strides must fulfil the following criteria:
+        - Stride h must be between 1 and 3 when ofm height is greater than 1
+        - Stride w must be between 1 and 3 when ofm height is greater than 1 or
+          stride w must be divisible by 2 or 3 and ifm width must be divisible
+          by stride_w/2 or stride_w/3"""
+
+        stride_w, stride_h = op.get_kernel_stride()
         stride_min = 1
         stride_max_h = 3
         ifm_width = op.ifm.shape[2]
-        _, optimized_stride = calc_resize_factor(ifm_width, w) if w > 1 else (1, w)
+        ofm_height = op.ofm.shape[1]
+        ofm_width = op.ofm.shape[2]
+
+        stride_h_valid = ofm_height == 1 or stride_min <= stride_h <= stride_max_h
+
+        _, optimized_stride = calc_resize_factor(ifm_width, stride_w) if stride_w > 1 else (1, stride_w)
         # Optimized stride indicates the final Conv2D stride width after all optimizations are performed
         can_optimize_stride_width_gt_3 = optimized_stride <= 3
-        valid = (stride_min <= w) and (stride_min <= h <= stride_max_h) and can_optimize_stride_width_gt_3
 
-        return valid, f"Op has stride WxH as: {w}x{h}"
+        stride_w_valid = ofm_width == 1 or ((stride_min <= stride_w) and can_optimize_stride_width_gt_3)
+
+        return (
+            stride_h_valid and stride_w_valid,
+            f"Op has stride WxH as: {stride_w}x{stride_h}, ifm shape as: {op.ifm.shape}, ofm shape as: {op.ofm.shape}",
+        )
 
     @staticmethod
     def constraint_stride_range_no_padding(op):
diff --git a/ethosu/vela/utils.py b/ethosu/vela/utils.py
index 11c253c..ee501f3 100644
--- a/ethosu/vela/utils.py
+++ b/ethosu/vela/utils.py
@@ -96,7 +96,7 @@
         # In case it is not divisible, check if the resize factor is
         # divisible by any of the hw_supported_strides. If it is, re-compute
         # the resize factor to be the value that leads us to
-        # reach a hw supported stride. The IFM width needs to be divisible by the new stride.
+        # reach a hw supported stride. The IFM width needs to be divisible by the new resize factor.
         # E.g.: IFM width = 133, stride = 14, filter width = 7 can be
         #       optimised to IFM width = 19, stride = 2, filter width = 7 using
         #       a resize factor of 7. The final stride is 2 which is