MLBEDSW-5534: Enet_640_640_int8 output diff

The output diff is caused by not including the kernel dilation when
calculating the bottom padding to be used on the last h_stripe. This
only shows up when using dedicated_sram since shared_sram does not split
into multiple h_stripes and thus uses the padding specified by the skirt
instead.

Signed-off-by: Rickard Bolin <rickard.bolin@arm.com>
Change-Id: I7f643748b153004d65be2124c0ac6c9d21cd803f
diff --git a/ethosu/vela/high_level_command_stream.py b/ethosu/vela/high_level_command_stream.py
index cf31aa5..7e60221 100644
--- a/ethosu/vela/high_level_command_stream.py
+++ b/ethosu/vela/high_level_command_stream.py
@@ -40,9 +40,9 @@
         ifm_shape: Shape4D,
         npu_block_type: NpuBlockType,
         concat_offsets: List[int],
+        k_dilated_height: int,
         split_offset: Shape4D = None,
         split_shape: Shape4D = None,
-        k_height: int = 1,
         upscaling_factor: int = 1,
     ):
         new_start_coord = list(self.start_coord)
@@ -105,7 +105,9 @@
                         pad_bottom = original_end_coord[-3] - (ifm_shape.height * upscaling_factor)
                     else:
                         k_start = new_start_coord[-3] - pad_top
-                        pad_bottom = max(0, k_start + total_stride + k_height - (ifm_shape.height * upscaling_factor))
+                        pad_bottom = max(
+                            0, k_start + total_stride + k_dilated_height - (ifm_shape.height * upscaling_factor)
+                        )
 
                 # Adjust for upscaling
                 new_start_coord[-3] = max(new_start_coord[-3] // upscaling_factor, 0)
diff --git a/ethosu/vela/high_level_command_stream_generator.py b/ethosu/vela/high_level_command_stream_generator.py
index 3d0a1e5..f0d7409 100644
--- a/ethosu/vela/high_level_command_stream_generator.py
+++ b/ethosu/vela/high_level_command_stream_generator.py
@@ -82,7 +82,7 @@
     elif sched_op.op_type == Op.ResizeBilinear:
         upscaling = round_up_divide(ofm_shape.height, ifm.shape.height)
 
-    # Get Kernel height
+    # Get kernel height and height dilation
     k_height = 1
     if npu_block_type in (NpuBlockType.Pooling, NpuBlockType.ReduceSum):
         if parent_op is not None:
@@ -91,6 +91,11 @@
         if uncomp_weight_tensor is not None:
             k_height = uncomp_weight_tensor.shape[0]
 
+    k_height_dilation = parent_op.attrs.get("dilation", (_, 1, _, _))[-3]
+
+    # Calculate dilated kernel height
+    k_dilated_height = k_height_dilation * (k_height - 1) + 1
+
     # Define Start and End coordinates for the OFM
     ofm_start = Shape4D(0, 0, 0, op_info.ofm_depth_slices[0])
     ofm_end = ofm_shape
@@ -150,9 +155,9 @@
                         ifm.shape,
                         npu_block_type,
                         write_offset.as_list(),
+                        k_dilated_height,
                         read_offsets[0],
                         read_shapes[0],
-                        k_height,
                         upscaling,
                     )
 
@@ -164,9 +169,9 @@
                         ifm2.shape,
                         npu_block_type,
                         write_offset.as_list(),
+                        k_dilated_height,
                         read_offsets[1],
                         read_shapes[1],
-                        k_height,
                         upscaling,
                     )