MLBEDSW-2420: Improved support for dilated convolution

- Dilation added to SET_KERNEL_STRIDE instruction
- Kernel height/width adjusted for dilation
- Updated padding calculation
- Updated weight compression

Change-Id: I0c8190223e223b039a305aba0f37896ae1de2b80
Signed-off-by: Louis Verhaard <louis.verhaard@arm.com>
diff --git a/ethosu/vela/graph_optimiser.py b/ethosu/vela/graph_optimiser.py
index 758b51a..b004f4c 100644
--- a/ethosu/vela/graph_optimiser.py
+++ b/ethosu/vela/graph_optimiser.py
@@ -292,7 +292,9 @@
         else:
             raise UnsupportedFeatureError("Unknown operation that uses padding: {}".format(op.type))
 
-        padding, skirt = calc_padding_and_skirt(op.attrs["padding"], kernel_size, op.attrs["strides"], input_shape)
+        dilation_h, dilation_w = op.get_dilation_h_w()
+        dilated_kernel_size = [dilation_h * (kernel_size[0] - 1) + 1, dilation_w * (kernel_size[1] - 1) + 1]
+        padding, skirt = calc_padding_and_skirt(op.attrs["padding"], dilated_kernel_size, op.attrs["strides"], input_shape)
         op.attrs["explicit_padding"] = padding
         op.attrs["skirt"] = skirt
     return op
diff --git a/ethosu/vela/mark_tensors.py b/ethosu/vela/mark_tensors.py
index cd70446..bf7bc45 100644
--- a/ethosu/vela/mark_tensors.py
+++ b/ethosu/vela/mark_tensors.py
@@ -17,7 +17,6 @@
 # Mark purpose and select formats for Tensors. Also compresses the weights.
 from . import rewrite_graph
 from . import weight_compressor
-from .operation import NpuBlockType
 from .tensor import TensorFormat
 from .tensor import TensorPurpose
 
@@ -319,14 +318,6 @@
             assert 0, "unknown tensor purpose %s" % (tens.purpose,)
         return fmt
 
-    def find_npu_usage_of_tensor(tens):
-        for op in tens.consumers():
-            if op.type == "DMA":
-                return find_npu_usage_of_tensor(op.outputs[0])
-            if "npu_block_type" in op.attrs:
-                return op.attrs["npu_block_type"]
-            return NpuBlockType.Default
-
     def visit_tens(tens, ps):
         if tens not in formats_for_tensor:
             fmt = init_tens(tens)
@@ -349,8 +340,9 @@
         if fmt == TensorFormat.WeightsCompressed and tens.values is not None:
             src_tens = tens.get_dma_src_tensor()
             if src_tens is not None:
-                npu_block_type = find_npu_usage_of_tensor(tens)
-                weight_compressor.compress_weights(arch, nng, tens, npu_block_type, 32, 32)
+                op = tens.find_npu_op()
+                npu_block_type = op.attrs["npu_block_type"]
+                weight_compressor.compress_weights(arch, nng, tens, npu_block_type, 32, 32, op.get_dilation_h_w())
                 # Alias compressed weights back into source tensor
                 src_tens.copy_compressed_weight_info(tens)
 
diff --git a/ethosu/vela/operation.py b/ethosu/vela/operation.py
index 338f962..e8a03b7 100644
--- a/ethosu/vela/operation.py
+++ b/ethosu/vela/operation.py
@@ -194,6 +194,10 @@
 
         return inputs, axis
 
+    def get_dilation_h_w(self):
+        _, dilation_h, dilation_w, _ = self.attrs.get("dilation", (1, 1, 1, 1))
+        return dilation_h, dilation_w
+
     split_ops = set(("Split", "SplitV", "StridedSlice", "Slice", "UnpackReshaped"))
 
     def is_split_op(self):
diff --git a/ethosu/vela/register_command_stream_generator.py b/ethosu/vela/register_command_stream_generator.py
index da7458e..3da8bbc 100644
--- a/ethosu/vela/register_command_stream_generator.py
+++ b/ethosu/vela/register_command_stream_generator.py
@@ -42,11 +42,11 @@
 from .high_level_command_stream import CommandType
 from .numeric_util import clamp_sigmoid
 from .numeric_util import clamp_tanh
+from .numeric_util import full_shape
 from .numeric_util import quantise_float32
 from .numeric_util import round_away_zero
 from .numeric_util import round_up
 from .numeric_util import round_up_to_int
-from .numeric_util import full_shape
 from .operation import NpuBlockType
 from .shared_buffer_allocation import SharedBufferAllocation
 from .tensor import MemArea
@@ -274,7 +274,7 @@
         if prev_cmd.ofm_tensor.equivalence_id == cmd.ifm_tensor.equivalence_id:
             return True
         elif cmd.ifm2_tensor is not None:
-            return (prev_cmd.ofm_tensor.equivalence_id == cmd.ifm2_tensor.equivalence_id)
+            return prev_cmd.ofm_tensor.equivalence_id == cmd.ifm2_tensor.equivalence_id
     return False
 
 
@@ -414,7 +414,7 @@
             use_global_scale = False
             # Specifies type of rounding to be used.
             rounding_mode = rounding.TFL
-            if primary_op.type == 'ResizeBilinear':
+            if primary_op.type == "ResizeBilinear":
                 rounding_mode = rounding.TRUNCATE
             fmf = primary_op.attrs.get("fused_memory_function", None)
             faf = primary_op.attrs.get("fused_activation_function", None)
@@ -428,6 +428,7 @@
             prev_ofm_rect = cur_ofm_rect
             prev_ofm_block = cur_ofm_block
             prev_kernel = cur_kernel
+            cur_kernel = get_op_kernel(ps)
 
             block_config = ps.block_config
             emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_HEIGHT_M1, block_config[0] - 1)
@@ -552,7 +553,7 @@
 
             emit.cmd0_with_param(cmd0.NPU_SET_ACC_FORMAT, acc_format_map[shared_buffer.use_accumulator_element])
 
-            if primary_op.type == 'ResizeBilinear':
+            if primary_op.type == "ResizeBilinear":
                 # perform nearest neighbor upscale
                 emit.cmd0_with_param(cmd0.NPU_SET_IFM_UPSCALE, 1)
             else:
@@ -575,7 +576,6 @@
                     explicit_padding[1] = 0
                 if cmd.ifm_box.end_coord[-2] < cmd.ifm_tensor.shape[-2]:
                     explicit_padding[3] = 0
-
                 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_TOP, explicit_padding[0])
                 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_LEFT, explicit_padding[1])
                 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_BOTTOM, explicit_padding[2])
@@ -590,7 +590,6 @@
                 # set kernel y stride extension bits
                 stride |= (primary_op.attrs["strides"][1] - 1 >> 1) << 9
 
-
                 if npu_block_type == NpuBlockType.Pooling:
                     k_height, k_width = primary_op.attrs["ksize"][1:3]
                     emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_HEIGHT_M1, k_height - 1)
@@ -641,8 +640,14 @@
                     # Reduced precision quantization and natural rounding used for int16
                     if cmd.ifm_tensor.dtype == DataType.int16:
                         rounding_mode = rounding.NATURAL
-                    emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_HEIGHT_M1, cmd.weight_tensor.shape[0] - 1)
-                    emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_WIDTH_M1, cmd.weight_tensor.shape[1] - 1)
+                    stride |= (cur_kernel.dilation.y - 1) << 4
+                    stride |= (cur_kernel.dilation.x - 1) << 3
+                    emit.cmd0_with_param(
+                        cmd0.NPU_SET_KERNEL_HEIGHT_M1, cur_kernel.dilation.y * (cmd.weight_tensor.shape[0] - 1)
+                    )
+                    emit.cmd0_with_param(
+                        cmd0.NPU_SET_KERNEL_WIDTH_M1, cur_kernel.dilation.x * (cmd.weight_tensor.shape[1] - 1)
+                    )
                     if cmd.weight_tensor.block_traversal == TensorBlockTraversal.PartKernelFirst:
                         # Part-kernel-first weight ordering
                         assert npu_block_type == NpuBlockType.ConvolutionMxN
@@ -934,7 +939,6 @@
             cur_ofm_block = Block(ps.block_config[1], ps.block_config[0], ps.block_config[3])
             cur_ofm_rect = get_op_ofm_rect(cmd)
             cur_ifm_rect = get_op_ifm_rect(cmd)
-            cur_kernel = get_op_kernel(cmd.ps)
             cur_padLT = get_op_padding_lt(cmd)
             if (prev_kernel is not None) and (cur_kernel is not None) and has_prev_op_dependency(prev_cmd, cmd):
                 if cmd.ifm_tensor.shape == prev_cmd.ofm_tensor.shape:
diff --git a/ethosu/vela/tensor.py b/ethosu/vela/tensor.py
index 2f91f61..426a710 100644
--- a/ethosu/vela/tensor.py
+++ b/ethosu/vela/tensor.py
@@ -521,7 +521,7 @@
             strides[4] = stride
             strides[3] = 16 * stride  # STRIDE_X
             strides[1] = strides[3] * augmented_shape[2]  # STRIDE_C
-            strides[2] = augmented_shape[2] * augmented_shape[3] * stride # STRIDE_Y
+            strides[2] = augmented_shape[2] * augmented_shape[3] * stride  # STRIDE_Y
             strides[0] = strides[2] * augmented_shape[1]  # STRIDE_N
 
         return strides, augmented_coord
@@ -539,6 +539,15 @@
         # Note: for DMA ops, Pass.weight_tensor is referring to the SRAM weight tensor
         return self.ops[0].inputs[0] if self.needs_dma() else None
 
+    def find_npu_op(self):
+        # Returns the NPU operator that uses this tensor, excluding DMA operators.
+        for op in self.consumers():
+            if op.type == "DMA":
+                return op.outputs[0].find_npu_op()
+            if "npu_block_type" in op.attrs:
+                return op
+            return None
+
     def compressed_stream_index_from_coord(self, coord):
         assert self.format == TensorFormat.WeightsCompressed
         assert len(self.compressed_values) > 0
diff --git a/ethosu/vela/weight_compressor.py b/ethosu/vela/weight_compressor.py
index 450e091..9edde60 100644
--- a/ethosu/vela/weight_compressor.py
+++ b/ethosu/vela/weight_compressor.py
@@ -19,7 +19,6 @@
 from collections import namedtuple
 
 import numpy as np
-from ethosu import mlw_codec
 
 from .data_type import DataType
 from .errors import UnsupportedFeatureError
@@ -32,20 +31,21 @@
 from .tensor import TensorFormat
 from .tensor import TensorPurpose
 from .tensor import TensorSubPurpose
+from ethosu import mlw_codec
 
 
 # Contains meta info for a weight compression. If two tensors have identical weight compression config,
 # then they also will have identical compressed weights.
 WeightCompressionConfig = namedtuple(
-    "WeightCompressionConfig", ["npu_block_type", "ofm_block_depth", "ofm_depth_step", "equivalence_id"]
+    "WeightCompressionConfig", ["npu_block_type", "ofm_block_depth", "ofm_depth_step", "dilation", "equivalence_id"]
 )
 
 
-def create_weight_compression_config(tens, npu_block_type, ofm_block_depth, ofm_depth_step):
+def create_weight_compression_config(tens, npu_block_type, ofm_block_depth, ofm_depth_step, dilation):
     # Note: for an ofm block only its depth is used in weight compression.
     # And block depth > ofm depth gives same result as block depth == ofm depth
     block_depth = min(ofm_block_depth, tens.quant_values.shape[-1])
-    return WeightCompressionConfig(npu_block_type, block_depth, ofm_depth_step, tens.equivalence_id)
+    return WeightCompressionConfig(npu_block_type, block_depth, ofm_depth_step, dilation, tens.equivalence_id)
 
 
 def set_storage_shape(tens):
@@ -90,10 +90,11 @@
     return compressed
 
 
-def generate_brick(arch, brick_weights, ofm_block_depth, block_traversal, ifm_bitdepth):
+def generate_brick(arch, brick_weights, ofm_block_depth, block_traversal, ifm_bitdepth, dilation):
     is_depthwise = block_traversal == TensorBlockTraversal.DepthWise
     is_partkernel = block_traversal == TensorBlockTraversal.PartKernelFirst
-    subkernel_max = arch.subkernel_max
+    decomp_h = arch.subkernel_max.height // dilation[0]
+    decomp_w = arch.subkernel_max.width // dilation[1]
     ofm_ublock = arch.ofm_ublock
     ifm_ublock = arch.ifm_ublock
     # Expect weights formatted HWIO
@@ -125,11 +126,11 @@
                 )
             # Weight decomposition
             # Subkernel Splitting  (H)
-            for subkernel_y in range(0, kernel_height, subkernel_max.height):
-                sub_height = min(kernel_height - subkernel_y, subkernel_max.height)
+            for subkernel_y in range(0, kernel_height, decomp_h):
+                sub_height = min(kernel_height - subkernel_y, decomp_h)
                 # Subkernel splitting (W)
-                for subkernel_x in range(0, kernel_width, subkernel_max.width):
-                    sub_width = min(kernel_width - subkernel_x, subkernel_max.width)
+                for subkernel_x in range(0, kernel_width, decomp_w):
+                    sub_width = min(kernel_width - subkernel_x, decomp_w)
                     subkernel_elements = sub_width * sub_height
                     # Part kernel first works across the kernel H/W and needs padding
                     if is_partkernel:
@@ -178,14 +179,14 @@
 
 
 # Compress the weights
-def compress_weights(arch, nng, tens, npu_block_type, ofm_block_depth, ofm_depth_step):
+def compress_weights(arch, nng, tens, npu_block_type, ofm_block_depth, ofm_depth_step, dilation):
     assert tens.purpose == TensorPurpose.Weights
     assert tens.format == TensorFormat.WeightsCompressed
 
     # Check the weight cache
     if nng.weight_cache is None:
         nng.weight_cache = CompressedWeightCache()
-    wcc = create_weight_compression_config(tens, npu_block_type, ofm_block_depth, ofm_depth_step)
+    wcc = create_weight_compression_config(tens, npu_block_type, ofm_block_depth, ofm_depth_step, dilation)
     tens.weight_compression_config = wcc
     tens_cached = nng.weight_cache.get_tensor_with_same_compression(wcc)
     if tens_cached is not None:
@@ -241,7 +242,7 @@
         brick_weights = weights[:, :, :, idx : idx + count]
 
         # Encode all weights into one chunk
-        raw_stream = generate_brick(arch, brick_weights, ofm_block_depth, tens.block_traversal, ifm_bitdepth)
+        raw_stream = generate_brick(arch, brick_weights, ofm_block_depth, tens.block_traversal, ifm_bitdepth, dilation)
         encoded = encode(raw_stream)
         encoded_streams.append(encoded)
 
@@ -387,7 +388,8 @@
         for ps in sg.passes:
             tens = ps.weight_tensor
             if tens is not None:
-                npu_usage_of_tensor = find_npu_usage_of_tensor(tens)
+                op = tens.find_npu_op()
+                npu_usage_of_tensor = op.attrs["npu_block_type"]
                 if npu_usage_of_tensor == NpuBlockType.ConvolutionDepthWise:
                     tens.quant_values = np.transpose(tens.quant_values, (0, 1, 3, 2))
                     tens.shape = tens.storage_shape = tens.bandwidth_shape = list(tens.quant_values.shape)
@@ -399,7 +401,7 @@
                 else:
                     ofm_depth_step = tens.shape[-1]
                 compress_weights(
-                    arch, nng, tens, npu_usage_of_tensor, ps.block_config[-1], ofm_depth_step,
+                    arch, nng, tens, npu_usage_of_tensor, ps.block_config[-1], ofm_depth_step, op.get_dilation_h_w()
                 )
                 # Update source tensor
                 if needs_dma: