MLBEDSW-8117: Incorrect stride check for IFM/IFM2 and OFM

The constraint check for the IFM/IFM2/OFM strides were coded
according to an incorrect version of the specification.

Changed the check to verify that the strides are a multiple
of 16 bytes. Also changed the wording in the exception message
to clarify if it is a stride or value violating the constraint.

Test case had two stride settings violating the constraint,
after this change one of them still fails the check, so
no change to tests, except in comments clarifying what is
being tested.

Change-Id: I93815d8bb08303b5f747c947c0bbd461b12895e3
Signed-off-by: Björn Davidsson <bjoern.davidsson@arm.com>
diff --git a/ethosu/vela/register_command_stream_generator.py b/ethosu/vela/register_command_stream_generator.py
index 56aae73..42ae99d 100644
--- a/ethosu/vela/register_command_stream_generator.py
+++ b/ethosu/vela/register_command_stream_generator.py
@@ -81,7 +81,7 @@
 from .register_command_stream_util import check_addresses
 from .register_command_stream_util import check_alignment
 from .register_command_stream_util import check_dma_op
-from .register_command_stream_util import check_size
+from .register_command_stream_util import check_length
 from .register_command_stream_util import check_strides
 from .register_command_stream_util import get_dma_memory_accesses
 from .register_command_stream_util import get_op_memory_accesses
@@ -526,7 +526,7 @@
     ):
         if core < len(weights):
             check_alignment(weights[core].address, 16)
-            check_size(weights[core].length, 16)
+            check_length(weights[core].length, 16)
             emit.cmd1_with_address(addr, weights[core].address)
             emit.cmd1_with_offset(length, weights[core].length)
         elif core < arch.ncores:
@@ -546,7 +546,7 @@
     ):
         if core < len(biases):
             emit.cmd1_with_address(addr, biases[core].address)
-            check_size(biases[core].length, 16)
+            check_length(biases[core].length, 16)
             emit.cmd1_with_offset(length, biases[core].length)
         elif core < arch.ncores:
             emit.cmd1_with_address(addr, biases[0].address)
diff --git a/ethosu/vela/register_command_stream_util.py b/ethosu/vela/register_command_stream_util.py
index c7050a3..74c4f90 100644
--- a/ethosu/vela/register_command_stream_util.py
+++ b/ethosu/vela/register_command_stream_util.py
@@ -60,10 +60,18 @@
         raise ByteAlignmentError(f"Cmd1 payload of size: {payload} Bytes is not {required_alignment}-byte aligned")
 
 
-def check_size(payload, required_multiple):
+def check_size(payload, required_multiple, value_type):
     # assuming payload is defined in bytes
     if payload % required_multiple != 0:
-        raise ByteSizeError(f"Cmd1 payload of size: {payload} Bytes is not a multiple of {required_multiple}")
+        raise ByteSizeError(f"Cmd1 {value_type} of size: {payload} Bytes is not a multiple of {required_multiple}")
+
+
+def check_stride(stride, required_multiple):
+    check_size(stride, required_multiple, "stride")
+
+
+def check_length(length, required_multiple):
+    check_size(length, required_multiple, "length")
 
 
 def to_npu_kernel(kernel: Kernel) -> NpuKernel:
@@ -263,12 +271,12 @@
 
     if fm.layout == NpuLayout.NHCWB16:
         strides_to_check = [strides.depth, strides.height]
-        required_multiple = 16 * element_size_in_bytes
+        required_multiple = 16
     else:
         strides_to_check = [strides.height, strides.width]
         required_multiple = element_size_in_bytes
     for stride in strides_to_check:
-        check_size(stride, required_multiple)
+        check_stride(stride, required_multiple)
 
 
 def check_addresses(addresses: List[int], layout: NpuLayout, element_size, arch: ArchitectureFeatures):
@@ -384,11 +392,11 @@
             check_alignment(dma_op.src.address, 16)
         if dma_op.dest.region == BASE_PTR_INDEX_MEM2MEM:
             check_alignment(dma_op.dest.address, 16)
-            check_size(dma_op.src.length, 16)
+            check_length(dma_op.src.length, 16)
     else:
         check_alignment(dma_op.src.address, 16)
         check_alignment(dma_op.dest.address, 16)
-        check_size(dma_op.src.length, 16)
+        check_length(dma_op.src.length, 16)
 
 
 # -------------------------------------------------------------------
diff --git a/ethosu/vela/test/extapi/test_extapi_generate_commands.py b/ethosu/vela/test/extapi/test_extapi_generate_commands.py
index b21aae3..92f6c79 100644
--- a/ethosu/vela/test/extapi/test_extapi_generate_commands.py
+++ b/ethosu/vela/test/extapi/test_extapi_generate_commands.py
@@ -861,11 +861,11 @@
     op.block_traversal = NpuBlockTraversal.PART_KERNEL_FIRST
     op.block_config = NpuShape3D(height=16, width=4, depth=16)
 
-    # NHWC depth stride not a multiple of 32 passes
+    # NHWC height stride not a multiple of 16 passes
     op.ifm.strides = NpuShape3D(depth=16, height=2, width=16)
     npu_generate_register_command_stream([op], NpuAccelerator.Ethos_U65_256)
 
-    # Same depth stride fails for NHCWB16
+    # Same height stride fails for NHCWB16
     op.ifm = create_feature_map(
         ifm_shape,
         1,