MLBEDSW-3502: Add address checks

Added checks during command stream generation to make sure
that address boundaries are respected.

Change-Id: I4dbc693b42d54e35c8fcc785e8be88059e409eec
Signed-off-by: Louis Verhaard <louis.verhaard@arm.com>
diff --git a/ethosu/vela/architecture_features.py b/ethosu/vela/architecture_features.py
index b9c3409..168d0e6 100644
--- a/ethosu/vela/architecture_features.py
+++ b/ethosu/vela/architecture_features.py
@@ -246,6 +246,8 @@
         self.memory_bandwidths_per_cycle = self.axi_port_width * self.memory_clock_scales / 8
 
         self.memory_bandwidths_per_second = self.memory_bandwidths_per_cycle * self.core_clock
+        # Max value in address offsets
+        self.max_address_offset = 1 << 48 if self.is_ethos_u65_system else 1 << 32
 
         # Get output/activation performance numbers
         self._generate_output_perf_tables(self.accelerator_config)
@@ -456,6 +458,13 @@
             self._mem_port_mapping(self.cache_mem_area) == MemArea.Sram and self.cache_mem_area != self.arena_mem_area
         )
 
+    def mem_type_size(self, mem_type: MemType) -> int:
+        """Returns size in bytes available for the given memory type"""
+        if mem_type == MemType.Scratch_fast and self.is_spilling_enabled():
+            return self.sram_size
+        # Size is unknown, return max possible address offset
+        return self.max_address_offset
+
     def _mem_port_mapping(self, mem_port):
         mem_port_mapping = {MemPort.Axi0: self.axi0_port, MemPort.Axi1: self.axi1_port}
         return mem_port_mapping[mem_port]
diff --git a/ethosu/vela/errors.py b/ethosu/vela/errors.py
index 04468c9..918ca0a 100644
--- a/ethosu/vela/errors.py
+++ b/ethosu/vela/errors.py
@@ -22,6 +22,7 @@
 
     def __init__(self, data):
         self.data = f"Error! {data}"
+        self.error_msg = data
 
     def __str__(self):
         return repr(self.data)
diff --git a/ethosu/vela/high_level_command_to_npu_op.py b/ethosu/vela/high_level_command_to_npu_op.py
index 56c5e74..c56eb04 100644
--- a/ethosu/vela/high_level_command_to_npu_op.py
+++ b/ethosu/vela/high_level_command_to_npu_op.py
@@ -17,6 +17,7 @@
 # Description:
 # Conversion from high level command to NpuOperation
 from enum import IntEnum
+from typing import Dict
 from typing import List
 from typing import Optional
 
@@ -157,7 +158,7 @@
     return NpuPadding(top=top, left=left, bottom=bottom, right=right)
 
 
-def get_region(tens: Tensor, arch: ArchitectureFeatures) -> int:
+def get_region(mem_type: MemType, arch: ArchitectureFeatures) -> int:
     base_ptr_idx_map = {
         MemType.Permanent_NPU: BasePointerIndex.WeightTensor,
         MemType.Permanent_CPU: BasePointerIndex.WeightTensor,
@@ -169,7 +170,16 @@
     else:
         base_ptr_idx_map[MemType.Scratch_fast] = BasePointerIndex.ScratchTensor
 
-    return base_ptr_idx_map[tens.mem_type].value
+    return base_ptr_idx_map[mem_type].value
+
+
+def get_mem_limits_for_regions(arch: ArchitectureFeatures) -> Dict[int, int]:
+    """Returns map region -> max size of the region in bytes"""
+    mem_limits = dict()
+    for mem_type in MemType.all():
+        mem_limits[get_region(mem_type, arch)] = arch.mem_type_size(mem_type)
+    mem_limits[BASE_PTR_INDEX_MEM2MEM] = arch.shram_size_bytes
+    return mem_limits
 
 
 def get_upscale(op: Operation) -> NpuResamplingMode:
@@ -238,7 +248,7 @@
 def create_feature_map(tens: Tensor, box: Box, arch: ArchitectureFeatures, op_shape4D: Shape4D) -> NpuFeatureMap:
     """Creates feature map with common fields populated"""
     fm = NpuFeatureMap()
-    fm.region = get_region(tens, arch)
+    fm.region = get_region(tens.mem_type, arch)
     fm.data_type = dtype_map[tens.dtype]
     if tens.format == TensorFormat.NHWC:
         fm.layout = NpuLayout.NHWC
@@ -270,7 +280,7 @@
     # Extract weight substream offsets and calculate their lengths
     assert len(weight_substream_offsets) > 1 and (weight_substream_offsets[0] == 0)
     weight_addr = weight_tensor.address_for_coordinate(weight_box.start_coord)
-    region = get_region(weight_tensor, arch)
+    region = get_region(weight_tensor.mem_type, arch)
     for core in range(substreams):
         address = weight_addr + weight_substream_offsets[core]
         length = weight_substream_offsets[core + 1] - weight_substream_offsets[core]
@@ -292,7 +302,7 @@
     assert len(scale_substream_offsets) > 1 and (scale_substream_offsets[0] == 0)
     scale_addr = scale_tensor.address_for_coordinate(weight_box.start_coord[-1:])
 
-    region = get_region(scale_tensor, arch)
+    region = get_region(scale_tensor.mem_type, arch)
     for core in range(substreams):
         address = scale_addr + scale_substream_offsets[core]
         length = scale_substream_offsets[core + 1] - scale_substream_offsets[core]
@@ -447,11 +457,11 @@
 
 def create_dma_op(cmd: DMA, arch: ArchitectureFeatures) -> NpuDmaOperation:
     """Converts the command to NpuDmaOperation"""
-    src_region = get_region(cmd.in_tensor, arch)
+    src_region = get_region(cmd.in_tensor.mem_type, arch)
     if cmd.out_tensor.purpose == TensorPurpose.LUT:
         dest_region = BASE_PTR_INDEX_MEM2MEM
     else:
-        dest_region = get_region(cmd.out_tensor, arch)
+        dest_region = get_region(cmd.out_tensor.mem_type, arch)
 
     start_coord = cmd.box.start_coord
     src_addr = cmd.in_tensor.address_for_coordinate(start_coord)
@@ -502,6 +512,7 @@
             npu_op = convert_command_to_npu_op(cmd, arch)
             npu_op_list.append(npu_op)
             npu_op_to_cmd[npu_op] = cmd
+    mem_limits = get_mem_limits_for_regions(arch)
     # Generate register commands
     if len(sg.high_level_command_stream) > 0:
         stream_id = DebugDatabase.add_stream(sg)
@@ -513,4 +524,6 @@
                 cmd = npu_op_to_cmd[npu_op]
                 DebugDatabase.add_command(stream_id, offset, cmd.ps.primary_op)
 
-        sg.register_command_stream = generate_command_stream(npu_op_list, arch, verbose, add_to_debug_db, npu_op_to_cmd)
+        sg.register_command_stream = generate_command_stream(
+            npu_op_list, arch, verbose, mem_limits, add_to_debug_db, npu_op_to_cmd
+        )
diff --git a/ethosu/vela/register_command_stream_generator.py b/ethosu/vela/register_command_stream_generator.py
index f925369..a4466c9 100644
--- a/ethosu/vela/register_command_stream_generator.py
+++ b/ethosu/vela/register_command_stream_generator.py
@@ -72,6 +72,7 @@
 from .numeric_util import round_up_to_int
 from .operation import NpuBlockType
 from .range_set import MemoryAccessSet
+from .register_command_stream_util import BASE_PTR_INDEX_MEM2MEM
 from .register_command_stream_util import calc_blockdep
 from .register_command_stream_util import get_dma_memory_accesses
 from .register_command_stream_util import get_op_memory_accesses
@@ -84,6 +85,7 @@
 from .shared_buffer_allocation import find_suitable_block_configs
 from .shared_buffer_allocation import shared_buffer_allocation_for_npu_op
 from .shared_buffer_allocation import SharedBufferAllocation
+from ethosu.vela.errors import VelaError
 
 
 class RegisterMachine:
@@ -265,6 +267,21 @@
 }
 
 
+def check_mem_limits(memory_accesses: MemoryAccessSet, mem_limits: Dict[int, int]):
+    """Checks that an operation's memory accesses respect the boundaries imposed by mem_limits"""
+    for mem_access in memory_accesses.accesses:
+        for region, range_set in mem_access.regions.items():
+            if region not in mem_limits:
+                raise VelaError(f"Invalid region: {region}")
+            max = mem_limits[region]
+            for start, end in range_set.ranges:
+                for offset in (start, end):
+                    if offset < 0:
+                        raise VelaError(f"Negative address offset: {offset}, region: {region}")
+                    if offset > max:
+                        raise VelaError(f"Address offset out of range: {offset}, region: {region}, max: {max}")
+
+
 def quantise(value: float, quant: Optional[NpuQuantization]) -> int:
     """Quantizes the given value"""
     scale = 1 if quant is None or quant.scale_f32 is None else quant.scale_f32
@@ -904,7 +921,12 @@
 
 
 def generate_command_stream(
-    npu_op_list: List[NpuOperation], arch: ArchitectureFeatures, verbose: bool, add_to_debug_db=None, npu_op_to_cmd=None
+    npu_op_list: List[NpuOperation],
+    arch: ArchitectureFeatures,
+    verbose: bool,
+    mem_limits: Dict[int, int],
+    add_to_debug_db=None,
+    npu_op_to_cmd=None,
 ) -> List[int]:
     """
     Generates register commands for the given list of NPU operations.
@@ -922,14 +944,20 @@
             memory_accesses[npu_op] = get_op_memory_accesses(npu_op, arch)
         else:
             assert 0, "Invalid operation type"
+
     if arch.is_ethos_u65_system:
         emit.cmd0_with_param(cmd0.NPU_SET_PARALLEL_MODE, arch.ncores - 1)
     dep_watermark = Watermark(0, 0)
     prev_op = None
     # Generate register commands for all operations
     for op_index, npu_op in enumerate(npu_op_list):
-        dep_watermark, cmd_waits = get_wait_dependency(arch, npu_op_list, memory_accesses, op_index, dep_watermark)
-        generate_registers_for_op(emit, npu_op, arch)
+        try:
+            check_mem_limits(memory_accesses[npu_op], mem_limits)
+            dep_watermark, cmd_waits = get_wait_dependency(arch, npu_op_list, memory_accesses, op_index, dep_watermark)
+            generate_registers_for_op(emit, npu_op, arch)
+        except VelaError as e:
+            # Add operation info and rethrow
+            raise VelaError(f"{e.error_msg}, in operation {op_index}:{npu_op.op_type.name}") from None
         if not isinstance(npu_op, NpuDmaOperation) and isinstance(npu_op, NpuBlockOperation):
             # Generate BLOCKDEP
             blockdep = calc_blockdep(arch, prev_op, npu_op)
@@ -987,4 +1015,8 @@
     """
     accelerator = Accelerator.from_npu_accelerator(npu_accelerator)
     arch = create_default_arch(accelerator)
-    return generate_command_stream(npu_op_list, arch, verbose=False)
+    mem_limits = dict()
+    for region in range(0, 8):
+        mem_limits[region] = arch.max_address_offset
+    mem_limits[BASE_PTR_INDEX_MEM2MEM] = arch.shram_size_bytes
+    return generate_command_stream(npu_op_list, arch, verbose=False, mem_limits=mem_limits)
diff --git a/ethosu/vela/test/extapi/test_extapi_generate_commands.py b/ethosu/vela/test/extapi/test_extapi_generate_commands.py
index b605dfc..db0485c 100644
--- a/ethosu/vela/test/extapi/test_extapi_generate_commands.py
+++ b/ethosu/vela/test/extapi/test_extapi_generate_commands.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
+# Copyright (C) 2020-2021 Arm Limited or its affiliates. All rights reserved.
 #
 # SPDX-License-Identifier: Apache-2.0
 #
@@ -16,6 +16,8 @@
 #
 # Description:
 # Contains unit tests for npu_generate_register_command_stream API for an external consumer
+import pytest
+
 from ethosu.vela.api import npu_find_block_configs
 from ethosu.vela.api import npu_generate_register_command_stream
 from ethosu.vela.api import NpuAccelerator
@@ -38,9 +40,15 @@
 from ethosu.vela.api import NpuQuantization
 from ethosu.vela.api import NpuShape3D
 from ethosu.vela.api import NpuTileBox
+from ethosu.vela.architecture_features import Accelerator
+from ethosu.vela.architecture_features import create_default_arch
+from ethosu.vela.errors import VelaError
 from ethosu.vela.ethos_u55_regs.ethos_u55_regs import cmd0
 from ethosu.vela.ethos_u55_regs.ethos_u55_regs import cmd1
+from ethosu.vela.high_level_command_to_npu_op import BasePointerIndex
+from ethosu.vela.high_level_command_to_npu_op import get_mem_limits_for_regions
 from ethosu.vela.register_command_stream_generator import CmdMode
+from ethosu.vela.register_command_stream_generator import generate_command_stream
 from ethosu.vela.register_command_stream_util import get_address_ranges
 
 
@@ -355,3 +363,59 @@
     # A DMA WAIT should have been inserted
     check_cmd0(cmds, cmd0.NPU_OP_DMA_WAIT, 0)
     check_cmd0(cmds, cmd0.NPU_OP_POOL, 1)
+
+
+def test_check_mem_limits():
+    # Tests that no code is generated with addresses out of bounds
+    conv_op = create_fully_connected_op()
+    # bias with end address out of range
+    conv_op.biases = [NpuAddressRange(region=0, address=(1 << 32) - 16, length=1000)]
+    with pytest.raises(VelaError):
+        npu_generate_register_command_stream([conv_op], NpuAccelerator.Ethos_U55_64)
+    # same test should pass with Ethos_U65_512
+    npu_generate_register_command_stream([conv_op], NpuAccelerator.Ethos_U65_512)
+    # weights with end address out of range
+    conv_op = create_fully_connected_op()
+    conv_op.weights = [NpuAddressRange(region=0, address=(1 << 48) - 960, length=1000)]
+    with pytest.raises(VelaError):
+        npu_generate_register_command_stream([conv_op], NpuAccelerator.Ethos_U65_256)
+    # bias with high end address, but still within range
+    conv_op = create_fully_connected_op()
+    conv_op.biases = [NpuAddressRange(region=0, address=(1 << 48) - 1024, length=1000)]
+    npu_generate_register_command_stream([conv_op], NpuAccelerator.Ethos_U65_512)
+    conv_op = create_fully_connected_op()
+    # weights with negative address
+    conv_op.weights = [NpuAddressRange(region=0, address=-16, length=1000)]
+    with pytest.raises(VelaError):
+        npu_generate_register_command_stream([conv_op], NpuAccelerator.Ethos_U55_32)
+    op = create_avg_pool_op()
+    # Tile 4's end address out of range
+    op.ifm.tiles = NpuTileBox(width_0=1, height_0=1, height_1=1, addresses=[0, 800, 4000, (1 << 32) - 16])
+    with pytest.raises(VelaError):
+        npu_generate_register_command_stream([op], NpuAccelerator.Ethos_U55_256)
+    op = create_avg_pool_op()
+    # IFM region out of range
+    op.ifm.region = 8
+    with pytest.raises(VelaError):
+        npu_generate_register_command_stream([op], NpuAccelerator.Ethos_U55_64)
+
+
+def test_check_sram_limit_spilling():
+    # Tests that no code is generated with addresses outside available sram spilling range
+    arch = create_default_arch(Accelerator.Ethos_U65_512)
+    assert arch.is_spilling_enabled()
+    op = create_avg_pool_op()
+    op.ifm.region = 0
+    # OFM in scratch fast memory
+    op.ofm.region = int(BasePointerIndex.ScratchFastTensor)
+    w, h = op.ofm.shape.width, op.ofm.shape.height
+    op.ofm.tiles = NpuTileBox(width_0=w, height_0=h, height_1=h, addresses=[32 * 1024, 0, 0, 0])
+    # 384K for spilling should fit
+    arch.sram_size = 384 * 1024
+    mem_limits = get_mem_limits_for_regions(arch)
+    generate_command_stream([op], arch, verbose=False, mem_limits=mem_limits)
+    # 32K for spilling does not fit, due to the OFM address
+    arch.sram_size = 32 * 1024
+    mem_limits = get_mem_limits_for_regions(arch)
+    with pytest.raises(VelaError):
+        generate_command_stream([op], arch, verbose=False, mem_limits=mem_limits)