Address generation fix
- The architecture supports address extensions wider than 32b via the cmd1.param
Change-Id: I7a01b4596f7a54f6be05b8e2c454494e6751757b
Signed-off-by: Mauricio Briceno <mauricio.briceno@arm.com>
diff --git a/ethosu/vela/register_command_stream_generator.py b/ethosu/vela/register_command_stream_generator.py
index d9f6b1f..ad29dae 100644
--- a/ethosu/vela/register_command_stream_generator.py
+++ b/ethosu/vela/register_command_stream_generator.py
@@ -183,6 +183,9 @@
self.cmd_stream.append((command, offset))
self.offset += CommandStreamEmitter.WORD_SIZE * 2
+ def cmd1_with_address(self, cmd: cmd1, offset):
+ self.cmd1_with_offset(cmd, offset, offset >> 32)
+
def cmd_wait(self, cmd: cmd0, channel: int, outstanding_count: int):
param = (16 * channel) + outstanding_count
command = ((param & 0xFFFF) << 16) | cmd.value
@@ -309,10 +312,8 @@
if layout == NpuLayout.NHCWB16:
# Check that all BasePointer addresses are aligned to 16 bytes
assert all((int(addr) % 16) == 0 for addr in addresses)
- emit.cmd1_with_offset(ptr_cmds[0], addresses[0])
- emit.cmd1_with_offset(ptr_cmds[1], addresses[1])
- emit.cmd1_with_offset(ptr_cmds[2], addresses[2])
- emit.cmd1_with_offset(ptr_cmds[3], addresses[3])
+ for i in range(4):
+ emit.cmd1_with_address(ptr_cmds[i], addresses[i])
def generate_tiles(emit: CommandStreamEmitter, tile_cmds: List[cmd0], tiles: NpuTileBox):
@@ -327,9 +328,9 @@
):
"""Generates STRIDE_C/Y/X registers"""
strides = get_strides(fm)
- emit.cmd1_with_offset(stride_c_cmd, strides.depth) # stride between 16-byte channel blocks (C)
- emit.cmd1_with_offset(stride_y_cmd, strides.height) # stride between vertical values (H)
- emit.cmd1_with_offset(stride_x_cmd, strides.width) # stride between horisontal values (W)
+ emit.cmd1_with_address(stride_c_cmd, strides.depth) # stride between 16-byte channel blocks (C)
+ emit.cmd1_with_address(stride_y_cmd, strides.height) # stride between vertical values (H)
+ emit.cmd1_with_address(stride_x_cmd, strides.width) # stride between horisontal values (W)
def generate_ifm_precision(emit: CommandStreamEmitter, fm: NpuFeatureMap, op_to_scale: int, precision_cmd: cmd0):
@@ -476,10 +477,10 @@
]
):
if core < len(weights):
- emit.cmd1_with_offset(addr, weights[core].address)
+ emit.cmd1_with_address(addr, weights[core].address)
emit.cmd1_with_offset(length, weights[core].length)
elif core < arch.ncores:
- emit.cmd1_with_offset(addr, weights[0].address)
+ emit.cmd1_with_address(addr, weights[0].address)
emit.cmd1_with_offset(length, 0)
@@ -493,10 +494,10 @@
[(cmd1.NPU_SET_SCALE_BASE, cmd1.NPU_SET_SCALE_LENGTH), (cmd1.NPU_SET_SCALE1_BASE, cmd1.NPU_SET_SCALE1_LENGTH)]
):
if core < len(biases):
- emit.cmd1_with_offset(addr, biases[core].address)
+ emit.cmd1_with_address(addr, biases[core].address)
emit.cmd1_with_offset(length, biases[core].length)
elif core < arch.ncores:
- emit.cmd1_with_offset(addr, biases[0].address)
+ emit.cmd1_with_address(addr, biases[0].address)
emit.cmd1_with_offset(length, 0)
@@ -875,11 +876,11 @@
def generate_dma_op(emit: CommandStreamEmitter, dma_op: NpuDmaOperation):
"""Generates register commands for DMA operations"""
emit.cmd0_with_param(cmd0.NPU_SET_DMA0_SRC_REGION, dma_op.src.region)
- emit.cmd1_with_offset(cmd1.NPU_SET_DMA0_SRC, dma_op.src.address)
+ emit.cmd1_with_address(cmd1.NPU_SET_DMA0_SRC, dma_op.src.address)
emit.cmd0_with_param(cmd0.NPU_SET_DMA0_DST_REGION, dma_op.dest.region)
- emit.cmd1_with_offset(cmd1.NPU_SET_DMA0_DST, dma_op.dest.address)
- emit.cmd1_with_offset(cmd1.NPU_SET_DMA0_LEN, dma_op.src.length)
+ emit.cmd1_with_address(cmd1.NPU_SET_DMA0_DST, dma_op.dest.address)
+ emit.cmd1_with_address(cmd1.NPU_SET_DMA0_LEN, dma_op.src.length)
def generate_registers_for_op(emit: CommandStreamEmitter, npu_op: NpuOperation, arch: ArchitectureFeatures):