MLBEDSW-2556: Odd core/block depth weight interleaving update

 - If blockdepth or core count resulted in empty or non-existent substreams, the
   command generator generated an error. This commit changes the command stream
   generator to only program cores that have streams and are enabled for the
   configuration.

Change-Id: I4e724b19de14d3a12e886ec6b17d0038593dfb59
Signed-off-by: Tim Hall <tim.hall@arm.com>
diff --git a/ethosu/vela/register_command_stream_generator.py b/ethosu/vela/register_command_stream_generator.py
index e753885..6cd8143 100644
--- a/ethosu/vela/register_command_stream_generator.py
+++ b/ethosu/vela/register_command_stream_generator.py
@@ -694,18 +694,20 @@
                 stream_index = cmd.weight_tensor.compressed_stream_index_from_coord(cmd.weight_box.start_coord)
                 weight_substream_offsets = cmd.weight_tensor.compressed_values_substream_offsets[stream_index]
                 substreams = len( weight_substream_offsets ) - 1 # Offset list must terminate with full stream length
-                assert substreams == arch.ncores
 
                 # Extract weight substream offsets and calculate their lengths
                 assert len(weight_substream_offsets) > 1 and (weight_substream_offsets[0] == 0)
                 weight_addr = cmd.weight_tensor.address_for_coordinate(cmd.weight_box.start_coord)
 
-                if substreams > 0:
-                    emit.cmd1_with_offset(cmd1.NPU_SET_WEIGHT_BASE, weight_addr + weight_substream_offsets[0] )
-                    emit.cmd1_with_offset(cmd1.NPU_SET_WEIGHT_LENGTH, weight_substream_offsets[1] - weight_substream_offsets[0])
-                if substreams > 1:
-                    emit.cmd1_with_offset(cmd1.NPU_SET_WEIGHT1_BASE, weight_addr + weight_substream_offsets[1])
-                    emit.cmd1_with_offset(cmd1.NPU_SET_WEIGHT1_LENGTH, weight_substream_offsets[2] - weight_substream_offsets[1])
+                # Set weights sources for active and present cores
+                for core, param in enumerate( [(cmd1.NPU_SET_WEIGHT_BASE, cmd1.NPU_SET_WEIGHT_LENGTH),
+                                               (cmd1.NPU_SET_WEIGHT1_BASE, cmd1.NPU_SET_WEIGHT1_LENGTH)] ):
+                    if core < substreams:
+                        emit.cmd1_with_offset(param[0], weight_addr + weight_substream_offsets[core] )
+                        emit.cmd1_with_offset(param[1], weight_substream_offsets[core+1] - weight_substream_offsets[core])
+                    elif core < arch.ncores:
+                        emit.cmd1_with_offset(param[0], weight_addr)
+                        emit.cmd1_with_offset(param[1], 0)
 
                 weight_region = base_ptr_idx_map[cmd.weight_tensor.mem_type]
                 emit.cmd0_with_param(cmd0.NPU_SET_WEIGHT_REGION, weight_region)
@@ -715,18 +717,20 @@
                 if cmd.scale_tensor is not None:
                     scale_substream_offsets = cmd.scale_tensor.compressed_values_substream_offsets[stream_index]
                     substreams = len( scale_substream_offsets ) - 1 # Offset list must terminate with full stream length
-                    assert substreams == arch.ncores
 
                     # Extract scale substream offsets and calculate their lengths
                     assert len(scale_substream_offsets) > 1 and (scale_substream_offsets[0] == 0)
                     scale_addr = cmd.scale_tensor.address_for_coordinate( cmd.weight_box.start_coord[-1:] )
 
-                    if substreams > 0:
-                        emit.cmd1_with_offset(cmd1.NPU_SET_SCALE_BASE, scale_addr + scale_substream_offsets[0])
-                        emit.cmd1_with_offset(cmd1.NPU_SET_SCALE_LENGTH, scale_substream_offsets[1] - scale_substream_offsets[0] )
-                    if substreams > 1:
-                        emit.cmd1_with_offset(cmd1.NPU_SET_SCALE1_BASE, scale_addr + scale_substream_offsets[1])
-                        emit.cmd1_with_offset(cmd1.NPU_SET_SCALE1_LENGTH, scale_substream_offsets[2] - scale_substream_offsets[1] )
+                    # Set scale sources for active and present cores
+                    for core, param in enumerate( [(cmd1.NPU_SET_SCALE_BASE, cmd1.NPU_SET_SCALE_LENGTH),
+                                                   (cmd1.NPU_SET_SCALE1_BASE, cmd1.NPU_SET_SCALE1_LENGTH)] ):
+                        if core < substreams:
+                            emit.cmd1_with_offset(param[0], scale_addr + scale_substream_offsets[core] )
+                            emit.cmd1_with_offset(param[1], scale_substream_offsets[core+1] - scale_substream_offsets[core])
+                        elif core < arch.ncores:
+                            emit.cmd1_with_offset(param[0], scale_addr)
+                            emit.cmd1_with_offset(param[1], 0)
 
                     # Emit base address for NPU to access scale & bias data
                     scale_region = base_ptr_idx_map[cmd.scale_tensor.mem_type]
diff --git a/ethosu/vela/weight_compressor.py b/ethosu/vela/weight_compressor.py
index fe8f04b..d356289 100644
--- a/ethosu/vela/weight_compressor.py
+++ b/ethosu/vela/weight_compressor.py
@@ -246,7 +246,6 @@
 
     # Slice weight stream up depth-ways into bricks and compress
     full_ofm_depth = quant_buf.shape[-1]
-    ofm_block_depth = ofm_block_depth // arch.ncores
     for idx in range(0, full_ofm_depth, ofm_depth_step):
         # Get the weights necessary for this brick
         count = min(full_ofm_depth - idx, ofm_depth_step)
@@ -260,7 +259,13 @@
         # and generate separate compressed streams.
         for core in range(0, min(arch.ncores, full_ofm_depth)):
             core_weights = core_deinterleave(brick_weights, core, arch.ncores)
-            raw_stream = generate_brick(arch, core_weights, ofm_block_depth, tens.block_traversal, ifm_bitdepth, dilation)
+
+            block_depth = (ofm_block_depth + arch.ncores - 1 - core) // arch.ncores
+            if block_depth != 0:
+                raw_stream = generate_brick(arch, core_weights, block_depth, tens.block_traversal, ifm_bitdepth, dilation)
+            else:
+                raw_stream = []
+
             raw_size += len( raw_stream )
             encoded_substream = encode( raw_stream )
             encoded_stream.extend( encoded_substream )