MLBEDSW-3222: Bias tensors in fast storage

For IFM streamed cascades bias tensors are read several times.
Moves these tensors to fast storage and add DMA commands.

Change-Id: I630f6275986c1b5e3f126c925b11e22500fb1128
Signed-off-by: Andreas Nevalainen <andreas.nevalainen@arm.com>
diff --git a/ethosu/vela/high_level_command_stream_generator.py b/ethosu/vela/high_level_command_stream_generator.py
index 01fab0e..871a048 100644
--- a/ethosu/vela/high_level_command_stream_generator.py
+++ b/ethosu/vela/high_level_command_stream_generator.py
@@ -238,6 +238,7 @@
                     y_step = y_dim
 
         weight_box = None
+        scale_box = None
 
         for start in range(y_start, y_dim, y_step):
             end = min(start + y_step, y_dim)
@@ -299,6 +300,10 @@
                         if ifm_y_present >= ifm_y_needed:
                             break
 
+            if scale_tensor is not None and scale_tensor.purpose == TensorPurpose.FSBias and scale_box is None:
+                scale_box = Box([0] * len(scale_tensor.shape), list(scale_tensor.shape))
+                yield from dma_if_necessary(ps, scale_box, scale_tensor)
+
             if weight_tensor is not None and weight_box is None:
                 weight_box = Box.make_weight_box(
                     weight_tensor.shape, npu_block_type, weights_transposed=weight_tensor.weight_transpose_depthwise