MLBEDSW-7206: Fixed weight buffering problem in cascading

- Fixed a problem where buffered weights were only used
in the first stripe that was produced. The following stripes
read the weights from permanent storage.

Change-Id: I176909fa0e2edbecf80e8ec8ac136f42d5d3bcd4
Signed-off-by: Johan Alfven <johan.alfven@arm.com>
diff --git a/ethosu/vela/high_level_command_stream_generator.py b/ethosu/vela/high_level_command_stream_generator.py
index cd878ec..5f6a93a 100644
--- a/ethosu/vela/high_level_command_stream_generator.py
+++ b/ethosu/vela/high_level_command_stream_generator.py
@@ -207,12 +207,13 @@
                 if op_info.npu_weights_tensor:
                     weight_box = Box([0, 0, 0, start_channel], [1, 1, 1, end_channel])
 
-                    if op_info.buffered_weight_tensors and is_first_h_stripe:
+                    if op_info.buffered_weight_tensors:
                         idx = depth_idx % len(op_info.buffered_weight_tensors)
-                        yield from dma_if_necessary(
-                            sched_op.parent_ps, weight_box, op_info.buffered_weight_tensors[idx]
-                        )
                         weight_tensor = op_info.buffered_weight_tensors[idx]
+                        if is_first_h_stripe:
+                            yield from dma_if_necessary(
+                                sched_op.parent_ps, weight_box, op_info.buffered_weight_tensors[idx]
+                            )
                 else:
                     weight_box = None