MLBEDSW-4644 Removed unnecessary LUT DMA commands

Fixed a bug where a DMA command for the activation LUT would be issued
for every depth-slice of an operator. This caused multiple
unnecessary DMA commands.

Signed-off-by: Jacob Bohlin <jacob.bohlin@arm.com>
Change-Id: I9c291692d8002f05656bb88214836ab389a56cdb
diff --git a/ethosu/vela/high_level_command_stream_generator.py b/ethosu/vela/high_level_command_stream_generator.py
index 5a838f8..6fcf80c 100644
--- a/ethosu/vela/high_level_command_stream_generator.py
+++ b/ethosu/vela/high_level_command_stream_generator.py
@@ -128,6 +128,11 @@
         for start_width in range(ofm_start.width, ofm_end.width, ofm_step.width):
             end_width = min(start_width + ofm_step.width, ofm_end.width)
 
+            if parent_op.activation_lut:
+                lut_tensor = [tens for tens in parent_op.inputs if tens.purpose == TensorPurpose.LUT][0]
+                lut_box = Box([0] * len(lut_tensor.shape), list(lut_tensor.shape))
+                yield from dma_if_necessary(sched_op.parent_ps, lut_box, lut_tensor)
+
             for depth_idx, start_channel in enumerate(ofm_depth_slices[:-1]):
                 start_channel = max(start_channel, ofm_start.depth)
                 end_channel = min(ofm_depth_slices[depth_idx + 1], ofm_end.depth)
@@ -196,11 +201,6 @@
                 else:
                     weight_box = None
 
-                if parent_op.activation_lut:
-                    lut_tensor = [tens for tens in parent_op.inputs if tens.purpose == TensorPurpose.LUT][0]
-                    lut_box = Box([0] * len(lut_tensor.shape), list(lut_tensor.shape))
-                    yield from dma_if_necessary(sched_op.parent_ps, lut_box, lut_tensor)
-
                 yield NpuStripe(
                     sched_op.parent_ps,
                     block_config.old_style_representation(),