MLBEDSW-4803: Output diff fix for MobileNetV3 This commit moves a piece of code back into a loop but with a flag to make sure that the code is only executed once per loop rather than potentially every iteration. This solves the issue of an output diff because of LUT DMAs occurring before weight DMAs. Signed-off-by: Dwight Lidman <dwight.lidman@arm.com> Change-Id: I3e597f0a955154af3d87febacea1b3920d53b7c2

commit: 8f78ac2ff735b7c0be7787d6423eb96a0d8b5983 [log] [tgz]
author: Dwight Lidman <dwight.lidman@arm.com> Fri Aug 13 14:04:30 2021 +0200
committer: patrik.gustavsson <patrik.gustavsson@arm.com> Mon Aug 16 14:30:16 2021 +0000
tree: 93ded59f15c08b8cdbb1d8d408c7591f4aaa8746
parent: cfb42620d7e061efdfc92bf944d0289ebfc02ea4 [diff]
diff --git a/ethosu/vela/high_level_command_stream_generator.py b/ethosu/vela/high_level_command_stream_generator.py
index b3ea9d4..3d0a1e5 100644
--- a/ethosu/vela/high_level_command_stream_generator.py
+++ b/ethosu/vela/high_level_command_stream_generator.py

@@ -130,11 +130,7 @@
         for start_width in range(ofm_start.width, ofm_end.width, ofm_step.width):
             end_width = min(start_width + ofm_step.width, ofm_end.width)
 
-            if parent_op.activation_lut:
-                lut_tensor = [tens for tens in parent_op.inputs if tens.purpose == TensorPurpose.LUT][0]
-                lut_box = Box([0] * len(lut_tensor.shape), list(lut_tensor.shape))
-                yield from dma_if_necessary(sched_op.parent_ps, lut_box, lut_tensor)
-
+            lut_dma_done = False
             for depth_idx, start_channel in enumerate(ofm_depth_slices[:-1]):
                 start_channel = max(start_channel, ofm_start.depth)
                 end_channel = min(ofm_depth_slices[depth_idx + 1], ofm_end.depth)
@@ -203,6 +199,13 @@
                 else:
                     weight_box = None
 
+                # Should only be done once per loop but not before weights above
+                if parent_op.activation_lut and not lut_dma_done:
+                    lut_tensor = [tens for tens in parent_op.inputs if tens.purpose == TensorPurpose.LUT][0]
+                    lut_box = Box([0] * len(lut_tensor.shape), list(lut_tensor.shape))
+                    lut_dma_done = True
+                    yield from dma_if_necessary(sched_op.parent_ps, lut_box, lut_tensor)
+
                 yield NpuStripe(
                     sched_op.parent_ps,
                     block_config.old_style_representation(),
commit	8f78ac2ff735b7c0be7787d6423eb96a0d8b5983	[log] [tgz]
author	Dwight Lidman <dwight.lidman@arm.com>	Fri Aug 13 14:04:30 2021 +0200
committer	patrik.gustavsson <patrik.gustavsson@arm.com>	Mon Aug 16 14:30:16 2021 +0000
tree	93ded59f15c08b8cdbb1d8d408c7591f4aaa8746
parent	cfb42620d7e061efdfc92bf944d0289ebfc02ea4 [diff]