MLBEDSW-3953: Output diff in mobilenet_v3

Fixed two issues:
  - Cmd stream can be out of order in Ifmstreaming
  - In H32, LUT could be corrupted if blockdep is not 0

Change-Id: I2edd84429b93d83b2794f14937ce3fd279fd4a24
Signed-off-by: Diqing Zhong <diqing.zhong@arm.com>
diff --git a/ethosu/vela/high_level_command_stream_generator.py b/ethosu/vela/high_level_command_stream_generator.py
index e514e76..66613ba 100644
--- a/ethosu/vela/high_level_command_stream_generator.py
+++ b/ethosu/vela/high_level_command_stream_generator.py
@@ -260,6 +260,18 @@
                 upscaling,
             )
 
+            ifm_y_needed = 1
+            if len(ifm_box.end_coord) >= 3:
+                ifm_y_needed = ifm_box.end_coord[-3]
+            if ifm_y_present < ifm_y_needed:
+                for prev_cmd in prev_pass_gen:
+                    yield prev_cmd
+                    rng = prev_cmd.get_ofm_y_range_for_pass(prev_pass)
+                    if rng is not None:
+                        ifm_y_present = max(ifm_y_present, rng[1])
+                        if ifm_y_present >= ifm_y_needed:
+                            break
+
             for intermediate in ps.intermediates:
                 if (
                     intermediate is not None
@@ -281,18 +293,6 @@
                         intermediate_box = Box([0] * len(intermediate.shape), list(intermediate.shape))
                     yield from dma_if_necessary(ps, intermediate_box, intermediate)
 
-            ifm_y_needed = 1
-            if len(ifm_box.end_coord) >= 3:
-                ifm_y_needed = ifm_box.end_coord[-3]
-            if ifm_y_present < ifm_y_needed:
-                for prev_cmd in prev_pass_gen:
-                    yield prev_cmd
-                    rng = prev_cmd.get_ofm_y_range_for_pass(prev_pass)
-                    if rng is not None:
-                        ifm_y_present = max(ifm_y_present, rng[1])
-                        if ifm_y_present >= ifm_y_needed:
-                            break
-
             if scale_tensor is not None and scale_tensor.purpose == TensorPurpose.FSBias and scale_box is None:
                 scale_box = Box([0] * len(scale_tensor.shape), list(scale_tensor.shape))
                 yield from dma_if_necessary(ps, scale_box, scale_tensor)
diff --git a/ethosu/vela/register_command_stream_util.py b/ethosu/vela/register_command_stream_util.py
index 55fa620..4cf826d 100644
--- a/ethosu/vela/register_command_stream_util.py
+++ b/ethosu/vela/register_command_stream_util.py
@@ -463,6 +463,12 @@
         return 0
     assert npu_op.ifm is not None
     assert prev_op.ofm is not None
+    # Check if the reserved shram will be used in current/prev op
+    prev_uses_lut = prev_op.activation is not None and prev_op.activation.op_type == NpuActivationOp.TABLE_LOOKUP
+    curr_uses_lut = npu_op.activation is not None and npu_op.activation.op_type == NpuActivationOp.TABLE_LOOKUP
+    if prev_uses_lut and arch.shram_reserved_unused_banks == 0 and not curr_uses_lut:
+        return 0
+
     # Check if IFM or IFM2 overlaps with prev op's OFM
     prev_ofm_ranges = get_address_ranges(prev_op.ofm)
     ifm_ranges = get_address_ranges(npu_op.ifm)