MLBEDSW-1828: Ifm/ifm2 order is reversed in some cases of split

Signed-off-by: Charles Xu <charles.xu@arm.com>
Change-Id: Ib8d66f8b3c0467966165c1b53aeb7da7c8764c89
diff --git a/ethosu/vela/high_level_command_stream_generator.py b/ethosu/vela/high_level_command_stream_generator.py
index 3b968dc..0cd3ad2 100644
--- a/ethosu/vela/high_level_command_stream_generator.py
+++ b/ethosu/vela/high_level_command_stream_generator.py
@@ -33,12 +33,36 @@
         in_tensor = dma_op.inputs[0]
         yield DMA(in_tensor, tensor, box)
 
+def match_tensor(source, derived):
+    if source == derived:
+        return True
+    ops = derived.ops
+    return (ops != [] and
+        len(ops) ==1 and
+        ops[0].type == "SplitSliceRead" and
+        source == ops[0].inputs[0])
 
 def generate_high_level_command_stream_for_pass(strat, passes, block_configs, idx):
     is_first = idx == 0
     is_last = idx == len(passes) - 1
     ps = passes[idx]
     block_config = block_configs[idx]
+    npu_block_type = ps.npu_block_type
+    split_offsets = [None, None]  # offset for [ifm, ifm2]
+
+    ifm_idx = 0
+    for op in ps.ops:
+        if op.type == "SplitSliceRead":
+            split_offsets[ifm_idx] = op.attrs["split_start"]
+            ps.primary_op.attrs["fused_memory_function"] = op.type
+            ifm_idx += 1
+
+    if len(ps.inputs) == 2 and npu_block_type == NpuBlockType.ElementWise:
+        # Ensure correct imf and ifm2 order
+        if (match_tensor(ps.inputs[0], ps.primary_op.inputs[1]) and
+            match_tensor(ps.inputs[1], ps.primary_op.inputs[0])):
+            ps.ifm_tensor, ps.ifm2_tensor = ps.ifm2_tensor, ps.ifm_tensor
+            split_offsets[0], split_offsets[1] = split_offsets[1], split_offsets[0]
 
     ifm_tensor = ps.ifm_tensor
     ifm2_tensor = ps.ifm2_tensor
@@ -55,13 +79,9 @@
         strides = ps.primary_op.attrs.get("strides", None)
         skirt = ps.primary_op.attrs.get("skirt", None)
 
-    npu_block_type = ps.npu_block_type
-
     concat_axis = 0
     concat_offset = 0
 
-    split_offsets = [None, None]  # offset for [ifm, ifm2]
-
     # Fusable activation functions
     activation_ops = set(("Sigmoid", "Tanh", "Relu", "Relu6", "ReluN1To1"))
 
@@ -78,14 +98,6 @@
         elif op.type in activation_ops:
             ps.primary_op.attrs["fused_activation_function"] = op.type
 
-    # The ops list has to be reversed here since the Pass Packing is done in reverse
-    ifm_idx = 0
-    for op in reversed(ps.ops):
-        if op.type == "SplitSliceRead":
-            split_offsets[ifm_idx] = op.attrs["split_start"]
-            ps.primary_op.attrs["fused_memory_function"] = op.type
-            ifm_idx += 1
-
     if strat == SchedulingStrategy.WeightStream:
         ofm_step = block_config[-1]
         ofm_stop = ofm_end[-1]
diff --git a/ethosu/vela/pass_packing.py b/ethosu/vela/pass_packing.py
index 5841ca2..4cfac33 100644
--- a/ethosu/vela/pass_packing.py
+++ b/ethosu/vela/pass_packing.py
@@ -314,7 +314,7 @@
                         if operation_set is None:
                             print("Warning:", curr_op.type, "operation is unknown or unsupported, placing on CPU")
 
-                        for inp in curr_op.inputs:
+                        for inp in reversed(curr_op.inputs):
                             can_pack = True
                             if len(inp.ops) == 1:
                                 next_op = inp.ops[0]