MLBEDSW-6971 Fix output diff when cascading elementwise operators

Fixed output diff when cascading elementwise operators with
reversed operand order.

Signed-off-by: Fredrik Svedberg <fredrik.svedberg@arm.com>
Change-Id: Iac2e28cfb53037b929459af213f4fa7715b3e6de
diff --git a/ethosu/vela/cascade_builder.py b/ethosu/vela/cascade_builder.py
index b4a4f87..ebe2f13 100644
--- a/ethosu/vela/cascade_builder.py
+++ b/ethosu/vela/cascade_builder.py
@@ -175,11 +175,12 @@
         ifm = sched_op.parent_op.ifm
         ifm2 = sched_op.parent_op.ifm2
 
-        # Cascading elementwise operations with reverse operand order is not handled
         if sched_op.parent_op.type.is_binary_elementwise_op() and ifm and ifm2:
             # We cannot rule out cascadability if at least one IFM is constant
+            ifm_const = ifm.ops != [] and ifm.ops[0].type == Op.Const
             ifm2_const = ifm2.ops != [] and ifm2.ops[0].type == Op.Const
-            return ifm_ifm2_correct_order(ifm.shape, ifm2.shape) and ifm2_const
+            correct_order = ifm_ifm2_correct_order(ifm.shape, ifm2.shape)
+            return (ifm_const and (ifm.shape == ifm2.shape or not correct_order)) or (ifm2_const and correct_order)
         else:
             # Either one IFM is not variable or it is not a binary elementwise op - we cannot rule out cascadability
             return True
diff --git a/ethosu/vela/high_level_command_stream.py b/ethosu/vela/high_level_command_stream.py
index 4a41edd..bfe5bce 100644
--- a/ethosu/vela/high_level_command_stream.py
+++ b/ethosu/vela/high_level_command_stream.py
@@ -210,6 +210,7 @@
         ifm2_box=None,
         pad_top=0,
         pad_bottom=0,
+        reversed_operands=False,
     ):
         self.ps = ps
         self.block_config = block_config
@@ -226,6 +227,7 @@
         self.weight_box = weight_box
         self.pad_top = pad_top
         self.pad_bottom = pad_bottom
+        self.reversed_operands = reversed_operands
         for i in range(len(self.ofm_box.end_coord)):
             assert self.ofm_box.end_coord[i] <= ps.ofm_shapes[0][i]
 
diff --git a/ethosu/vela/high_level_command_stream_generator.py b/ethosu/vela/high_level_command_stream_generator.py
index 7e13b62..e71fb6e 100644
--- a/ethosu/vela/high_level_command_stream_generator.py
+++ b/ethosu/vela/high_level_command_stream_generator.py
@@ -74,6 +74,8 @@
         _,
         _,
     ) = parent_op.get_ifm_ifm2_weights_biases_ofm()
+    if sched_op.reversed_operands:
+        ifm2_tensor, ifm_tensor = ifm_tensor, ifm2_tensor
     ifm = sched_op.ifm
     ifm2 = sched_op.ifm2
     ofm_shape = sched_op.ofm.shape
@@ -236,4 +238,5 @@
                     ifm2_box=ifm2_box,
                     pad_top=pad_top,
                     pad_bottom=pad_bottom,
+                    reversed_operands=sched_op.reversed_operands,
                 )
diff --git a/ethosu/vela/high_level_command_to_npu_op.py b/ethosu/vela/high_level_command_to_npu_op.py
index 974d980..202917b 100644
--- a/ethosu/vela/high_level_command_to_npu_op.py
+++ b/ethosu/vela/high_level_command_to_npu_op.py
@@ -555,7 +555,10 @@
     if elemwise_op not in UNARY_ELEMWISE_OPS:
         ifm_shape = [] if cmd.ifm_tensor.shape == [] else ps.ifm_shapes[0].as_list()
         ifm2_shape = [] if cmd.ifm2_tensor.shape == [] else ps.ifm_shapes[1].as_list()
-        if not ifm_ifm2_correct_order(ifm_shape, ifm2_shape):
+        if cmd.reversed_operands:
+            assert ifm_ifm2_correct_order(ifm_shape, ifm2_shape)
+            npu_op.reversed_operands = True
+        elif not ifm_ifm2_correct_order(ifm_shape, ifm2_shape):
             # The scalar/broadcasted feature map has to be the ifm2 tensor so switch the ifms
             cmd.ifm_tensor, cmd.ifm2_tensor = cmd.ifm2_tensor, cmd.ifm_tensor
             cmd.ifm_box, cmd.ifm2_box = cmd.ifm2_box, cmd.ifm_box
diff --git a/ethosu/vela/scheduler.py b/ethosu/vela/scheduler.py
index 9dca63a..208b121 100644
--- a/ethosu/vela/scheduler.py
+++ b/ethosu/vela/scheduler.py
@@ -182,6 +182,7 @@
         self.activation = ps.primary_op.activation
         self.kernel = ps.primary_op.kernel
         self.resampling_mode = ps.primary_op.ifm_resampling_mode
+        self.reversed_operands = False
         self.uses_scalar = ps.primary_op.ifm2 is not None and (
             ps.primary_op.ifm.shape == [] or ps.primary_op.ifm2.shape == []
         )
@@ -239,6 +240,7 @@
                 # The non-broadcasted IFM should be the primary input
                 or (ifm1.shape != ofm.shape and ifm2.shape == ofm.shape)
             ):
+                self.reversed_operands = True
                 self.ifm, self.ifm2 = self.ifm2, self.ifm
 
                 self.parent_ps.ifm_shapes = self.parent_ps.ifm_shapes[::-1]
diff --git a/ethosu/vela/softmax.py b/ethosu/vela/softmax.py
index 1655427..a0fd19c 100644
--- a/ethosu/vela/softmax.py
+++ b/ethosu/vela/softmax.py
@@ -353,8 +353,8 @@
         )
         add_op = create_add(
             f"{self.op.name}_add{pass_number}",
-            f0_one_const,
             shifted_sum_minus_one,
+            f0_one_const,
             one_scale_quant,
             activation,
         )