MLBEDSW-2570 Avoid usage of NHCWB16 for some cases

Avoid usage of NHCWB16 when Stack/Pack/Concat is performed in axis 3,
and the "concat start" of each slice to be combined is not a multiple
of 16.

Signed-off-by: Patrik Gustavsson <patrik.gustavsson@arm.com>
Change-Id: If3f7b4a3424be3c86fc2dc48e8649ce4c4f49485
diff --git a/ethosu/vela/graph_optimiser.py b/ethosu/vela/graph_optimiser.py
index 582924c..3fe703e 100644
--- a/ethosu/vela/graph_optimiser.py
+++ b/ethosu/vela/graph_optimiser.py
@@ -69,6 +69,16 @@
             tens.ops.append(new_op)
         assert tens.shape[axis] == offset
 
+        # If axis = 3, NHCWB16 can only be used in the output if all the concat_start's are a multiple of 16,
+        # as it is only then the address offset for the ofm, for all operations, will be 16 byte aligned
+        # For other values of axis the address offsets will be 16 byte aligned, as they are all based on c = 0
+        # and those addresses are always 16 byte aligned due to the NHCWB16 format.
+        if axis == 3:
+            for op in tens.ops:
+                if op.attrs["concat_start"] % 16 != 0:
+                    tens.avoid_NHCWB16 = True
+                    break
+
     return tens
 
 
diff --git a/ethosu/vela/scheduler.py b/ethosu/vela/scheduler.py
index cc9278f..f3b3a79 100644
--- a/ethosu/vela/scheduler.py
+++ b/ethosu/vela/scheduler.py
@@ -670,14 +670,16 @@
         for pred_candidate in ps.dag_predecessors:
             if len(pred_candidate.outputs) == 1 and pred_candidate.outputs[0] == ifm_tensor:
                 # we found a predecessor that produces this IFM tensor
-                if len(pred_candidate.successors) == 1 and pred_candidate.successors[0] == ps:
-                    # and it only has one successor, namely us
-                    if pred_candidate.placement == PassPlacement.Npu:
-                        if pred_candidate.npu_block_type in self.ifm_stream_npu_blocks:
-                            # and it is on the Npu
-                            if not self.avoid_for_spilling(pred_candidate):
-                                # and fusable - it's a candidate
-                                pred_pass_list.append(pred_candidate)
+                if not ifm_tensor.avoid_NHCWB16:
+                    # and NHCWB16 format is not to be avoided
+                    if len(pred_candidate.successors) == 1 and pred_candidate.successors[0] == ps:
+                        # and it only has one successor, namely us
+                        if pred_candidate.placement == PassPlacement.Npu:
+                            if pred_candidate.npu_block_type in self.ifm_stream_npu_blocks:
+                                # and it is on the Npu
+                                if not self.avoid_for_spilling(pred_candidate):
+                                    # and fusable - it's a candidate
+                                    pred_pass_list.append(pred_candidate)
 
         if not pred_pass_list:
             return ABORT_SEARCH
@@ -953,12 +955,15 @@
                         if output.purpose != TensorPurpose.FeatureMap:
                             continue
 
-                        use_NHCWB16 = True
-                        for op in output.consumer_list:
-                            if op is None or op.type == "Reshape":
-                                use_NHCWB16 = False
-                            else:
-                                use_NHCWB16 &= op.run_on_npu
+                        use_NHCWB16 = not output.avoid_NHCWB16
+
+                        if use_NHCWB16:
+                            # Check consumers, to see if NHCWB16 can be used in the output
+                            for op in output.consumer_list:
+                                if op is None or op.type == "Reshape":
+                                    use_NHCWB16 = False
+                                else:
+                                    use_NHCWB16 &= op.run_on_npu
 
                         if use_NHCWB16:
                             output.set_format(TensorFormat.NHCWB16, arch)
diff --git a/ethosu/vela/tensor.py b/ethosu/vela/tensor.py
index 3574970..ecca0e0 100644
--- a/ethosu/vela/tensor.py
+++ b/ethosu/vela/tensor.py
@@ -300,6 +300,7 @@
         "npu_tensor",
         "equivalence_id",
         "resampling_mode",
+        "avoid_NHCWB16",
     )
     AllocationQuantum = 16
 
@@ -346,6 +347,8 @@
         self.block_traversal = TensorBlockTraversal.Default
         self.resampling_mode = resampling_mode.NONE
 
+        self.avoid_NHCWB16 = False
+
     def element_size(self):
         if self.element_size_bytes == 0:
             return self.dtype.size_in_bits() / 8
@@ -380,6 +383,7 @@
         res.resampling_mode = self.resampling_mode
 
         res.copy_compressed_weight_info(self)
+        res.avoid_NHCWB16 = self.avoid_NHCWB16
         return res
 
     def clone_into_fast_storage(self, arch):