MLBEDSW-4034: New Scheduler Size or Performance Optimisation

 - Merged dev/scheduler at 83639f90e8c828f70de6e29142355a940224959b

Signed-off-by: Tim Hall <tim.hall@arm.com>
Change-Id: I0050529d4b42da93768c7264296434dd877fb5b4
diff --git a/ethosu/vela/pass_packing.py b/ethosu/vela/pass_packing.py
index 2a1903d..518b243 100644
--- a/ethosu/vela/pass_packing.py
+++ b/ethosu/vela/pass_packing.py
@@ -32,13 +32,12 @@
     Main = 1
     Post = 2
     Mac = 4
-    Dma = 8
-    ElementWise = 16
-    Npu = 32
-    Cpu = 64
-    StartupInit = 128
-    MemoryOnly = 256
-    PostFusingLimited = 512
+    ElementWise = 8
+    Npu = 16
+    Cpu = 32
+    StartupInit = 64
+    MemoryOnly = 128
+    PostFusingLimited = 256
 
 
 mac_main_ops = set(
@@ -87,7 +86,6 @@
 quantization_ops = set((Op.Dequantize, Op.Max, Op.Min))
 cpu_ops = set((Op.Softmax, Op.LRN, Op.Shape, Op.Pad, Op.AddN)) | quantization_ops
 
-npu_dma_ops = set((Op.DMA,))
 startup_init_ops = set((Op.Const, Op.Placeholder, Op.SubgraphInput))
 memory_only_ops = set((Op.Squeeze, Op.Reshape, Op.QuantizedReshape, Op.ExpandDims,))
 
@@ -135,16 +133,6 @@
     ),
     (
         # ops_set
-        npu_dma_ops,
-        # incompatible_pack_flags
-        PassFlags.Cpu | PassFlags.MemoryOnly,
-        # flags_to_set
-        PassFlags.Npu | PassFlags.Dma,
-        # flags_to_clear
-        PassFlags.Empty,
-    ),
-    (
-        # ops_set
         startup_init_ops,
         # incompatible_pack_flags
         PassFlags.Npu | PassFlags.Cpu | PassFlags.MemoryOnly,
@@ -261,12 +249,6 @@
                                 assert ifm_tensor is not None, "IFM missing in {}".format(curr_op)
                                 assert ifm_tensor.purpose == TensorPurpose.FeatureMap
 
-                        if flags_to_set & PassFlags.Dma:
-                            # DMAs are special - Output buffers need to be preserved as intermediates,
-                            # if the pass consumes the results
-                            if tens is not None:
-                                reverse_intermediates.append(tens)
-
                         if operation_set is None:
                             print("Warning:", curr_op.type, "operation is unknown or unsupported, placing on CPU")
 
@@ -292,7 +274,7 @@
 
         is_element_wise = True
         for op in reverse_ops_list:
-            if op.type not in elem_wise_ops and op.type not in npu_dma_ops:
+            if op.type not in elem_wise_ops and op.type:
                 is_element_wise = False
                 break
 
@@ -335,11 +317,6 @@
             for inp in primary_op.inputs:
                 if inp is None:
                     continue
-                if len(inp.ops) == 1 and inp.ops[0].type == Op.DMA and inp.purpose == TensorPurpose.FeatureMap:
-                    src_op = inp.ops[0]
-                    if src_op in input_ops_list:
-                        inp = src_op.inputs[0]
-                        input_ops_list.remove(src_op)
                 add_input_list(inp, input_set, input_refcounts, lut_list, ordered_input_list)
             input_ops_list.remove(primary_op)
 
@@ -349,9 +326,6 @@
                 add_input_list(inp, input_set, input_refcounts, lut_list, ordered_input_list)
 
         name = ops_list[0].name
-        non_dma_ops = [op for op in ops_list if op.type != Op.DMA]
-        if non_dma_ops:
-            name = non_dma_ops[0].name
         ps = Pass(name, placement, is_element_wise, npu_block_type)
         ps.ops = ops_list
         ps.primary_op = primary_op