MLBEDSW-4034: New Scheduler Size or Performance Optimisation

 - Merged dev/scheduler at 83639f90e8c828f70de6e29142355a940224959b

Signed-off-by: Tim Hall <tim.hall@arm.com>
Change-Id: I0050529d4b42da93768c7264296434dd877fb5b4
diff --git a/ethosu/vela/npu_serialisation.py b/ethosu/vela/npu_serialisation.py
index ad4d29c..39a7f21 100644
--- a/ethosu/vela/npu_serialisation.py
+++ b/ethosu/vela/npu_serialisation.py
@@ -42,10 +42,8 @@
 
 def copy_compressed_values_to_memory_tensor(memory_tensor, src_tensor):
     start_addr = src_tensor.address
-    for compressed_values in src_tensor.compressed_values:
-        end_addr = start_addr + len(compressed_values)
-        memory_tensor.values[start_addr:end_addr] = compressed_values
-        start_addr = end_addr
+    end_addr = src_tensor.address + src_tensor.storage_size()
+    memory_tensor.values[start_addr:end_addr] = src_tensor.buffer.copy()
 
 
 def copy_ifm_values_to_memory_tensor(memory_tensor, src_tensor):
@@ -94,31 +92,21 @@
         sg.scratch_fast_tensor = scratch_fast_tens
         sg.scratch_fast_tensor.shape[0] = 0
 
-    for cps in sg.cascaded_passes:
-        for ps in cps.passes:
-            if ps.placement == PassPlacement.Npu:
-                if ps.weight_tensor is not None:
-                    # For DMA ops, ps.weight_tensor is referring to the SRAM weight tensor and therefore the address
-                    # is pointing at the destination address of where the weights should be placed in SRAM.
-                    # This ensures that the Flash weight tensor is used instead and thus gets the correct address.
-                    if ps.weight_tensor.ops[0].type == Op.DMA:
-                        copy_compressed_values_to_memory_tensor(sg.flash_tensor, ps.weight_tensor.ops[0].inputs[0])
-                    else:
-                        copy_compressed_values_to_memory_tensor(sg.flash_tensor, ps.weight_tensor)
+    for sched_op in sg.sched_ops:
+        ifm_tensor, ifm2_tensor, _, _, _ = sched_op.parent_op.get_ifm_ifm2_weights_biases_ofm()
 
-                    if ps.scale_tensor.ops[0].type == Op.DMA:
-                        copy_compressed_values_to_memory_tensor(sg.flash_tensor, ps.scale_tensor.ops[0].inputs[0])
-                    else:
-                        copy_compressed_values_to_memory_tensor(sg.flash_tensor, ps.scale_tensor)
+        op_info = sg.schedule.cost_map[sched_op]
+        if op_info.npu_weights_tensor:
+            copy_compressed_values_to_memory_tensor(sg.flash_tensor, op_info.npu_weights_tensor)
 
-                if ps.lut_tensor is not None:
-                    copy_ifm_values_to_memory_tensor(sg.flash_tensor, ps.lut_tensor)
-                if ps.ifm_tensor is not None and ps.ifm_tensor.mem_type not in (MemType.Scratch, MemType.Scratch_fast):
-                    copy_ifm_values_to_memory_tensor(sg.flash_tensor, ps.ifm_tensor)
-                if ps.ifm2_tensor is not None and (
-                    ps.ifm2_tensor.mem_type not in (MemType.Scratch, MemType.Scratch_fast)
-                ):
-                    copy_ifm_values_to_memory_tensor(sg.flash_tensor, ps.ifm2_tensor)
+        if ifm_tensor and ifm_tensor.mem_type not in (MemType.Scratch, MemType.Scratch_fast):
+            copy_ifm_values_to_memory_tensor(sg.flash_tensor, ifm_tensor)
+        if ifm2_tensor and (ifm2_tensor.mem_type not in (MemType.Scratch, MemType.Scratch_fast)):
+            copy_ifm_values_to_memory_tensor(sg.flash_tensor, ifm2_tensor)
+
+        if sched_op.parent_op.activation_lut:
+            copy_ifm_values_to_memory_tensor(sg.flash_tensor, sched_op.parent_ps.lut_tensor)
+
     sg.command_stream_tensor = make_memory_tensor(
         sg.name + "_command_stream", flash_area, MemType.Permanent_CPU, command_stream_size_bytes, True, arch
     )