MLBEDSW-7393: MLCE: Optimize compile time for large networks - There is a problem with large networks containing many NPU subgraphs. The scheduling takes too long time since the snapshot memory calculation is always doing a complete update for the full graph. - A complete run is needed in the end to calculate all the time indexes correctly. However, when scheduling a NPU subgraph it is enough to extract live ranges for the current schedule and its operators. Change-Id: Iccb7d6728119c1428ad0b45a2ac34e92158c15bd Signed-off-by: Johan Alfven <johan.alfven@arm.com>

commit: 6e281afe19ea0cd9dba2cecfb73050c18f29d242 [log] [tgz]
author: Johan Alfven <johan.alfven@arm.com> Tue Feb 28 09:03:03 2023 +0100
committer: Fredrik Svedberg <fredrik.svedberg@arm.com> Mon Mar 13 08:40:35 2023 +0000
tree: 7741feb7a6ac2f5d7822be8dc46b43f0589aca53
parent: c72cac8e8beb6bd52bdf6a41e6f7182b5167ee5d [diff] [blame]
diff --git a/ethosu/vela/scheduler.py b/ethosu/vela/scheduler.py
index eeed44f..a50f262 100644
--- a/ethosu/vela/scheduler.py
+++ b/ethosu/vela/scheduler.py

@@ -537,12 +537,11 @@
         # Collect live ranges from tensors
         lr_graph = live_range.LiveRangeGraph()
         for mem_area, mem_type_set in memories_list:
-            live_range.extract_live_ranges_from_cascaded_passes(
-                self.nng.get_root_subgraph(),
+            live_range.extract_live_ranges_from_schedule(
+                self.sg,
                 mem_area,
                 mem_type_set,
                 lr_graph,
-                Tensor.AllocationQuantum,
             )
 
         # Populate time-array with memory used by live ranges
@@ -1128,12 +1127,11 @@
         memories_list = [(self.arch.fast_storage_mem_area, set((MemType.Scratch, MemType.Scratch_fast)))]
         lr_graph = live_range.LiveRangeGraph()
         for mem_area, mem_type_set in memories_list:
-            live_range.extract_live_ranges_from_cascaded_passes(
-                self.nng.get_root_subgraph(),
+            live_range.extract_live_ranges_from_schedule(
+                self.sg,
                 mem_area,
                 mem_type_set,
                 lr_graph,
-                Tensor.AllocationQuantum,
             )
 
         # Find the relation between the sched_op and the buffering tensor
@@ -1248,12 +1246,11 @@
         memories_list = [(fast_storage_mem_area, set((MemType.Scratch, MemType.Scratch_fast)))]
         lr_graph = live_range.LiveRangeGraph()
         for mem_area, mem_type_set in memories_list:
-            live_range.extract_live_ranges_from_cascaded_passes(
-                self.nng.get_root_subgraph(),
+            live_range.extract_live_ranges_from_schedule(
+                self.sg,
                 mem_area,
                 mem_type_set,
                 lr_graph,
-                Tensor.AllocationQuantum,
             )
         max_mem_usage = lr_graph.get_temporal_memory_usage(fast_storage_mem_area)
 
@@ -1452,6 +1449,33 @@
             print(f"\t\t{i}: {cascade.start} -> {cascade.end}, size: {cascade.mem_usage}")
 
 
+def _update_memory_snapshot_for_all_npu_graphs(nng: Graph, arch: ArchitectureFeatures, schedulers):
+    mem_area = arch.fast_storage_mem_area
+    mem_type_set = set((MemType.Scratch, MemType.Scratch_fast))
+
+    # Collect live ranges for the full graph
+    # extract_live_ranges_from_cascaded_passes will start from the root sg and
+    # all sub graphs/cascaded passes will be visited and the correct time_index
+    # will be set for all the tensors.
+    lr_graph = live_range.LiveRangeGraph()
+    live_range.extract_live_ranges_from_cascaded_passes(
+        nng.get_root_subgraph(),
+        mem_area,
+        mem_type_set,
+        lr_graph,
+        Tensor.AllocationQuantum,
+    )
+    # Populate time-array with memory used by live ranges
+    temporal_usage = lr_graph.get_temporal_memory_usage(arch.fast_storage_mem_area)
+
+    # Update snapshot for all the npu sub graphs
+    # Not needed for the scheduler any longer but npu_performance
+    # is using this information so it must have the correct state
+    for sg in schedulers:
+        sg.schedule.memory_snapshot = temporal_usage
+        sg.schedule.fast_storage_peak_usage = max(temporal_usage, default=0)
+
+
 def _update_tensor_allocation(nng: Graph, arch: ArchitectureFeatures, options):
     """
     Creates live ranges and runs tensor allocator for the current schedule
@@ -1652,5 +1676,8 @@
             if scheduler_options.verbose_schedule:
                 scheduler.print_schedule(sg.schedule)
 
+    # Make a full live range calculation starting from the root sg
+    _update_memory_snapshot_for_all_npu_graphs(nng, arch, schedulers)
+
     # Evaluate schedule
     _update_tensor_allocation(nng, arch, options)
commit	6e281afe19ea0cd9dba2cecfb73050c18f29d242	[log] [tgz]
author	Johan Alfven <johan.alfven@arm.com>	Tue Feb 28 09:03:03 2023 +0100
committer	Fredrik Svedberg <fredrik.svedberg@arm.com>	Mon Mar 13 08:40:35 2023 +0000
tree	7741feb7a6ac2f5d7822be8dc46b43f0589aca53
parent	c72cac8e8beb6bd52bdf6a41e6f7182b5167ee5d [diff] [blame]