MLBEDSW-6984: Optimize fast storage for feature maps

- Remove very long live ranges that are standing out compared to
its neighbors. This can be seen on large networks with complex
structure. If they are chosen instead of shorter live ranges,
it will be difficult for the HillClimb Allocator to find a perfect
fit in the final allocation.

Signed-off-by: Johan Alfven <johan.alfven@arm.com>
Change-Id: I6cf23adfdc06c1e93e12e9cf816453d940ff31f7
diff --git a/ethosu/vela/scheduler.py b/ethosu/vela/scheduler.py
index 021bcc9..fbe2e16 100644
--- a/ethosu/vela/scheduler.py
+++ b/ethosu/vela/scheduler.py
@@ -1281,9 +1281,9 @@
                 for tens in lr.tensors:
                     competing_tens_access[tens] = 0
 
-        sz = len(competing_lrs)
-        # All lrs and their tensors have been handled if sz is zero, we may thus return
-        if sz == 0:
+        competing_lrs_sz = len(competing_lrs)
+        # All lrs and their tensors have been handled if competing_lrs_sz is zero, we may thus return
+        if competing_lrs_sz == 0:
             return
 
         # Estimate element access for all tensors that are competing for a place in fast-storage.
@@ -1307,16 +1307,52 @@
                 competing_tens_access[tens] += access.ofm_write
 
         competing_lrs = sorted(competing_lrs, key=lambda lr: (lr.start_time, lr.end_time + 1, lr.size))
+
+        # Remove lrs that have a live range that is too long compared to others.
+        # They are causing problems for the HillClimb Allocator when it has to
+        # change the allocation indices, in order to fit all the allocations into SRAM.
+        # This problem only occur in larger networks with complex graphs.
+        #
+        # Limit the number of items for allocate_component to work with max MAX_EXHAUSTIVE_ITEMS
+        # at the time. Too many will give too long compilation time
+        #
+        # Too long is currently decided to be (based on experience, analyzing many networks):
+        # Compare lr at postion i with lr at position i + MAX_EXHAUSTIVE_ITEMS.
+        # If end time differs by at least MAX_EXHAUSTIVE_LIFE_RANGE then do not include lr at position i.
+        if competing_lrs_sz > FastStorageComponentAllocator.MAX_EXHAUSTIVE_ITEMS:
+            # create a copy of the original list to iterate over because the original version is modified in-loop
+            competing_lrs_copy = competing_lrs.copy()
+            for i, lr in enumerate(competing_lrs_copy):
+                lr_time = lr.end_time - lr.start_time
+                if lr_time < FastStorageComponentAllocator.MAX_EXHAUSTIVE_LIFE_RANGE:
+                    # Skip small ranges
+                    continue
+
+                # Compare current lr with lr at position lr + MAX_EXHAUSTIVE_ITEMS
+                cmp_pos = min(i + FastStorageComponentAllocator.MAX_EXHAUSTIVE_ITEMS, competing_lrs_sz - 1)
+
+                # Compare end times + plus a margin by MAX_EXHAUSTIVE_LIFE_RANGE
+                if (
+                    lr.end_time
+                    > competing_lrs_copy[cmp_pos].end_time + FastStorageComponentAllocator.MAX_EXHAUSTIVE_LIFE_RANGE
+                ):
+                    # Current lr live time stands out, remove it. No use adding it to the
+                    # evicted_fms list since the lr should not be included in the fast storage allocation
+                    FastStorageComponentAllocator.evict(lr, max_mem_usage, self.scratched_fms)
+                    competing_lrs.remove(lr)
+
         start = 0
-        start_time = competing_lrs[0].start_time
         end_time = competing_lrs[0].end_time
+        competing_lrs_sz = len(competing_lrs)
         component_allocator = FastStorageComponentAllocator(base_mem_usage, max_mem_usage, staging_limit)
         # Build up components and then allocate each separately
         for i, lr in enumerate(competing_lrs):
-            if lr.start_time <= end_time and i - start < component_allocator.MAX_EXHAUSTIVE_LIFE_RANGE:
-                start_time = min(start_time, lr.start_time)
+            nbr_items = i - start
+            if lr.start_time <= end_time and (nbr_items < FastStorageComponentAllocator.MAX_EXHAUSTIVE_ITEMS):
                 end_time = max(end_time, lr.end_time)
             else:
+                # Number items reached max items or current lr's start time
+                # does not overlap with previous lr's end time
                 component_allocator.allocate_component(
                     component_allocator,
                     competing_lrs[start:i],
@@ -1328,11 +1364,10 @@
                     self.evicted_fms,
                 )
                 start = i
-                start_time = lr.start_time
                 end_time = lr.end_time
         component_allocator.allocate_component(
             component_allocator,
-            competing_lrs[start:sz],
+            competing_lrs[start:competing_lrs_sz],
             max_mem_usage,
             base_mem_usage,
             staging_limit,
@@ -1446,6 +1481,7 @@
 
 class FastStorageComponentAllocator:
     MAX_EXHAUSTIVE_LIFE_RANGE = 20
+    MAX_EXHAUSTIVE_ITEMS = 20
 
     def __init__(self, base_mem_usage, max_mem_usage, staging_limit):
         self.base_mem_usage = base_mem_usage