Performance improvement in tensor allocation - Tensor allocation verification was O(N^2), is now closer to O(N) - Removed a sort in HillClimb allocator Change-Id: I286a269881490c485cc2b0eeab3b1ecffa8f3df0 Signed-off-by: Louis Verhaard <louis.verhaard@arm.com>

commit: 226ecaf4561f421206d1593eac0fa57dd56db82e [log] [tgz]
author: Louis Verhaard <louis.verhaard@arm.com> Tue Mar 30 10:18:28 2021 +0200
committer: Louis Verhaard <louis.verhaard@arm.com> Tue Mar 30 16:15:11 2021 +0200
tree: 539c69b7f61d0d9452ec1554c861677dfb6a480e
parent: 3438c929528583bc019055ad7057c08271b0cee7 [diff]
diff --git a/ethosu/vela/greedy_allocation.py b/ethosu/vela/greedy_allocation.py
index 51b0780..c68a507 100644
--- a/ethosu/vela/greedy_allocation.py
+++ b/ethosu/vela/greedy_allocation.py

@@ -60,7 +60,7 @@
 
     def allocate_live_ranges(self, verbose_allocation, alignment):
         lrs = set()
-        for lr in self.live_ranges.ranges.values():
+        for lr in self.live_ranges.lrs:
             lrs.add((lr.start_time, -lr.end_time, lr))
 
         lrs = sorted(lrs)

diff --git a/ethosu/vela/live_range.py b/ethosu/vela/live_range.py
index 0cc89e2..de001e5 100644
--- a/ethosu/vela/live_range.py
+++ b/ethosu/vela/live_range.py

@@ -16,6 +16,8 @@
 # Description:
 # Build a live range graph for tensors in one or more subgraphs. Used for tensor allocation as well as in the scheduler.
 # Can work with either a pass packed subgraph or a scheduled subgraph.
+from typing import List
+
 from .nn_graph import PassPlacement
 from .operation import Op
 from .tensor import MemType
@@ -99,6 +101,7 @@
 
 class LiveRangeGraph:
     def __init__(self):
+        self.lrs: List[LiveRange] = []  # List of all created ranges
         self.ranges = {}  # tens -> range
         self.ignore_tensors = set()
         self.processed_subgraphs = set()
@@ -114,6 +117,7 @@
         # No live range found for the tensor, create a new one
         rng = LiveRange(tens, alignment)
         self.ranges[tens] = rng
+        self.lrs.append(rng)
         return rng
 
     def fuse_ranges(self, in_tens, out_tens):

diff --git a/ethosu/vela/register_command_stream_generator.py b/ethosu/vela/register_command_stream_generator.py
index 65801b8..733be59 100644
--- a/ethosu/vela/register_command_stream_generator.py
+++ b/ethosu/vela/register_command_stream_generator.py

@@ -86,7 +86,6 @@
 from .shared_buffer_allocation import find_suitable_block_configs
 from .shared_buffer_allocation import shared_buffer_allocation_for_npu_op
 from .shared_buffer_allocation import SharedBufferAllocation
-from ethosu.vela.errors import VelaError
 
 
 class RegisterMachine:

diff --git a/ethosu/vela/tensor_allocation.py b/ethosu/vela/tensor_allocation.py
index b2ea7de..0ad30e5 100644
--- a/ethosu/vela/tensor_allocation.py
+++ b/ethosu/vela/tensor_allocation.py

@@ -17,6 +17,7 @@
 # Wrapping function to do tensor address allocation. That is, assigning addresses to tensors based on what has been
 # worked out from the allowable overlaps that are calculated by the live range analysis.
 import math
+from typing import List
 
 import numpy as np
 
@@ -25,6 +26,7 @@
 from . import numeric_util
 from .errors import AllocationError
 from .greedy_allocation import allocate_live_ranges as greedy_allocate_live_ranges
+from .live_range import LiveRange
 from .live_range import LiveRangeGraph
 from .nn_graph import TensorAllocator
 from .tensor import MemArea
@@ -65,14 +67,10 @@
 
 def hillclimb_allocate_live_ranges(live_ranges: LiveRangeGraph, alloc_granularity: int) -> int:
     # Allocates using the hill climb allocator
-    lr_set = {(lr.start_time, lr.end_time, lr) for lr in live_ranges.ranges.values()}
-    lr_list = [lr for _, _, lr in lr_set]
-    lr_list.sort()
-
-    addresses = hillclimb_allocation.allocate_live_ranges(lr_list)
+    addresses = hillclimb_allocation.allocate_live_ranges(live_ranges.lrs)
     # The result is a list containing the allocated addresses
     total_sz = 0
-    for lr, address in zip(lr_list, addresses):
+    for lr, address in zip(live_ranges.lrs, addresses):
         total_sz = max(total_sz, address + lr.size)
         lr.set_address(address)
     verify_allocation(live_ranges, alloc_granularity)
@@ -80,7 +78,7 @@
 
 
 def verify_alignment(live_ranges: LiveRangeGraph, alignment: int):
-    for lr in live_ranges.ranges.values():
+    for lr in live_ranges.lrs:
         for tens in lr.tensors:
             if not all(op and op.run_on_npu for op in tens.ops + tens.consumer_list):
                 # This is a CPU tensor, verify alignment
@@ -89,12 +87,16 @@
 
 
 def verify_allocation(live_ranges: LiveRangeGraph, alignment: int):
-    lrs = list(live_ranges.ranges.values())
-    for n in lrs:
-        verify_alignment(live_ranges, alignment)
-
-        for m in lrs:
-            if n != m and n.overlaps_ranges(m):
+    verify_alignment(live_ranges, alignment)
+    nr_time_slots = 1 + max(lr.end_time for lr in live_ranges.lrs)
+    # Contains active live ranges at each timestamp
+    lrs_at_time = [[] for i in range(nr_time_slots)]
+    for lr in live_ranges.lrs:
+        for t in range(lr.start_time, lr.end_time + 1):
+            lrs_at_time[t].append(lr)
+    for t in range(nr_time_slots):
+        for ix, n in enumerate(lrs_at_time[t]):
+            for m in lrs_at_time[t][ix + 1 :]:
                 overlap, tens_n, tens_m = n.overlaps_address(m)
                 if overlap and not (tens_n.equivalent(tens_m) and tens_n.address == tens_m.address):
                     raise AllocationError(
@@ -142,11 +144,9 @@
         print()
 
 
-def calculate_allocation_efficiency(lrs):
-    lr_set = set(lrs.ranges.values())
-
-    size_at_time = [0] * (1 + max(lr.end_time for lr in lr_set))
-    for lr in lr_set:
+def calculate_allocation_efficiency(lrs: List[LiveRange]):
+    size_at_time = [0] * (1 + max(lr.end_time for lr in lrs))
+    for lr in lrs:
         for t in range(lr.start_time, lr.end_time + 1):
             size_at_time[t] += lr.size
 
@@ -210,7 +210,7 @@
         print_allocation(lrs, mem_area, mem_type_set, sg, verbose_allocation)
 
         if mem_area == MemArea.Sram:
-            sg.min_mem_usage = calculate_allocation_efficiency(lrs)
+            sg.min_mem_usage = calculate_allocation_efficiency(lrs.lrs)
             # Mark Sram usage for all subgraphs
             for sg_ in nng.subgraphs:
                 mark_sram_used_for_cascaded_passes(sg_, lrs)

diff --git a/ethosu/vela/test/test_tensor_allocation.py b/ethosu/vela/test/test_tensor_allocation.py
new file mode 100644
index 0000000..f437bbb
--- /dev/null
+++ b/ethosu/vela/test/test_tensor_allocation.py

@@ -0,0 +1,46 @@
+# Copyright (C) 2021 Arm Limited or its affiliates. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Description:
+# Contains unit tests for tensor allocation
+import pytest
+
+from ethosu.vela.data_type import DataType
+from ethosu.vela.errors import AllocationError
+from ethosu.vela.live_range import LiveRangeGraph
+from ethosu.vela.tensor import Tensor
+from ethosu.vela.tensor_allocation import verify_allocation
+
+
+def test_verify_allocation():
+    # Create live range graph with 2 live ranges with overlapping start/end time
+    lr_graph = LiveRangeGraph()
+    t1 = Tensor([1, 100, 10, 10], DataType.int8, "t1")
+    lr1 = lr_graph.get_or_create_range(t1)
+    lr1.mark_usage(4)
+    lr1.mark_usage(8)
+    t2 = Tensor([1, 10, 20, 10], DataType.int8, "t2")
+    lr2 = lr_graph.get_or_create_range(t2)
+    # Set overlapping addresses, should lead to verification failure
+    lr1.set_address(16)
+    lr2.set_address(32)
+    lr2.mark_usage(7)
+    lr2.mark_usage(12)
+    with pytest.raises(AllocationError):
+        verify_allocation(lr_graph, 16)
+    # Set non-overlapping addresses, verification should now succeed
+    lr2.set_address(None)
+    lr2.set_address(160000)
+    verify_allocation(lr_graph, 16)
commit	226ecaf4561f421206d1593eac0fa57dd56db82e	[log] [tgz]
author	Louis Verhaard <louis.verhaard@arm.com>	Tue Mar 30 10:18:28 2021 +0200
committer	Louis Verhaard <louis.verhaard@arm.com>	Tue Mar 30 16:15:11 2021 +0200
tree	539c69b7f61d0d9452ec1554c861677dfb6a480e
parent	3438c929528583bc019055ad7057c08271b0cee7 [diff]