MLBEDSW-3808: Ported search allocator to python

- Straight port of the C++ implementation to python.
- Renamed the allocator from "Search" to "HillClimb"

Change-Id: I50797d541f326d0264daf79bf7866aef32350a60
Signed-off-by: Louis Verhaard <louis.verhaard@arm.com>
diff --git a/ethosu/vela/hillclimb_allocation.py b/ethosu/vela/hillclimb_allocation.py
new file mode 100644
index 0000000..de53ab8
--- /dev/null
+++ b/ethosu/vela/hillclimb_allocation.py
@@ -0,0 +1,310 @@
+# Copyright (C) 2021 Arm Limited or its affiliates. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Description:
+# Tensor allocator based on a hill-climb search
+import random
+from typing import List
+from typing import Set
+
+from .live_range import LiveRange
+
+
+class LiveRangeInfo:
+    def __init__(self, id: int, start_time: int, end_time: int, size: int):
+        # Index of this live range
+        self.id = id
+        # Start time (input to the allocator algorithm)
+        self.start_time = start_time
+        # End time, inclusive (input to the allocator algorithm)
+        self.end_time = end_time
+        # Size in bytes (input to the allocator algorithm)
+        self.size = size
+        # Allocated address (the main output from the allocator algorithm)
+        self.address: int = 0
+        # End address, exclusive
+        self.end_address: int = 0
+        # id of predecessor live range (predecessor's end address == this lr's address)
+        self.predecessor: int = 0
+        # Turn at which the live range was allocated
+        self.turn: int = 0
+        # Max value of size_at_time (only used in the heuristic allocation)
+        self.urgency = 0
+        self.neighbours: List["LiveRangeInfo"] = []
+
+    def overlaps(self, addr2: int, size2: int) -> int:
+        return self.address < addr2 + size2 and addr2 < self.end_address
+
+    def is_neighbour(self, lr: "LiveRangeInfo") -> bool:
+        return self.start_time <= lr.end_time and lr.start_time <= self.end_time
+
+    def __str__(self):
+        return "<LiveRangeInfo: id={}, start_time={}, end_time={}, size={}, address={}>".format(
+            self.id, self.start_time, self.end_time, self.size, self.address
+        )
+
+    def __lt__(self, other) -> bool:
+        if self.urgency != other.urgency:
+            return self.urgency > other.urgency
+        duration1 = self.end_time - self.start_time
+        duration2 = other.end_time - other.start_time
+        if duration1 != duration2:
+            return duration1 > duration2
+
+        if self.start_time != other.start_time:
+            return self.start_time < other.start_time
+
+        if self.size != other.size:
+            return self.size > other.size
+
+        return self.id < other.id
+
+
+class HillClimbAllocator:
+    """
+    Implements tensor allocator using a hill climb search.
+
+    The basic algorithm is:
+
+    Use a heuristic allocator to find an initial allocation
+    while allocation is not optimal and iterations < MAX_ITERATIONS:
+        find the "bottleneck": the live range with highest end address
+        find all live ranges that affected the allocation of the bottleneck
+        swap the order of any two affecting live ranges
+        reallocate tensors using the reordered live ranges
+        if the new allocation is better: keep it, else set allocation to previous allocation
+    """
+
+    MAX_ITERATIONS = 500
+    NOT_ALLOCATED = -1
+    # Used for live ranges allocated at address 0
+    NO_PREDECESSOR = -1
+
+    def __init__(self, live_ranges: List[LiveRange]):
+        # Contains the live ranges
+        self.lrs: List[LiveRangeInfo] = [
+            LiveRangeInfo(id, lr.start_time, lr.end_time, lr.size) for id, lr in enumerate(live_ranges)
+        ]
+        self.lrs_at_time = []
+        # The available size (input to algorithm).
+        self.available_size: int = 0
+        # The algorithm stops once the target size has been achieved
+        self.target_size: int = 0
+        # The highest end address of the best found allocation
+        self.best_size: int = 1 << 63
+        # For each live range: max value of size_at_time (only used in the heuristic allocation)
+        self.lr_urgency = len(self.lrs) * [0]
+        nr_time_slots = 1 + max(lr.end_time for lr in self.lrs)
+        # Contains active live ranges at each timestamp
+        self.lrs_at_time = [[] for i in range(nr_time_slots)]
+        for lr in self.lrs:
+            for t in range(lr.start_time, lr.end_time + 1):
+                self.lrs_at_time[t].append(lr)
+        # At each timestamp: accumulated size of active live ranges
+        size_at_time = [sum(lr.size for lr in self.lrs_at_time[t]) for t in range(nr_time_slots)]
+        # The minimum possible size, assuming all live ranges can be perfectly allocated
+        self.min_required_size: int = max(size_at_time)
+        # Calculate all neighbours + the urgency of each live range
+        for lr in self.lrs:
+            lr.urgency = 0
+            lr.neighbours = []
+            neighbours = set()
+            for t in range(lr.start_time, lr.end_time + 1):
+                lr.urgency = max(size_at_time[t], lr.urgency)
+                for lr2 in self.lrs_at_time[t]:
+                    if lr2 not in neighbours and lr != lr2:
+                        neighbours.add(lr2)
+                        lr.neighbours.append(lr2)
+
+    def allocate_lr(self, lr: LiveRangeInfo):
+        """
+        Allocates the given live range at the smallest possible address
+        """
+        address = 0
+        predecessor = HillClimbAllocator.NO_PREDECESSOR
+        fits = False
+        while not fits:
+            fits = True
+            # Find neighbours that overlap with address
+            for lr2 in lr.neighbours:
+                if lr2.address == HillClimbAllocator.NOT_ALLOCATED or lr2.end_address <= address:
+                    continue
+                if lr2.overlaps(address, lr.size):
+                    # Overlap found increase address
+                    fits = False
+                    address = lr2.end_address
+                    predecessor = lr2.id
+        lr.address = address
+        lr.end_address = address + lr.size
+        lr.predecessor = predecessor
+
+    def allocate_indices(self, indices: List[int]):
+        """
+        Allocates the live ranges in the order indicated by the indices;
+        allocates each live range at the lowest possible address.
+        """
+        for lr in self.lrs:
+            lr.address = HillClimbAllocator.NOT_ALLOCATED
+        size = 0
+        for turn, index in enumerate(indices):
+            lr = self.lrs[index]
+            self.allocate_lr(lr)
+            lr.turn = turn
+            size = max(size, lr.end_address)
+            if size > self.best_size:
+                # This allocation is worse than the best known allocation;
+                # no need to continue
+                break
+        return size
+
+    def add_predecessor_turns(self, turn_set: Set[int], turn_list: List[int], lr: LiveRangeInfo):
+        """
+        Adds the given live range + predecessors to the turns vector.
+        Note: the turn_set is for quick detection of duplicates,
+        the turn_list is to get reproduceable results
+        """
+        if lr.turn not in turn_set:
+            turn_set.add(lr.turn)
+            turn_list.append(lr.turn)
+        id = lr.id
+        while self.lrs[id].predecessor != HillClimbAllocator.NO_PREDECESSOR:
+            id = self.lrs[id].predecessor
+            turn = self.lrs[id].turn
+            if turn not in turn_set:
+                turn_set.add(turn)
+                turn_list.append(turn)
+
+    def attempt_bottleneck_fix(self, indices: List[int]):
+        """
+        Finds the "bottleneck", the live range with highest end address, and reorders the indices
+        such that a next allocation might lower the memory usage.
+
+                                  ---------
+                                  |       |
+                                  |   D   |
+                                  |       |
+         ----------------------------------
+         |           B                 |
+         -------------------------------
+         | |
+         |A|                      ---
+         | |                      |C|
+         | |                      | |
+         ---------------------------------------
+
+        In the above example, the allocation order was [A, B, C, D] and D is the resulting bottle-neck.
+        The live ranges that affected the allocation of D are the direct neighbours of D (i.e. B and C),
+        and all direct and indirect predecessors of D and its neighbours
+        (i.e. A, which is the predecessor of B, and indirect predecessor of D).
+
+        By permuting the order in which the affecting live ranges are allocated, the bottleneck might
+        be lowered. In the above example, almost any permutation would lower the bottleneck.
+        """
+        # Find the bottleneck
+        max_lr = self.lrs[0]
+        for lr in self.lrs[1:]:
+            if lr.end_address > max_lr.end_address:
+                max_lr = lr
+
+        # Find all live ranges that affected the placement of the bottleneck live range.
+        # This consists of two types of live ranges:
+        # - direct neighbours of the bottleneck live range
+        # - direct and indirect predecessors of these neighbours + bottleneck
+        # The turns at which these live ranges were allocated are put in the turns set.
+        turn_set = set()
+        turn_list = list()
+        self.add_predecessor_turns(turn_set, turn_list, max_lr)
+        for lr2 in max_lr.neighbours:
+            self.add_predecessor_turns(turn_set, turn_list, lr2)
+
+        # Non-direct neighbours that interfere with the allocation of the bottleneck are the
+        # immediate cause for gaps in the allocation, and are selected with higher probability.
+        non_nb_turn_list = []
+        for turn in turn_list:
+            lr = self.lrs[indices[turn]]
+            if not max_lr.is_neighbour(lr):
+                non_nb_turn_list.append(turn)
+        assert turn_list
+        # Pick from non-neighbour list with 30% probability
+        # (magic number based on tuning)
+        if random.randint(0, 100) < 30 and non_nb_turn_list:
+            # Pick a live range from the "non-neighbour list"
+            ix1 = non_nb_turn_list[random.randint(0, len(non_nb_turn_list) - 1)]
+        else:
+            # Pick any affecting live range.
+            ix1 = turn_list[random.randint(0, len(turn_list) - 1)]
+
+        ix2 = turn_list[random.randint(0, len(turn_list) - 2)]
+        if ix1 == ix2:
+            ix2 = turn_list[-1]
+        # Swap indices
+        indices[ix1], indices[ix2] = indices[ix2], indices[ix1]
+
+    def search(self, indices: List[int], iterations: int):
+        """
+        Search for a solution, using the given indices as initial solution.
+        """
+        best_indices = indices[:]
+        for _ in range(iterations):
+            # Reorder the indices
+            self.attempt_bottleneck_fix(indices)
+            # Allocate the reordered indices and check if it gave an improvement
+            new_size = self.allocate_indices(indices)
+            if new_size <= self.best_size:
+                # The new allocation produced a new best result remember it
+                self.best_size = new_size
+                best_indices = indices[:]
+                self.allocated_addresses = [lr.address for lr in self.lrs]
+                if self.best_size <= self.min_required_size:
+                    # Target reached stop
+                    return
+            else:
+                # The new allocation produced worse result undo the change
+                indices = best_indices[:]
+
+    def allocate(self) -> List[int]:
+        """
+        Runs the allocation algorithm. Finishes when an optimal solution has been
+        found or when maximum iterations have been run.
+        The allocated addresses are placed in the output vector, in the same
+        order as the input vector.
+
+        Implementation note: the algorithm produces reproduceable results by using
+        a well-defined random number generator with well-defined default seed,
+        and using a fixed number of iterations.
+        """
+        random.seed(1)
+        # Sort indices on priority. Note: self.lrs must be left unchanged
+        indices = [lr.id for lr in sorted(self.lrs)]
+        # Allocate the initial solution
+        self.best_size = self.allocate_indices(indices)
+        self.allocated_addresses = [lr.address for lr in self.lrs]
+        if self.best_size > self.min_required_size:
+            # Try to improve the heuristic allocation
+            self.search(indices, HillClimbAllocator.MAX_ITERATIONS)
+        # else the heuristic allocation returned an optimal solution; no search needed
+        return self.allocated_addresses
+
+
+def allocate_live_ranges(lrs: List[LiveRange]) -> List[int]:
+    """
+    Allocates live ranges using a search based allocator.
+    Returns the list of allocated addresses (one for each live range)
+    """
+    if not lrs:
+        return []
+    allocator = HillClimbAllocator(lrs)
+    return allocator.allocate()