Louis Verhaard | d700252 | 2021-01-20 17:23:54 +0100 | [diff] [blame] | 1 | # Copyright (C) 2021 Arm Limited or its affiliates. All rights reserved. |
| 2 | # |
| 3 | # SPDX-License-Identifier: Apache-2.0 |
| 4 | # |
| 5 | # Licensed under the Apache License, Version 2.0 (the License); you may |
| 6 | # not use this file except in compliance with the License. |
| 7 | # You may obtain a copy of the License at |
| 8 | # |
| 9 | # www.apache.org/licenses/LICENSE-2.0 |
| 10 | # |
| 11 | # Unless required by applicable law or agreed to in writing, software |
| 12 | # distributed under the License is distributed on an AS IS BASIS, WITHOUT |
| 13 | # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 14 | # See the License for the specific language governing permissions and |
| 15 | # limitations under the License. |
| 16 | # |
| 17 | # Description: |
| 18 | # Tensor allocator based on a hill-climb search |
| 19 | import random |
| 20 | from typing import List |
| 21 | from typing import Set |
| 22 | |
Henrik G Olsson | 807278a | 2021-03-08 18:20:00 +0100 | [diff] [blame] | 23 | from . import numeric_util |
Louis Verhaard | d700252 | 2021-01-20 17:23:54 +0100 | [diff] [blame] | 24 | from .live_range import LiveRange |
| 25 | |
| 26 | |
| 27 | class LiveRangeInfo: |
Henrik G Olsson | 807278a | 2021-03-08 18:20:00 +0100 | [diff] [blame] | 28 | def __init__(self, id: int, start_time: int, end_time: int, size: int, min_alignment: int): |
Louis Verhaard | d700252 | 2021-01-20 17:23:54 +0100 | [diff] [blame] | 29 | # Index of this live range |
| 30 | self.id = id |
| 31 | # Start time (input to the allocator algorithm) |
| 32 | self.start_time = start_time |
| 33 | # End time, inclusive (input to the allocator algorithm) |
| 34 | self.end_time = end_time |
| 35 | # Size in bytes (input to the allocator algorithm) |
| 36 | self.size = size |
| 37 | # Allocated address (the main output from the allocator algorithm) |
| 38 | self.address: int = 0 |
| 39 | # End address, exclusive |
| 40 | self.end_address: int = 0 |
| 41 | # id of predecessor live range (predecessor's end address == this lr's address) |
| 42 | self.predecessor: int = 0 |
| 43 | # Turn at which the live range was allocated |
| 44 | self.turn: int = 0 |
| 45 | # Max value of size_at_time (only used in the heuristic allocation) |
| 46 | self.urgency = 0 |
| 47 | self.neighbours: List["LiveRangeInfo"] = [] |
Henrik G Olsson | 807278a | 2021-03-08 18:20:00 +0100 | [diff] [blame] | 48 | self.min_alignment = min_alignment |
Louis Verhaard | d700252 | 2021-01-20 17:23:54 +0100 | [diff] [blame] | 49 | |
| 50 | def overlaps(self, addr2: int, size2: int) -> int: |
| 51 | return self.address < addr2 + size2 and addr2 < self.end_address |
| 52 | |
| 53 | def is_neighbour(self, lr: "LiveRangeInfo") -> bool: |
| 54 | return self.start_time <= lr.end_time and lr.start_time <= self.end_time |
| 55 | |
| 56 | def __str__(self): |
| 57 | return "<LiveRangeInfo: id={}, start_time={}, end_time={}, size={}, address={}>".format( |
| 58 | self.id, self.start_time, self.end_time, self.size, self.address |
| 59 | ) |
| 60 | |
| 61 | def __lt__(self, other) -> bool: |
| 62 | if self.urgency != other.urgency: |
| 63 | return self.urgency > other.urgency |
| 64 | duration1 = self.end_time - self.start_time |
| 65 | duration2 = other.end_time - other.start_time |
| 66 | if duration1 != duration2: |
| 67 | return duration1 > duration2 |
| 68 | |
| 69 | if self.start_time != other.start_time: |
| 70 | return self.start_time < other.start_time |
| 71 | |
| 72 | if self.size != other.size: |
| 73 | return self.size > other.size |
| 74 | |
| 75 | return self.id < other.id |
| 76 | |
| 77 | |
| 78 | class HillClimbAllocator: |
| 79 | """ |
| 80 | Implements tensor allocator using a hill climb search. |
| 81 | |
| 82 | The basic algorithm is: |
| 83 | |
| 84 | Use a heuristic allocator to find an initial allocation |
| 85 | while allocation is not optimal and iterations < MAX_ITERATIONS: |
| 86 | find the "bottleneck": the live range with highest end address |
| 87 | find all live ranges that affected the allocation of the bottleneck |
| 88 | swap the order of any two affecting live ranges |
| 89 | reallocate tensors using the reordered live ranges |
| 90 | if the new allocation is better: keep it, else set allocation to previous allocation |
| 91 | """ |
| 92 | |
| 93 | MAX_ITERATIONS = 500 |
| 94 | NOT_ALLOCATED = -1 |
| 95 | # Used for live ranges allocated at address 0 |
| 96 | NO_PREDECESSOR = -1 |
| 97 | |
| 98 | def __init__(self, live_ranges: List[LiveRange]): |
| 99 | # Contains the live ranges |
| 100 | self.lrs: List[LiveRangeInfo] = [ |
Henrik G Olsson | 807278a | 2021-03-08 18:20:00 +0100 | [diff] [blame] | 101 | LiveRangeInfo(id, lr.start_time, lr.end_time, lr.size, lr.get_alignment()) |
| 102 | for id, lr in enumerate(live_ranges) |
Louis Verhaard | d700252 | 2021-01-20 17:23:54 +0100 | [diff] [blame] | 103 | ] |
| 104 | self.lrs_at_time = [] |
| 105 | # The available size (input to algorithm). |
| 106 | self.available_size: int = 0 |
| 107 | # The algorithm stops once the target size has been achieved |
| 108 | self.target_size: int = 0 |
| 109 | # The highest end address of the best found allocation |
| 110 | self.best_size: int = 1 << 63 |
| 111 | # For each live range: max value of size_at_time (only used in the heuristic allocation) |
| 112 | self.lr_urgency = len(self.lrs) * [0] |
| 113 | nr_time_slots = 1 + max(lr.end_time for lr in self.lrs) |
| 114 | # Contains active live ranges at each timestamp |
| 115 | self.lrs_at_time = [[] for i in range(nr_time_slots)] |
| 116 | for lr in self.lrs: |
| 117 | for t in range(lr.start_time, lr.end_time + 1): |
| 118 | self.lrs_at_time[t].append(lr) |
| 119 | # At each timestamp: accumulated size of active live ranges |
| 120 | size_at_time = [sum(lr.size for lr in self.lrs_at_time[t]) for t in range(nr_time_slots)] |
| 121 | # The minimum possible size, assuming all live ranges can be perfectly allocated |
| 122 | self.min_required_size: int = max(size_at_time) |
| 123 | # Calculate all neighbours + the urgency of each live range |
| 124 | for lr in self.lrs: |
| 125 | lr.urgency = 0 |
| 126 | lr.neighbours = [] |
| 127 | neighbours = set() |
| 128 | for t in range(lr.start_time, lr.end_time + 1): |
| 129 | lr.urgency = max(size_at_time[t], lr.urgency) |
| 130 | for lr2 in self.lrs_at_time[t]: |
| 131 | if lr2 not in neighbours and lr != lr2: |
| 132 | neighbours.add(lr2) |
| 133 | lr.neighbours.append(lr2) |
| 134 | |
| 135 | def allocate_lr(self, lr: LiveRangeInfo): |
| 136 | """ |
| 137 | Allocates the given live range at the smallest possible address |
| 138 | """ |
| 139 | address = 0 |
| 140 | predecessor = HillClimbAllocator.NO_PREDECESSOR |
| 141 | fits = False |
| 142 | while not fits: |
| 143 | fits = True |
| 144 | # Find neighbours that overlap with address |
| 145 | for lr2 in lr.neighbours: |
| 146 | if lr2.address == HillClimbAllocator.NOT_ALLOCATED or lr2.end_address <= address: |
| 147 | continue |
| 148 | if lr2.overlaps(address, lr.size): |
| 149 | # Overlap found increase address |
| 150 | fits = False |
Henrik G Olsson | 807278a | 2021-03-08 18:20:00 +0100 | [diff] [blame] | 151 | address = numeric_util.round_up(lr2.end_address, lr.min_alignment) |
Louis Verhaard | d700252 | 2021-01-20 17:23:54 +0100 | [diff] [blame] | 152 | predecessor = lr2.id |
| 153 | lr.address = address |
| 154 | lr.end_address = address + lr.size |
| 155 | lr.predecessor = predecessor |
| 156 | |
| 157 | def allocate_indices(self, indices: List[int]): |
| 158 | """ |
| 159 | Allocates the live ranges in the order indicated by the indices; |
| 160 | allocates each live range at the lowest possible address. |
| 161 | """ |
| 162 | for lr in self.lrs: |
| 163 | lr.address = HillClimbAllocator.NOT_ALLOCATED |
| 164 | size = 0 |
| 165 | for turn, index in enumerate(indices): |
| 166 | lr = self.lrs[index] |
| 167 | self.allocate_lr(lr) |
| 168 | lr.turn = turn |
| 169 | size = max(size, lr.end_address) |
| 170 | if size > self.best_size: |
| 171 | # This allocation is worse than the best known allocation; |
| 172 | # no need to continue |
| 173 | break |
| 174 | return size |
| 175 | |
| 176 | def add_predecessor_turns(self, turn_set: Set[int], turn_list: List[int], lr: LiveRangeInfo): |
| 177 | """ |
| 178 | Adds the given live range + predecessors to the turns vector. |
| 179 | Note: the turn_set is for quick detection of duplicates, |
| 180 | the turn_list is to get reproduceable results |
| 181 | """ |
| 182 | if lr.turn not in turn_set: |
| 183 | turn_set.add(lr.turn) |
| 184 | turn_list.append(lr.turn) |
| 185 | id = lr.id |
| 186 | while self.lrs[id].predecessor != HillClimbAllocator.NO_PREDECESSOR: |
| 187 | id = self.lrs[id].predecessor |
| 188 | turn = self.lrs[id].turn |
| 189 | if turn not in turn_set: |
| 190 | turn_set.add(turn) |
| 191 | turn_list.append(turn) |
| 192 | |
| 193 | def attempt_bottleneck_fix(self, indices: List[int]): |
| 194 | """ |
| 195 | Finds the "bottleneck", the live range with highest end address, and reorders the indices |
| 196 | such that a next allocation might lower the memory usage. |
| 197 | |
| 198 | --------- |
| 199 | | | |
| 200 | | D | |
| 201 | | | |
| 202 | ---------------------------------- |
| 203 | | B | |
| 204 | ------------------------------- |
| 205 | | | |
| 206 | |A| --- |
| 207 | | | |C| |
| 208 | | | | | |
| 209 | --------------------------------------- |
| 210 | |
| 211 | In the above example, the allocation order was [A, B, C, D] and D is the resulting bottle-neck. |
| 212 | The live ranges that affected the allocation of D are the direct neighbours of D (i.e. B and C), |
| 213 | and all direct and indirect predecessors of D and its neighbours |
| 214 | (i.e. A, which is the predecessor of B, and indirect predecessor of D). |
| 215 | |
| 216 | By permuting the order in which the affecting live ranges are allocated, the bottleneck might |
| 217 | be lowered. In the above example, almost any permutation would lower the bottleneck. |
| 218 | """ |
| 219 | # Find the bottleneck |
| 220 | max_lr = self.lrs[0] |
| 221 | for lr in self.lrs[1:]: |
| 222 | if lr.end_address > max_lr.end_address: |
| 223 | max_lr = lr |
| 224 | |
| 225 | # Find all live ranges that affected the placement of the bottleneck live range. |
| 226 | # This consists of two types of live ranges: |
| 227 | # - direct neighbours of the bottleneck live range |
| 228 | # - direct and indirect predecessors of these neighbours + bottleneck |
| 229 | # The turns at which these live ranges were allocated are put in the turns set. |
| 230 | turn_set = set() |
| 231 | turn_list = list() |
| 232 | self.add_predecessor_turns(turn_set, turn_list, max_lr) |
| 233 | for lr2 in max_lr.neighbours: |
| 234 | self.add_predecessor_turns(turn_set, turn_list, lr2) |
| 235 | |
| 236 | # Non-direct neighbours that interfere with the allocation of the bottleneck are the |
| 237 | # immediate cause for gaps in the allocation, and are selected with higher probability. |
| 238 | non_nb_turn_list = [] |
| 239 | for turn in turn_list: |
| 240 | lr = self.lrs[indices[turn]] |
| 241 | if not max_lr.is_neighbour(lr): |
| 242 | non_nb_turn_list.append(turn) |
| 243 | assert turn_list |
| 244 | # Pick from non-neighbour list with 30% probability |
| 245 | # (magic number based on tuning) |
| 246 | if random.randint(0, 100) < 30 and non_nb_turn_list: |
| 247 | # Pick a live range from the "non-neighbour list" |
| 248 | ix1 = non_nb_turn_list[random.randint(0, len(non_nb_turn_list) - 1)] |
| 249 | else: |
| 250 | # Pick any affecting live range. |
| 251 | ix1 = turn_list[random.randint(0, len(turn_list) - 1)] |
| 252 | |
| 253 | ix2 = turn_list[random.randint(0, len(turn_list) - 2)] |
| 254 | if ix1 == ix2: |
| 255 | ix2 = turn_list[-1] |
| 256 | # Swap indices |
| 257 | indices[ix1], indices[ix2] = indices[ix2], indices[ix1] |
| 258 | |
| 259 | def search(self, indices: List[int], iterations: int): |
| 260 | """ |
| 261 | Search for a solution, using the given indices as initial solution. |
| 262 | """ |
| 263 | best_indices = indices[:] |
| 264 | for _ in range(iterations): |
| 265 | # Reorder the indices |
| 266 | self.attempt_bottleneck_fix(indices) |
| 267 | # Allocate the reordered indices and check if it gave an improvement |
| 268 | new_size = self.allocate_indices(indices) |
| 269 | if new_size <= self.best_size: |
| 270 | # The new allocation produced a new best result remember it |
| 271 | self.best_size = new_size |
| 272 | best_indices = indices[:] |
| 273 | self.allocated_addresses = [lr.address for lr in self.lrs] |
| 274 | if self.best_size <= self.min_required_size: |
| 275 | # Target reached stop |
| 276 | return |
| 277 | else: |
| 278 | # The new allocation produced worse result undo the change |
| 279 | indices = best_indices[:] |
| 280 | |
| 281 | def allocate(self) -> List[int]: |
| 282 | """ |
| 283 | Runs the allocation algorithm. Finishes when an optimal solution has been |
| 284 | found or when maximum iterations have been run. |
| 285 | The allocated addresses are placed in the output vector, in the same |
| 286 | order as the input vector. |
| 287 | |
| 288 | Implementation note: the algorithm produces reproduceable results by using |
| 289 | a well-defined random number generator with well-defined default seed, |
| 290 | and using a fixed number of iterations. |
| 291 | """ |
| 292 | random.seed(1) |
| 293 | # Sort indices on priority. Note: self.lrs must be left unchanged |
| 294 | indices = [lr.id for lr in sorted(self.lrs)] |
| 295 | # Allocate the initial solution |
| 296 | self.best_size = self.allocate_indices(indices) |
| 297 | self.allocated_addresses = [lr.address for lr in self.lrs] |
| 298 | if self.best_size > self.min_required_size: |
| 299 | # Try to improve the heuristic allocation |
| 300 | self.search(indices, HillClimbAllocator.MAX_ITERATIONS) |
| 301 | # else the heuristic allocation returned an optimal solution; no search needed |
| 302 | return self.allocated_addresses |
| 303 | |
| 304 | |
| 305 | def allocate_live_ranges(lrs: List[LiveRange]) -> List[int]: |
| 306 | """ |
| 307 | Allocates live ranges using a search based allocator. |
| 308 | Returns the list of allocated addresses (one for each live range) |
| 309 | """ |
| 310 | if not lrs: |
| 311 | return [] |
| 312 | allocator = HillClimbAllocator(lrs) |
| 313 | return allocator.allocate() |