Rickard Bolin | bc6ee58 | 2022-11-04 08:24:29 +0000 | [diff] [blame] | 1 | # SPDX-FileCopyrightText: Copyright 2021-2022 Arm Limited and/or its affiliates <open-source-office@arm.com> |
Louis Verhaard | d700252 | 2021-01-20 17:23:54 +0100 | [diff] [blame] | 2 | # |
| 3 | # SPDX-License-Identifier: Apache-2.0 |
| 4 | # |
| 5 | # Licensed under the Apache License, Version 2.0 (the License); you may |
| 6 | # not use this file except in compliance with the License. |
| 7 | # You may obtain a copy of the License at |
| 8 | # |
| 9 | # www.apache.org/licenses/LICENSE-2.0 |
| 10 | # |
| 11 | # Unless required by applicable law or agreed to in writing, software |
| 12 | # distributed under the License is distributed on an AS IS BASIS, WITHOUT |
| 13 | # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 14 | # See the License for the specific language governing permissions and |
| 15 | # limitations under the License. |
| 16 | # |
| 17 | # Description: |
| 18 | # Tensor allocator based on a hill-climb search |
| 19 | import random |
| 20 | from typing import List |
| 21 | from typing import Set |
| 22 | |
Henrik G Olsson | 807278a | 2021-03-08 18:20:00 +0100 | [diff] [blame] | 23 | from . import numeric_util |
Louis Verhaard | d700252 | 2021-01-20 17:23:54 +0100 | [diff] [blame] | 24 | from .live_range import LiveRange |
| 25 | |
| 26 | |
| 27 | class LiveRangeInfo: |
Henrik G Olsson | 807278a | 2021-03-08 18:20:00 +0100 | [diff] [blame] | 28 | def __init__(self, id: int, start_time: int, end_time: int, size: int, min_alignment: int): |
Louis Verhaard | d700252 | 2021-01-20 17:23:54 +0100 | [diff] [blame] | 29 | # Index of this live range |
| 30 | self.id = id |
| 31 | # Start time (input to the allocator algorithm) |
| 32 | self.start_time = start_time |
| 33 | # End time, inclusive (input to the allocator algorithm) |
| 34 | self.end_time = end_time |
| 35 | # Size in bytes (input to the allocator algorithm) |
| 36 | self.size = size |
| 37 | # Allocated address (the main output from the allocator algorithm) |
| 38 | self.address: int = 0 |
| 39 | # End address, exclusive |
| 40 | self.end_address: int = 0 |
| 41 | # id of predecessor live range (predecessor's end address == this lr's address) |
| 42 | self.predecessor: int = 0 |
| 43 | # Turn at which the live range was allocated |
| 44 | self.turn: int = 0 |
| 45 | # Max value of size_at_time (only used in the heuristic allocation) |
| 46 | self.urgency = 0 |
| 47 | self.neighbours: List["LiveRangeInfo"] = [] |
Henrik G Olsson | 807278a | 2021-03-08 18:20:00 +0100 | [diff] [blame] | 48 | self.min_alignment = min_alignment |
Louis Verhaard | d700252 | 2021-01-20 17:23:54 +0100 | [diff] [blame] | 49 | |
| 50 | def overlaps(self, addr2: int, size2: int) -> int: |
| 51 | return self.address < addr2 + size2 and addr2 < self.end_address |
| 52 | |
| 53 | def is_neighbour(self, lr: "LiveRangeInfo") -> bool: |
| 54 | return self.start_time <= lr.end_time and lr.start_time <= self.end_time |
| 55 | |
| 56 | def __str__(self): |
| 57 | return "<LiveRangeInfo: id={}, start_time={}, end_time={}, size={}, address={}>".format( |
| 58 | self.id, self.start_time, self.end_time, self.size, self.address |
| 59 | ) |
| 60 | |
| 61 | def __lt__(self, other) -> bool: |
| 62 | if self.urgency != other.urgency: |
| 63 | return self.urgency > other.urgency |
| 64 | duration1 = self.end_time - self.start_time |
| 65 | duration2 = other.end_time - other.start_time |
| 66 | if duration1 != duration2: |
| 67 | return duration1 > duration2 |
| 68 | |
| 69 | if self.start_time != other.start_time: |
| 70 | return self.start_time < other.start_time |
| 71 | |
| 72 | if self.size != other.size: |
| 73 | return self.size > other.size |
| 74 | |
| 75 | return self.id < other.id |
| 76 | |
| 77 | |
| 78 | class HillClimbAllocator: |
| 79 | """ |
| 80 | Implements tensor allocator using a hill climb search. |
| 81 | |
| 82 | The basic algorithm is: |
| 83 | |
| 84 | Use a heuristic allocator to find an initial allocation |
| 85 | while allocation is not optimal and iterations < MAX_ITERATIONS: |
| 86 | find the "bottleneck": the live range with highest end address |
| 87 | find all live ranges that affected the allocation of the bottleneck |
| 88 | swap the order of any two affecting live ranges |
| 89 | reallocate tensors using the reordered live ranges |
| 90 | if the new allocation is better: keep it, else set allocation to previous allocation |
| 91 | """ |
| 92 | |
Tim Hall | cda4fcb | 2022-05-19 12:36:58 +0100 | [diff] [blame] | 93 | # Maximum number of iterations of the algorithm (can be override from the command line) |
| 94 | MAX_ITERATIONS = 99999 |
Louis Verhaard | d700252 | 2021-01-20 17:23:54 +0100 | [diff] [blame] | 95 | NOT_ALLOCATED = -1 |
| 96 | # Used for live ranges allocated at address 0 |
| 97 | NO_PREDECESSOR = -1 |
Louis Verhaard | a19b467 | 2022-02-28 15:12:57 +0100 | [diff] [blame] | 98 | # Special handling if best solution has not improved during this many iterations |
| 99 | MAX_ITERATIONS_STUCK = 50 |
Tim Hall | cda4fcb | 2022-05-19 12:36:58 +0100 | [diff] [blame] | 100 | # Minimum number of iterations since the last improvement (unless an optimal solution is found) |
| 101 | MIN_ITERATIONS_IMPROVE = 500 |
Louis Verhaard | d700252 | 2021-01-20 17:23:54 +0100 | [diff] [blame] | 102 | |
Tim Hall | cda4fcb | 2022-05-19 12:36:58 +0100 | [diff] [blame] | 103 | def __init__(self, live_ranges: List[LiveRange], max_iterations: int, memory_limit: int): |
Louis Verhaard | d700252 | 2021-01-20 17:23:54 +0100 | [diff] [blame] | 104 | # Contains the live ranges |
| 105 | self.lrs: List[LiveRangeInfo] = [ |
Henrik G Olsson | 807278a | 2021-03-08 18:20:00 +0100 | [diff] [blame] | 106 | LiveRangeInfo(id, lr.start_time, lr.end_time, lr.size, lr.get_alignment()) |
| 107 | for id, lr in enumerate(live_ranges) |
Louis Verhaard | d700252 | 2021-01-20 17:23:54 +0100 | [diff] [blame] | 108 | ] |
Jonas Ohlsson | 845e232 | 2022-03-01 12:39:55 +0100 | [diff] [blame] | 109 | self.lrs_at_time: List[List[LiveRangeInfo]] = [] |
Louis Verhaard | d700252 | 2021-01-20 17:23:54 +0100 | [diff] [blame] | 110 | # The available size (input to algorithm). |
| 111 | self.available_size: int = 0 |
| 112 | # The algorithm stops once the target size has been achieved |
| 113 | self.target_size: int = 0 |
| 114 | # The highest end address of the best found allocation |
| 115 | self.best_size: int = 1 << 63 |
| 116 | # For each live range: max value of size_at_time (only used in the heuristic allocation) |
| 117 | self.lr_urgency = len(self.lrs) * [0] |
| 118 | nr_time_slots = 1 + max(lr.end_time for lr in self.lrs) |
| 119 | # Contains active live ranges at each timestamp |
| 120 | self.lrs_at_time = [[] for i in range(nr_time_slots)] |
| 121 | for lr in self.lrs: |
| 122 | for t in range(lr.start_time, lr.end_time + 1): |
| 123 | self.lrs_at_time[t].append(lr) |
| 124 | # At each timestamp: accumulated size of active live ranges |
| 125 | size_at_time = [sum(lr.size for lr in self.lrs_at_time[t]) for t in range(nr_time_slots)] |
| 126 | # The minimum possible size, assuming all live ranges can be perfectly allocated |
| 127 | self.min_required_size: int = max(size_at_time) |
Tim Hall | cda4fcb | 2022-05-19 12:36:58 +0100 | [diff] [blame] | 128 | # Set the maximum number of iterations the search will iterate to find a solution |
| 129 | if max_iterations is None: |
| 130 | self.max_iterations = self.MAX_ITERATIONS |
| 131 | else: |
| 132 | self.max_iterations = max_iterations |
| 133 | # Defines a memory limit that the allocation must meet |
| 134 | self.memory_limit = memory_limit |
| 135 | if self.memory_limit < self.min_required_size: |
| 136 | print( |
| 137 | f"Warning: Memory limit = {self.memory_limit} is less than the minimum possible allocation size =" |
| 138 | f" {self.min_required_size}" |
| 139 | ) |
Louis Verhaard | d700252 | 2021-01-20 17:23:54 +0100 | [diff] [blame] | 140 | # Calculate all neighbours + the urgency of each live range |
| 141 | for lr in self.lrs: |
| 142 | lr.urgency = 0 |
| 143 | lr.neighbours = [] |
| 144 | neighbours = set() |
| 145 | for t in range(lr.start_time, lr.end_time + 1): |
| 146 | lr.urgency = max(size_at_time[t], lr.urgency) |
| 147 | for lr2 in self.lrs_at_time[t]: |
| 148 | if lr2 not in neighbours and lr != lr2: |
| 149 | neighbours.add(lr2) |
| 150 | lr.neighbours.append(lr2) |
| 151 | |
| 152 | def allocate_lr(self, lr: LiveRangeInfo): |
| 153 | """ |
| 154 | Allocates the given live range at the smallest possible address |
| 155 | """ |
| 156 | address = 0 |
| 157 | predecessor = HillClimbAllocator.NO_PREDECESSOR |
| 158 | fits = False |
| 159 | while not fits: |
| 160 | fits = True |
| 161 | # Find neighbours that overlap with address |
| 162 | for lr2 in lr.neighbours: |
| 163 | if lr2.address == HillClimbAllocator.NOT_ALLOCATED or lr2.end_address <= address: |
| 164 | continue |
| 165 | if lr2.overlaps(address, lr.size): |
| 166 | # Overlap found increase address |
| 167 | fits = False |
Henrik G Olsson | 807278a | 2021-03-08 18:20:00 +0100 | [diff] [blame] | 168 | address = numeric_util.round_up(lr2.end_address, lr.min_alignment) |
Louis Verhaard | d700252 | 2021-01-20 17:23:54 +0100 | [diff] [blame] | 169 | predecessor = lr2.id |
| 170 | lr.address = address |
| 171 | lr.end_address = address + lr.size |
| 172 | lr.predecessor = predecessor |
| 173 | |
| 174 | def allocate_indices(self, indices: List[int]): |
| 175 | """ |
| 176 | Allocates the live ranges in the order indicated by the indices; |
| 177 | allocates each live range at the lowest possible address. |
| 178 | """ |
| 179 | for lr in self.lrs: |
| 180 | lr.address = HillClimbAllocator.NOT_ALLOCATED |
| 181 | size = 0 |
| 182 | for turn, index in enumerate(indices): |
| 183 | lr = self.lrs[index] |
| 184 | self.allocate_lr(lr) |
| 185 | lr.turn = turn |
| 186 | size = max(size, lr.end_address) |
| 187 | if size > self.best_size: |
| 188 | # This allocation is worse than the best known allocation; |
| 189 | # no need to continue |
| 190 | break |
| 191 | return size |
| 192 | |
| 193 | def add_predecessor_turns(self, turn_set: Set[int], turn_list: List[int], lr: LiveRangeInfo): |
| 194 | """ |
| 195 | Adds the given live range + predecessors to the turns vector. |
| 196 | Note: the turn_set is for quick detection of duplicates, |
| 197 | the turn_list is to get reproduceable results |
| 198 | """ |
| 199 | if lr.turn not in turn_set: |
| 200 | turn_set.add(lr.turn) |
| 201 | turn_list.append(lr.turn) |
| 202 | id = lr.id |
| 203 | while self.lrs[id].predecessor != HillClimbAllocator.NO_PREDECESSOR: |
| 204 | id = self.lrs[id].predecessor |
| 205 | turn = self.lrs[id].turn |
| 206 | if turn not in turn_set: |
| 207 | turn_set.add(turn) |
| 208 | turn_list.append(turn) |
| 209 | |
Louis Verhaard | a19b467 | 2022-02-28 15:12:57 +0100 | [diff] [blame] | 210 | def attempt_bottleneck_fix(self, indices: List[int], iterations_stuck): |
Louis Verhaard | d700252 | 2021-01-20 17:23:54 +0100 | [diff] [blame] | 211 | """ |
| 212 | Finds the "bottleneck", the live range with highest end address, and reorders the indices |
| 213 | such that a next allocation might lower the memory usage. |
| 214 | |
| 215 | --------- |
| 216 | | | |
| 217 | | D | |
| 218 | | | |
| 219 | ---------------------------------- |
| 220 | | B | |
| 221 | ------------------------------- |
| 222 | | | |
| 223 | |A| --- |
| 224 | | | |C| |
| 225 | | | | | |
| 226 | --------------------------------------- |
| 227 | |
| 228 | In the above example, the allocation order was [A, B, C, D] and D is the resulting bottle-neck. |
| 229 | The live ranges that affected the allocation of D are the direct neighbours of D (i.e. B and C), |
| 230 | and all direct and indirect predecessors of D and its neighbours |
| 231 | (i.e. A, which is the predecessor of B, and indirect predecessor of D). |
| 232 | |
| 233 | By permuting the order in which the affecting live ranges are allocated, the bottleneck might |
| 234 | be lowered. In the above example, almost any permutation would lower the bottleneck. |
| 235 | """ |
| 236 | # Find the bottleneck |
| 237 | max_lr = self.lrs[0] |
| 238 | for lr in self.lrs[1:]: |
| 239 | if lr.end_address > max_lr.end_address: |
| 240 | max_lr = lr |
| 241 | |
| 242 | # Find all live ranges that affected the placement of the bottleneck live range. |
| 243 | # This consists of two types of live ranges: |
| 244 | # - direct neighbours of the bottleneck live range |
| 245 | # - direct and indirect predecessors of these neighbours + bottleneck |
| 246 | # The turns at which these live ranges were allocated are put in the turns set. |
Jonas Ohlsson | 845e232 | 2022-03-01 12:39:55 +0100 | [diff] [blame] | 247 | turn_set: Set[int] = set() |
| 248 | turn_list: List[int] = list() |
Louis Verhaard | d700252 | 2021-01-20 17:23:54 +0100 | [diff] [blame] | 249 | self.add_predecessor_turns(turn_set, turn_list, max_lr) |
| 250 | for lr2 in max_lr.neighbours: |
| 251 | self.add_predecessor_turns(turn_set, turn_list, lr2) |
| 252 | |
| 253 | # Non-direct neighbours that interfere with the allocation of the bottleneck are the |
| 254 | # immediate cause for gaps in the allocation, and are selected with higher probability. |
| 255 | non_nb_turn_list = [] |
| 256 | for turn in turn_list: |
| 257 | lr = self.lrs[indices[turn]] |
| 258 | if not max_lr.is_neighbour(lr): |
| 259 | non_nb_turn_list.append(turn) |
| 260 | assert turn_list |
| 261 | # Pick from non-neighbour list with 30% probability |
| 262 | # (magic number based on tuning) |
| 263 | if random.randint(0, 100) < 30 and non_nb_turn_list: |
| 264 | # Pick a live range from the "non-neighbour list" |
| 265 | ix1 = non_nb_turn_list[random.randint(0, len(non_nb_turn_list) - 1)] |
| 266 | else: |
| 267 | # Pick any affecting live range. |
| 268 | ix1 = turn_list[random.randint(0, len(turn_list) - 1)] |
| 269 | |
| 270 | ix2 = turn_list[random.randint(0, len(turn_list) - 2)] |
| 271 | if ix1 == ix2: |
| 272 | ix2 = turn_list[-1] |
| 273 | # Swap indices |
| 274 | indices[ix1], indices[ix2] = indices[ix2], indices[ix1] |
Louis Verhaard | a19b467 | 2022-02-28 15:12:57 +0100 | [diff] [blame] | 275 | if iterations_stuck > HillClimbAllocator.MAX_ITERATIONS_STUCK: |
| 276 | # The best allocation has not improved for a while, maybe improvement is not possible |
| 277 | # by single-swapping indices; add more neighbour live ranges and swap up to two more indices. |
| 278 | # Adding more neighbours can sometimes resolve the situation where the current bottleneck |
| 279 | # is resolved, but always results in a higher bottleneck at a nearby live range. |
| 280 | # Magic number is based on tuning |
| 281 | for turn in non_nb_turn_list: |
| 282 | for lr in self.lrs[indices[turn]].neighbours: |
| 283 | if lr.turn not in turn_set: |
| 284 | turn_set.add(lr.turn) |
| 285 | turn_list.append(lr.turn) |
| 286 | ix1 = turn_list[random.randint(0, len(turn_list) - 1)] |
| 287 | ix2 = turn_list[random.randint(0, len(turn_list) - 1)] |
| 288 | indices[ix1], indices[ix2] = indices[ix2], indices[ix1] |
Louis Verhaard | d700252 | 2021-01-20 17:23:54 +0100 | [diff] [blame] | 289 | |
Tim Hall | cda4fcb | 2022-05-19 12:36:58 +0100 | [diff] [blame] | 290 | def search(self, indices: List[int]): |
Louis Verhaard | d700252 | 2021-01-20 17:23:54 +0100 | [diff] [blame] | 291 | """ |
| 292 | Search for a solution, using the given indices as initial solution. |
| 293 | """ |
| 294 | best_indices = indices[:] |
Louis Verhaard | a19b467 | 2022-02-28 15:12:57 +0100 | [diff] [blame] | 295 | last_improvement_iteration = 0 |
Tim Hall | cda4fcb | 2022-05-19 12:36:58 +0100 | [diff] [blame] | 296 | i = 0 |
| 297 | while (self.best_size > self.memory_limit and i < self.max_iterations) or ( |
| 298 | i - last_improvement_iteration < self.MIN_ITERATIONS_IMPROVE |
| 299 | ): |
Louis Verhaard | d700252 | 2021-01-20 17:23:54 +0100 | [diff] [blame] | 300 | # Reorder the indices |
Louis Verhaard | a19b467 | 2022-02-28 15:12:57 +0100 | [diff] [blame] | 301 | self.attempt_bottleneck_fix(indices, i - last_improvement_iteration) |
Louis Verhaard | d700252 | 2021-01-20 17:23:54 +0100 | [diff] [blame] | 302 | # Allocate the reordered indices and check if it gave an improvement |
| 303 | new_size = self.allocate_indices(indices) |
| 304 | if new_size <= self.best_size: |
| 305 | # The new allocation produced a new best result remember it |
Louis Verhaard | a19b467 | 2022-02-28 15:12:57 +0100 | [diff] [blame] | 306 | if new_size < self.best_size: |
| 307 | last_improvement_iteration = i |
Louis Verhaard | d700252 | 2021-01-20 17:23:54 +0100 | [diff] [blame] | 308 | self.best_size = new_size |
| 309 | best_indices = indices[:] |
| 310 | self.allocated_addresses = [lr.address for lr in self.lrs] |
| 311 | if self.best_size <= self.min_required_size: |
Louis Verhaard | a19b467 | 2022-02-28 15:12:57 +0100 | [diff] [blame] | 312 | # Target reached; stop |
Louis Verhaard | d700252 | 2021-01-20 17:23:54 +0100 | [diff] [blame] | 313 | return |
| 314 | else: |
| 315 | # The new allocation produced worse result undo the change |
| 316 | indices = best_indices[:] |
Tim Hall | cda4fcb | 2022-05-19 12:36:58 +0100 | [diff] [blame] | 317 | i += 1 |
Louis Verhaard | d700252 | 2021-01-20 17:23:54 +0100 | [diff] [blame] | 318 | |
| 319 | def allocate(self) -> List[int]: |
| 320 | """ |
| 321 | Runs the allocation algorithm. Finishes when an optimal solution has been |
| 322 | found or when maximum iterations have been run. |
| 323 | The allocated addresses are placed in the output vector, in the same |
| 324 | order as the input vector. |
| 325 | |
| 326 | Implementation note: the algorithm produces reproduceable results by using |
| 327 | a well-defined random number generator with well-defined default seed, |
| 328 | and using a fixed number of iterations. |
| 329 | """ |
| 330 | random.seed(1) |
| 331 | # Sort indices on priority. Note: self.lrs must be left unchanged |
| 332 | indices = [lr.id for lr in sorted(self.lrs)] |
| 333 | # Allocate the initial solution |
| 334 | self.best_size = self.allocate_indices(indices) |
| 335 | self.allocated_addresses = [lr.address for lr in self.lrs] |
| 336 | if self.best_size > self.min_required_size: |
| 337 | # Try to improve the heuristic allocation |
Tim Hall | cda4fcb | 2022-05-19 12:36:58 +0100 | [diff] [blame] | 338 | self.search(indices) |
Louis Verhaard | d700252 | 2021-01-20 17:23:54 +0100 | [diff] [blame] | 339 | # else the heuristic allocation returned an optimal solution; no search needed |
| 340 | return self.allocated_addresses |
| 341 | |
| 342 | |
Tim Hall | cda4fcb | 2022-05-19 12:36:58 +0100 | [diff] [blame] | 343 | def allocate_live_ranges(lrs: List[LiveRange], max_iterations: int, memory_limit: int) -> List[int]: |
Louis Verhaard | d700252 | 2021-01-20 17:23:54 +0100 | [diff] [blame] | 344 | """ |
| 345 | Allocates live ranges using a search based allocator. |
| 346 | Returns the list of allocated addresses (one for each live range) |
| 347 | """ |
| 348 | if not lrs: |
| 349 | return [] |
Tim Hall | cda4fcb | 2022-05-19 12:36:58 +0100 | [diff] [blame] | 350 | allocator = HillClimbAllocator(lrs, max_iterations, memory_limit) |
Louis Verhaard | d700252 | 2021-01-20 17:23:54 +0100 | [diff] [blame] | 351 | return allocator.allocate() |