Louis Verhaard | d700252 | 2021-01-20 17:23:54 +0100 | [diff] [blame^] | 1 | # Copyright (C) 2021 Arm Limited or its affiliates. All rights reserved. |
| 2 | # |
| 3 | # SPDX-License-Identifier: Apache-2.0 |
| 4 | # |
| 5 | # Licensed under the Apache License, Version 2.0 (the License); you may |
| 6 | # not use this file except in compliance with the License. |
| 7 | # You may obtain a copy of the License at |
| 8 | # |
| 9 | # www.apache.org/licenses/LICENSE-2.0 |
| 10 | # |
| 11 | # Unless required by applicable law or agreed to in writing, software |
| 12 | # distributed under the License is distributed on an AS IS BASIS, WITHOUT |
| 13 | # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 14 | # See the License for the specific language governing permissions and |
| 15 | # limitations under the License. |
| 16 | # |
| 17 | # Description: |
| 18 | # Tensor allocator based on a hill-climb search |
| 19 | import random |
| 20 | from typing import List |
| 21 | from typing import Set |
| 22 | |
| 23 | from .live_range import LiveRange |
| 24 | |
| 25 | |
| 26 | class LiveRangeInfo: |
| 27 | def __init__(self, id: int, start_time: int, end_time: int, size: int): |
| 28 | # Index of this live range |
| 29 | self.id = id |
| 30 | # Start time (input to the allocator algorithm) |
| 31 | self.start_time = start_time |
| 32 | # End time, inclusive (input to the allocator algorithm) |
| 33 | self.end_time = end_time |
| 34 | # Size in bytes (input to the allocator algorithm) |
| 35 | self.size = size |
| 36 | # Allocated address (the main output from the allocator algorithm) |
| 37 | self.address: int = 0 |
| 38 | # End address, exclusive |
| 39 | self.end_address: int = 0 |
| 40 | # id of predecessor live range (predecessor's end address == this lr's address) |
| 41 | self.predecessor: int = 0 |
| 42 | # Turn at which the live range was allocated |
| 43 | self.turn: int = 0 |
| 44 | # Max value of size_at_time (only used in the heuristic allocation) |
| 45 | self.urgency = 0 |
| 46 | self.neighbours: List["LiveRangeInfo"] = [] |
| 47 | |
| 48 | def overlaps(self, addr2: int, size2: int) -> int: |
| 49 | return self.address < addr2 + size2 and addr2 < self.end_address |
| 50 | |
| 51 | def is_neighbour(self, lr: "LiveRangeInfo") -> bool: |
| 52 | return self.start_time <= lr.end_time and lr.start_time <= self.end_time |
| 53 | |
| 54 | def __str__(self): |
| 55 | return "<LiveRangeInfo: id={}, start_time={}, end_time={}, size={}, address={}>".format( |
| 56 | self.id, self.start_time, self.end_time, self.size, self.address |
| 57 | ) |
| 58 | |
| 59 | def __lt__(self, other) -> bool: |
| 60 | if self.urgency != other.urgency: |
| 61 | return self.urgency > other.urgency |
| 62 | duration1 = self.end_time - self.start_time |
| 63 | duration2 = other.end_time - other.start_time |
| 64 | if duration1 != duration2: |
| 65 | return duration1 > duration2 |
| 66 | |
| 67 | if self.start_time != other.start_time: |
| 68 | return self.start_time < other.start_time |
| 69 | |
| 70 | if self.size != other.size: |
| 71 | return self.size > other.size |
| 72 | |
| 73 | return self.id < other.id |
| 74 | |
| 75 | |
| 76 | class HillClimbAllocator: |
| 77 | """ |
| 78 | Implements tensor allocator using a hill climb search. |
| 79 | |
| 80 | The basic algorithm is: |
| 81 | |
| 82 | Use a heuristic allocator to find an initial allocation |
| 83 | while allocation is not optimal and iterations < MAX_ITERATIONS: |
| 84 | find the "bottleneck": the live range with highest end address |
| 85 | find all live ranges that affected the allocation of the bottleneck |
| 86 | swap the order of any two affecting live ranges |
| 87 | reallocate tensors using the reordered live ranges |
| 88 | if the new allocation is better: keep it, else set allocation to previous allocation |
| 89 | """ |
| 90 | |
| 91 | MAX_ITERATIONS = 500 |
| 92 | NOT_ALLOCATED = -1 |
| 93 | # Used for live ranges allocated at address 0 |
| 94 | NO_PREDECESSOR = -1 |
| 95 | |
| 96 | def __init__(self, live_ranges: List[LiveRange]): |
| 97 | # Contains the live ranges |
| 98 | self.lrs: List[LiveRangeInfo] = [ |
| 99 | LiveRangeInfo(id, lr.start_time, lr.end_time, lr.size) for id, lr in enumerate(live_ranges) |
| 100 | ] |
| 101 | self.lrs_at_time = [] |
| 102 | # The available size (input to algorithm). |
| 103 | self.available_size: int = 0 |
| 104 | # The algorithm stops once the target size has been achieved |
| 105 | self.target_size: int = 0 |
| 106 | # The highest end address of the best found allocation |
| 107 | self.best_size: int = 1 << 63 |
| 108 | # For each live range: max value of size_at_time (only used in the heuristic allocation) |
| 109 | self.lr_urgency = len(self.lrs) * [0] |
| 110 | nr_time_slots = 1 + max(lr.end_time for lr in self.lrs) |
| 111 | # Contains active live ranges at each timestamp |
| 112 | self.lrs_at_time = [[] for i in range(nr_time_slots)] |
| 113 | for lr in self.lrs: |
| 114 | for t in range(lr.start_time, lr.end_time + 1): |
| 115 | self.lrs_at_time[t].append(lr) |
| 116 | # At each timestamp: accumulated size of active live ranges |
| 117 | size_at_time = [sum(lr.size for lr in self.lrs_at_time[t]) for t in range(nr_time_slots)] |
| 118 | # The minimum possible size, assuming all live ranges can be perfectly allocated |
| 119 | self.min_required_size: int = max(size_at_time) |
| 120 | # Calculate all neighbours + the urgency of each live range |
| 121 | for lr in self.lrs: |
| 122 | lr.urgency = 0 |
| 123 | lr.neighbours = [] |
| 124 | neighbours = set() |
| 125 | for t in range(lr.start_time, lr.end_time + 1): |
| 126 | lr.urgency = max(size_at_time[t], lr.urgency) |
| 127 | for lr2 in self.lrs_at_time[t]: |
| 128 | if lr2 not in neighbours and lr != lr2: |
| 129 | neighbours.add(lr2) |
| 130 | lr.neighbours.append(lr2) |
| 131 | |
| 132 | def allocate_lr(self, lr: LiveRangeInfo): |
| 133 | """ |
| 134 | Allocates the given live range at the smallest possible address |
| 135 | """ |
| 136 | address = 0 |
| 137 | predecessor = HillClimbAllocator.NO_PREDECESSOR |
| 138 | fits = False |
| 139 | while not fits: |
| 140 | fits = True |
| 141 | # Find neighbours that overlap with address |
| 142 | for lr2 in lr.neighbours: |
| 143 | if lr2.address == HillClimbAllocator.NOT_ALLOCATED or lr2.end_address <= address: |
| 144 | continue |
| 145 | if lr2.overlaps(address, lr.size): |
| 146 | # Overlap found increase address |
| 147 | fits = False |
| 148 | address = lr2.end_address |
| 149 | predecessor = lr2.id |
| 150 | lr.address = address |
| 151 | lr.end_address = address + lr.size |
| 152 | lr.predecessor = predecessor |
| 153 | |
| 154 | def allocate_indices(self, indices: List[int]): |
| 155 | """ |
| 156 | Allocates the live ranges in the order indicated by the indices; |
| 157 | allocates each live range at the lowest possible address. |
| 158 | """ |
| 159 | for lr in self.lrs: |
| 160 | lr.address = HillClimbAllocator.NOT_ALLOCATED |
| 161 | size = 0 |
| 162 | for turn, index in enumerate(indices): |
| 163 | lr = self.lrs[index] |
| 164 | self.allocate_lr(lr) |
| 165 | lr.turn = turn |
| 166 | size = max(size, lr.end_address) |
| 167 | if size > self.best_size: |
| 168 | # This allocation is worse than the best known allocation; |
| 169 | # no need to continue |
| 170 | break |
| 171 | return size |
| 172 | |
| 173 | def add_predecessor_turns(self, turn_set: Set[int], turn_list: List[int], lr: LiveRangeInfo): |
| 174 | """ |
| 175 | Adds the given live range + predecessors to the turns vector. |
| 176 | Note: the turn_set is for quick detection of duplicates, |
| 177 | the turn_list is to get reproduceable results |
| 178 | """ |
| 179 | if lr.turn not in turn_set: |
| 180 | turn_set.add(lr.turn) |
| 181 | turn_list.append(lr.turn) |
| 182 | id = lr.id |
| 183 | while self.lrs[id].predecessor != HillClimbAllocator.NO_PREDECESSOR: |
| 184 | id = self.lrs[id].predecessor |
| 185 | turn = self.lrs[id].turn |
| 186 | if turn not in turn_set: |
| 187 | turn_set.add(turn) |
| 188 | turn_list.append(turn) |
| 189 | |
| 190 | def attempt_bottleneck_fix(self, indices: List[int]): |
| 191 | """ |
| 192 | Finds the "bottleneck", the live range with highest end address, and reorders the indices |
| 193 | such that a next allocation might lower the memory usage. |
| 194 | |
| 195 | --------- |
| 196 | | | |
| 197 | | D | |
| 198 | | | |
| 199 | ---------------------------------- |
| 200 | | B | |
| 201 | ------------------------------- |
| 202 | | | |
| 203 | |A| --- |
| 204 | | | |C| |
| 205 | | | | | |
| 206 | --------------------------------------- |
| 207 | |
| 208 | In the above example, the allocation order was [A, B, C, D] and D is the resulting bottle-neck. |
| 209 | The live ranges that affected the allocation of D are the direct neighbours of D (i.e. B and C), |
| 210 | and all direct and indirect predecessors of D and its neighbours |
| 211 | (i.e. A, which is the predecessor of B, and indirect predecessor of D). |
| 212 | |
| 213 | By permuting the order in which the affecting live ranges are allocated, the bottleneck might |
| 214 | be lowered. In the above example, almost any permutation would lower the bottleneck. |
| 215 | """ |
| 216 | # Find the bottleneck |
| 217 | max_lr = self.lrs[0] |
| 218 | for lr in self.lrs[1:]: |
| 219 | if lr.end_address > max_lr.end_address: |
| 220 | max_lr = lr |
| 221 | |
| 222 | # Find all live ranges that affected the placement of the bottleneck live range. |
| 223 | # This consists of two types of live ranges: |
| 224 | # - direct neighbours of the bottleneck live range |
| 225 | # - direct and indirect predecessors of these neighbours + bottleneck |
| 226 | # The turns at which these live ranges were allocated are put in the turns set. |
| 227 | turn_set = set() |
| 228 | turn_list = list() |
| 229 | self.add_predecessor_turns(turn_set, turn_list, max_lr) |
| 230 | for lr2 in max_lr.neighbours: |
| 231 | self.add_predecessor_turns(turn_set, turn_list, lr2) |
| 232 | |
| 233 | # Non-direct neighbours that interfere with the allocation of the bottleneck are the |
| 234 | # immediate cause for gaps in the allocation, and are selected with higher probability. |
| 235 | non_nb_turn_list = [] |
| 236 | for turn in turn_list: |
| 237 | lr = self.lrs[indices[turn]] |
| 238 | if not max_lr.is_neighbour(lr): |
| 239 | non_nb_turn_list.append(turn) |
| 240 | assert turn_list |
| 241 | # Pick from non-neighbour list with 30% probability |
| 242 | # (magic number based on tuning) |
| 243 | if random.randint(0, 100) < 30 and non_nb_turn_list: |
| 244 | # Pick a live range from the "non-neighbour list" |
| 245 | ix1 = non_nb_turn_list[random.randint(0, len(non_nb_turn_list) - 1)] |
| 246 | else: |
| 247 | # Pick any affecting live range. |
| 248 | ix1 = turn_list[random.randint(0, len(turn_list) - 1)] |
| 249 | |
| 250 | ix2 = turn_list[random.randint(0, len(turn_list) - 2)] |
| 251 | if ix1 == ix2: |
| 252 | ix2 = turn_list[-1] |
| 253 | # Swap indices |
| 254 | indices[ix1], indices[ix2] = indices[ix2], indices[ix1] |
| 255 | |
| 256 | def search(self, indices: List[int], iterations: int): |
| 257 | """ |
| 258 | Search for a solution, using the given indices as initial solution. |
| 259 | """ |
| 260 | best_indices = indices[:] |
| 261 | for _ in range(iterations): |
| 262 | # Reorder the indices |
| 263 | self.attempt_bottleneck_fix(indices) |
| 264 | # Allocate the reordered indices and check if it gave an improvement |
| 265 | new_size = self.allocate_indices(indices) |
| 266 | if new_size <= self.best_size: |
| 267 | # The new allocation produced a new best result remember it |
| 268 | self.best_size = new_size |
| 269 | best_indices = indices[:] |
| 270 | self.allocated_addresses = [lr.address for lr in self.lrs] |
| 271 | if self.best_size <= self.min_required_size: |
| 272 | # Target reached stop |
| 273 | return |
| 274 | else: |
| 275 | # The new allocation produced worse result undo the change |
| 276 | indices = best_indices[:] |
| 277 | |
| 278 | def allocate(self) -> List[int]: |
| 279 | """ |
| 280 | Runs the allocation algorithm. Finishes when an optimal solution has been |
| 281 | found or when maximum iterations have been run. |
| 282 | The allocated addresses are placed in the output vector, in the same |
| 283 | order as the input vector. |
| 284 | |
| 285 | Implementation note: the algorithm produces reproduceable results by using |
| 286 | a well-defined random number generator with well-defined default seed, |
| 287 | and using a fixed number of iterations. |
| 288 | """ |
| 289 | random.seed(1) |
| 290 | # Sort indices on priority. Note: self.lrs must be left unchanged |
| 291 | indices = [lr.id for lr in sorted(self.lrs)] |
| 292 | # Allocate the initial solution |
| 293 | self.best_size = self.allocate_indices(indices) |
| 294 | self.allocated_addresses = [lr.address for lr in self.lrs] |
| 295 | if self.best_size > self.min_required_size: |
| 296 | # Try to improve the heuristic allocation |
| 297 | self.search(indices, HillClimbAllocator.MAX_ITERATIONS) |
| 298 | # else the heuristic allocation returned an optimal solution; no search needed |
| 299 | return self.allocated_addresses |
| 300 | |
| 301 | |
| 302 | def allocate_live_ranges(lrs: List[LiveRange]) -> List[int]: |
| 303 | """ |
| 304 | Allocates live ranges using a search based allocator. |
| 305 | Returns the list of allocated addresses (one for each live range) |
| 306 | """ |
| 307 | if not lrs: |
| 308 | return [] |
| 309 | allocator = HillClimbAllocator(lrs) |
| 310 | return allocator.allocate() |