blob: 321a4d02d6bd349e633e732fa78b8b10d2181bdd [file] [log] [blame]
Rickard Bolinbc6ee582022-11-04 08:24:29 +00001# SPDX-FileCopyrightText: Copyright 2021-2022 Arm Limited and/or its affiliates <open-source-office@arm.com>
Louis Verhaardd7002522021-01-20 17:23:54 +01002#
3# SPDX-License-Identifier: Apache-2.0
4#
5# Licensed under the Apache License, Version 2.0 (the License); you may
6# not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an AS IS BASIS, WITHOUT
13# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
16#
17# Description:
18# Tensor allocator based on a hill-climb search
19import random
20from typing import List
21from typing import Set
22
Henrik G Olsson807278a2021-03-08 18:20:00 +010023from . import numeric_util
Louis Verhaardd7002522021-01-20 17:23:54 +010024from .live_range import LiveRange
25
26
27class LiveRangeInfo:
Henrik G Olsson807278a2021-03-08 18:20:00 +010028 def __init__(self, id: int, start_time: int, end_time: int, size: int, min_alignment: int):
Louis Verhaardd7002522021-01-20 17:23:54 +010029 # Index of this live range
30 self.id = id
31 # Start time (input to the allocator algorithm)
32 self.start_time = start_time
33 # End time, inclusive (input to the allocator algorithm)
34 self.end_time = end_time
35 # Size in bytes (input to the allocator algorithm)
36 self.size = size
37 # Allocated address (the main output from the allocator algorithm)
38 self.address: int = 0
39 # End address, exclusive
40 self.end_address: int = 0
41 # id of predecessor live range (predecessor's end address == this lr's address)
42 self.predecessor: int = 0
43 # Turn at which the live range was allocated
44 self.turn: int = 0
45 # Max value of size_at_time (only used in the heuristic allocation)
46 self.urgency = 0
47 self.neighbours: List["LiveRangeInfo"] = []
Henrik G Olsson807278a2021-03-08 18:20:00 +010048 self.min_alignment = min_alignment
Louis Verhaardd7002522021-01-20 17:23:54 +010049
50 def overlaps(self, addr2: int, size2: int) -> int:
51 return self.address < addr2 + size2 and addr2 < self.end_address
52
53 def is_neighbour(self, lr: "LiveRangeInfo") -> bool:
54 return self.start_time <= lr.end_time and lr.start_time <= self.end_time
55
56 def __str__(self):
57 return "<LiveRangeInfo: id={}, start_time={}, end_time={}, size={}, address={}>".format(
58 self.id, self.start_time, self.end_time, self.size, self.address
59 )
60
61 def __lt__(self, other) -> bool:
62 if self.urgency != other.urgency:
63 return self.urgency > other.urgency
64 duration1 = self.end_time - self.start_time
65 duration2 = other.end_time - other.start_time
66 if duration1 != duration2:
67 return duration1 > duration2
68
69 if self.start_time != other.start_time:
70 return self.start_time < other.start_time
71
72 if self.size != other.size:
73 return self.size > other.size
74
75 return self.id < other.id
76
77
78class HillClimbAllocator:
79 """
80 Implements tensor allocator using a hill climb search.
81
82 The basic algorithm is:
83
84 Use a heuristic allocator to find an initial allocation
85 while allocation is not optimal and iterations < MAX_ITERATIONS:
86 find the "bottleneck": the live range with highest end address
87 find all live ranges that affected the allocation of the bottleneck
88 swap the order of any two affecting live ranges
89 reallocate tensors using the reordered live ranges
90 if the new allocation is better: keep it, else set allocation to previous allocation
91 """
92
Tim Hallcda4fcb2022-05-19 12:36:58 +010093 # Maximum number of iterations of the algorithm (can be override from the command line)
94 MAX_ITERATIONS = 99999
Louis Verhaardd7002522021-01-20 17:23:54 +010095 NOT_ALLOCATED = -1
96 # Used for live ranges allocated at address 0
97 NO_PREDECESSOR = -1
Louis Verhaarda19b4672022-02-28 15:12:57 +010098 # Special handling if best solution has not improved during this many iterations
99 MAX_ITERATIONS_STUCK = 50
Tim Hallcda4fcb2022-05-19 12:36:58 +0100100 # Minimum number of iterations since the last improvement (unless an optimal solution is found)
101 MIN_ITERATIONS_IMPROVE = 500
Louis Verhaardd7002522021-01-20 17:23:54 +0100102
Tim Hallcda4fcb2022-05-19 12:36:58 +0100103 def __init__(self, live_ranges: List[LiveRange], max_iterations: int, memory_limit: int):
Louis Verhaardd7002522021-01-20 17:23:54 +0100104 # Contains the live ranges
105 self.lrs: List[LiveRangeInfo] = [
Henrik G Olsson807278a2021-03-08 18:20:00 +0100106 LiveRangeInfo(id, lr.start_time, lr.end_time, lr.size, lr.get_alignment())
107 for id, lr in enumerate(live_ranges)
Louis Verhaardd7002522021-01-20 17:23:54 +0100108 ]
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100109 self.lrs_at_time: List[List[LiveRangeInfo]] = []
Louis Verhaardd7002522021-01-20 17:23:54 +0100110 # The available size (input to algorithm).
111 self.available_size: int = 0
112 # The algorithm stops once the target size has been achieved
113 self.target_size: int = 0
114 # The highest end address of the best found allocation
115 self.best_size: int = 1 << 63
116 # For each live range: max value of size_at_time (only used in the heuristic allocation)
117 self.lr_urgency = len(self.lrs) * [0]
118 nr_time_slots = 1 + max(lr.end_time for lr in self.lrs)
119 # Contains active live ranges at each timestamp
120 self.lrs_at_time = [[] for i in range(nr_time_slots)]
121 for lr in self.lrs:
122 for t in range(lr.start_time, lr.end_time + 1):
123 self.lrs_at_time[t].append(lr)
124 # At each timestamp: accumulated size of active live ranges
125 size_at_time = [sum(lr.size for lr in self.lrs_at_time[t]) for t in range(nr_time_slots)]
126 # The minimum possible size, assuming all live ranges can be perfectly allocated
127 self.min_required_size: int = max(size_at_time)
Tim Hallcda4fcb2022-05-19 12:36:58 +0100128 # Set the maximum number of iterations the search will iterate to find a solution
129 if max_iterations is None:
130 self.max_iterations = self.MAX_ITERATIONS
131 else:
132 self.max_iterations = max_iterations
133 # Defines a memory limit that the allocation must meet
134 self.memory_limit = memory_limit
135 if self.memory_limit < self.min_required_size:
136 print(
137 f"Warning: Memory limit = {self.memory_limit} is less than the minimum possible allocation size ="
138 f" {self.min_required_size}"
139 )
Louis Verhaardd7002522021-01-20 17:23:54 +0100140 # Calculate all neighbours + the urgency of each live range
141 for lr in self.lrs:
142 lr.urgency = 0
143 lr.neighbours = []
144 neighbours = set()
145 for t in range(lr.start_time, lr.end_time + 1):
146 lr.urgency = max(size_at_time[t], lr.urgency)
147 for lr2 in self.lrs_at_time[t]:
148 if lr2 not in neighbours and lr != lr2:
149 neighbours.add(lr2)
150 lr.neighbours.append(lr2)
151
152 def allocate_lr(self, lr: LiveRangeInfo):
153 """
154 Allocates the given live range at the smallest possible address
155 """
156 address = 0
157 predecessor = HillClimbAllocator.NO_PREDECESSOR
158 fits = False
159 while not fits:
160 fits = True
161 # Find neighbours that overlap with address
162 for lr2 in lr.neighbours:
163 if lr2.address == HillClimbAllocator.NOT_ALLOCATED or lr2.end_address <= address:
164 continue
165 if lr2.overlaps(address, lr.size):
166 # Overlap found increase address
167 fits = False
Henrik G Olsson807278a2021-03-08 18:20:00 +0100168 address = numeric_util.round_up(lr2.end_address, lr.min_alignment)
Louis Verhaardd7002522021-01-20 17:23:54 +0100169 predecessor = lr2.id
170 lr.address = address
171 lr.end_address = address + lr.size
172 lr.predecessor = predecessor
173
174 def allocate_indices(self, indices: List[int]):
175 """
176 Allocates the live ranges in the order indicated by the indices;
177 allocates each live range at the lowest possible address.
178 """
179 for lr in self.lrs:
180 lr.address = HillClimbAllocator.NOT_ALLOCATED
181 size = 0
182 for turn, index in enumerate(indices):
183 lr = self.lrs[index]
184 self.allocate_lr(lr)
185 lr.turn = turn
186 size = max(size, lr.end_address)
187 if size > self.best_size:
188 # This allocation is worse than the best known allocation;
189 # no need to continue
190 break
191 return size
192
193 def add_predecessor_turns(self, turn_set: Set[int], turn_list: List[int], lr: LiveRangeInfo):
194 """
195 Adds the given live range + predecessors to the turns vector.
196 Note: the turn_set is for quick detection of duplicates,
197 the turn_list is to get reproduceable results
198 """
199 if lr.turn not in turn_set:
200 turn_set.add(lr.turn)
201 turn_list.append(lr.turn)
202 id = lr.id
203 while self.lrs[id].predecessor != HillClimbAllocator.NO_PREDECESSOR:
204 id = self.lrs[id].predecessor
205 turn = self.lrs[id].turn
206 if turn not in turn_set:
207 turn_set.add(turn)
208 turn_list.append(turn)
209
Louis Verhaarda19b4672022-02-28 15:12:57 +0100210 def attempt_bottleneck_fix(self, indices: List[int], iterations_stuck):
Louis Verhaardd7002522021-01-20 17:23:54 +0100211 """
212 Finds the "bottleneck", the live range with highest end address, and reorders the indices
213 such that a next allocation might lower the memory usage.
214
215 ---------
216 | |
217 | D |
218 | |
219 ----------------------------------
220 | B |
221 -------------------------------
222 | |
223 |A| ---
224 | | |C|
225 | | | |
226 ---------------------------------------
227
228 In the above example, the allocation order was [A, B, C, D] and D is the resulting bottle-neck.
229 The live ranges that affected the allocation of D are the direct neighbours of D (i.e. B and C),
230 and all direct and indirect predecessors of D and its neighbours
231 (i.e. A, which is the predecessor of B, and indirect predecessor of D).
232
233 By permuting the order in which the affecting live ranges are allocated, the bottleneck might
234 be lowered. In the above example, almost any permutation would lower the bottleneck.
235 """
236 # Find the bottleneck
237 max_lr = self.lrs[0]
238 for lr in self.lrs[1:]:
239 if lr.end_address > max_lr.end_address:
240 max_lr = lr
241
242 # Find all live ranges that affected the placement of the bottleneck live range.
243 # This consists of two types of live ranges:
244 # - direct neighbours of the bottleneck live range
245 # - direct and indirect predecessors of these neighbours + bottleneck
246 # The turns at which these live ranges were allocated are put in the turns set.
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100247 turn_set: Set[int] = set()
248 turn_list: List[int] = list()
Louis Verhaardd7002522021-01-20 17:23:54 +0100249 self.add_predecessor_turns(turn_set, turn_list, max_lr)
250 for lr2 in max_lr.neighbours:
251 self.add_predecessor_turns(turn_set, turn_list, lr2)
252
253 # Non-direct neighbours that interfere with the allocation of the bottleneck are the
254 # immediate cause for gaps in the allocation, and are selected with higher probability.
255 non_nb_turn_list = []
256 for turn in turn_list:
257 lr = self.lrs[indices[turn]]
258 if not max_lr.is_neighbour(lr):
259 non_nb_turn_list.append(turn)
260 assert turn_list
261 # Pick from non-neighbour list with 30% probability
262 # (magic number based on tuning)
263 if random.randint(0, 100) < 30 and non_nb_turn_list:
264 # Pick a live range from the "non-neighbour list"
265 ix1 = non_nb_turn_list[random.randint(0, len(non_nb_turn_list) - 1)]
266 else:
267 # Pick any affecting live range.
268 ix1 = turn_list[random.randint(0, len(turn_list) - 1)]
269
270 ix2 = turn_list[random.randint(0, len(turn_list) - 2)]
271 if ix1 == ix2:
272 ix2 = turn_list[-1]
273 # Swap indices
274 indices[ix1], indices[ix2] = indices[ix2], indices[ix1]
Louis Verhaarda19b4672022-02-28 15:12:57 +0100275 if iterations_stuck > HillClimbAllocator.MAX_ITERATIONS_STUCK:
276 # The best allocation has not improved for a while, maybe improvement is not possible
277 # by single-swapping indices; add more neighbour live ranges and swap up to two more indices.
278 # Adding more neighbours can sometimes resolve the situation where the current bottleneck
279 # is resolved, but always results in a higher bottleneck at a nearby live range.
280 # Magic number is based on tuning
281 for turn in non_nb_turn_list:
282 for lr in self.lrs[indices[turn]].neighbours:
283 if lr.turn not in turn_set:
284 turn_set.add(lr.turn)
285 turn_list.append(lr.turn)
286 ix1 = turn_list[random.randint(0, len(turn_list) - 1)]
287 ix2 = turn_list[random.randint(0, len(turn_list) - 1)]
288 indices[ix1], indices[ix2] = indices[ix2], indices[ix1]
Louis Verhaardd7002522021-01-20 17:23:54 +0100289
Tim Hallcda4fcb2022-05-19 12:36:58 +0100290 def search(self, indices: List[int]):
Louis Verhaardd7002522021-01-20 17:23:54 +0100291 """
292 Search for a solution, using the given indices as initial solution.
293 """
294 best_indices = indices[:]
Louis Verhaarda19b4672022-02-28 15:12:57 +0100295 last_improvement_iteration = 0
Tim Hallcda4fcb2022-05-19 12:36:58 +0100296 i = 0
297 while (self.best_size > self.memory_limit and i < self.max_iterations) or (
298 i - last_improvement_iteration < self.MIN_ITERATIONS_IMPROVE
299 ):
Louis Verhaardd7002522021-01-20 17:23:54 +0100300 # Reorder the indices
Louis Verhaarda19b4672022-02-28 15:12:57 +0100301 self.attempt_bottleneck_fix(indices, i - last_improvement_iteration)
Louis Verhaardd7002522021-01-20 17:23:54 +0100302 # Allocate the reordered indices and check if it gave an improvement
303 new_size = self.allocate_indices(indices)
304 if new_size <= self.best_size:
305 # The new allocation produced a new best result remember it
Louis Verhaarda19b4672022-02-28 15:12:57 +0100306 if new_size < self.best_size:
307 last_improvement_iteration = i
Louis Verhaardd7002522021-01-20 17:23:54 +0100308 self.best_size = new_size
309 best_indices = indices[:]
310 self.allocated_addresses = [lr.address for lr in self.lrs]
311 if self.best_size <= self.min_required_size:
Louis Verhaarda19b4672022-02-28 15:12:57 +0100312 # Target reached; stop
Louis Verhaardd7002522021-01-20 17:23:54 +0100313 return
314 else:
315 # The new allocation produced worse result undo the change
316 indices = best_indices[:]
Tim Hallcda4fcb2022-05-19 12:36:58 +0100317 i += 1
Louis Verhaardd7002522021-01-20 17:23:54 +0100318
319 def allocate(self) -> List[int]:
320 """
321 Runs the allocation algorithm. Finishes when an optimal solution has been
322 found or when maximum iterations have been run.
323 The allocated addresses are placed in the output vector, in the same
324 order as the input vector.
325
326 Implementation note: the algorithm produces reproduceable results by using
327 a well-defined random number generator with well-defined default seed,
328 and using a fixed number of iterations.
329 """
330 random.seed(1)
331 # Sort indices on priority. Note: self.lrs must be left unchanged
332 indices = [lr.id for lr in sorted(self.lrs)]
333 # Allocate the initial solution
334 self.best_size = self.allocate_indices(indices)
335 self.allocated_addresses = [lr.address for lr in self.lrs]
336 if self.best_size > self.min_required_size:
337 # Try to improve the heuristic allocation
Tim Hallcda4fcb2022-05-19 12:36:58 +0100338 self.search(indices)
Louis Verhaardd7002522021-01-20 17:23:54 +0100339 # else the heuristic allocation returned an optimal solution; no search needed
340 return self.allocated_addresses
341
342
Tim Hallcda4fcb2022-05-19 12:36:58 +0100343def allocate_live_ranges(lrs: List[LiveRange], max_iterations: int, memory_limit: int) -> List[int]:
Louis Verhaardd7002522021-01-20 17:23:54 +0100344 """
345 Allocates live ranges using a search based allocator.
346 Returns the list of allocated addresses (one for each live range)
347 """
348 if not lrs:
349 return []
Tim Hallcda4fcb2022-05-19 12:36:58 +0100350 allocator = HillClimbAllocator(lrs, max_iterations, memory_limit)
Louis Verhaardd7002522021-01-20 17:23:54 +0100351 return allocator.allocate()