blob: 6d3003c5f2bf103d738a7ae3b94a0729559d56de [file] [log] [blame]
Louis Verhaardd7002522021-01-20 17:23:54 +01001# Copyright (C) 2021 Arm Limited or its affiliates. All rights reserved.
2#
3# SPDX-License-Identifier: Apache-2.0
4#
5# Licensed under the Apache License, Version 2.0 (the License); you may
6# not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an AS IS BASIS, WITHOUT
13# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
16#
17# Description:
18# Tensor allocator based on a hill-climb search
19import random
20from typing import List
21from typing import Set
22
Henrik G Olsson807278a2021-03-08 18:20:00 +010023from . import numeric_util
Louis Verhaardd7002522021-01-20 17:23:54 +010024from .live_range import LiveRange
25
26
27class LiveRangeInfo:
Henrik G Olsson807278a2021-03-08 18:20:00 +010028 def __init__(self, id: int, start_time: int, end_time: int, size: int, min_alignment: int):
Louis Verhaardd7002522021-01-20 17:23:54 +010029 # Index of this live range
30 self.id = id
31 # Start time (input to the allocator algorithm)
32 self.start_time = start_time
33 # End time, inclusive (input to the allocator algorithm)
34 self.end_time = end_time
35 # Size in bytes (input to the allocator algorithm)
36 self.size = size
37 # Allocated address (the main output from the allocator algorithm)
38 self.address: int = 0
39 # End address, exclusive
40 self.end_address: int = 0
41 # id of predecessor live range (predecessor's end address == this lr's address)
42 self.predecessor: int = 0
43 # Turn at which the live range was allocated
44 self.turn: int = 0
45 # Max value of size_at_time (only used in the heuristic allocation)
46 self.urgency = 0
47 self.neighbours: List["LiveRangeInfo"] = []
Henrik G Olsson807278a2021-03-08 18:20:00 +010048 self.min_alignment = min_alignment
Louis Verhaardd7002522021-01-20 17:23:54 +010049
50 def overlaps(self, addr2: int, size2: int) -> int:
51 return self.address < addr2 + size2 and addr2 < self.end_address
52
53 def is_neighbour(self, lr: "LiveRangeInfo") -> bool:
54 return self.start_time <= lr.end_time and lr.start_time <= self.end_time
55
56 def __str__(self):
57 return "<LiveRangeInfo: id={}, start_time={}, end_time={}, size={}, address={}>".format(
58 self.id, self.start_time, self.end_time, self.size, self.address
59 )
60
61 def __lt__(self, other) -> bool:
62 if self.urgency != other.urgency:
63 return self.urgency > other.urgency
64 duration1 = self.end_time - self.start_time
65 duration2 = other.end_time - other.start_time
66 if duration1 != duration2:
67 return duration1 > duration2
68
69 if self.start_time != other.start_time:
70 return self.start_time < other.start_time
71
72 if self.size != other.size:
73 return self.size > other.size
74
75 return self.id < other.id
76
77
78class HillClimbAllocator:
79 """
80 Implements tensor allocator using a hill climb search.
81
82 The basic algorithm is:
83
84 Use a heuristic allocator to find an initial allocation
85 while allocation is not optimal and iterations < MAX_ITERATIONS:
86 find the "bottleneck": the live range with highest end address
87 find all live ranges that affected the allocation of the bottleneck
88 swap the order of any two affecting live ranges
89 reallocate tensors using the reordered live ranges
90 if the new allocation is better: keep it, else set allocation to previous allocation
91 """
92
93 MAX_ITERATIONS = 500
94 NOT_ALLOCATED = -1
95 # Used for live ranges allocated at address 0
96 NO_PREDECESSOR = -1
Louis Verhaarda19b4672022-02-28 15:12:57 +010097 # Special handling if best solution has not improved during this many iterations
98 MAX_ITERATIONS_STUCK = 50
Louis Verhaardd7002522021-01-20 17:23:54 +010099
100 def __init__(self, live_ranges: List[LiveRange]):
101 # Contains the live ranges
102 self.lrs: List[LiveRangeInfo] = [
Henrik G Olsson807278a2021-03-08 18:20:00 +0100103 LiveRangeInfo(id, lr.start_time, lr.end_time, lr.size, lr.get_alignment())
104 for id, lr in enumerate(live_ranges)
Louis Verhaardd7002522021-01-20 17:23:54 +0100105 ]
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100106 self.lrs_at_time: List[List[LiveRangeInfo]] = []
Louis Verhaardd7002522021-01-20 17:23:54 +0100107 # The available size (input to algorithm).
108 self.available_size: int = 0
109 # The algorithm stops once the target size has been achieved
110 self.target_size: int = 0
111 # The highest end address of the best found allocation
112 self.best_size: int = 1 << 63
113 # For each live range: max value of size_at_time (only used in the heuristic allocation)
114 self.lr_urgency = len(self.lrs) * [0]
115 nr_time_slots = 1 + max(lr.end_time for lr in self.lrs)
116 # Contains active live ranges at each timestamp
117 self.lrs_at_time = [[] for i in range(nr_time_slots)]
118 for lr in self.lrs:
119 for t in range(lr.start_time, lr.end_time + 1):
120 self.lrs_at_time[t].append(lr)
121 # At each timestamp: accumulated size of active live ranges
122 size_at_time = [sum(lr.size for lr in self.lrs_at_time[t]) for t in range(nr_time_slots)]
123 # The minimum possible size, assuming all live ranges can be perfectly allocated
124 self.min_required_size: int = max(size_at_time)
125 # Calculate all neighbours + the urgency of each live range
126 for lr in self.lrs:
127 lr.urgency = 0
128 lr.neighbours = []
129 neighbours = set()
130 for t in range(lr.start_time, lr.end_time + 1):
131 lr.urgency = max(size_at_time[t], lr.urgency)
132 for lr2 in self.lrs_at_time[t]:
133 if lr2 not in neighbours and lr != lr2:
134 neighbours.add(lr2)
135 lr.neighbours.append(lr2)
136
137 def allocate_lr(self, lr: LiveRangeInfo):
138 """
139 Allocates the given live range at the smallest possible address
140 """
141 address = 0
142 predecessor = HillClimbAllocator.NO_PREDECESSOR
143 fits = False
144 while not fits:
145 fits = True
146 # Find neighbours that overlap with address
147 for lr2 in lr.neighbours:
148 if lr2.address == HillClimbAllocator.NOT_ALLOCATED or lr2.end_address <= address:
149 continue
150 if lr2.overlaps(address, lr.size):
151 # Overlap found increase address
152 fits = False
Henrik G Olsson807278a2021-03-08 18:20:00 +0100153 address = numeric_util.round_up(lr2.end_address, lr.min_alignment)
Louis Verhaardd7002522021-01-20 17:23:54 +0100154 predecessor = lr2.id
155 lr.address = address
156 lr.end_address = address + lr.size
157 lr.predecessor = predecessor
158
159 def allocate_indices(self, indices: List[int]):
160 """
161 Allocates the live ranges in the order indicated by the indices;
162 allocates each live range at the lowest possible address.
163 """
164 for lr in self.lrs:
165 lr.address = HillClimbAllocator.NOT_ALLOCATED
166 size = 0
167 for turn, index in enumerate(indices):
168 lr = self.lrs[index]
169 self.allocate_lr(lr)
170 lr.turn = turn
171 size = max(size, lr.end_address)
172 if size > self.best_size:
173 # This allocation is worse than the best known allocation;
174 # no need to continue
175 break
176 return size
177
178 def add_predecessor_turns(self, turn_set: Set[int], turn_list: List[int], lr: LiveRangeInfo):
179 """
180 Adds the given live range + predecessors to the turns vector.
181 Note: the turn_set is for quick detection of duplicates,
182 the turn_list is to get reproduceable results
183 """
184 if lr.turn not in turn_set:
185 turn_set.add(lr.turn)
186 turn_list.append(lr.turn)
187 id = lr.id
188 while self.lrs[id].predecessor != HillClimbAllocator.NO_PREDECESSOR:
189 id = self.lrs[id].predecessor
190 turn = self.lrs[id].turn
191 if turn not in turn_set:
192 turn_set.add(turn)
193 turn_list.append(turn)
194
Louis Verhaarda19b4672022-02-28 15:12:57 +0100195 def attempt_bottleneck_fix(self, indices: List[int], iterations_stuck):
Louis Verhaardd7002522021-01-20 17:23:54 +0100196 """
197 Finds the "bottleneck", the live range with highest end address, and reorders the indices
198 such that a next allocation might lower the memory usage.
199
200 ---------
201 | |
202 | D |
203 | |
204 ----------------------------------
205 | B |
206 -------------------------------
207 | |
208 |A| ---
209 | | |C|
210 | | | |
211 ---------------------------------------
212
213 In the above example, the allocation order was [A, B, C, D] and D is the resulting bottle-neck.
214 The live ranges that affected the allocation of D are the direct neighbours of D (i.e. B and C),
215 and all direct and indirect predecessors of D and its neighbours
216 (i.e. A, which is the predecessor of B, and indirect predecessor of D).
217
218 By permuting the order in which the affecting live ranges are allocated, the bottleneck might
219 be lowered. In the above example, almost any permutation would lower the bottleneck.
220 """
221 # Find the bottleneck
222 max_lr = self.lrs[0]
223 for lr in self.lrs[1:]:
224 if lr.end_address > max_lr.end_address:
225 max_lr = lr
226
227 # Find all live ranges that affected the placement of the bottleneck live range.
228 # This consists of two types of live ranges:
229 # - direct neighbours of the bottleneck live range
230 # - direct and indirect predecessors of these neighbours + bottleneck
231 # The turns at which these live ranges were allocated are put in the turns set.
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100232 turn_set: Set[int] = set()
233 turn_list: List[int] = list()
Louis Verhaardd7002522021-01-20 17:23:54 +0100234 self.add_predecessor_turns(turn_set, turn_list, max_lr)
235 for lr2 in max_lr.neighbours:
236 self.add_predecessor_turns(turn_set, turn_list, lr2)
237
238 # Non-direct neighbours that interfere with the allocation of the bottleneck are the
239 # immediate cause for gaps in the allocation, and are selected with higher probability.
240 non_nb_turn_list = []
241 for turn in turn_list:
242 lr = self.lrs[indices[turn]]
243 if not max_lr.is_neighbour(lr):
244 non_nb_turn_list.append(turn)
245 assert turn_list
246 # Pick from non-neighbour list with 30% probability
247 # (magic number based on tuning)
248 if random.randint(0, 100) < 30 and non_nb_turn_list:
249 # Pick a live range from the "non-neighbour list"
250 ix1 = non_nb_turn_list[random.randint(0, len(non_nb_turn_list) - 1)]
251 else:
252 # Pick any affecting live range.
253 ix1 = turn_list[random.randint(0, len(turn_list) - 1)]
254
255 ix2 = turn_list[random.randint(0, len(turn_list) - 2)]
256 if ix1 == ix2:
257 ix2 = turn_list[-1]
258 # Swap indices
259 indices[ix1], indices[ix2] = indices[ix2], indices[ix1]
Louis Verhaarda19b4672022-02-28 15:12:57 +0100260 if iterations_stuck > HillClimbAllocator.MAX_ITERATIONS_STUCK:
261 # The best allocation has not improved for a while, maybe improvement is not possible
262 # by single-swapping indices; add more neighbour live ranges and swap up to two more indices.
263 # Adding more neighbours can sometimes resolve the situation where the current bottleneck
264 # is resolved, but always results in a higher bottleneck at a nearby live range.
265 # Magic number is based on tuning
266 for turn in non_nb_turn_list:
267 for lr in self.lrs[indices[turn]].neighbours:
268 if lr.turn not in turn_set:
269 turn_set.add(lr.turn)
270 turn_list.append(lr.turn)
271 ix1 = turn_list[random.randint(0, len(turn_list) - 1)]
272 ix2 = turn_list[random.randint(0, len(turn_list) - 1)]
273 indices[ix1], indices[ix2] = indices[ix2], indices[ix1]
Louis Verhaardd7002522021-01-20 17:23:54 +0100274
275 def search(self, indices: List[int], iterations: int):
276 """
277 Search for a solution, using the given indices as initial solution.
278 """
279 best_indices = indices[:]
Louis Verhaarda19b4672022-02-28 15:12:57 +0100280 last_improvement_iteration = 0
281 for i in range(iterations):
Louis Verhaardd7002522021-01-20 17:23:54 +0100282 # Reorder the indices
Louis Verhaarda19b4672022-02-28 15:12:57 +0100283 self.attempt_bottleneck_fix(indices, i - last_improvement_iteration)
Louis Verhaardd7002522021-01-20 17:23:54 +0100284 # Allocate the reordered indices and check if it gave an improvement
285 new_size = self.allocate_indices(indices)
286 if new_size <= self.best_size:
287 # The new allocation produced a new best result remember it
Louis Verhaarda19b4672022-02-28 15:12:57 +0100288 if new_size < self.best_size:
289 last_improvement_iteration = i
Louis Verhaardd7002522021-01-20 17:23:54 +0100290 self.best_size = new_size
291 best_indices = indices[:]
292 self.allocated_addresses = [lr.address for lr in self.lrs]
293 if self.best_size <= self.min_required_size:
Louis Verhaarda19b4672022-02-28 15:12:57 +0100294 # Target reached; stop
Louis Verhaardd7002522021-01-20 17:23:54 +0100295 return
296 else:
297 # The new allocation produced worse result undo the change
298 indices = best_indices[:]
299
300 def allocate(self) -> List[int]:
301 """
302 Runs the allocation algorithm. Finishes when an optimal solution has been
303 found or when maximum iterations have been run.
304 The allocated addresses are placed in the output vector, in the same
305 order as the input vector.
306
307 Implementation note: the algorithm produces reproduceable results by using
308 a well-defined random number generator with well-defined default seed,
309 and using a fixed number of iterations.
310 """
311 random.seed(1)
312 # Sort indices on priority. Note: self.lrs must be left unchanged
313 indices = [lr.id for lr in sorted(self.lrs)]
314 # Allocate the initial solution
315 self.best_size = self.allocate_indices(indices)
316 self.allocated_addresses = [lr.address for lr in self.lrs]
317 if self.best_size > self.min_required_size:
318 # Try to improve the heuristic allocation
319 self.search(indices, HillClimbAllocator.MAX_ITERATIONS)
320 # else the heuristic allocation returned an optimal solution; no search needed
321 return self.allocated_addresses
322
323
324def allocate_live_ranges(lrs: List[LiveRange]) -> List[int]:
325 """
326 Allocates live ranges using a search based allocator.
327 Returns the list of allocated addresses (one for each live range)
328 """
329 if not lrs:
330 return []
331 allocator = HillClimbAllocator(lrs)
332 return allocator.allocate()