blob: de53ab83913b2e3f2cbc274f9bf3fdb87ee66a4e [file] [log] [blame]
Louis Verhaardd7002522021-01-20 17:23:54 +01001# Copyright (C) 2021 Arm Limited or its affiliates. All rights reserved.
2#
3# SPDX-License-Identifier: Apache-2.0
4#
5# Licensed under the Apache License, Version 2.0 (the License); you may
6# not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an AS IS BASIS, WITHOUT
13# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
16#
17# Description:
18# Tensor allocator based on a hill-climb search
19import random
20from typing import List
21from typing import Set
22
23from .live_range import LiveRange
24
25
26class LiveRangeInfo:
27 def __init__(self, id: int, start_time: int, end_time: int, size: int):
28 # Index of this live range
29 self.id = id
30 # Start time (input to the allocator algorithm)
31 self.start_time = start_time
32 # End time, inclusive (input to the allocator algorithm)
33 self.end_time = end_time
34 # Size in bytes (input to the allocator algorithm)
35 self.size = size
36 # Allocated address (the main output from the allocator algorithm)
37 self.address: int = 0
38 # End address, exclusive
39 self.end_address: int = 0
40 # id of predecessor live range (predecessor's end address == this lr's address)
41 self.predecessor: int = 0
42 # Turn at which the live range was allocated
43 self.turn: int = 0
44 # Max value of size_at_time (only used in the heuristic allocation)
45 self.urgency = 0
46 self.neighbours: List["LiveRangeInfo"] = []
47
48 def overlaps(self, addr2: int, size2: int) -> int:
49 return self.address < addr2 + size2 and addr2 < self.end_address
50
51 def is_neighbour(self, lr: "LiveRangeInfo") -> bool:
52 return self.start_time <= lr.end_time and lr.start_time <= self.end_time
53
54 def __str__(self):
55 return "<LiveRangeInfo: id={}, start_time={}, end_time={}, size={}, address={}>".format(
56 self.id, self.start_time, self.end_time, self.size, self.address
57 )
58
59 def __lt__(self, other) -> bool:
60 if self.urgency != other.urgency:
61 return self.urgency > other.urgency
62 duration1 = self.end_time - self.start_time
63 duration2 = other.end_time - other.start_time
64 if duration1 != duration2:
65 return duration1 > duration2
66
67 if self.start_time != other.start_time:
68 return self.start_time < other.start_time
69
70 if self.size != other.size:
71 return self.size > other.size
72
73 return self.id < other.id
74
75
76class HillClimbAllocator:
77 """
78 Implements tensor allocator using a hill climb search.
79
80 The basic algorithm is:
81
82 Use a heuristic allocator to find an initial allocation
83 while allocation is not optimal and iterations < MAX_ITERATIONS:
84 find the "bottleneck": the live range with highest end address
85 find all live ranges that affected the allocation of the bottleneck
86 swap the order of any two affecting live ranges
87 reallocate tensors using the reordered live ranges
88 if the new allocation is better: keep it, else set allocation to previous allocation
89 """
90
91 MAX_ITERATIONS = 500
92 NOT_ALLOCATED = -1
93 # Used for live ranges allocated at address 0
94 NO_PREDECESSOR = -1
95
96 def __init__(self, live_ranges: List[LiveRange]):
97 # Contains the live ranges
98 self.lrs: List[LiveRangeInfo] = [
99 LiveRangeInfo(id, lr.start_time, lr.end_time, lr.size) for id, lr in enumerate(live_ranges)
100 ]
101 self.lrs_at_time = []
102 # The available size (input to algorithm).
103 self.available_size: int = 0
104 # The algorithm stops once the target size has been achieved
105 self.target_size: int = 0
106 # The highest end address of the best found allocation
107 self.best_size: int = 1 << 63
108 # For each live range: max value of size_at_time (only used in the heuristic allocation)
109 self.lr_urgency = len(self.lrs) * [0]
110 nr_time_slots = 1 + max(lr.end_time for lr in self.lrs)
111 # Contains active live ranges at each timestamp
112 self.lrs_at_time = [[] for i in range(nr_time_slots)]
113 for lr in self.lrs:
114 for t in range(lr.start_time, lr.end_time + 1):
115 self.lrs_at_time[t].append(lr)
116 # At each timestamp: accumulated size of active live ranges
117 size_at_time = [sum(lr.size for lr in self.lrs_at_time[t]) for t in range(nr_time_slots)]
118 # The minimum possible size, assuming all live ranges can be perfectly allocated
119 self.min_required_size: int = max(size_at_time)
120 # Calculate all neighbours + the urgency of each live range
121 for lr in self.lrs:
122 lr.urgency = 0
123 lr.neighbours = []
124 neighbours = set()
125 for t in range(lr.start_time, lr.end_time + 1):
126 lr.urgency = max(size_at_time[t], lr.urgency)
127 for lr2 in self.lrs_at_time[t]:
128 if lr2 not in neighbours and lr != lr2:
129 neighbours.add(lr2)
130 lr.neighbours.append(lr2)
131
132 def allocate_lr(self, lr: LiveRangeInfo):
133 """
134 Allocates the given live range at the smallest possible address
135 """
136 address = 0
137 predecessor = HillClimbAllocator.NO_PREDECESSOR
138 fits = False
139 while not fits:
140 fits = True
141 # Find neighbours that overlap with address
142 for lr2 in lr.neighbours:
143 if lr2.address == HillClimbAllocator.NOT_ALLOCATED or lr2.end_address <= address:
144 continue
145 if lr2.overlaps(address, lr.size):
146 # Overlap found increase address
147 fits = False
148 address = lr2.end_address
149 predecessor = lr2.id
150 lr.address = address
151 lr.end_address = address + lr.size
152 lr.predecessor = predecessor
153
154 def allocate_indices(self, indices: List[int]):
155 """
156 Allocates the live ranges in the order indicated by the indices;
157 allocates each live range at the lowest possible address.
158 """
159 for lr in self.lrs:
160 lr.address = HillClimbAllocator.NOT_ALLOCATED
161 size = 0
162 for turn, index in enumerate(indices):
163 lr = self.lrs[index]
164 self.allocate_lr(lr)
165 lr.turn = turn
166 size = max(size, lr.end_address)
167 if size > self.best_size:
168 # This allocation is worse than the best known allocation;
169 # no need to continue
170 break
171 return size
172
173 def add_predecessor_turns(self, turn_set: Set[int], turn_list: List[int], lr: LiveRangeInfo):
174 """
175 Adds the given live range + predecessors to the turns vector.
176 Note: the turn_set is for quick detection of duplicates,
177 the turn_list is to get reproduceable results
178 """
179 if lr.turn not in turn_set:
180 turn_set.add(lr.turn)
181 turn_list.append(lr.turn)
182 id = lr.id
183 while self.lrs[id].predecessor != HillClimbAllocator.NO_PREDECESSOR:
184 id = self.lrs[id].predecessor
185 turn = self.lrs[id].turn
186 if turn not in turn_set:
187 turn_set.add(turn)
188 turn_list.append(turn)
189
190 def attempt_bottleneck_fix(self, indices: List[int]):
191 """
192 Finds the "bottleneck", the live range with highest end address, and reorders the indices
193 such that a next allocation might lower the memory usage.
194
195 ---------
196 | |
197 | D |
198 | |
199 ----------------------------------
200 | B |
201 -------------------------------
202 | |
203 |A| ---
204 | | |C|
205 | | | |
206 ---------------------------------------
207
208 In the above example, the allocation order was [A, B, C, D] and D is the resulting bottle-neck.
209 The live ranges that affected the allocation of D are the direct neighbours of D (i.e. B and C),
210 and all direct and indirect predecessors of D and its neighbours
211 (i.e. A, which is the predecessor of B, and indirect predecessor of D).
212
213 By permuting the order in which the affecting live ranges are allocated, the bottleneck might
214 be lowered. In the above example, almost any permutation would lower the bottleneck.
215 """
216 # Find the bottleneck
217 max_lr = self.lrs[0]
218 for lr in self.lrs[1:]:
219 if lr.end_address > max_lr.end_address:
220 max_lr = lr
221
222 # Find all live ranges that affected the placement of the bottleneck live range.
223 # This consists of two types of live ranges:
224 # - direct neighbours of the bottleneck live range
225 # - direct and indirect predecessors of these neighbours + bottleneck
226 # The turns at which these live ranges were allocated are put in the turns set.
227 turn_set = set()
228 turn_list = list()
229 self.add_predecessor_turns(turn_set, turn_list, max_lr)
230 for lr2 in max_lr.neighbours:
231 self.add_predecessor_turns(turn_set, turn_list, lr2)
232
233 # Non-direct neighbours that interfere with the allocation of the bottleneck are the
234 # immediate cause for gaps in the allocation, and are selected with higher probability.
235 non_nb_turn_list = []
236 for turn in turn_list:
237 lr = self.lrs[indices[turn]]
238 if not max_lr.is_neighbour(lr):
239 non_nb_turn_list.append(turn)
240 assert turn_list
241 # Pick from non-neighbour list with 30% probability
242 # (magic number based on tuning)
243 if random.randint(0, 100) < 30 and non_nb_turn_list:
244 # Pick a live range from the "non-neighbour list"
245 ix1 = non_nb_turn_list[random.randint(0, len(non_nb_turn_list) - 1)]
246 else:
247 # Pick any affecting live range.
248 ix1 = turn_list[random.randint(0, len(turn_list) - 1)]
249
250 ix2 = turn_list[random.randint(0, len(turn_list) - 2)]
251 if ix1 == ix2:
252 ix2 = turn_list[-1]
253 # Swap indices
254 indices[ix1], indices[ix2] = indices[ix2], indices[ix1]
255
256 def search(self, indices: List[int], iterations: int):
257 """
258 Search for a solution, using the given indices as initial solution.
259 """
260 best_indices = indices[:]
261 for _ in range(iterations):
262 # Reorder the indices
263 self.attempt_bottleneck_fix(indices)
264 # Allocate the reordered indices and check if it gave an improvement
265 new_size = self.allocate_indices(indices)
266 if new_size <= self.best_size:
267 # The new allocation produced a new best result remember it
268 self.best_size = new_size
269 best_indices = indices[:]
270 self.allocated_addresses = [lr.address for lr in self.lrs]
271 if self.best_size <= self.min_required_size:
272 # Target reached stop
273 return
274 else:
275 # The new allocation produced worse result undo the change
276 indices = best_indices[:]
277
278 def allocate(self) -> List[int]:
279 """
280 Runs the allocation algorithm. Finishes when an optimal solution has been
281 found or when maximum iterations have been run.
282 The allocated addresses are placed in the output vector, in the same
283 order as the input vector.
284
285 Implementation note: the algorithm produces reproduceable results by using
286 a well-defined random number generator with well-defined default seed,
287 and using a fixed number of iterations.
288 """
289 random.seed(1)
290 # Sort indices on priority. Note: self.lrs must be left unchanged
291 indices = [lr.id for lr in sorted(self.lrs)]
292 # Allocate the initial solution
293 self.best_size = self.allocate_indices(indices)
294 self.allocated_addresses = [lr.address for lr in self.lrs]
295 if self.best_size > self.min_required_size:
296 # Try to improve the heuristic allocation
297 self.search(indices, HillClimbAllocator.MAX_ITERATIONS)
298 # else the heuristic allocation returned an optimal solution; no search needed
299 return self.allocated_addresses
300
301
302def allocate_live_ranges(lrs: List[LiveRange]) -> List[int]:
303 """
304 Allocates live ranges using a search based allocator.
305 Returns the list of allocated addresses (one for each live range)
306 """
307 if not lrs:
308 return []
309 allocator = HillClimbAllocator(lrs)
310 return allocator.allocate()