blob: 5e02dac040a4bd49bfb0abf5430ec72b8c94d177 [file] [log] [blame]
Louis Verhaardd7002522021-01-20 17:23:54 +01001# Copyright (C) 2021 Arm Limited or its affiliates. All rights reserved.
2#
3# SPDX-License-Identifier: Apache-2.0
4#
5# Licensed under the Apache License, Version 2.0 (the License); you may
6# not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an AS IS BASIS, WITHOUT
13# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
16#
17# Description:
18# Tensor allocator based on a hill-climb search
19import random
20from typing import List
21from typing import Set
22
Henrik G Olsson807278a2021-03-08 18:20:00 +010023from . import numeric_util
Louis Verhaardd7002522021-01-20 17:23:54 +010024from .live_range import LiveRange
25
26
27class LiveRangeInfo:
Henrik G Olsson807278a2021-03-08 18:20:00 +010028 def __init__(self, id: int, start_time: int, end_time: int, size: int, min_alignment: int):
Louis Verhaardd7002522021-01-20 17:23:54 +010029 # Index of this live range
30 self.id = id
31 # Start time (input to the allocator algorithm)
32 self.start_time = start_time
33 # End time, inclusive (input to the allocator algorithm)
34 self.end_time = end_time
35 # Size in bytes (input to the allocator algorithm)
36 self.size = size
37 # Allocated address (the main output from the allocator algorithm)
38 self.address: int = 0
39 # End address, exclusive
40 self.end_address: int = 0
41 # id of predecessor live range (predecessor's end address == this lr's address)
42 self.predecessor: int = 0
43 # Turn at which the live range was allocated
44 self.turn: int = 0
45 # Max value of size_at_time (only used in the heuristic allocation)
46 self.urgency = 0
47 self.neighbours: List["LiveRangeInfo"] = []
Henrik G Olsson807278a2021-03-08 18:20:00 +010048 self.min_alignment = min_alignment
Louis Verhaardd7002522021-01-20 17:23:54 +010049
50 def overlaps(self, addr2: int, size2: int) -> int:
51 return self.address < addr2 + size2 and addr2 < self.end_address
52
53 def is_neighbour(self, lr: "LiveRangeInfo") -> bool:
54 return self.start_time <= lr.end_time and lr.start_time <= self.end_time
55
56 def __str__(self):
57 return "<LiveRangeInfo: id={}, start_time={}, end_time={}, size={}, address={}>".format(
58 self.id, self.start_time, self.end_time, self.size, self.address
59 )
60
61 def __lt__(self, other) -> bool:
62 if self.urgency != other.urgency:
63 return self.urgency > other.urgency
64 duration1 = self.end_time - self.start_time
65 duration2 = other.end_time - other.start_time
66 if duration1 != duration2:
67 return duration1 > duration2
68
69 if self.start_time != other.start_time:
70 return self.start_time < other.start_time
71
72 if self.size != other.size:
73 return self.size > other.size
74
75 return self.id < other.id
76
77
78class HillClimbAllocator:
79 """
80 Implements tensor allocator using a hill climb search.
81
82 The basic algorithm is:
83
84 Use a heuristic allocator to find an initial allocation
85 while allocation is not optimal and iterations < MAX_ITERATIONS:
86 find the "bottleneck": the live range with highest end address
87 find all live ranges that affected the allocation of the bottleneck
88 swap the order of any two affecting live ranges
89 reallocate tensors using the reordered live ranges
90 if the new allocation is better: keep it, else set allocation to previous allocation
91 """
92
93 MAX_ITERATIONS = 500
94 NOT_ALLOCATED = -1
95 # Used for live ranges allocated at address 0
96 NO_PREDECESSOR = -1
97
98 def __init__(self, live_ranges: List[LiveRange]):
99 # Contains the live ranges
100 self.lrs: List[LiveRangeInfo] = [
Henrik G Olsson807278a2021-03-08 18:20:00 +0100101 LiveRangeInfo(id, lr.start_time, lr.end_time, lr.size, lr.get_alignment())
102 for id, lr in enumerate(live_ranges)
Louis Verhaardd7002522021-01-20 17:23:54 +0100103 ]
104 self.lrs_at_time = []
105 # The available size (input to algorithm).
106 self.available_size: int = 0
107 # The algorithm stops once the target size has been achieved
108 self.target_size: int = 0
109 # The highest end address of the best found allocation
110 self.best_size: int = 1 << 63
111 # For each live range: max value of size_at_time (only used in the heuristic allocation)
112 self.lr_urgency = len(self.lrs) * [0]
113 nr_time_slots = 1 + max(lr.end_time for lr in self.lrs)
114 # Contains active live ranges at each timestamp
115 self.lrs_at_time = [[] for i in range(nr_time_slots)]
116 for lr in self.lrs:
117 for t in range(lr.start_time, lr.end_time + 1):
118 self.lrs_at_time[t].append(lr)
119 # At each timestamp: accumulated size of active live ranges
120 size_at_time = [sum(lr.size for lr in self.lrs_at_time[t]) for t in range(nr_time_slots)]
121 # The minimum possible size, assuming all live ranges can be perfectly allocated
122 self.min_required_size: int = max(size_at_time)
123 # Calculate all neighbours + the urgency of each live range
124 for lr in self.lrs:
125 lr.urgency = 0
126 lr.neighbours = []
127 neighbours = set()
128 for t in range(lr.start_time, lr.end_time + 1):
129 lr.urgency = max(size_at_time[t], lr.urgency)
130 for lr2 in self.lrs_at_time[t]:
131 if lr2 not in neighbours and lr != lr2:
132 neighbours.add(lr2)
133 lr.neighbours.append(lr2)
134
135 def allocate_lr(self, lr: LiveRangeInfo):
136 """
137 Allocates the given live range at the smallest possible address
138 """
139 address = 0
140 predecessor = HillClimbAllocator.NO_PREDECESSOR
141 fits = False
142 while not fits:
143 fits = True
144 # Find neighbours that overlap with address
145 for lr2 in lr.neighbours:
146 if lr2.address == HillClimbAllocator.NOT_ALLOCATED or lr2.end_address <= address:
147 continue
148 if lr2.overlaps(address, lr.size):
149 # Overlap found increase address
150 fits = False
Henrik G Olsson807278a2021-03-08 18:20:00 +0100151 address = numeric_util.round_up(lr2.end_address, lr.min_alignment)
Louis Verhaardd7002522021-01-20 17:23:54 +0100152 predecessor = lr2.id
153 lr.address = address
154 lr.end_address = address + lr.size
155 lr.predecessor = predecessor
156
157 def allocate_indices(self, indices: List[int]):
158 """
159 Allocates the live ranges in the order indicated by the indices;
160 allocates each live range at the lowest possible address.
161 """
162 for lr in self.lrs:
163 lr.address = HillClimbAllocator.NOT_ALLOCATED
164 size = 0
165 for turn, index in enumerate(indices):
166 lr = self.lrs[index]
167 self.allocate_lr(lr)
168 lr.turn = turn
169 size = max(size, lr.end_address)
170 if size > self.best_size:
171 # This allocation is worse than the best known allocation;
172 # no need to continue
173 break
174 return size
175
176 def add_predecessor_turns(self, turn_set: Set[int], turn_list: List[int], lr: LiveRangeInfo):
177 """
178 Adds the given live range + predecessors to the turns vector.
179 Note: the turn_set is for quick detection of duplicates,
180 the turn_list is to get reproduceable results
181 """
182 if lr.turn not in turn_set:
183 turn_set.add(lr.turn)
184 turn_list.append(lr.turn)
185 id = lr.id
186 while self.lrs[id].predecessor != HillClimbAllocator.NO_PREDECESSOR:
187 id = self.lrs[id].predecessor
188 turn = self.lrs[id].turn
189 if turn not in turn_set:
190 turn_set.add(turn)
191 turn_list.append(turn)
192
193 def attempt_bottleneck_fix(self, indices: List[int]):
194 """
195 Finds the "bottleneck", the live range with highest end address, and reorders the indices
196 such that a next allocation might lower the memory usage.
197
198 ---------
199 | |
200 | D |
201 | |
202 ----------------------------------
203 | B |
204 -------------------------------
205 | |
206 |A| ---
207 | | |C|
208 | | | |
209 ---------------------------------------
210
211 In the above example, the allocation order was [A, B, C, D] and D is the resulting bottle-neck.
212 The live ranges that affected the allocation of D are the direct neighbours of D (i.e. B and C),
213 and all direct and indirect predecessors of D and its neighbours
214 (i.e. A, which is the predecessor of B, and indirect predecessor of D).
215
216 By permuting the order in which the affecting live ranges are allocated, the bottleneck might
217 be lowered. In the above example, almost any permutation would lower the bottleneck.
218 """
219 # Find the bottleneck
220 max_lr = self.lrs[0]
221 for lr in self.lrs[1:]:
222 if lr.end_address > max_lr.end_address:
223 max_lr = lr
224
225 # Find all live ranges that affected the placement of the bottleneck live range.
226 # This consists of two types of live ranges:
227 # - direct neighbours of the bottleneck live range
228 # - direct and indirect predecessors of these neighbours + bottleneck
229 # The turns at which these live ranges were allocated are put in the turns set.
230 turn_set = set()
231 turn_list = list()
232 self.add_predecessor_turns(turn_set, turn_list, max_lr)
233 for lr2 in max_lr.neighbours:
234 self.add_predecessor_turns(turn_set, turn_list, lr2)
235
236 # Non-direct neighbours that interfere with the allocation of the bottleneck are the
237 # immediate cause for gaps in the allocation, and are selected with higher probability.
238 non_nb_turn_list = []
239 for turn in turn_list:
240 lr = self.lrs[indices[turn]]
241 if not max_lr.is_neighbour(lr):
242 non_nb_turn_list.append(turn)
243 assert turn_list
244 # Pick from non-neighbour list with 30% probability
245 # (magic number based on tuning)
246 if random.randint(0, 100) < 30 and non_nb_turn_list:
247 # Pick a live range from the "non-neighbour list"
248 ix1 = non_nb_turn_list[random.randint(0, len(non_nb_turn_list) - 1)]
249 else:
250 # Pick any affecting live range.
251 ix1 = turn_list[random.randint(0, len(turn_list) - 1)]
252
253 ix2 = turn_list[random.randint(0, len(turn_list) - 2)]
254 if ix1 == ix2:
255 ix2 = turn_list[-1]
256 # Swap indices
257 indices[ix1], indices[ix2] = indices[ix2], indices[ix1]
258
259 def search(self, indices: List[int], iterations: int):
260 """
261 Search for a solution, using the given indices as initial solution.
262 """
263 best_indices = indices[:]
264 for _ in range(iterations):
265 # Reorder the indices
266 self.attempt_bottleneck_fix(indices)
267 # Allocate the reordered indices and check if it gave an improvement
268 new_size = self.allocate_indices(indices)
269 if new_size <= self.best_size:
270 # The new allocation produced a new best result remember it
271 self.best_size = new_size
272 best_indices = indices[:]
273 self.allocated_addresses = [lr.address for lr in self.lrs]
274 if self.best_size <= self.min_required_size:
275 # Target reached stop
276 return
277 else:
278 # The new allocation produced worse result undo the change
279 indices = best_indices[:]
280
281 def allocate(self) -> List[int]:
282 """
283 Runs the allocation algorithm. Finishes when an optimal solution has been
284 found or when maximum iterations have been run.
285 The allocated addresses are placed in the output vector, in the same
286 order as the input vector.
287
288 Implementation note: the algorithm produces reproduceable results by using
289 a well-defined random number generator with well-defined default seed,
290 and using a fixed number of iterations.
291 """
292 random.seed(1)
293 # Sort indices on priority. Note: self.lrs must be left unchanged
294 indices = [lr.id for lr in sorted(self.lrs)]
295 # Allocate the initial solution
296 self.best_size = self.allocate_indices(indices)
297 self.allocated_addresses = [lr.address for lr in self.lrs]
298 if self.best_size > self.min_required_size:
299 # Try to improve the heuristic allocation
300 self.search(indices, HillClimbAllocator.MAX_ITERATIONS)
301 # else the heuristic allocation returned an optimal solution; no search needed
302 return self.allocated_addresses
303
304
305def allocate_live_ranges(lrs: List[LiveRange]) -> List[int]:
306 """
307 Allocates live ranges using a search based allocator.
308 Returns the list of allocated addresses (one for each live range)
309 """
310 if not lrs:
311 return []
312 allocator = HillClimbAllocator(lrs)
313 return allocator.allocate()