Blame - ethosu/vela/hillclimb_allocation.py - ml/ethos-u/ethos-u-vela

blob: de53ab83913b2e3f2cbc274f9bf3fdb87ee66a4e [file] [log] [blame]

Louis Verhaard	d700252	2021-01-20 17:23:54 +0100	[diff] [blame^]	1	# Copyright (C) 2021 Arm Limited or its affiliates. All rights reserved.
				2	#
				3	# SPDX-License-Identifier: Apache-2.0
				4	#
				5	# Licensed under the Apache License, Version 2.0 (the License); you may
				6	# not use this file except in compliance with the License.
				7	# You may obtain a copy of the License at
				8	#
				9	# www.apache.org/licenses/LICENSE-2.0
				10	#
				11	# Unless required by applicable law or agreed to in writing, software
				12	# distributed under the License is distributed on an AS IS BASIS, WITHOUT
				13	# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				14	# See the License for the specific language governing permissions and
				15	# limitations under the License.
				16	#
				17	# Description:
				18	# Tensor allocator based on a hill-climb search
				19	import random
				20	from typing import List
				21	from typing import Set
				22
				23	from .live_range import LiveRange
				24
				25
				26	class LiveRangeInfo:
				27	def __init__(self, id: int, start_time: int, end_time: int, size: int):
				28	# Index of this live range
				29	self.id = id
				30	# Start time (input to the allocator algorithm)
				31	self.start_time = start_time
				32	# End time, inclusive (input to the allocator algorithm)
				33	self.end_time = end_time
				34	# Size in bytes (input to the allocator algorithm)
				35	self.size = size
				36	# Allocated address (the main output from the allocator algorithm)
				37	self.address: int = 0
				38	# End address, exclusive
				39	self.end_address: int = 0
				40	# id of predecessor live range (predecessor's end address == this lr's address)
				41	self.predecessor: int = 0
				42	# Turn at which the live range was allocated
				43	self.turn: int = 0
				44	# Max value of size_at_time (only used in the heuristic allocation)
				45	self.urgency = 0
				46	self.neighbours: List["LiveRangeInfo"] = []
				47
				48	def overlaps(self, addr2: int, size2: int) -> int:
				49	return self.address < addr2 + size2 and addr2 < self.end_address
				50
				51	def is_neighbour(self, lr: "LiveRangeInfo") -> bool:
				52	return self.start_time <= lr.end_time and lr.start_time <= self.end_time
				53
				54	def __str__(self):
				55	return "<LiveRangeInfo: id={}, start_time={}, end_time={}, size={}, address={}>".format(
				56	self.id, self.start_time, self.end_time, self.size, self.address
				57	)
				58
				59	def __lt__(self, other) -> bool:
				60	if self.urgency != other.urgency:
				61	return self.urgency > other.urgency
				62	duration1 = self.end_time - self.start_time
				63	duration2 = other.end_time - other.start_time
				64	if duration1 != duration2:
				65	return duration1 > duration2
				66
				67	if self.start_time != other.start_time:
				68	return self.start_time < other.start_time
				69
				70	if self.size != other.size:
				71	return self.size > other.size
				72
				73	return self.id < other.id
				74
				75
				76	class HillClimbAllocator:
				77	"""
				78	Implements tensor allocator using a hill climb search.
				79
				80	The basic algorithm is:
				81
				82	Use a heuristic allocator to find an initial allocation
				83	while allocation is not optimal and iterations < MAX_ITERATIONS:
				84	find the "bottleneck": the live range with highest end address
				85	find all live ranges that affected the allocation of the bottleneck
				86	swap the order of any two affecting live ranges
				87	reallocate tensors using the reordered live ranges
				88	if the new allocation is better: keep it, else set allocation to previous allocation
				89	"""
				90
				91	MAX_ITERATIONS = 500
				92	NOT_ALLOCATED = -1
				93	# Used for live ranges allocated at address 0
				94	NO_PREDECESSOR = -1
				95
				96	def __init__(self, live_ranges: List[LiveRange]):
				97	# Contains the live ranges
				98	self.lrs: List[LiveRangeInfo] = [
				99	LiveRangeInfo(id, lr.start_time, lr.end_time, lr.size) for id, lr in enumerate(live_ranges)
				100	]
				101	self.lrs_at_time = []
				102	# The available size (input to algorithm).
				103	self.available_size: int = 0
				104	# The algorithm stops once the target size has been achieved
				105	self.target_size: int = 0
				106	# The highest end address of the best found allocation
				107	self.best_size: int = 1 << 63
				108	# For each live range: max value of size_at_time (only used in the heuristic allocation)
				109	self.lr_urgency = len(self.lrs) * [0]
				110	nr_time_slots = 1 + max(lr.end_time for lr in self.lrs)
				111	# Contains active live ranges at each timestamp
				112	self.lrs_at_time = [[] for i in range(nr_time_slots)]
				113	for lr in self.lrs:
				114	for t in range(lr.start_time, lr.end_time + 1):
				115	self.lrs_at_time[t].append(lr)
				116	# At each timestamp: accumulated size of active live ranges
				117	size_at_time = [sum(lr.size for lr in self.lrs_at_time[t]) for t in range(nr_time_slots)]
				118	# The minimum possible size, assuming all live ranges can be perfectly allocated
				119	self.min_required_size: int = max(size_at_time)
				120	# Calculate all neighbours + the urgency of each live range
				121	for lr in self.lrs:
				122	lr.urgency = 0
				123	lr.neighbours = []
				124	neighbours = set()
				125	for t in range(lr.start_time, lr.end_time + 1):
				126	lr.urgency = max(size_at_time[t], lr.urgency)
				127	for lr2 in self.lrs_at_time[t]:
				128	if lr2 not in neighbours and lr != lr2:
				129	neighbours.add(lr2)
				130	lr.neighbours.append(lr2)
				131
				132	def allocate_lr(self, lr: LiveRangeInfo):
				133	"""
				134	Allocates the given live range at the smallest possible address
				135	"""
				136	address = 0
				137	predecessor = HillClimbAllocator.NO_PREDECESSOR
				138	fits = False
				139	while not fits:
				140	fits = True
				141	# Find neighbours that overlap with address
				142	for lr2 in lr.neighbours:
				143	if lr2.address == HillClimbAllocator.NOT_ALLOCATED or lr2.end_address <= address:
				144	continue
				145	if lr2.overlaps(address, lr.size):
				146	# Overlap found increase address
				147	fits = False
				148	address = lr2.end_address
				149	predecessor = lr2.id
				150	lr.address = address
				151	lr.end_address = address + lr.size
				152	lr.predecessor = predecessor
				153
				154	def allocate_indices(self, indices: List[int]):
				155	"""
				156	Allocates the live ranges in the order indicated by the indices;
				157	allocates each live range at the lowest possible address.
				158	"""
				159	for lr in self.lrs:
				160	lr.address = HillClimbAllocator.NOT_ALLOCATED
				161	size = 0
				162	for turn, index in enumerate(indices):
				163	lr = self.lrs[index]
				164	self.allocate_lr(lr)
				165	lr.turn = turn
				166	size = max(size, lr.end_address)
				167	if size > self.best_size:
				168	# This allocation is worse than the best known allocation;
				169	# no need to continue
				170	break
				171	return size
				172
				173	def add_predecessor_turns(self, turn_set: Set[int], turn_list: List[int], lr: LiveRangeInfo):
				174	"""
				175	Adds the given live range + predecessors to the turns vector.
				176	Note: the turn_set is for quick detection of duplicates,
				177	the turn_list is to get reproduceable results
				178	"""
				179	if lr.turn not in turn_set:
				180	turn_set.add(lr.turn)
				181	turn_list.append(lr.turn)
				182	id = lr.id
				183	while self.lrs[id].predecessor != HillClimbAllocator.NO_PREDECESSOR:
				184	id = self.lrs[id].predecessor
				185	turn = self.lrs[id].turn
				186	if turn not in turn_set:
				187	turn_set.add(turn)
				188	turn_list.append(turn)
				189
				190	def attempt_bottleneck_fix(self, indices: List[int]):
				191	"""
				192	Finds the "bottleneck", the live range with highest end address, and reorders the indices
				193	such that a next allocation might lower the memory usage.
				194
				195	---------
				196	\| \|
				197	\| D \|
				198	\| \|
				199	----------------------------------
				200	\| B \|
				201	-------------------------------
				202	\| \|
				203	\|A\| ---
				204	\| \| \|C\|
				205	\| \| \| \|
				206	---------------------------------------
				207
				208	In the above example, the allocation order was [A, B, C, D] and D is the resulting bottle-neck.
				209	The live ranges that affected the allocation of D are the direct neighbours of D (i.e. B and C),
				210	and all direct and indirect predecessors of D and its neighbours
				211	(i.e. A, which is the predecessor of B, and indirect predecessor of D).
				212
				213	By permuting the order in which the affecting live ranges are allocated, the bottleneck might
				214	be lowered. In the above example, almost any permutation would lower the bottleneck.
				215	"""
				216	# Find the bottleneck
				217	max_lr = self.lrs[0]
				218	for lr in self.lrs[1:]:
				219	if lr.end_address > max_lr.end_address:
				220	max_lr = lr
				221
				222	# Find all live ranges that affected the placement of the bottleneck live range.
				223	# This consists of two types of live ranges:
				224	# - direct neighbours of the bottleneck live range
				225	# - direct and indirect predecessors of these neighbours + bottleneck
				226	# The turns at which these live ranges were allocated are put in the turns set.
				227	turn_set = set()
				228	turn_list = list()
				229	self.add_predecessor_turns(turn_set, turn_list, max_lr)
				230	for lr2 in max_lr.neighbours:
				231	self.add_predecessor_turns(turn_set, turn_list, lr2)
				232
				233	# Non-direct neighbours that interfere with the allocation of the bottleneck are the
				234	# immediate cause for gaps in the allocation, and are selected with higher probability.
				235	non_nb_turn_list = []
				236	for turn in turn_list:
				237	lr = self.lrs[indices[turn]]
				238	if not max_lr.is_neighbour(lr):
				239	non_nb_turn_list.append(turn)
				240	assert turn_list
				241	# Pick from non-neighbour list with 30% probability
				242	# (magic number based on tuning)
				243	if random.randint(0, 100) < 30 and non_nb_turn_list:
				244	# Pick a live range from the "non-neighbour list"
				245	ix1 = non_nb_turn_list[random.randint(0, len(non_nb_turn_list) - 1)]
				246	else:
				247	# Pick any affecting live range.
				248	ix1 = turn_list[random.randint(0, len(turn_list) - 1)]
				249
				250	ix2 = turn_list[random.randint(0, len(turn_list) - 2)]
				251	if ix1 == ix2:
				252	ix2 = turn_list[-1]
				253	# Swap indices
				254	indices[ix1], indices[ix2] = indices[ix2], indices[ix1]
				255
				256	def search(self, indices: List[int], iterations: int):
				257	"""
				258	Search for a solution, using the given indices as initial solution.
				259	"""
				260	best_indices = indices[:]
				261	for _ in range(iterations):
				262	# Reorder the indices
				263	self.attempt_bottleneck_fix(indices)
				264	# Allocate the reordered indices and check if it gave an improvement
				265	new_size = self.allocate_indices(indices)
				266	if new_size <= self.best_size:
				267	# The new allocation produced a new best result remember it
				268	self.best_size = new_size
				269	best_indices = indices[:]
				270	self.allocated_addresses = [lr.address for lr in self.lrs]
				271	if self.best_size <= self.min_required_size:
				272	# Target reached stop
				273	return
				274	else:
				275	# The new allocation produced worse result undo the change
				276	indices = best_indices[:]
				277
				278	def allocate(self) -> List[int]:
				279	"""
				280	Runs the allocation algorithm. Finishes when an optimal solution has been
				281	found or when maximum iterations have been run.
				282	The allocated addresses are placed in the output vector, in the same
				283	order as the input vector.
				284
				285	Implementation note: the algorithm produces reproduceable results by using
				286	a well-defined random number generator with well-defined default seed,
				287	and using a fixed number of iterations.
				288	"""
				289	random.seed(1)
				290	# Sort indices on priority. Note: self.lrs must be left unchanged
				291	indices = [lr.id for lr in sorted(self.lrs)]
				292	# Allocate the initial solution
				293	self.best_size = self.allocate_indices(indices)
				294	self.allocated_addresses = [lr.address for lr in self.lrs]
				295	if self.best_size > self.min_required_size:
				296	# Try to improve the heuristic allocation
				297	self.search(indices, HillClimbAllocator.MAX_ITERATIONS)
				298	# else the heuristic allocation returned an optimal solution; no search needed
				299	return self.allocated_addresses
				300
				301
				302	def allocate_live_ranges(lrs: List[LiveRange]) -> List[int]:
				303	"""
				304	Allocates live ranges using a search based allocator.
				305	Returns the list of allocated addresses (one for each live range)
				306	"""
				307	if not lrs:
				308	return []
				309	allocator = HillClimbAllocator(lrs)
				310	return allocator.allocate()