Blame - ethosu/vela/hillclimb_allocation.py - ml/ethos-u/ethos-u-vela

blob: 5e02dac040a4bd49bfb0abf5430ec72b8c94d177 [file] [log] [blame]

Louis Verhaard	d700252	2021-01-20 17:23:54 +0100	[diff] [blame]	1	# Copyright (C) 2021 Arm Limited or its affiliates. All rights reserved.
				2	#
				3	# SPDX-License-Identifier: Apache-2.0
				4	#
				5	# Licensed under the Apache License, Version 2.0 (the License); you may
				6	# not use this file except in compliance with the License.
				7	# You may obtain a copy of the License at
				8	#
				9	# www.apache.org/licenses/LICENSE-2.0
				10	#
				11	# Unless required by applicable law or agreed to in writing, software
				12	# distributed under the License is distributed on an AS IS BASIS, WITHOUT
				13	# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				14	# See the License for the specific language governing permissions and
				15	# limitations under the License.
				16	#
				17	# Description:
				18	# Tensor allocator based on a hill-climb search
				19	import random
				20	from typing import List
				21	from typing import Set
				22
Henrik G Olsson	807278a	2021-03-08 18:20:00 +0100	[diff] [blame]	23	from . import numeric_util
Louis Verhaard	d700252	2021-01-20 17:23:54 +0100	[diff] [blame]	24	from .live_range import LiveRange
				25
				26
				27	class LiveRangeInfo:
Henrik G Olsson	807278a	2021-03-08 18:20:00 +0100	[diff] [blame]	28	def __init__(self, id: int, start_time: int, end_time: int, size: int, min_alignment: int):
Louis Verhaard	d700252	2021-01-20 17:23:54 +0100	[diff] [blame]	29	# Index of this live range
				30	self.id = id
				31	# Start time (input to the allocator algorithm)
				32	self.start_time = start_time
				33	# End time, inclusive (input to the allocator algorithm)
				34	self.end_time = end_time
				35	# Size in bytes (input to the allocator algorithm)
				36	self.size = size
				37	# Allocated address (the main output from the allocator algorithm)
				38	self.address: int = 0
				39	# End address, exclusive
				40	self.end_address: int = 0
				41	# id of predecessor live range (predecessor's end address == this lr's address)
				42	self.predecessor: int = 0
				43	# Turn at which the live range was allocated
				44	self.turn: int = 0
				45	# Max value of size_at_time (only used in the heuristic allocation)
				46	self.urgency = 0
				47	self.neighbours: List["LiveRangeInfo"] = []
Henrik G Olsson	807278a	2021-03-08 18:20:00 +0100	[diff] [blame]	48	self.min_alignment = min_alignment
Louis Verhaard	d700252	2021-01-20 17:23:54 +0100	[diff] [blame]	49
				50	def overlaps(self, addr2: int, size2: int) -> int:
				51	return self.address < addr2 + size2 and addr2 < self.end_address
				52
				53	def is_neighbour(self, lr: "LiveRangeInfo") -> bool:
				54	return self.start_time <= lr.end_time and lr.start_time <= self.end_time
				55
				56	def __str__(self):
				57	return "<LiveRangeInfo: id={}, start_time={}, end_time={}, size={}, address={}>".format(
				58	self.id, self.start_time, self.end_time, self.size, self.address
				59	)
				60
				61	def __lt__(self, other) -> bool:
				62	if self.urgency != other.urgency:
				63	return self.urgency > other.urgency
				64	duration1 = self.end_time - self.start_time
				65	duration2 = other.end_time - other.start_time
				66	if duration1 != duration2:
				67	return duration1 > duration2
				68
				69	if self.start_time != other.start_time:
				70	return self.start_time < other.start_time
				71
				72	if self.size != other.size:
				73	return self.size > other.size
				74
				75	return self.id < other.id
				76
				77
				78	class HillClimbAllocator:
				79	"""
				80	Implements tensor allocator using a hill climb search.
				81
				82	The basic algorithm is:
				83
				84	Use a heuristic allocator to find an initial allocation
				85	while allocation is not optimal and iterations < MAX_ITERATIONS:
				86	find the "bottleneck": the live range with highest end address
				87	find all live ranges that affected the allocation of the bottleneck
				88	swap the order of any two affecting live ranges
				89	reallocate tensors using the reordered live ranges
				90	if the new allocation is better: keep it, else set allocation to previous allocation
				91	"""
				92
				93	MAX_ITERATIONS = 500
				94	NOT_ALLOCATED = -1
				95	# Used for live ranges allocated at address 0
				96	NO_PREDECESSOR = -1
				97
				98	def __init__(self, live_ranges: List[LiveRange]):
				99	# Contains the live ranges
				100	self.lrs: List[LiveRangeInfo] = [
Henrik G Olsson	807278a	2021-03-08 18:20:00 +0100	[diff] [blame]	101	LiveRangeInfo(id, lr.start_time, lr.end_time, lr.size, lr.get_alignment())
				102	for id, lr in enumerate(live_ranges)
Louis Verhaard	d700252	2021-01-20 17:23:54 +0100	[diff] [blame]	103	]
				104	self.lrs_at_time = []
				105	# The available size (input to algorithm).
				106	self.available_size: int = 0
				107	# The algorithm stops once the target size has been achieved
				108	self.target_size: int = 0
				109	# The highest end address of the best found allocation
				110	self.best_size: int = 1 << 63
				111	# For each live range: max value of size_at_time (only used in the heuristic allocation)
				112	self.lr_urgency = len(self.lrs) * [0]
				113	nr_time_slots = 1 + max(lr.end_time for lr in self.lrs)
				114	# Contains active live ranges at each timestamp
				115	self.lrs_at_time = [[] for i in range(nr_time_slots)]
				116	for lr in self.lrs:
				117	for t in range(lr.start_time, lr.end_time + 1):
				118	self.lrs_at_time[t].append(lr)
				119	# At each timestamp: accumulated size of active live ranges
				120	size_at_time = [sum(lr.size for lr in self.lrs_at_time[t]) for t in range(nr_time_slots)]
				121	# The minimum possible size, assuming all live ranges can be perfectly allocated
				122	self.min_required_size: int = max(size_at_time)
				123	# Calculate all neighbours + the urgency of each live range
				124	for lr in self.lrs:
				125	lr.urgency = 0
				126	lr.neighbours = []
				127	neighbours = set()
				128	for t in range(lr.start_time, lr.end_time + 1):
				129	lr.urgency = max(size_at_time[t], lr.urgency)
				130	for lr2 in self.lrs_at_time[t]:
				131	if lr2 not in neighbours and lr != lr2:
				132	neighbours.add(lr2)
				133	lr.neighbours.append(lr2)
				134
				135	def allocate_lr(self, lr: LiveRangeInfo):
				136	"""
				137	Allocates the given live range at the smallest possible address
				138	"""
				139	address = 0
				140	predecessor = HillClimbAllocator.NO_PREDECESSOR
				141	fits = False
				142	while not fits:
				143	fits = True
				144	# Find neighbours that overlap with address
				145	for lr2 in lr.neighbours:
				146	if lr2.address == HillClimbAllocator.NOT_ALLOCATED or lr2.end_address <= address:
				147	continue
				148	if lr2.overlaps(address, lr.size):
				149	# Overlap found increase address
				150	fits = False
Henrik G Olsson	807278a	2021-03-08 18:20:00 +0100	[diff] [blame]	151	address = numeric_util.round_up(lr2.end_address, lr.min_alignment)
Louis Verhaard	d700252	2021-01-20 17:23:54 +0100	[diff] [blame]	152	predecessor = lr2.id
				153	lr.address = address
				154	lr.end_address = address + lr.size
				155	lr.predecessor = predecessor
				156
				157	def allocate_indices(self, indices: List[int]):
				158	"""
				159	Allocates the live ranges in the order indicated by the indices;
				160	allocates each live range at the lowest possible address.
				161	"""
				162	for lr in self.lrs:
				163	lr.address = HillClimbAllocator.NOT_ALLOCATED
				164	size = 0
				165	for turn, index in enumerate(indices):
				166	lr = self.lrs[index]
				167	self.allocate_lr(lr)
				168	lr.turn = turn
				169	size = max(size, lr.end_address)
				170	if size > self.best_size:
				171	# This allocation is worse than the best known allocation;
				172	# no need to continue
				173	break
				174	return size
				175
				176	def add_predecessor_turns(self, turn_set: Set[int], turn_list: List[int], lr: LiveRangeInfo):
				177	"""
				178	Adds the given live range + predecessors to the turns vector.
				179	Note: the turn_set is for quick detection of duplicates,
				180	the turn_list is to get reproduceable results
				181	"""
				182	if lr.turn not in turn_set:
				183	turn_set.add(lr.turn)
				184	turn_list.append(lr.turn)
				185	id = lr.id
				186	while self.lrs[id].predecessor != HillClimbAllocator.NO_PREDECESSOR:
				187	id = self.lrs[id].predecessor
				188	turn = self.lrs[id].turn
				189	if turn not in turn_set:
				190	turn_set.add(turn)
				191	turn_list.append(turn)
				192
				193	def attempt_bottleneck_fix(self, indices: List[int]):
				194	"""
				195	Finds the "bottleneck", the live range with highest end address, and reorders the indices
				196	such that a next allocation might lower the memory usage.
				197
				198	---------
				199	\| \|
				200	\| D \|
				201	\| \|
				202	----------------------------------
				203	\| B \|
				204	-------------------------------
				205	\| \|
				206	\|A\| ---
				207	\| \| \|C\|
				208	\| \| \| \|
				209	---------------------------------------
				210
				211	In the above example, the allocation order was [A, B, C, D] and D is the resulting bottle-neck.
				212	The live ranges that affected the allocation of D are the direct neighbours of D (i.e. B and C),
				213	and all direct and indirect predecessors of D and its neighbours
				214	(i.e. A, which is the predecessor of B, and indirect predecessor of D).
				215
				216	By permuting the order in which the affecting live ranges are allocated, the bottleneck might
				217	be lowered. In the above example, almost any permutation would lower the bottleneck.
				218	"""
				219	# Find the bottleneck
				220	max_lr = self.lrs[0]
				221	for lr in self.lrs[1:]:
				222	if lr.end_address > max_lr.end_address:
				223	max_lr = lr
				224
				225	# Find all live ranges that affected the placement of the bottleneck live range.
				226	# This consists of two types of live ranges:
				227	# - direct neighbours of the bottleneck live range
				228	# - direct and indirect predecessors of these neighbours + bottleneck
				229	# The turns at which these live ranges were allocated are put in the turns set.
				230	turn_set = set()
				231	turn_list = list()
				232	self.add_predecessor_turns(turn_set, turn_list, max_lr)
				233	for lr2 in max_lr.neighbours:
				234	self.add_predecessor_turns(turn_set, turn_list, lr2)
				235
				236	# Non-direct neighbours that interfere with the allocation of the bottleneck are the
				237	# immediate cause for gaps in the allocation, and are selected with higher probability.
				238	non_nb_turn_list = []
				239	for turn in turn_list:
				240	lr = self.lrs[indices[turn]]
				241	if not max_lr.is_neighbour(lr):
				242	non_nb_turn_list.append(turn)
				243	assert turn_list
				244	# Pick from non-neighbour list with 30% probability
				245	# (magic number based on tuning)
				246	if random.randint(0, 100) < 30 and non_nb_turn_list:
				247	# Pick a live range from the "non-neighbour list"
				248	ix1 = non_nb_turn_list[random.randint(0, len(non_nb_turn_list) - 1)]
				249	else:
				250	# Pick any affecting live range.
				251	ix1 = turn_list[random.randint(0, len(turn_list) - 1)]
				252
				253	ix2 = turn_list[random.randint(0, len(turn_list) - 2)]
				254	if ix1 == ix2:
				255	ix2 = turn_list[-1]
				256	# Swap indices
				257	indices[ix1], indices[ix2] = indices[ix2], indices[ix1]
				258
				259	def search(self, indices: List[int], iterations: int):
				260	"""
				261	Search for a solution, using the given indices as initial solution.
				262	"""
				263	best_indices = indices[:]
				264	for _ in range(iterations):
				265	# Reorder the indices
				266	self.attempt_bottleneck_fix(indices)
				267	# Allocate the reordered indices and check if it gave an improvement
				268	new_size = self.allocate_indices(indices)
				269	if new_size <= self.best_size:
				270	# The new allocation produced a new best result remember it
				271	self.best_size = new_size
				272	best_indices = indices[:]
				273	self.allocated_addresses = [lr.address for lr in self.lrs]
				274	if self.best_size <= self.min_required_size:
				275	# Target reached stop
				276	return
				277	else:
				278	# The new allocation produced worse result undo the change
				279	indices = best_indices[:]
				280
				281	def allocate(self) -> List[int]:
				282	"""
				283	Runs the allocation algorithm. Finishes when an optimal solution has been
				284	found or when maximum iterations have been run.
				285	The allocated addresses are placed in the output vector, in the same
				286	order as the input vector.
				287
				288	Implementation note: the algorithm produces reproduceable results by using
				289	a well-defined random number generator with well-defined default seed,
				290	and using a fixed number of iterations.
				291	"""
				292	random.seed(1)
				293	# Sort indices on priority. Note: self.lrs must be left unchanged
				294	indices = [lr.id for lr in sorted(self.lrs)]
				295	# Allocate the initial solution
				296	self.best_size = self.allocate_indices(indices)
				297	self.allocated_addresses = [lr.address for lr in self.lrs]
				298	if self.best_size > self.min_required_size:
				299	# Try to improve the heuristic allocation
				300	self.search(indices, HillClimbAllocator.MAX_ITERATIONS)
				301	# else the heuristic allocation returned an optimal solution; no search needed
				302	return self.allocated_addresses
				303
				304
				305	def allocate_live_ranges(lrs: List[LiveRange]) -> List[int]:
				306	"""
				307	Allocates live ranges using a search based allocator.
				308	Returns the list of allocated addresses (one for each live range)
				309	"""
				310	if not lrs:
				311	return []
				312	allocator = HillClimbAllocator(lrs)
				313	return allocator.allocate()