Blame - ethosu/vela/tensor_allocation.py - ml/ethos-u/ethos-u-vela

blob: b2ea7de6e9c287035221710efe7df51db8b9ecce [file] [log] [blame]

Louis Verhaard	d700252	2021-01-20 17:23:54 +0100	[diff] [blame]	1	# Copyright (C) 2020-2021 Arm Limited or its affiliates. All rights reserved.
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	2	#
				3	# SPDX-License-Identifier: Apache-2.0
				4	#
				5	# Licensed under the Apache License, Version 2.0 (the License); you may
				6	# not use this file except in compliance with the License.
				7	# You may obtain a copy of the License at
				8	#
				9	# www.apache.org/licenses/LICENSE-2.0
				10	#
				11	# Unless required by applicable law or agreed to in writing, software
				12	# distributed under the License is distributed on an AS IS BASIS, WITHOUT
				13	# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				14	# See the License for the specific language governing permissions and
				15	# limitations under the License.
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	16	# Description:
				17	# Wrapping function to do tensor address allocation. That is, assigning addresses to tensors based on what has been
				18	# worked out from the allowable overlaps that are calculated by the live range analysis.
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	19	import math
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	20
Diego Russo	ea6111a	2020-04-14 18:41:58 +0100	[diff] [blame]	21	import numpy as np
				22
Louis Verhaard	d700252	2021-01-20 17:23:54 +0100	[diff] [blame]	23	from . import hillclimb_allocation
Diego Russo	ea6111a	2020-04-14 18:41:58 +0100	[diff] [blame]	24	from . import live_range
				25	from . import numeric_util
Jacob Bohlin	0628a8c	2020-08-28 13:25:14 +0200	[diff] [blame]	26	from .errors import AllocationError
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	27	from .greedy_allocation import allocate_live_ranges as greedy_allocate_live_ranges
Louis Verhaard	9bfe0f8	2020-12-03 12:26:25 +0100	[diff] [blame]	28	from .live_range import LiveRangeGraph
Diego Russo	e8a1045	2020-04-21 17:39:10 +0100	[diff] [blame]	29	from .nn_graph import TensorAllocator
				30	from .tensor import MemArea
Patrik Gustavsson	eca2e95	2020-05-27 09:15:11 +0200	[diff] [blame]	31	from .tensor import MemType
Jacob Bohlin	0628a8c	2020-08-28 13:25:14 +0200	[diff] [blame]	32	from .tensor import Tensor
Louis Verhaard	0b8268a	2020-08-05 16:11:29 +0200	[diff] [blame]	33	from .tensor import TensorPurpose
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	34
				35
Jacob Bohlin	0628a8c	2020-08-28 13:25:14 +0200	[diff] [blame]	36	def linear_allocate_live_ranges(live_ranges, alloc_granularity=Tensor.AllocationQuantum):
Louis Verhaard	3c07c97	2020-05-07 08:12:58 +0200	[diff] [blame]	37	# Allocates using increasing addresses. Duplicate constant tensors will be allocated to the same address
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	38	total_sz = 0
				39	allocated_tensors = []
				40
Louis Verhaard	3c07c97	2020-05-07 08:12:58 +0200	[diff] [blame]	41	# just assign increasing addresses, except for duplicates
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	42	for tens, lr in live_ranges.ranges.items():
				43	if tens in allocated_tensors:
				44	continue
				45
Louis Verhaard	3c07c97	2020-05-07 08:12:58 +0200	[diff] [blame]	46	address = total_sz
				47	if tens.weight_compression_config is not None:
				48	for allocated_tens in allocated_tensors:
				49	if allocated_tens.weight_compression_config == tens.weight_compression_config:
				50	address = allocated_tens.address
				51	break
Louis Verhaard	0b8268a	2020-08-05 16:11:29 +0200	[diff] [blame]	52	if tens.purpose == TensorPurpose.LUT:
				53	for allocated_tens in allocated_tensors:
				54	if allocated_tens.equivalent(tens):
				55	address = allocated_tens.address
				56	break
Louis Verhaard	3c07c97	2020-05-07 08:12:58 +0200	[diff] [blame]	57	lr.set_address(address)
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	58	allocated_tensors += lr.tensors
Louis Verhaard	3c07c97	2020-05-07 08:12:58 +0200	[diff] [blame]	59	if address == total_sz:
				60	total_sz += numeric_util.round_up(int(math.ceil(lr.size)), alloc_granularity)
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	61
Jacob Bohlin	0628a8c	2020-08-28 13:25:14 +0200	[diff] [blame]	62	verify_alignment(live_ranges, alloc_granularity)
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	63	return total_sz
				64
				65
Louis Verhaard	d700252	2021-01-20 17:23:54 +0100	[diff] [blame]	66	def hillclimb_allocate_live_ranges(live_ranges: LiveRangeGraph, alloc_granularity: int) -> int:
				67	# Allocates using the hill climb allocator
				68	lr_set = {(lr.start_time, lr.end_time, lr) for lr in live_ranges.ranges.values()}
				69	lr_list = [lr for _, _, lr in lr_set]
Fredrik Svedberg	ab11bd1	2021-02-15 15:36:05 +0100	[diff] [blame]	70	lr_list.sort()
				71
Louis Verhaard	d700252	2021-01-20 17:23:54 +0100	[diff] [blame]	72	addresses = hillclimb_allocation.allocate_live_ranges(lr_list)
Louis Verhaard	9bfe0f8	2020-12-03 12:26:25 +0100	[diff] [blame]	73	# The result is a list containing the allocated addresses
				74	total_sz = 0
Louis Verhaard	d700252	2021-01-20 17:23:54 +0100	[diff] [blame]	75	for lr, address in zip(lr_list, addresses):
Louis Verhaard	9bfe0f8	2020-12-03 12:26:25 +0100	[diff] [blame]	76	total_sz = max(total_sz, address + lr.size)
				77	lr.set_address(address)
				78	verify_allocation(live_ranges, alloc_granularity)
				79	return total_sz
				80
				81
				82	def verify_alignment(live_ranges: LiveRangeGraph, alignment: int):
Jacob Bohlin	0628a8c	2020-08-28 13:25:14 +0200	[diff] [blame]	83	for lr in live_ranges.ranges.values():
				84	for tens in lr.tensors:
				85	if not all(op and op.run_on_npu for op in tens.ops + tens.consumer_list):
				86	# This is a CPU tensor, verify alignment
				87	if tens.address % alignment != 0:
Michael McGeagh	7a6f843	2020-12-02 15:29:22 +0000	[diff] [blame]	88	raise AllocationError(f"Tensor '{tens.name}' not aligned to {alignment} bytes")
Jacob Bohlin	0628a8c	2020-08-28 13:25:14 +0200	[diff] [blame]	89
				90
Louis Verhaard	9bfe0f8	2020-12-03 12:26:25 +0100	[diff] [blame]	91	def verify_allocation(live_ranges: LiveRangeGraph, alignment: int):
				92	lrs = list(live_ranges.ranges.values())
				93	for n in lrs:
				94	verify_alignment(live_ranges, alignment)
				95
				96	for m in lrs:
				97	if n != m and n.overlaps_ranges(m):
				98	overlap, tens_n, tens_m = n.overlaps_address(m)
				99	if overlap and not (tens_n.equivalent(tens_m) and tens_n.address == tens_m.address):
				100	raise AllocationError(
Michael McGeagh	7a6f843	2020-12-02 15:29:22 +0000	[diff] [blame]	101	f"Overlapping buffers: {n.name}: {tens_n.address} -> {tens_n.address + n.size}"
				102	f" and {m.name}: {tens_m.address} -> {tens_m.address + m.size}"
Louis Verhaard	9bfe0f8	2020-12-03 12:26:25 +0100	[diff] [blame]	103	)
				104
				105
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	106	def mark_sram_used_for_cascaded_passes(sg, lrs):
				107	end_pos = max(ps.time for ps in sg.cascaded_passes) + 2
				108	mem_usage = np.zeros(end_pos, dtype=np.int64)
				109
				110	for tens, rng in lrs.ranges.items():
				111	storage_size = tens.storage_size()
				112	mem_usage[rng.start_time : rng.end_time] += storage_size
				113
				114	for cps in sg.cascaded_passes:
				115	sram_used = max(mem_usage[cps.time], mem_usage[cps.time + 1])
				116	cps.sram_used = sram_used
				117	for ps in cps.passes:
				118	ps.sram_used = sram_used
				119
				120
Tim Hall	b9b515c	2020-11-01 21:27:19 +0000	[diff] [blame]	121	def print_allocation(lrs, mem_area, mem_type_set, sg, verbose_allocation):
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	122	if verbose_allocation:
Patrik Gustavsson	eca2e95	2020-05-27 09:15:11 +0200	[diff] [blame]	123	if mem_type_set == set((MemType.Permanent_NPU,)) or mem_type_set == set((MemType.Permanent_CPU,)):
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	124	print("allocation for", mem_area, "- constant tensors in", sg.placement.name, "subgraph(s)")
Patrik Gustavsson	eca2e95	2020-05-27 09:15:11 +0200	[diff] [blame]	125	else:
				126	print("allocation for", mem_area, "- non-constant tensors in Cpu and Npu subgraphs")
Louis Verhaard	1356c2a	2020-09-16 10:25:28 +0200	[diff] [blame]	127	mem_usage = 0
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	128	for start_time, start, end, name, end_time in sorted(
				129	(
				130	lr.start_time,
				131	tens.address,
				132	tens.address + int(math.ceil(tens.storage_size())),
				133	tens.name + " " + str(tens.purpose),
				134	lr.end_time,
				135	)
				136	for tens, lr in lrs.ranges.items()
				137	):
				138	name = name.replace("\x00", "")
				139	print("%9d: %#12x - %#12x: %3d - %3d %s" % ((end - start), start, end, start_time, end_time, name))
Louis Verhaard	1356c2a	2020-09-16 10:25:28 +0200	[diff] [blame]	140	mem_usage = max(mem_usage, end)
				141	print("Memory usage: {} ({:#x}) bytes / {:.1f} KB".format(mem_usage, mem_usage, mem_usage / 1024))
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	142	print()
				143
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	144
erik.andersson@arm.com	3438c92	2021-03-24 10:32:09 +0100	[diff] [blame^]	145	def calculate_allocation_efficiency(lrs):
				146	lr_set = set(lrs.ranges.values())
				147
				148	size_at_time = [0] * (1 + max(lr.end_time for lr in lr_set))
				149	for lr in lr_set:
				150	for t in range(lr.start_time, lr.end_time + 1):
				151	size_at_time[t] += lr.size
				152
				153	return max(size_at_time)
				154
				155
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	156	def allocate_tensors(
				157	nng,
				158	sg,
				159	arch,
				160	mem_area,
Patrik Gustavsson	eca2e95	2020-05-27 09:15:11 +0200	[diff] [blame]	161	mem_type_set,
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	162	tensor_allocator=TensorAllocator.Greedy,
				163	verbose_allocation=False,
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	164	lr_graph=None,
Tim Hall	b9b515c	2020-11-01 21:27:19 +0000	[diff] [blame]	165	cpu_tensor_alignment=Tensor.AllocationQuantum,
Louis Verhaard	0b9c9a3	2020-09-15 14:05:38 +0200	[diff] [blame]	166	max_size=None,
				167	dry_test=False,
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	168	):
Louis Verhaard	0b9c9a3	2020-09-15 14:05:38 +0200	[diff] [blame]	169	# Allocates addresses to tensors, returns False if tensors could not be fit within max_size
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	170	ignore_subgraph_input_output_tensors = False
				171	lrs = live_range.extract_live_ranges_from_cascaded_passes(
				172	sg,
				173	mem_area,
Patrik Gustavsson	eca2e95	2020-05-27 09:15:11 +0200	[diff] [blame]	174	mem_type_set,
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	175	ignore_subgraph_input_output_tensors=ignore_subgraph_input_output_tensors,
				176	lr_graph=lr_graph,
Tim Hall	b9b515c	2020-11-01 21:27:19 +0000	[diff] [blame]	177	cpu_tensor_alignment=cpu_tensor_alignment,
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	178	)
				179
				180	if lrs.ranges:
				181	tens_alloc = tensor_allocator
				182	if tens_alloc == TensorAllocator.Greedy:
Tim Hall	b9b515c	2020-11-01 21:27:19 +0000	[diff] [blame]	183	total_sz = greedy_allocate_live_ranges(sg, arch, lrs, mem_area, cpu_tensor_alignment, verbose_allocation)
Louis Verhaard	9bfe0f8	2020-12-03 12:26:25 +0100	[diff] [blame]	184	verify_allocation(lrs, cpu_tensor_alignment)
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	185	elif tens_alloc == TensorAllocator.LinearAlloc:
Tim Hall	b9b515c	2020-11-01 21:27:19 +0000	[diff] [blame]	186	total_sz = linear_allocate_live_ranges(lrs, cpu_tensor_alignment)
Louis Verhaard	d700252	2021-01-20 17:23:54 +0100	[diff] [blame]	187	elif tens_alloc == TensorAllocator.HillClimb:
				188	total_sz = hillclimb_allocate_live_ranges(lrs, cpu_tensor_alignment)
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	189	else:
				190	assert 0
Louis Verhaard	0b9c9a3	2020-09-15 14:05:38 +0200	[diff] [blame]	191	alloc_ok = max_size is None or total_sz <= max_size
				192	if dry_test or not alloc_ok:
				193	# Dry test or allocation failed; undo allocation
				194	for lr in lrs.ranges.values():
				195	lr.set_address(None)
				196	return alloc_ok
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	197
Patrik Gustavsson	eca2e95	2020-05-27 09:15:11 +0200	[diff] [blame]	198	if sg.memory_used.get(mem_area, 0) == 0:
				199	sg.memory_used[mem_area] = total_sz
				200	else:
				201	sg.memory_used[mem_area] += total_sz
				202
				203	# Keep track of how much should be used for scratch or permanent storage for NPU
				204	for mem_type in mem_type_set:
				205	if sg.memory_used_per_type.get(mem_type, 0) == 0:
				206	sg.memory_used_per_type[mem_type] = total_sz
				207	else:
				208	sg.memory_used_per_type[mem_type] += total_sz
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	209
Tim Hall	b9b515c	2020-11-01 21:27:19 +0000	[diff] [blame]	210	print_allocation(lrs, mem_area, mem_type_set, sg, verbose_allocation)
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	211
				212	if mem_area == MemArea.Sram:
erik.andersson@arm.com	3438c92	2021-03-24 10:32:09 +0100	[diff] [blame^]	213	sg.min_mem_usage = calculate_allocation_efficiency(lrs)
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	214	# Mark Sram usage for all subgraphs
				215	for sg_ in nng.subgraphs:
				216	mark_sram_used_for_cascaded_passes(sg_, lrs)
				217
				218	if sg == nng.get_root_subgraph():
				219	nng.memory_used = sg.memory_used
Diqing Zhong	66d7ec0	2021-02-01 19:07:04 +0100	[diff] [blame]	220	try:
				221	nng.weights_compression_ratio = nng.total_compressed_weights / nng.total_original_weights
				222	except ZeroDivisionError:
				223	nng.weights_compression_ratio = 0.0
Diqing Zhong	db5124c	2021-01-11 12:52:48 +0100	[diff] [blame]	224
Louis Verhaard	0b9c9a3	2020-09-15 14:05:38 +0200	[diff] [blame]	225	return True