Blame - ethosu/vela/tensor_allocation.py - ml/ethos-u/ethos-u-vela

blob: 7a7aaa4173e6d628e17e865e80ece7f84b9809af [file] [log] [blame]

Raul Farkas	1c54ac1	2023-04-26 07:49:15 +0100	[diff] [blame]	1	# SPDX-FileCopyrightText: Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	2	#
				3	# SPDX-License-Identifier: Apache-2.0
				4	#
				5	# Licensed under the Apache License, Version 2.0 (the License); you may
				6	# not use this file except in compliance with the License.
				7	# You may obtain a copy of the License at
				8	#
				9	# www.apache.org/licenses/LICENSE-2.0
				10	#
				11	# Unless required by applicable law or agreed to in writing, software
				12	# distributed under the License is distributed on an AS IS BASIS, WITHOUT
				13	# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				14	# See the License for the specific language governing permissions and
				15	# limitations under the License.
Rickard Bolin	bc6ee58	2022-11-04 08:24:29 +0000	[diff] [blame]	16	#
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	17	# Description:
				18	# Wrapping function to do tensor address allocation. That is, assigning addresses to tensors based on what has been
				19	# worked out from the allowable overlaps that are calculated by the live range analysis.
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	20	import math
Louis Verhaard	226ecaf	2021-03-30 10:18:28 +0200	[diff] [blame]	21	from typing import List
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	22
Diego Russo	ea6111a	2020-04-14 18:41:58 +0100	[diff] [blame]	23	import numpy as np
				24
Louis Verhaard	d700252	2021-01-20 17:23:54 +0100	[diff] [blame]	25	from . import hillclimb_allocation
Diego Russo	ea6111a	2020-04-14 18:41:58 +0100	[diff] [blame]	26	from . import live_range
				27	from . import numeric_util
Jacob Bohlin	0628a8c	2020-08-28 13:25:14 +0200	[diff] [blame]	28	from .errors import AllocationError
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	29	from .greedy_allocation import allocate_live_ranges as greedy_allocate_live_ranges
Louis Verhaard	226ecaf	2021-03-30 10:18:28 +0200	[diff] [blame]	30	from .live_range import LiveRange
Louis Verhaard	9bfe0f8	2020-12-03 12:26:25 +0100	[diff] [blame]	31	from .live_range import LiveRangeGraph
Diego Russo	e8a1045	2020-04-21 17:39:10 +0100	[diff] [blame]	32	from .nn_graph import TensorAllocator
				33	from .tensor import MemArea
Patrik Gustavsson	eca2e95	2020-05-27 09:15:11 +0200	[diff] [blame]	34	from .tensor import MemType
Jacob Bohlin	0628a8c	2020-08-28 13:25:14 +0200	[diff] [blame]	35	from .tensor import Tensor
Louis Verhaard	0b8268a	2020-08-05 16:11:29 +0200	[diff] [blame]	36	from .tensor import TensorPurpose
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	37
				38
Jacob Bohlin	0628a8c	2020-08-28 13:25:14 +0200	[diff] [blame]	39	def linear_allocate_live_ranges(live_ranges, alloc_granularity=Tensor.AllocationQuantum):
Louis Verhaard	3c07c97	2020-05-07 08:12:58 +0200	[diff] [blame]	40	# Allocates using increasing addresses. Duplicate constant tensors will be allocated to the same address
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	41	total_sz = 0
				42	allocated_tensors = []
				43
Louis Verhaard	3c07c97	2020-05-07 08:12:58 +0200	[diff] [blame]	44	# just assign increasing addresses, except for duplicates
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	45	for tens, lr in live_ranges.ranges.items():
				46	if tens in allocated_tensors:
				47	continue
				48
Louis Verhaard	3c07c97	2020-05-07 08:12:58 +0200	[diff] [blame]	49	address = total_sz
				50	if tens.weight_compression_config is not None:
				51	for allocated_tens in allocated_tensors:
				52	if allocated_tens.weight_compression_config == tens.weight_compression_config:
Tim Hall	d784af7	2021-06-08 21:25:57 +0100	[diff] [blame]	53	assert allocated_tens.scale_compression_config == tens.scale_compression_config
Louis Verhaard	3c07c97	2020-05-07 08:12:58 +0200	[diff] [blame]	54	address = allocated_tens.address
				55	break
Louis Verhaard	0b8268a	2020-08-05 16:11:29 +0200	[diff] [blame]	56	if tens.purpose == TensorPurpose.LUT:
				57	for allocated_tens in allocated_tensors:
				58	if allocated_tens.equivalent(tens):
				59	address = allocated_tens.address
				60	break
Louis Verhaard	3c07c97	2020-05-07 08:12:58 +0200	[diff] [blame]	61	lr.set_address(address)
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	62	allocated_tensors += lr.tensors
Louis Verhaard	3c07c97	2020-05-07 08:12:58 +0200	[diff] [blame]	63	if address == total_sz:
				64	total_sz += numeric_util.round_up(int(math.ceil(lr.size)), alloc_granularity)
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	65
Jacob Bohlin	0628a8c	2020-08-28 13:25:14 +0200	[diff] [blame]	66	verify_alignment(live_ranges, alloc_granularity)
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	67	return total_sz
				68
				69
Tim Hall	cda4fcb	2022-05-19 12:36:58 +0100	[diff] [blame]	70	def hillclimb_allocate_live_ranges(
				71	live_ranges: LiveRangeGraph, alloc_granularity: int, max_iterations: int, mem_limit: int
				72	) -> int:
Louis Verhaard	d700252	2021-01-20 17:23:54 +0100	[diff] [blame]	73	# Allocates using the hill climb allocator
Tim Hall	cda4fcb	2022-05-19 12:36:58 +0100	[diff] [blame]	74	addresses = hillclimb_allocation.allocate_live_ranges(live_ranges.lrs, max_iterations, mem_limit)
Louis Verhaard	9bfe0f8	2020-12-03 12:26:25 +0100	[diff] [blame]	75	# The result is a list containing the allocated addresses
				76	total_sz = 0
Louis Verhaard	226ecaf	2021-03-30 10:18:28 +0200	[diff] [blame]	77	for lr, address in zip(live_ranges.lrs, addresses):
Louis Verhaard	9bfe0f8	2020-12-03 12:26:25 +0100	[diff] [blame]	78	total_sz = max(total_sz, address + lr.size)
				79	lr.set_address(address)
				80	verify_allocation(live_ranges, alloc_granularity)
				81	return total_sz
				82
				83
				84	def verify_alignment(live_ranges: LiveRangeGraph, alignment: int):
Louis Verhaard	226ecaf	2021-03-30 10:18:28 +0200	[diff] [blame]	85	for lr in live_ranges.lrs:
Jacob Bohlin	0628a8c	2020-08-28 13:25:14 +0200	[diff] [blame]	86	for tens in lr.tensors:
				87	if not all(op and op.run_on_npu for op in tens.ops + tens.consumer_list):
				88	# This is a CPU tensor, verify alignment
				89	if tens.address % alignment != 0:
Michael McGeagh	7a6f843	2020-12-02 15:29:22 +0000	[diff] [blame]	90	raise AllocationError(f"Tensor '{tens.name}' not aligned to {alignment} bytes")
Jacob Bohlin	0628a8c	2020-08-28 13:25:14 +0200	[diff] [blame]	91
				92
Louis Verhaard	9bfe0f8	2020-12-03 12:26:25 +0100	[diff] [blame]	93	def verify_allocation(live_ranges: LiveRangeGraph, alignment: int):
Louis Verhaard	226ecaf	2021-03-30 10:18:28 +0200	[diff] [blame]	94	verify_alignment(live_ranges, alignment)
				95	nr_time_slots = 1 + max(lr.end_time for lr in live_ranges.lrs)
				96	# Contains active live ranges at each timestamp
Jonas Ohlsson	845e232	2022-03-01 12:39:55 +0100	[diff] [blame]	97	lrs_at_time: List[List[LiveRange]] = [[] for i in range(nr_time_slots)]
Louis Verhaard	226ecaf	2021-03-30 10:18:28 +0200	[diff] [blame]	98	for lr in live_ranges.lrs:
				99	for t in range(lr.start_time, lr.end_time + 1):
				100	lrs_at_time[t].append(lr)
				101	for t in range(nr_time_slots):
Johan Alfvén	36da8d3	2022-01-18 08:56:56 +0100	[diff] [blame]	102	lrs_new_items = [lr for lr in lrs_at_time[t] if t == 0 or lr not in lrs_at_time[t - 1]]
				103	for m in lrs_new_items:
				104	for n in lrs_at_time[t]:
Louis Verhaard	9bfe0f8	2020-12-03 12:26:25 +0100	[diff] [blame]	105	overlap, tens_n, tens_m = n.overlaps_address(m)
				106	if overlap and not (tens_n.equivalent(tens_m) and tens_n.address == tens_m.address):
				107	raise AllocationError(
Michael McGeagh	7a6f843	2020-12-02 15:29:22 +0000	[diff] [blame]	108	f"Overlapping buffers: {n.name}: {tens_n.address} -> {tens_n.address + n.size}"
				109	f" and {m.name}: {tens_m.address} -> {tens_m.address + m.size}"
Louis Verhaard	9bfe0f8	2020-12-03 12:26:25 +0100	[diff] [blame]	110	)
				111
				112
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	113	def mark_sram_used_for_cascaded_passes(sg, lrs):
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	114	if len(sg.cascaded_passes) < 1:
				115	return
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	116	end_pos = max(ps.time for ps in sg.cascaded_passes) + 2
				117	mem_usage = np.zeros(end_pos, dtype=np.int64)
				118
				119	for tens, rng in lrs.ranges.items():
				120	storage_size = tens.storage_size()
				121	mem_usage[rng.start_time : rng.end_time] += storage_size
				122
				123	for cps in sg.cascaded_passes:
				124	sram_used = max(mem_usage[cps.time], mem_usage[cps.time + 1])
				125	cps.sram_used = sram_used
				126	for ps in cps.passes:
				127	ps.sram_used = sram_used
				128
				129
Tim Hall	64556f3	2021-05-17 22:57:46 +0100	[diff] [blame]	130	def print_allocation(lrs, mem_area, mem_type_set, tensor_allocator, sg, actual_mem_usage_for_alloc):
				131	print("\n" + "#" * 80)
				132	sg_placement = (
				133	sg.placement.name
Jonas Ohlsson	d857507	2022-03-30 10:30:25 +0200	[diff] [blame]	134	if mem_type_set.intersection(
				135	(
				136	MemType.Permanent_NPU,
				137	MemType.Permanent_CPU,
				138	)
				139	)
Tim Hall	64556f3	2021-05-17 22:57:46 +0100	[diff] [blame]	140	else "Cpu and Npu"
				141	)
				142	print(
				143	f"Tensor Allocation for mem_area {mem_area.name}, of mem_type_set ("
				144	f'{", ".join(f"{mem_type.name}" for mem_type in mem_type_set)}'
				145	f"), using allocator {tensor_allocator}, in {sg_placement} subgraph:"
				146	)
				147
				148	memory_hist = memory_usage_histogram(lrs.lrs)
				149	min_mem_usage_for_alloc = max(memory_hist)
Tim Hall	cda4fcb	2022-05-19 12:36:58 +0100	[diff] [blame]	150	print(
				151	f"{'Start Time':>10s} - {'End Time':>10s}: {'Start Addr':>10s} - {'End Addr':>10s}: {'Tensor Size':>11s}:"
				152	f" {'Memory Usage':>12s}: {'Purpose':12s}: Name"
				153	)
Tim Hall	64556f3	2021-05-17 22:57:46 +0100	[diff] [blame]	154	for start_time, end_time, size, start_addr, end_addr, purpose, name in sorted(
Jonas Ohlsson	d857507	2022-03-30 10:30:25 +0200	[diff] [blame]	155	(
				156	lr.start_time,
				157	lr.end_time,
				158	lr.size,
				159	tens.address,
				160	tens.address + lr.size,
				161	tens.purpose,
				162	tens.name,
				163	)
Tim Hall	64556f3	2021-05-17 22:57:46 +0100	[diff] [blame]	164	for tens, lr in lrs.ranges.items()
				165	):
				166	print(
				167	f"{start_time:10d} - {end_time:10d}: {start_addr:#10x} - {end_addr:#10x}: {size:11d}:"
Tim Hall	d2e03c6	2023-12-19 11:17:37 +0000	[diff] [blame]	168	f" {max(memory_hist[start_time:end_time+1]):12d}: {purpose.display_name():12s}: {name:s}"
Tim Hall	64556f3	2021-05-17 22:57:46 +0100	[diff] [blame]	169	)
				170
				171	alloc_overhead_fraction = (actual_mem_usage_for_alloc - min_mem_usage_for_alloc) / min_mem_usage_for_alloc
				172	print(
				173	f"Allocation Peak Tensor Size: {min_mem_usage_for_alloc:9d} ({min_mem_usage_for_alloc:#10x})"
				174	f" Bytes {min_mem_usage_for_alloc/1024.0:8.2f} KiB"
				175	)
				176	print(
				177	f"Allocation Peak Memory Usage: {actual_mem_usage_for_alloc:9d} ({actual_mem_usage_for_alloc:#10x})"
				178	f" Bytes {actual_mem_usage_for_alloc/1024.0:8.2f} KiB"
				179	)
				180	print(
				181	f"Allocation Overhead: {actual_mem_usage_for_alloc-min_mem_usage_for_alloc:9d}"
				182	f" Bytes ({100*alloc_overhead_fraction:.2f} %)"
				183	)
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	184
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	185
Tim Hall	64556f3	2021-05-17 22:57:46 +0100	[diff] [blame]	186	def memory_usage_histogram(lrs: List[LiveRange]):
				187	histogram = [0] * (1 + max(lr.end_time for lr in lrs))
Louis Verhaard	226ecaf	2021-03-30 10:18:28 +0200	[diff] [blame]	188	for lr in lrs:
erik.andersson@arm.com	3438c92	2021-03-24 10:32:09 +0100	[diff] [blame]	189	for t in range(lr.start_time, lr.end_time + 1):
Tim Hall	64556f3	2021-05-17 22:57:46 +0100	[diff] [blame]	190	histogram[t] += lr.size
erik.andersson@arm.com	3438c92	2021-03-24 10:32:09 +0100	[diff] [blame]	191
Tim Hall	64556f3	2021-05-17 22:57:46 +0100	[diff] [blame]	192	return histogram
erik.andersson@arm.com	3438c92	2021-03-24 10:32:09 +0100	[diff] [blame]	193
				194
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	195	def allocate(
				196	sg,
				197	arch,
				198	mem_area,
				199	mem_type_set,
				200	tensor_allocator=TensorAllocator.Greedy,
				201	lr_graph=None,
				202	cpu_tensor_alignment=Tensor.AllocationQuantum,
Tim Hall	cda4fcb	2022-05-19 12:36:58 +0100	[diff] [blame]	203	hillclimb_max_iterations=None,
Raul Farkas	1c54ac1	2023-04-26 07:49:15 +0100	[diff] [blame]	204	verbose_progress=False,
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	205	):
				206	# Allocates addresses to tensors, returns False if tensors could not be fit within max_size
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	207	lrs = live_range.extract_live_ranges_from_cascaded_passes(
Jonas Ohlsson	d857507	2022-03-30 10:30:25 +0200	[diff] [blame]	208	sg,
				209	mem_area,
				210	mem_type_set,
				211	lr_graph=lr_graph,
				212	cpu_tensor_alignment=cpu_tensor_alignment,
Raul Farkas	1c54ac1	2023-04-26 07:49:15 +0100	[diff] [blame]	213	verbose_progress=verbose_progress,
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	214	)
				215	total_sz = 0
				216	if lrs.ranges:
				217	tens_alloc = tensor_allocator
				218	if tens_alloc == TensorAllocator.Greedy:
Tim Hall	cda4fcb	2022-05-19 12:36:58 +0100	[diff] [blame]	219	total_sz = greedy_allocate_live_ranges(lrs, cpu_tensor_alignment)
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	220	verify_allocation(lrs, cpu_tensor_alignment)
				221	elif tens_alloc == TensorAllocator.LinearAlloc:
				222	total_sz = linear_allocate_live_ranges(lrs, cpu_tensor_alignment)
				223	elif tens_alloc == TensorAllocator.HillClimb:
Tim Hall	cda4fcb	2022-05-19 12:36:58 +0100	[diff] [blame]	224	mem_type = MemType.Scratch_fast if MemType.Scratch_fast in mem_type_set else list(mem_type_set)[0]
				225	mem_size = arch.mem_type_size(mem_type)
				226	total_sz = hillclimb_allocate_live_ranges(lrs, cpu_tensor_alignment, hillclimb_max_iterations, mem_size)
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	227	else:
				228	assert 0
				229	return lrs, total_sz
				230
				231
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	232	def allocate_tensors(
				233	nng,
				234	sg,
				235	arch,
				236	mem_area,
Patrik Gustavsson	eca2e95	2020-05-27 09:15:11 +0200	[diff] [blame]	237	mem_type_set,
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	238	tensor_allocator=TensorAllocator.Greedy,
				239	verbose_allocation=False,
Raul Farkas	1c54ac1	2023-04-26 07:49:15 +0100	[diff] [blame]	240	verbose_progress=False,
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	241	lr_graph=None,
Tim Hall	b9b515c	2020-11-01 21:27:19 +0000	[diff] [blame]	242	cpu_tensor_alignment=Tensor.AllocationQuantum,
Tim Hall	cda4fcb	2022-05-19 12:36:58 +0100	[diff] [blame]	243	hillclimb_max_iterations=None,
Louis Verhaard	0b9c9a3	2020-09-15 14:05:38 +0200	[diff] [blame]	244	max_size=None,
				245	dry_test=False,
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	246	):
Louis Verhaard	0b9c9a3	2020-09-15 14:05:38 +0200	[diff] [blame]	247	# Allocates addresses to tensors, returns False if tensors could not be fit within max_size
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	248	lrs, total_sz = allocate(
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	249	sg,
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	250	arch,
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	251	mem_area,
Patrik Gustavsson	eca2e95	2020-05-27 09:15:11 +0200	[diff] [blame]	252	mem_type_set,
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	253	tensor_allocator=tensor_allocator,
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	254	lr_graph=lr_graph,
Tim Hall	b9b515c	2020-11-01 21:27:19 +0000	[diff] [blame]	255	cpu_tensor_alignment=cpu_tensor_alignment,
Tim Hall	cda4fcb	2022-05-19 12:36:58 +0100	[diff] [blame]	256	hillclimb_max_iterations=hillclimb_max_iterations,
Raul Farkas	1c54ac1	2023-04-26 07:49:15 +0100	[diff] [blame]	257	verbose_progress=verbose_progress,
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	258	)
				259
				260	if lrs.ranges:
Louis Verhaard	0b9c9a3	2020-09-15 14:05:38 +0200	[diff] [blame]	261	alloc_ok = max_size is None or total_sz <= max_size
				262	if dry_test or not alloc_ok:
				263	# Dry test or allocation failed; undo allocation
				264	for lr in lrs.ranges.values():
				265	lr.set_address(None)
				266	return alloc_ok
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	267
Patrik Gustavsson	eca2e95	2020-05-27 09:15:11 +0200	[diff] [blame]	268	if sg.memory_used.get(mem_area, 0) == 0:
				269	sg.memory_used[mem_area] = total_sz
				270	else:
				271	sg.memory_used[mem_area] += total_sz
				272
				273	# Keep track of how much should be used for scratch or permanent storage for NPU
				274	for mem_type in mem_type_set:
				275	if sg.memory_used_per_type.get(mem_type, 0) == 0:
				276	sg.memory_used_per_type[mem_type] = total_sz
				277	else:
				278	sg.memory_used_per_type[mem_type] += total_sz
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	279
Tim Hall	64556f3	2021-05-17 22:57:46 +0100	[diff] [blame]	280	if verbose_allocation:
				281	print_allocation(lrs, mem_area, mem_type_set, tensor_allocator, sg, total_sz)
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	282
				283	if mem_area == MemArea.Sram:
				284	# Mark Sram usage for all subgraphs
				285	for sg_ in nng.subgraphs:
				286	mark_sram_used_for_cascaded_passes(sg_, lrs)
				287
				288	if sg == nng.get_root_subgraph():
				289	nng.memory_used = sg.memory_used
Louis Verhaard	0b9c9a3	2020-09-15 14:05:38 +0200	[diff] [blame]	290	return True