Blame - ethosu/vela/tensor_allocation.py - ml/ethos-u/ethos-u-vela

blob: e3952df32ccea3e426e15afe0782d28cb02311b9 [file] [log] [blame]

Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	1	# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
				2	#
				3	# SPDX-License-Identifier: Apache-2.0
				4	#
				5	# Licensed under the Apache License, Version 2.0 (the License); you may
				6	# not use this file except in compliance with the License.
				7	# You may obtain a copy of the License at
				8	#
				9	# www.apache.org/licenses/LICENSE-2.0
				10	#
				11	# Unless required by applicable law or agreed to in writing, software
				12	# distributed under the License is distributed on an AS IS BASIS, WITHOUT
				13	# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				14	# See the License for the specific language governing permissions and
				15	# limitations under the License.
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	16	# Description:
				17	# Wrapping function to do tensor address allocation. That is, assigning addresses to tensors based on what has been
				18	# worked out from the allowable overlaps that are calculated by the live range analysis.
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	19	import math
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	20
Diego Russo	ea6111a	2020-04-14 18:41:58 +0100	[diff] [blame]	21	import numpy as np
				22
				23	from . import live_range
				24	from . import numeric_util
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	25	from .greedy_allocation import allocate_live_ranges as greedy_allocate_live_ranges
Diego Russo	e8a1045	2020-04-21 17:39:10 +0100	[diff] [blame]	26	from .nn_graph import TensorAllocator
				27	from .tensor import MemArea
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	28
				29
Louis Verhaard	3c07c97	2020-05-07 08:12:58 +0200	[diff] [blame^]	30	def linear_allocate_live_ranges(live_ranges, alloc_granularity=16):
				31	# Allocates using increasing addresses. Duplicate constant tensors will be allocated to the same address
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	32	total_sz = 0
				33	allocated_tensors = []
				34
Louis Verhaard	3c07c97	2020-05-07 08:12:58 +0200	[diff] [blame^]	35	# just assign increasing addresses, except for duplicates
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	36	for tens, lr in live_ranges.ranges.items():
				37	if tens in allocated_tensors:
				38	continue
				39
Louis Verhaard	3c07c97	2020-05-07 08:12:58 +0200	[diff] [blame^]	40	address = total_sz
				41	if tens.weight_compression_config is not None:
				42	for allocated_tens in allocated_tensors:
				43	if allocated_tens.weight_compression_config == tens.weight_compression_config:
				44	address = allocated_tens.address
				45	break
				46	lr.set_address(address)
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	47	allocated_tensors += lr.tensors
Louis Verhaard	3c07c97	2020-05-07 08:12:58 +0200	[diff] [blame^]	48	if address == total_sz:
				49	total_sz += numeric_util.round_up(int(math.ceil(lr.size)), alloc_granularity)
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	50
				51	return total_sz
				52
				53
				54	def mark_sram_used_for_cascaded_passes(sg, lrs):
				55	end_pos = max(ps.time for ps in sg.cascaded_passes) + 2
				56	mem_usage = np.zeros(end_pos, dtype=np.int64)
				57
				58	for tens, rng in lrs.ranges.items():
				59	storage_size = tens.storage_size()
				60	mem_usage[rng.start_time : rng.end_time] += storage_size
				61
				62	for cps in sg.cascaded_passes:
				63	sram_used = max(mem_usage[cps.time], mem_usage[cps.time + 1])
				64	cps.sram_used = sram_used
				65	for ps in cps.passes:
				66	ps.sram_used = sram_used
				67
				68
				69	def print_allocation(lrs, mem_area, sg, verbose_allocation, show_minimum_possible_allocation):
				70	if verbose_allocation:
				71	if mem_area == MemArea.Sram:
				72	print("allocation for", mem_area, "- non-constant tensors in Cpu and Npu subgraphs")
				73	else:
				74	print("allocation for", mem_area, "- constant tensors in", sg.placement.name, "subgraph(s)")
				75	for start_time, start, end, name, end_time in sorted(
				76	(
				77	lr.start_time,
				78	tens.address,
				79	tens.address + int(math.ceil(tens.storage_size())),
				80	tens.name + " " + str(tens.purpose),
				81	lr.end_time,
				82	)
				83	for tens, lr in lrs.ranges.items()
				84	):
				85	name = name.replace("\x00", "")
				86	print("%9d: %#12x - %#12x: %3d - %3d %s" % ((end - start), start, end, start_time, end_time, name))
				87	print()
				88
				89	if show_minimum_possible_allocation and mem_area == MemArea.Sram:
				90	min_possible_allocation = max(cps.sram_used for cps in sg.cascaded_passes)
				91	print(
				92	"Min possible allocation %d bytes / %.1f KB / %.1f MB"
				93	% (min_possible_allocation, min_possible_allocation / 1024, min_possible_allocation / 1024 / 1024)
				94	)
				95
				96
				97	def allocate_tensors(
				98	nng,
				99	sg,
				100	arch,
				101	mem_area,
				102	use_ifm_ofm_overlap=True,
				103	tensor_allocator=TensorAllocator.Greedy,
				104	verbose_allocation=False,
				105	show_minimum_possible_allocation=False,
				106	lr_graph=None,
				107	):
				108	ignore_subgraph_input_output_tensors = False
				109	lrs = live_range.extract_live_ranges_from_cascaded_passes(
				110	sg,
				111	mem_area,
				112	mark_output_tensors_overlapping_with_input_tensors=False,
				113	use_ifm_ofm_overlap=use_ifm_ofm_overlap,
				114	ignore_subgraph_input_output_tensors=ignore_subgraph_input_output_tensors,
				115	lr_graph=lr_graph,
				116	)
				117
				118	if lrs.ranges:
				119	tens_alloc = tensor_allocator
				120	if tens_alloc == TensorAllocator.Greedy:
				121	total_sz = greedy_allocate_live_ranges(sg, arch, lrs, mem_area, verbose_allocation)
				122	elif tens_alloc == TensorAllocator.LinearAlloc:
				123	total_sz = linear_allocate_live_ranges(lrs)
				124	else:
				125	assert 0
				126
				127	sg.memory_used[mem_area] = total_sz
				128
				129	nng.total_size[mem_area] = nng.total_size.get(mem_area, 0) + sum(tens.storage_size() for tens in lrs.ranges)
				130	nng.total_elements[mem_area] = nng.total_elements.get(mem_area, 0) + sum(tens.elements() for tens in lrs.ranges)
				131
				132	print_allocation(lrs, mem_area, sg, verbose_allocation, show_minimum_possible_allocation)
				133
				134	if mem_area == MemArea.Sram:
				135	# Mark Sram usage for all subgraphs
				136	for sg_ in nng.subgraphs:
				137	mark_sram_used_for_cascaded_passes(sg_, lrs)
				138
				139	if sg == nng.get_root_subgraph():
				140	nng.memory_used = sg.memory_used
				141	for mem_area in nng.total_elements.keys():
				142	try:
				143	nng.bits_per_element[mem_area] = nng.total_size[mem_area] * 8 / nng.total_elements[mem_area]
				144	except ZeroDivisionError:
				145	nng.bits_per_element[mem_area] = 0.0