Blame - ethosu/vela/compiler_driver.py - ml/ethos-u/ethos-u-vela

blob: 51c9707021675267c801aede96b213b5a7f68786 [file] [log] [blame]

wilisa01	46c9477	2023-02-08 09:56:14 +0000	[diff] [blame]	1	# SPDX-FileCopyrightText: Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	2	#
				3	# SPDX-License-Identifier: Apache-2.0
				4	#
				5	# Licensed under the Apache License, Version 2.0 (the License); you may
				6	# not use this file except in compliance with the License.
				7	# You may obtain a copy of the License at
				8	#
				9	# www.apache.org/licenses/LICENSE-2.0
				10	#
				11	# Unless required by applicable law or agreed to in writing, software
				12	# distributed under the License is distributed on an AS IS BASIS, WITHOUT
				13	# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				14	# See the License for the specific language governing permissions and
				15	# limitations under the License.
Rickard Bolin	bc6ee58	2022-11-04 08:24:29 +0000	[diff] [blame]	16	#
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	17	# Description:
				18	# Contains the main sequencing of the compiler.
Diego Russo	ea6111a	2020-04-14 18:41:58 +0100	[diff] [blame]	19	import time
				20
Diego Russo	e8a1045	2020-04-21 17:39:10 +0100	[diff] [blame]	21	from . import extract_npu_subgraphs
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	22	from . import graph_optimiser
Diego Russo	e8a1045	2020-04-21 17:39:10 +0100	[diff] [blame]	23	from . import high_level_command_stream_generator
Louis Verhaard	1e17018	2020-11-26 11:42:04 +0100	[diff] [blame]	24	from . import high_level_command_to_npu_op
Diego Russo	e8a1045	2020-04-21 17:39:10 +0100	[diff] [blame]	25	from . import live_range
Louis Verhaard	0b8268a	2020-08-05 16:11:29 +0200	[diff] [blame]	26	from . import lut
Diego Russo	e8a1045	2020-04-21 17:39:10 +0100	[diff] [blame]	27	from . import mark_tensors
				28	from . import npu_performance
				29	from . import npu_serialisation
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	30	from . import pass_packing
				31	from . import scheduler
				32	from . import tensor_allocation
Tim Hall	e6ccd87	2020-11-09 16:46:37 +0000	[diff] [blame]	33	from .debug_database import DebugDatabase
Diego Russo	e8a1045	2020-04-21 17:39:10 +0100	[diff] [blame]	34	from .nn_graph import PassPlacement
				35	from .nn_graph import TensorAllocator
Tim Hall	e6ccd87	2020-11-09 16:46:37 +0000	[diff] [blame]	36	from .operation import Op
Diego Russo	ea6111a	2020-04-14 18:41:58 +0100	[diff] [blame]	37	from .rewrite_graph import verify_graph_health
Tim Hall	e6ccd87	2020-11-09 16:46:37 +0000	[diff] [blame]	38	from .rewrite_graph import visit_graph_post_order
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	39	from .scheduler import OptimizationStrategy
				40	from .tensor import MemArea
Patrik Gustavsson	eca2e95	2020-05-27 09:15:11 +0200	[diff] [blame]	41	from .tensor import MemType
Jacob Bohlin	0628a8c	2020-08-28 13:25:14 +0200	[diff] [blame]	42	from .tensor import Tensor
Raul Farkas	1c54ac1	2023-04-26 07:49:15 +0100	[diff] [blame]	43	from .utils import progress_print
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	44
				45
				46	class CompilerOptions:
				47	"""Set of options to change compiler behaviour - verbosity, targets, turning off passes.
				48
Jonas Ohlsson	d857507	2022-03-30 10:30:25 +0200	[diff] [blame]	49	Note the difference between ArchitectureFeatures and CompilerOptions
				50	- ArchitectureFeatures is for changing the Ethos-U and system architecture
				51	- CompilerOptions is for changing the behaviour of the compiler"""
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	52
				53	def __init__(
				54	self,
				55	verbose_graph=False,
				56	verbose_quantization=False,
				57	verbose_packing=False,
				58	verbose_tensor_purpose=False,
				59	verbose_tensor_format=False,
				60	verbose_allocation=False,
				61	verbose_high_level_command_stream=False,
				62	verbose_register_command_stream=False,
				63	verbose_operators=False,
Fredrik Svedberg	f5c07c4	2021-04-23 14:36:42 +0200	[diff] [blame]	64	verbose_weights=False,
Tim Hall	c1be087	2022-03-03 17:50:52 +0000	[diff] [blame]	65	verbose_performance=False,
Raul Farkas	1c54ac1	2023-04-26 07:49:15 +0100	[diff] [blame]	66	verbose_progress=False,
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	67	show_cpu_operations=False,
				68	tensor_allocator=TensorAllocator.Greedy,
				69	timing=False,
wilisa01	46c9477	2023-02-08 09:56:14 +0000	[diff] [blame]	70	force_symmetric_int_weights=False,
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	71	output_dir="outputs",
Tim Hall	b9b515c	2020-11-01 21:27:19 +0000	[diff] [blame]	72	cpu_tensor_alignment=Tensor.AllocationQuantum,
Tim Hall	cda4fcb	2022-05-19 12:36:58 +0100	[diff] [blame]	73	hillclimb_max_iterations=None,
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	74	):
				75
				76	self.verbose_graph = verbose_graph
				77	self.verbose_quantization = verbose_quantization
				78	self.verbose_packing = verbose_packing
				79	self.verbose_tensor_purpose = verbose_tensor_purpose
				80	self.verbose_tensor_format = verbose_tensor_format
				81	self.verbose_allocation = verbose_allocation
				82	self.verbose_high_level_command_stream = verbose_high_level_command_stream
				83	self.verbose_register_command_stream = verbose_register_command_stream
				84	self.verbose_operators = verbose_operators
Fredrik Svedberg	f5c07c4	2021-04-23 14:36:42 +0200	[diff] [blame]	85	self.verbose_weights = verbose_weights
Tim Hall	c1be087	2022-03-03 17:50:52 +0000	[diff] [blame]	86	self.verbose_performance = verbose_performance
Raul Farkas	1c54ac1	2023-04-26 07:49:15 +0100	[diff] [blame]	87	self.verbose_progress = verbose_progress
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	88	self.show_cpu_operations = show_cpu_operations
				89	self.tensor_allocator = tensor_allocator
				90	self.timing = timing
wilisa01	46c9477	2023-02-08 09:56:14 +0000	[diff] [blame]	91	self.force_symmetric_int_weights = force_symmetric_int_weights
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	92	self.output_dir = output_dir
Tim Hall	b9b515c	2020-11-01 21:27:19 +0000	[diff] [blame]	93	self.cpu_tensor_alignment = cpu_tensor_alignment
Tim Hall	cda4fcb	2022-05-19 12:36:58 +0100	[diff] [blame]	94	self.hillclimb_max_iterations = hillclimb_max_iterations
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	95
				96	def __str__(self):
				97	return type(self).__name__ + ": " + str(self.__dict__)
				98
				99	__repr__ = __str__
				100
				101
Louis Verhaard	0b9c9a3	2020-09-15 14:05:38 +0200	[diff] [blame]	102	def next_sram_factor(alloc_results):
				103	# Bisects to find the max SRAM usage that successfully can be fitted with the tensor allocator.
				104	# Returns tuple (factor, dry_test), with factor is None (stop) or 0 <= factor <= 1 (next SRAM factor to try),
				105	# dry_test is True while still bisecting.
				106	upper = 1.0
				107	lower = 0.7
				108	MAX_ITERATIONS = 8
				109	if len(alloc_results) == 0:
				110	# First iteration, try max SRAM, keep the result if it succeeds
				111	return (upper, False)
				112	elif len(alloc_results) == 1:
				113	if alloc_results[0]:
				114	# The allocator succeeded at first try; stop
				115	return (None, False)
				116	else:
				117	# Start bisecting, try lowerbound SRAM
				118	return (lower, True)
				119	elif len(alloc_results) > MAX_ITERATIONS:
				120	# Stop
				121	return (None, False)
				122	if not alloc_results[1]:
				123	# Allocation at lower failed; search interval 0 - lower
				124	upper = lower
				125	lower = 0
				126	best = lower
				127	for success in alloc_results[2:]:
				128	middle = (lower + upper) / 2
				129	if success:
				130	best = max(best, middle)
				131	lower = middle
				132	else:
				133	upper = middle
				134	if len(alloc_results) == MAX_ITERATIONS:
				135	# Done bisecting; repeat the best match, but not as dry test
				136	return (best, False)
				137	# Next try; run only as dry test
				138	return ((lower + upper) / 2, True)
				139
				140
Tim Hall	e6ccd87	2020-11-09 16:46:37 +0000	[diff] [blame]	141	def _record_operator(op, arch):
wilisa01	79a8904	2022-11-02 17:18:43 +0000	[diff] [blame]	142	if op.type not in (Op.Const, Op.Placeholder):
Tim Hall	e6ccd87	2020-11-09 16:46:37 +0000	[diff] [blame]	143	DebugDatabase.add_source(op)
				144
				145
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	146	def _check_schedule(nng, arch, scheduler_options):
				147	# check sram usage for optimisation strategy
				148	sram_usage = nng.get_root_subgraph().memory_used.get(MemArea.Sram)
				149	if sram_usage is not None and scheduler_options.optimization_strategy == OptimizationStrategy.Performance:
				150	if sram_usage > scheduler_options.optimization_sram_limit:
				151	print(
				152	f"Warning: SRAM target for arena memory area exceeded."
				153	f" Target = {scheduler_options.optimization_sram_limit} Bytes,"
				154	f" Actual = {sram_usage} Bytes"
				155	)
				156
				157
wilisa01	89a8cdd	2022-08-22 16:13:06 +0000	[diff] [blame]	158	def compiler_driver(nng, arch, options, scheduler_options, network_type, output_basename):
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	159	assert verify_graph_health(nng)
Raul Farkas	1c54ac1	2023-04-26 07:49:15 +0100	[diff] [blame]	160	verbose_progress = scheduler_options.verbose_progress
Tim Hall	e6ccd87	2020-11-09 16:46:37 +0000	[diff] [blame]	161
				162	# Pre-optimisation operator tracking
				163	for sg in nng.subgraphs:
				164	visit_graph_post_order(sg.output_tensors, arch, [], [_record_operator])
				165
Raul Farkas	1c54ac1	2023-04-26 07:49:15 +0100	[diff] [blame]	166	progress_print(verbose_progress, "Performing graph optimisation")
wilisa01	46c9477	2023-02-08 09:56:14 +0000	[diff] [blame]	167	nng = graph_optimiser.optimise_graph(
				168	nng, arch, network_type, options.verbose_graph, options.force_symmetric_int_weights
				169	)
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	170	assert verify_graph_health(nng)
				171
				172	if options.verbose_quantization:
				173	nng.print_graph_with_tensor_quantization()
				174
Raul Farkas	1c54ac1	2023-04-26 07:49:15 +0100	[diff] [blame]	175	progress_print(verbose_progress, "Defining tensor purpose")
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	176	nng = mark_tensors.mark_tensor_purpose(nng, arch, options.verbose_tensor_purpose)
				177	assert verify_graph_health(nng)
Raul Farkas	1c54ac1	2023-04-26 07:49:15 +0100	[diff] [blame]	178
				179	progress_print(verbose_progress, "Performing pass packing")
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	180	pass_packing.pack_into_passes(nng, arch, options.verbose_packing)
				181	assert verify_graph_health(nng)
				182
Raul Farkas	1c54ac1	2023-04-26 07:49:15 +0100	[diff] [blame]	183	progress_print(verbose_progress, "Extracting npu subgraphs")
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	184	extract_npu_subgraphs.extract_npu_subgraphs(nng, arch)
				185
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	186	assert verify_graph_health(nng)
				187	if options.timing:
				188	start = time.time()
				189
Raul Farkas	1c54ac1	2023-04-26 07:49:15 +0100	[diff] [blame]	190	progress_print(verbose_progress, "Scheduling passes")
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	191	# Run the scheduler
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	192	scheduler.schedule_passes(nng, arch, options, scheduler_options)
				193	_check_schedule(nng, arch, scheduler_options)
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	194
				195	if options.timing:
				196	stop = time.time()
				197	print("Scheduling took %f s" % (stop - start))
				198	start = time.time()
				199
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	200	# LiveRanges for constant tensors for all Npu subgraphs
				201	permanent_storage = arch.permanent_storage_mem_area
				202	lr_graph_flash = live_range.LiveRangeGraph()
				203
				204	# Placeholders for scratch and flash tensors that are common for all Npu subgraphs
				205	scratch_tens = None
Patrik Gustavsson	3ab9452	2020-06-29 17:36:55 +0200	[diff] [blame]	206	scratch_fast_tens = None
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	207	flash_tens = None
				208
Dwight Lidman	62cdfe5	2021-10-11 16:39:10 +0200	[diff] [blame]	209	# Create list of NPU subgraphs with same order as the list of all subgraphs
				210	npu_subgraphs = [sg for sg in nng.subgraphs if sg.placement == PassPlacement.Npu]
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	211
Raul Farkas	1c54ac1	2023-04-26 07:49:15 +0100	[diff] [blame]	212	progress_print(verbose_progress, "Calculating live ranges for constant NPU tensors")
Dwight Lidman	62cdfe5	2021-10-11 16:39:10 +0200	[diff] [blame]	213	# Calculate live ranges for all constant Npu tensors, in permanent storage
				214	for sg in npu_subgraphs:
				215	lr_graph_flash = live_range.create_linear_live_range_graph(
Jonas Ohlsson	d857507	2022-03-30 10:30:25 +0200	[diff] [blame]	216	sg,
				217	permanent_storage,
				218	MemType.Permanent_NPU,
				219	lr_graph=lr_graph_flash,
Dwight Lidman	62cdfe5	2021-10-11 16:39:10 +0200	[diff] [blame]	220	)
				221
				222	if npu_subgraphs:
Raul Farkas	1c54ac1	2023-04-26 07:49:15 +0100	[diff] [blame]	223	progress_print(verbose_progress, "Allocating NPU constant tensors to the first NPU subgraph")
Tim Hall	25f605c	2020-05-18 18:04:26 +0100	[diff] [blame]	224	# Allocate all Npu constant tensors to the first Npu subgraph since it is
				225	# processed first during serialization into tensors
Dwight Lidman	62cdfe5	2021-10-11 16:39:10 +0200	[diff] [blame]	226	first_npu_sg = npu_subgraphs[0]
Tim Hall	25f605c	2020-05-18 18:04:26 +0100	[diff] [blame]	227	tensor_allocation.allocate_tensors(
				228	nng,
				229	first_npu_sg,
				230	arch,
				231	permanent_storage,
Patrik Gustavsson	eca2e95	2020-05-27 09:15:11 +0200	[diff] [blame]	232	set((MemType.Permanent_NPU,)),
Louis Verhaard	0b9c9a3	2020-09-15 14:05:38 +0200	[diff] [blame]	233	tensor_allocator=TensorAllocator.LinearAlloc,
				234	verbose_allocation=options.verbose_allocation,
Louis Verhaard	0b9c9a3	2020-09-15 14:05:38 +0200	[diff] [blame]	235	lr_graph=lr_graph_flash,
Tim Hall	25f605c	2020-05-18 18:04:26 +0100	[diff] [blame]	236	)
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	237
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	238	root_sg = nng.get_root_subgraph()
Patrik Gustavsson	eca2e95	2020-05-27 09:15:11 +0200	[diff] [blame]	239
Raul Farkas	1c54ac1	2023-04-26 07:49:15 +0100	[diff] [blame]	240	progress_print(verbose_progress, "Generating command stream")
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	241	# Generate command streams and serialise Npu-ops into tensors
Dwight Lidman	62cdfe5	2021-10-11 16:39:10 +0200	[diff] [blame]	242	for sg in npu_subgraphs:
				243	high_level_command_stream_generator.generate_high_level_command_stream_for_schedule(
				244	nng, sg, arch, options.verbose_high_level_command_stream
				245	)
				246	lut.optimize_high_level_cmd_stream(sg, arch)
				247	high_level_command_to_npu_op.generate_register_command_stream_for_sg(
				248	nng, sg, arch, options.verbose_register_command_stream
				249	)
				250	scratch_tens, scratch_fast_tens, flash_tens = npu_serialisation.serialise_npu_subgraph_into_tensors(
				251	sg, arch, scratch_tens, scratch_fast_tens, flash_tens
				252	)
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	253
Johan Alfvén	673683b	2022-09-05 09:39:47 +0200	[diff] [blame]	254	# Create list of CPU subgraphs with same order as the list of all subgraphs
				255	cpu_subgraphs = [sg for sg in nng.subgraphs if sg.placement == PassPlacement.Cpu]
				256	for sg in cpu_subgraphs:
				257	npu_serialisation.rewrite_npu_call_ops(sg, arch)
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	258
Jacob Bohlin	268394d	2020-08-13 13:24:59 +0200	[diff] [blame]	259	# Set Scratch and Fast_scratch Tensor size
				260	if scratch_tens is not None:
				261	scratch_tens.set_all_shapes([root_sg.memory_used_per_type.get(MemType.Scratch, 0)])
				262	if scratch_fast_tens is not None:
				263	scratch_fast_tens.set_all_shapes([root_sg.memory_used_per_type.get(MemType.Scratch_fast, 0)])
				264
Raul Farkas	1c54ac1	2023-04-26 07:49:15 +0100	[diff] [blame]	265	progress_print(verbose_progress, "Allocating CPU constant tensors")
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	266	# Allocate all Cpu constant tensors, this is done last because the Npu-ops
				267	# have to be serialized into flash and scratch tensors first
				268	tensor_allocation.allocate_tensors(
				269	nng,
				270	root_sg,
				271	arch,
				272	permanent_storage,
Patrik Gustavsson	eca2e95	2020-05-27 09:15:11 +0200	[diff] [blame]	273	set((MemType.Permanent_CPU,)),
Louis Verhaard	0b9c9a3	2020-09-15 14:05:38 +0200	[diff] [blame]	274	tensor_allocator=TensorAllocator.LinearAlloc,
				275	verbose_allocation=options.verbose_allocation,
Tim Hall	b9b515c	2020-11-01 21:27:19 +0000	[diff] [blame]	276	cpu_tensor_alignment=options.cpu_tensor_alignment,
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	277	)
Raul Farkas	1c54ac1	2023-04-26 07:49:15 +0100	[diff] [blame]	278	progress_print(verbose_progress, "Calculating new performance for the network")
wilisa01	89a8cdd	2022-08-22 16:13:06 +0000	[diff] [blame]	279	npu_performance.calc_new_performance_for_network(
				280	nng, arch, network_type, options.verbose_performance, output_basename
				281	)