Blame - ethosu/vela/compiler_driver.py - ml/ethos-u/ethos-u-vela

blob: 5e9e38fb5f75647a897e49e5cc83478f97299003 [file] [log] [blame]

Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	1	# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
				2	#
				3	# SPDX-License-Identifier: Apache-2.0
				4	#
				5	# Licensed under the Apache License, Version 2.0 (the License); you may
				6	# not use this file except in compliance with the License.
				7	# You may obtain a copy of the License at
				8	#
				9	# www.apache.org/licenses/LICENSE-2.0
				10	#
				11	# Unless required by applicable law or agreed to in writing, software
				12	# distributed under the License is distributed on an AS IS BASIS, WITHOUT
				13	# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				14	# See the License for the specific language governing permissions and
				15	# limitations under the License.
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	16	# Description:
				17	# Contains the main sequencing of the compiler.
Diego Russo	ea6111a	2020-04-14 18:41:58 +0100	[diff] [blame]	18	import time
				19
Diego Russo	e8a1045	2020-04-21 17:39:10 +0100	[diff] [blame]	20	from . import extract_npu_subgraphs
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	21	from . import graph_optimiser
Diego Russo	e8a1045	2020-04-21 17:39:10 +0100	[diff] [blame]	22	from . import high_level_command_stream_generator
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	23	from . import insert_dma
Diego Russo	e8a1045	2020-04-21 17:39:10 +0100	[diff] [blame]	24	from . import live_range
Louis Verhaard	0b8268a	2020-08-05 16:11:29 +0200	[diff] [blame^]	25	from . import lut
Diego Russo	e8a1045	2020-04-21 17:39:10 +0100	[diff] [blame]	26	from . import mark_tensors
				27	from . import npu_performance
				28	from . import npu_serialisation
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	29	from . import pass_packing
Diego Russo	e8a1045	2020-04-21 17:39:10 +0100	[diff] [blame]	30	from . import register_command_stream_generator
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	31	from . import scheduler
				32	from . import tensor_allocation
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	33	from . import weight_compressor
Patrik Gustavsson	c0bb899	2020-08-11 16:45:35 +0200	[diff] [blame]	34	from .errors import VelaError
Diego Russo	e8a1045	2020-04-21 17:39:10 +0100	[diff] [blame]	35	from .nn_graph import PassPlacement
				36	from .nn_graph import TensorAllocator
Diego Russo	ea6111a	2020-04-14 18:41:58 +0100	[diff] [blame]	37	from .rewrite_graph import verify_graph_health
Patrik Gustavsson	eca2e95	2020-05-27 09:15:11 +0200	[diff] [blame]	38	from .tensor import MemType
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	39
				40
				41	class CompilerOptions:
				42	"""Set of options to change compiler behaviour - verbosity, targets, turning off passes.
				43
				44	Note the difference between ArchitectureFeatures and CompilerOptions
				45	- ArchitectureFeatures is for changing the Ethos-U55 and system architecture
				46	- CompilerOptions is for changing the behaviour of the compiler
				47	"""
				48
				49	def __init__(
				50	self,
				51	verbose_graph=False,
				52	verbose_quantization=False,
				53	verbose_packing=False,
				54	verbose_tensor_purpose=False,
				55	verbose_tensor_format=False,
				56	verbose_allocation=False,
				57	verbose_high_level_command_stream=False,
				58	verbose_register_command_stream=False,
				59	verbose_operators=False,
				60	show_minimum_possible_allocation=False,
				61	show_cpu_operations=False,
				62	tensor_allocator=TensorAllocator.Greedy,
				63	timing=False,
				64	output_dir="outputs",
				65	):
				66
				67	self.verbose_graph = verbose_graph
				68	self.verbose_quantization = verbose_quantization
				69	self.verbose_packing = verbose_packing
				70	self.verbose_tensor_purpose = verbose_tensor_purpose
				71	self.verbose_tensor_format = verbose_tensor_format
				72	self.verbose_allocation = verbose_allocation
				73	self.verbose_high_level_command_stream = verbose_high_level_command_stream
				74	self.verbose_register_command_stream = verbose_register_command_stream
				75	self.verbose_operators = verbose_operators
				76	self.show_minimum_possible_allocation = show_minimum_possible_allocation
				77	self.show_cpu_operations = show_cpu_operations
				78	self.tensor_allocator = tensor_allocator
				79	self.timing = timing
				80	self.output_dir = output_dir
				81
				82	def __str__(self):
				83	return type(self).__name__ + ": " + str(self.__dict__)
				84
				85	__repr__ = __str__
				86
				87
				88	def compiler_driver(nng, arch, options, scheduler_options):
				89	assert verify_graph_health(nng)
				90	nng = graph_optimiser.optimise_graph_a(nng, arch, options.verbose_graph)
				91	assert verify_graph_health(nng)
				92
				93	if options.verbose_quantization:
				94	nng.print_graph_with_tensor_quantization()
				95
				96	nng = graph_optimiser.optimise_graph_b(nng, arch, options.verbose_graph)
				97	assert verify_graph_health(nng)
				98
				99	nng = mark_tensors.mark_tensor_purpose(nng, arch, options.verbose_tensor_purpose)
				100	assert verify_graph_health(nng)
				101	nng = insert_dma.insert_dma_commands(nng, arch, options.verbose_graph)
				102	assert verify_graph_health(nng)
				103	pass_packing.pack_into_passes(nng, arch, options.verbose_packing)
				104	assert verify_graph_health(nng)
				105
				106	extract_npu_subgraphs.extract_npu_subgraphs(nng, arch)
				107
				108	mark_tensors.mark_tensor_format(nng, arch, options.verbose_tensor_format)
				109	assert verify_graph_health(nng)
				110	if options.timing:
				111	start = time.time()
				112
				113	# Run the scheduler
				114	scheduler.schedule_passes(nng, arch, scheduler_options)
				115
				116	if options.timing:
				117	stop = time.time()
				118	print("Scheduling took %f s" % (stop - start))
				119	start = time.time()
				120
				121	# Update the compressed weights now that we have determined the
				122	# block config, and calc and pack the scales and biases
				123	weight_compressor.update_pass_weight_and_scale_tensors(nng, arch)
				124
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	125	# LiveRanges for constant tensors for all Npu subgraphs
				126	permanent_storage = arch.permanent_storage_mem_area
				127	lr_graph_flash = live_range.LiveRangeGraph()
				128
				129	# Placeholders for scratch and flash tensors that are common for all Npu subgraphs
				130	scratch_tens = None
Patrik Gustavsson	3ab9452	2020-06-29 17:36:55 +0200	[diff] [blame]	131	scratch_fast_tens = None
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	132	flash_tens = None
				133
				134	# Calculate live ranges for all constant Npu tensors, in permanent storage
				135	for sg in nng.subgraphs:
				136	if sg.placement == PassPlacement.Npu:
				137	lr_graph_flash = live_range.extract_live_ranges_from_cascaded_passes(
Patrik Gustavsson	eca2e95	2020-05-27 09:15:11 +0200	[diff] [blame]	138	sg,
				139	permanent_storage,
				140	MemType.Permanent_NPU,
				141	ignore_subgraph_input_output_tensors=True,
				142	lr_graph=lr_graph_flash,
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	143	)
				144
Tim Hall	25f605c	2020-05-18 18:04:26 +0100	[diff] [blame]	145	if len(nng.subgraphs) > 1:
				146	# Allocate all Npu constant tensors to the first Npu subgraph since it is
				147	# processed first during serialization into tensors
				148	first_npu_sg = nng.subgraphs[1]
				149	assert first_npu_sg.placement == PassPlacement.Npu
Tim Hall	25f605c	2020-05-18 18:04:26 +0100	[diff] [blame]	150	tensor_allocation.allocate_tensors(
				151	nng,
				152	first_npu_sg,
				153	arch,
				154	permanent_storage,
Patrik Gustavsson	eca2e95	2020-05-27 09:15:11 +0200	[diff] [blame]	155	set((MemType.Permanent_NPU,)),
Tim Hall	25f605c	2020-05-18 18:04:26 +0100	[diff] [blame]	156	scheduler_options.use_ifm_ofm_overlap,
				157	TensorAllocator.LinearAlloc,
				158	options.verbose_allocation,
				159	options.show_minimum_possible_allocation,
				160	lr_graph_flash,
				161	)
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	162
				163	# Allocate all non-constant tensors to the root, i.e. Cpu, subgraph. This step
				164	# will start at the root subgraph's input and traverse from top to bottom. When
				165	# it comes across an Npu-op it will extract live ranges for it's corresponding
Patrik Gustavsson	eca2e95	2020-05-27 09:15:11 +0200	[diff] [blame]	166	# Npu subgraph and add them to the root's live range graph.
				167	# The non-constant tensors are stored either in arch.feature_map_storage_mem_area or
				168	# arch.fast_storage_mem_area.
				169	# When these memory areas are the same, all non-constant tensors are allocated together.
				170	# Otherwise they are allocated separately.
				171
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	172	root_sg = nng.get_root_subgraph()
Patrik Gustavsson	eca2e95	2020-05-27 09:15:11 +0200	[diff] [blame]	173
				174	alloc_list = []
				175	if arch.feature_map_storage_mem_area == arch.fast_storage_mem_area:
				176	mem_alloc_scratch = (arch.feature_map_storage_mem_area, set((MemType.Scratch, MemType.Scratch_fast)))
				177	alloc_list.append(mem_alloc_scratch)
				178	else:
				179	mem_alloc_scratch = (arch.feature_map_storage_mem_area, set((MemType.Scratch,)))
				180	mem_alloc_scratch_fast = (arch.fast_storage_mem_area, set((MemType.Scratch_fast,)))
				181	alloc_list.append(mem_alloc_scratch)
				182	alloc_list.append(mem_alloc_scratch_fast)
				183
				184	for alloc in alloc_list:
				185	tensor_allocation.allocate_tensors(
				186	nng,
				187	root_sg,
				188	arch,
				189	alloc[0],
				190	alloc[1],
				191	scheduler_options.use_ifm_ofm_overlap,
				192	options.tensor_allocator,
				193	options.verbose_allocation,
				194	options.show_minimum_possible_allocation,
				195	)
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	196
				197	# Generate command streams and serialise Npu-ops into tensors
				198	for sg in nng.subgraphs:
				199	high_level_command_stream_generator.generate_high_level_command_stream(
				200	nng, sg, arch, options.verbose_high_level_command_stream
				201	)
Louis Verhaard	0b8268a	2020-08-05 16:11:29 +0200	[diff] [blame^]	202	lut.optimize_high_level_cmd_stream(sg, arch)
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	203	register_command_stream_generator.generate_register_command_stream(
				204	nng, sg, arch, options.verbose_register_command_stream
				205	)
Patrik Gustavsson	3ab9452	2020-06-29 17:36:55 +0200	[diff] [blame]	206	scratch_tens, scratch_fast_tens, flash_tens = npu_serialisation.serialise_npu_subgraph_into_tensors(
				207	nng, sg, arch, scratch_tens, scratch_fast_tens, flash_tens
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	208	)
				209
				210	npu_serialisation.rewrite_npu_call_ops(nng, root_sg, arch)
				211
Patrik Gustavsson	3ab9452	2020-06-29 17:36:55 +0200	[diff] [blame]	212	if root_sg is not None and (arch.feature_map_storage_mem_area != arch.fast_storage_mem_area):
				213	if root_sg.memory_used_per_type.get(MemType.Scratch_fast, 0) > arch.sram_size:
Patrik Gustavsson	c0bb899	2020-08-11 16:45:35 +0200	[diff] [blame]	214	raise VelaError(
				215	"Sram limit {} bytes, has been exceeded by the scratch fast tensor {} bytes".format(
				216	arch.sram_size, root_sg.memory_used_per_type.get(MemType.Scratch_fast, 0)
				217	)
				218	)
Patrik Gustavsson	3ab9452	2020-06-29 17:36:55 +0200	[diff] [blame]	219
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	220	# Allocate all Cpu constant tensors, this is done last because the Npu-ops
				221	# have to be serialized into flash and scratch tensors first
				222	tensor_allocation.allocate_tensors(
				223	nng,
				224	root_sg,
				225	arch,
				226	permanent_storage,
Patrik Gustavsson	eca2e95	2020-05-27 09:15:11 +0200	[diff] [blame]	227	set((MemType.Permanent_CPU,)),
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	228	scheduler_options.use_ifm_ofm_overlap,
Louis Verhaard	3c07c97	2020-05-07 08:12:58 +0200	[diff] [blame]	229	TensorAllocator.LinearAlloc,
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	230	options.verbose_allocation,
				231	options.show_minimum_possible_allocation,
				232	)
				233
				234	npu_performance.calc_performance_for_network(nng, arch)