Blame - ethosu/vela/compiler_driver.py - ml/ethos-u/ethos-u-vela

blob: 64aff06b4d971646964d542f3085bb4357c08837 [file] [log] [blame]

Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	1	# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
				2	#
				3	# SPDX-License-Identifier: Apache-2.0
				4	#
				5	# Licensed under the Apache License, Version 2.0 (the License); you may
				6	# not use this file except in compliance with the License.
				7	# You may obtain a copy of the License at
				8	#
				9	# www.apache.org/licenses/LICENSE-2.0
				10	#
				11	# Unless required by applicable law or agreed to in writing, software
				12	# distributed under the License is distributed on an AS IS BASIS, WITHOUT
				13	# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				14	# See the License for the specific language governing permissions and
				15	# limitations under the License.
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	16	# Description:
				17	# Contains the main sequencing of the compiler.
Diego Russo	ea6111a	2020-04-14 18:41:58 +0100	[diff] [blame]	18	import time
				19
Diego Russo	e8a1045	2020-04-21 17:39:10 +0100	[diff] [blame^]	20	from . import extract_npu_subgraphs
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	21	from . import graph_optimiser
Diego Russo	e8a1045	2020-04-21 17:39:10 +0100	[diff] [blame^]	22	from . import high_level_command_stream_generator
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	23	from . import insert_dma
Diego Russo	e8a1045	2020-04-21 17:39:10 +0100	[diff] [blame^]	24	from . import live_range
				25	from . import mark_tensors
				26	from . import npu_performance
				27	from . import npu_serialisation
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	28	from . import pass_packing
Diego Russo	e8a1045	2020-04-21 17:39:10 +0100	[diff] [blame^]	29	from . import register_command_stream_generator
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	30	from . import scheduler
				31	from . import tensor_allocation
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	32	from . import weight_compressor
Diego Russo	e8a1045	2020-04-21 17:39:10 +0100	[diff] [blame^]	33	from .nn_graph import PassPlacement
				34	from .nn_graph import TensorAllocator
Diego Russo	ea6111a	2020-04-14 18:41:58 +0100	[diff] [blame]	35	from .rewrite_graph import verify_graph_health
Diego Russo	e8a1045	2020-04-21 17:39:10 +0100	[diff] [blame^]	36	from .tensor import MemArea
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	37
				38
				39	class CompilerOptions:
				40	"""Set of options to change compiler behaviour - verbosity, targets, turning off passes.
				41
				42	Note the difference between ArchitectureFeatures and CompilerOptions
				43	- ArchitectureFeatures is for changing the Ethos-U55 and system architecture
				44	- CompilerOptions is for changing the behaviour of the compiler
				45	"""
				46
				47	def __init__(
				48	self,
				49	verbose_graph=False,
				50	verbose_quantization=False,
				51	verbose_packing=False,
				52	verbose_tensor_purpose=False,
				53	verbose_tensor_format=False,
				54	verbose_allocation=False,
				55	verbose_high_level_command_stream=False,
				56	verbose_register_command_stream=False,
				57	verbose_operators=False,
				58	show_minimum_possible_allocation=False,
				59	show_cpu_operations=False,
				60	tensor_allocator=TensorAllocator.Greedy,
				61	timing=False,
				62	output_dir="outputs",
				63	):
				64
				65	self.verbose_graph = verbose_graph
				66	self.verbose_quantization = verbose_quantization
				67	self.verbose_packing = verbose_packing
				68	self.verbose_tensor_purpose = verbose_tensor_purpose
				69	self.verbose_tensor_format = verbose_tensor_format
				70	self.verbose_allocation = verbose_allocation
				71	self.verbose_high_level_command_stream = verbose_high_level_command_stream
				72	self.verbose_register_command_stream = verbose_register_command_stream
				73	self.verbose_operators = verbose_operators
				74	self.show_minimum_possible_allocation = show_minimum_possible_allocation
				75	self.show_cpu_operations = show_cpu_operations
				76	self.tensor_allocator = tensor_allocator
				77	self.timing = timing
				78	self.output_dir = output_dir
				79
				80	def __str__(self):
				81	return type(self).__name__ + ": " + str(self.__dict__)
				82
				83	__repr__ = __str__
				84
				85
				86	def compiler_driver(nng, arch, options, scheduler_options):
				87	assert verify_graph_health(nng)
				88	nng = graph_optimiser.optimise_graph_a(nng, arch, options.verbose_graph)
				89	assert verify_graph_health(nng)
				90
				91	if options.verbose_quantization:
				92	nng.print_graph_with_tensor_quantization()
				93
				94	nng = graph_optimiser.optimise_graph_b(nng, arch, options.verbose_graph)
				95	assert verify_graph_health(nng)
				96
				97	nng = mark_tensors.mark_tensor_purpose(nng, arch, options.verbose_tensor_purpose)
				98	assert verify_graph_health(nng)
				99	nng = insert_dma.insert_dma_commands(nng, arch, options.verbose_graph)
				100	assert verify_graph_health(nng)
				101	pass_packing.pack_into_passes(nng, arch, options.verbose_packing)
				102	assert verify_graph_health(nng)
				103
				104	extract_npu_subgraphs.extract_npu_subgraphs(nng, arch)
				105
				106	mark_tensors.mark_tensor_format(nng, arch, options.verbose_tensor_format)
				107	assert verify_graph_health(nng)
				108	if options.timing:
				109	start = time.time()
				110
				111	# Run the scheduler
				112	scheduler.schedule_passes(nng, arch, scheduler_options)
				113
				114	if options.timing:
				115	stop = time.time()
				116	print("Scheduling took %f s" % (stop - start))
				117	start = time.time()
				118
				119	# Update the compressed weights now that we have determined the
				120	# block config, and calc and pack the scales and biases
				121	weight_compressor.update_pass_weight_and_scale_tensors(nng, arch)
				122
				123	# Memory area for all non-constant tensors (Cpu and Npu)
				124	non_const_mem_area = MemArea.Sram
				125
				126	# LiveRanges for constant tensors for all Npu subgraphs
				127	permanent_storage = arch.permanent_storage_mem_area
				128	lr_graph_flash = live_range.LiveRangeGraph()
				129
				130	# Placeholders for scratch and flash tensors that are common for all Npu subgraphs
				131	scratch_tens = None
				132	flash_tens = None
				133
				134	# Calculate live ranges for all constant Npu tensors, in permanent storage
				135	for sg in nng.subgraphs:
				136	if sg.placement == PassPlacement.Npu:
				137	lr_graph_flash = live_range.extract_live_ranges_from_cascaded_passes(
				138	sg, permanent_storage, ignore_subgraph_input_output_tensors=True, lr_graph=lr_graph_flash
				139	)
				140
Patrik Gustavsson	cf72890	2020-04-30 08:57:23 +0200	[diff] [blame]	141	assert len(nng.subgraphs) > 1, "Error: No operators can be hardware accelerated; cancelling compilation"
				142
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	143	# Allocate all Npu constant tensors to the first Npu subgraph since it is
				144	# processed first during serialization into tensors
				145	first_npu_sg = nng.subgraphs[1]
				146	assert first_npu_sg.placement == PassPlacement.Npu
				147	tensor_allocation.allocate_tensors(
				148	nng,
				149	first_npu_sg,
				150	arch,
				151	permanent_storage,
				152	scheduler_options.use_ifm_ofm_overlap,
				153	options.tensor_allocator,
				154	options.verbose_allocation,
				155	options.show_minimum_possible_allocation,
				156	lr_graph_flash,
				157	)
				158
				159	# Allocate all non-constant tensors to the root, i.e. Cpu, subgraph. This step
				160	# will start at the root subgraph's input and traverse from top to bottom. When
				161	# it comes across an Npu-op it will extract live ranges for it's corresponding
				162	# Npu subgraph and add them to the root's live range graph. Finally, all of the
				163	# non-constant tensors are allocated together
				164	root_sg = nng.get_root_subgraph()
				165	tensor_allocation.allocate_tensors(
				166	nng,
				167	root_sg,
				168	arch,
				169	non_const_mem_area,
				170	scheduler_options.use_ifm_ofm_overlap,
				171	options.tensor_allocator,
				172	options.verbose_allocation,
				173	options.show_minimum_possible_allocation,
				174	)
				175
				176	# Generate command streams and serialise Npu-ops into tensors
				177	for sg in nng.subgraphs:
				178	high_level_command_stream_generator.generate_high_level_command_stream(
				179	nng, sg, arch, options.verbose_high_level_command_stream
				180	)
				181	register_command_stream_generator.generate_register_command_stream(
				182	nng, sg, arch, options.verbose_register_command_stream
				183	)
				184	scratch_tens, flash_tens = npu_serialisation.serialise_npu_subgraph_into_tensors(
				185	nng, sg, arch, scratch_tens, flash_tens
				186	)
				187
				188	npu_serialisation.rewrite_npu_call_ops(nng, root_sg, arch)
				189
				190	# Allocate all Cpu constant tensors, this is done last because the Npu-ops
				191	# have to be serialized into flash and scratch tensors first
				192	tensor_allocation.allocate_tensors(
				193	nng,
				194	root_sg,
				195	arch,
				196	permanent_storage,
				197	scheduler_options.use_ifm_ofm_overlap,
				198	options.tensor_allocator,
				199	options.verbose_allocation,
				200	options.show_minimum_possible_allocation,
				201	)
				202
				203	npu_performance.calc_performance_for_network(nng, arch)