Blame - ethosu/vela/vela.py - ml/ethos-u/ethos-u-vela

blob: 49f8c26c00f9aa046ca27028f19e3565d6f34100 [file] [log] [blame]

Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	1	# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
				2	#
				3	# SPDX-License-Identifier: Apache-2.0
				4	#
				5	# Licensed under the Apache License, Version 2.0 (the License); you may
				6	# not use this file except in compliance with the License.
				7	# You may obtain a copy of the License at
				8	#
				9	# www.apache.org/licenses/LICENSE-2.0
				10	#
				11	# Unless required by applicable law or agreed to in writing, software
				12	# distributed under the License is distributed on an AS IS BASIS, WITHOUT
				13	# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				14	# See the License for the specific language governing permissions and
				15	# limitations under the License.
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	16	# Description:
				17	# Main entry point for the Vela compiler.
				18	#
				19	# Provides command line interface, options parsing, and network loading. Before calling the compiler driver.
Diego Russo	e8a1045	2020-04-21 17:39:10 +0100	[diff] [blame^]	20	import argparse
				21	import ast
				22	import configparser
Diego Russo	ea6111a	2020-04-14 18:41:58 +0100	[diff] [blame]	23	import os.path
				24	import sys
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	25	import time
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	26
				27	from . import architecture_features
Diego Russo	e8a1045	2020-04-21 17:39:10 +0100	[diff] [blame^]	28	from . import compiler_driver
				29	from . import model_reader
				30	from . import scheduler
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	31	from . import stats_writer
				32	from . import tflite_writer
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	33	from ._version import __version__
Diego Russo	e8a1045	2020-04-21 17:39:10 +0100	[diff] [blame^]	34	from .nn_graph import PassPlacement
				35	from .nn_graph import TensorAllocator
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	36	from .scheduler import ParetoMetric
Diego Russo	ea6111a	2020-04-14 18:41:58 +0100	[diff] [blame]	37	from .tensor import MemArea
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	38
				39
				40	def process(fname, arch, model_reader_options, compiler_options, scheduler_options):
				41	if compiler_options.timing:
				42	start = time.time()
				43
				44	nng = model_reader.read_model(fname, model_reader_options)
				45
				46	if not nng:
				47	print("reading of", fname, "failed")
				48	assert False
				49
				50	if compiler_options.verbose_operators:
				51	nng.print_operators()
				52
				53	if compiler_options.timing:
				54	stop = time.time()
				55	print("Model reading took %f s" % (stop - start))
				56	start = time.time()
				57
				58	compiler_driver.compiler_driver(nng, arch, compiler_options, scheduler_options)
				59
				60	passes_csv_file = "%s/%s_pass-breakdown_%s.csv" % (compiler_options.output_dir, nng.name, arch.system_config)
				61	stats_writer.write_pass_metrics_csv(nng, passes_csv_file)
				62
				63	summary_csv_file = "%s/%s_summary_%s.csv" % (compiler_options.output_dir, nng.name, arch.system_config)
				64	stats_writer.write_summary_metrics_csv(nng, summary_csv_file, arch)
				65
				66	stats_writer.print_performance_metrics(nng, show_cpu_operations=compiler_options.show_cpu_operations, arch=arch)
				67
				68	if fname.endswith(".tflite"):
				69	tflite_writer.write_tflite(nng, "%s/%s_vela.tflite" % (compiler_options.output_dir, nng.name))
				70
				71	if compiler_options.timing:
				72	stop = time.time()
				73	print("Compiler driver took %f s" % (stop - start))
				74
				75	return nng
				76
				77
				78	def print_subgraph_io_summary(nng):
				79	"""Print a summary of all the input and output tensor sizes for all subgraphs.
				80	Also displays the total tensor size and the memory used area for sram.
				81	"""
				82
				83	print("Subgraph IO Summary")
				84	print("-------------------")
				85	print("NNG: {0}".format(nng.name))
				86	max_sg_size = 0
				87	for sg in reversed(nng.subgraphs):
				88	print(" Subgraph: {0} = {1}".format(sg.name, sg.placement))
				89	sg_size = 0
				90
				91	if sg.placement == PassPlacement.Npu:
				92	for tens in sg.input_tensors + [sg.scratch_tensor] + sg.output_tensors:
				93	if tens in sg.input_tensors:
				94	tens_dir = "In"
				95	elif tens in sg.output_tensors:
				96	tens_dir = "Out"
				97	else:
				98	tens_dir = "In/Out"
				99
				100	size = tens.elements() * tens.element_size() / 1024.0
				101	sg_size = sg_size + size
				102	print(" Tensor [{0}]: {1} = {2} KiB".format(tens_dir, tens.name, size))
				103
				104	print(" Total Size = {0} KiB".format(sg_size))
				105	print(" SRAM Memory Used = {0} KiB".format(sg.memory_used.get(MemArea.Sram, 0) / 1024.0))
				106	max_sg_size = max(sg_size, max_sg_size)
				107
				108	print(" Maximum Subgraph Size = {0} KiB".format(max_sg_size))
				109
				110
				111	def main(args=None):
				112	if args is None:
				113	args = sys.argv[1:]
				114
				115	parser = argparse.ArgumentParser(prog="vela", description="Neural network model compiler for Ethos-U55")
				116
				117	parser.add_argument(
				118	"network", metavar="NETWORK", type=str, default=None, nargs=None, help="Filename of network to process"
				119	)
				120
				121	parser.add_argument("--version", action="version", version=__version__)
				122	parser.add_argument(
				123	"--output-dir", type=str, default="output", help="Output directory to write files to (default: %(default)s)"
				124	)
				125	parser.add_argument("--config", type=str, help="Location of vela configuration file")
				126	parser.add_argument("--batch-size", type=int, default=1, help="Batch size (default: %(default)s)")
				127
				128	parser.add_argument("--verbose-graph", action="store_true", help="Verbose graph rewriter")
				129	parser.add_argument("--verbose-quantization", action="store_true", help="Verbose quantization")
				130	parser.add_argument("--verbose-packing", action="store_true", help="Verbose pass packing")
				131	parser.add_argument("--verbose-tensor-purpose", action="store_true", help="Verbose tensor purpose")
				132	parser.add_argument("--verbose-tensor-format", action="store_true", help="Verbose tensor format")
				133	parser.add_argument("--verbose-schedule", action="store_true", help="Verbose schedule")
				134	parser.add_argument(
				135	"--verbose-pareto-frontier-schedules",
				136	action="store_true",
				137	help="Show all schedules along the pareto frontier of optimisation criteria",
				138	)
				139	parser.add_argument("--verbose-allocation", action="store_true", help="Verbose tensor allocation")
				140	parser.add_argument(
				141	"--verbose-high-level-command-stream", action="store_true", help="Verbose high level command stream"
				142	)
				143	parser.add_argument(
				144	"--verbose-register-command-stream", action="store_true", help="Verbose register command stream"
				145	)
				146	parser.add_argument("--verbose-operators", action="store_true", help="Verbose operator list")
				147
				148	parser.add_argument(
				149	"--show-minimum-possible-allocation", action="store_true", help="Show the minimum possible allocation"
				150	)
				151	parser.add_argument(
				152	"--show-cpu-operations", action="store_true", help="Show the operations that fall back to the CPU"
				153	)
				154	parser.add_argument(
				155	"--cascading",
				156	type=ast.literal_eval,
				157	default=True,
				158	choices=[True, False],
				159	help="Controls the packing of multiple passes into a cascade (default: %(default)s)",
				160	)
				161	parser.add_argument(
				162	"--ifm-ofm-overlap",
				163	type=ast.literal_eval,
				164	default=True,
				165	choices=[True, False],
				166	help="Controls the overlapping of IFM and OFM buffers (default: %(default)s)",
				167	)
				168	parser.add_argument("--force-block-config", type=str, default="", help="Force a specific block configuration HxWxC")
				169	parser.add_argument(
				170	"--inter-pass-cycle-delay",
				171	type=int,
				172	default=0,
				173	help="Artificial delay between passes, measured in NPU cycles (default: %(default)s)",
				174	)
				175	parser.add_argument("--timing", action="store_true", help="Time the compiler doing operations")
				176	parser.add_argument(
				177	"--accelerator-config",
				178	type=str,
				179	default="ethos-u55-256",
				180	choices=list(architecture_features.ArchitectureFeatures.accelerator_configs.keys()),
				181	help="Accelerator configuration to use (default: %(default)s)",
				182	)
				183	parser.add_argument(
				184	"--system-config",
				185	type=str,
				186	default="internal-default",
				187	help="System configuration to use (default: %(default)s)",
				188	)
				189	parser.add_argument(
				190	"--dram-bandwidth",
				191	type=float,
				192	default=0.0,
				193	help="DRAM memory bandwidth in GB/s, use zero to select the value from system config (default: %(default)s)",
				194	)
				195	parser.add_argument(
				196	"--permanent-storage",
				197	default=MemArea.OffChipFlash,
				198	type=lambda s: MemArea[s],
				199	choices=list(MemArea)[3:-1],
				200	help=(
				201	"Memory area for permanent storage. To store the weights and other constant data in SRAM select "
				202	"'OnChipFlash' (default: %(default)s)"
				203	),
				204	)
				205	parser.add_argument(
				206	"--tensor-allocator",
				207	default=TensorAllocator.Greedy,
				208	type=lambda s: TensorAllocator[s],
				209	choices=list(TensorAllocator),
				210	help="Tensor Allocator algorithm (default: %(default)s)",
				211	)
				212	parser.add_argument(
				213	"--show-subgraph-io-summary",
				214	action="store_true",
				215	help="Shows a summary of all the subgraphs and their inputs and outputs",
				216	)
				217	parser.add_argument(
				218	"--ifm-streaming",
				219	type=ast.literal_eval,
				220	default=True,
				221	choices=[True, False],
				222	help="Controls scheduler IFM streaming search (default: %(default)s)",
				223	)
				224	parser.add_argument(
				225	"--block-config-limit",
				226	type=int,
				227	default=16,
				228	help="Limit block config search space, use zero for unlimited (default: %(default)s)",
				229	)
				230	parser.add_argument(
				231	"--global-memory-clock-scale",
				232	type=float,
				233	default=1.0,
				234	help=(
				235	"Performs an additional scaling of the individual memory clock scales specified by the system config "
				236	"(default: %(default)s)"
				237	),
				238	)
				239	parser.add_argument(
				240	"--pareto-metric",
				241	default=ParetoMetric.BwCycMem,
				242	type=lambda s: ParetoMetric[s],
				243	choices=list(ParetoMetric),
				244	help="Controls the calculation of the pareto metric (default: %(default)s)",
				245	)
				246	parser.add_argument(
				247	"--recursion-limit",
				248	type=int,
				249	default=10000,
				250	help="Set the recursion depth limit, may result in RecursionError if too low (default: %(default)s)",
				251	)
				252	parser.add_argument(
				253	"--max-block-dependency",
				254	type=int,
				255	default=architecture_features.ArchitectureFeatures.MAX_BLOCKDEP,
				256	choices=range(0, architecture_features.ArchitectureFeatures.MAX_BLOCKDEP + 1),
				257	help=(
				258	"Set the maximum value that can be used for the block dependency between npu kernel operations "
				259	"(default: %(default)s)"
				260	),
				261	)
				262
				263	args = parser.parse_args(args=args)
				264
				265	# Read configuration file
				266	config_file = args.config
				267	config = None
				268	if config_file is not None:
				269	with open(config_file) as f:
				270	config = configparser.ConfigParser()
				271	config.read_file(f)
				272
				273	if args.network is None:
				274	parser.error("the following argument is required: NETWORK")
				275
				276	sys.setrecursionlimit(args.recursion_limit)
				277
				278	if args.force_block_config:
				279	force_block_config = architecture_features.Block.from_string(args.force_block_config)
				280	else:
				281	force_block_config = None
				282
				283	arch = architecture_features.ArchitectureFeatures(
				284	vela_config=config,
				285	system_config=args.system_config,
				286	accelerator_config=args.accelerator_config,
				287	permanent_storage=args.permanent_storage,
				288	inter_pass_cycle_delay=args.inter_pass_cycle_delay,
				289	dram_bandwidth=args.dram_bandwidth,
				290	override_block_config=force_block_config,
				291	block_config_limit=args.block_config_limit,
				292	global_memory_clock_scale=args.global_memory_clock_scale,
				293	max_blockdep=args.max_block_dependency,
				294	)
				295
				296	compiler_options = compiler_driver.CompilerOptions(
				297	verbose_graph=args.verbose_graph,
				298	verbose_quantization=args.verbose_quantization,
				299	verbose_packing=args.verbose_packing,
				300	verbose_tensor_purpose=args.verbose_tensor_purpose,
				301	verbose_tensor_format=args.verbose_tensor_format,
				302	verbose_allocation=args.verbose_allocation,
				303	verbose_high_level_command_stream=args.verbose_high_level_command_stream,
				304	verbose_register_command_stream=args.verbose_register_command_stream,
				305	verbose_operators=args.verbose_operators,
				306	show_minimum_possible_allocation=args.show_minimum_possible_allocation,
				307	show_cpu_operations=args.show_cpu_operations,
				308	tensor_allocator=args.tensor_allocator,
				309	timing=args.timing,
				310	output_dir=args.output_dir,
				311	)
				312
				313	scheduler_options = scheduler.SchedulerOptions(
				314	use_cascading=args.cascading,
				315	use_ifm_ofm_overlap=args.ifm_ofm_overlap,
				316	verbose_schedule=args.verbose_schedule,
				317	verbose_pareto_frontier_schedules=args.verbose_pareto_frontier_schedules,
				318	use_ifm_streaming=args.ifm_streaming,
				319	pareto_metric=args.pareto_metric,
				320	)
				321
				322	model_reader_options = model_reader.ModelReaderOptions(batch_size=args.batch_size)
				323
				324	os.makedirs(args.output_dir, exist_ok=True)
				325
				326	nng = process(args.network, arch, model_reader_options, compiler_options, scheduler_options)
				327
				328	if args.show_subgraph_io_summary:
				329	print_subgraph_io_summary(nng)
				330
				331	return 0