Blame - ethosu/vela/vela.py - ml/ethos-u/ethos-u-vela

blob: f07aec896950fea271c27d0f05316ae737b36e6c [file] [log] [blame]

Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame^]	1	# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
				2	#
				3	# SPDX-License-Identifier: Apache-2.0
				4	#
				5	# Licensed under the Apache License, Version 2.0 (the License); you may
				6	# not use this file except in compliance with the License.
				7	# You may obtain a copy of the License at
				8	#
				9	# www.apache.org/licenses/LICENSE-2.0
				10	#
				11	# Unless required by applicable law or agreed to in writing, software
				12	# distributed under the License is distributed on an AS IS BASIS, WITHOUT
				13	# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				14	# See the License for the specific language governing permissions and
				15	# limitations under the License.
				16
				17
				18	# Description:
				19	# Main entry point for the Vela compiler.
				20	#
				21	# Provides command line interface, options parsing, and network loading. Before calling the compiler driver.
				22
				23	import sys
				24	import os.path
				25	import os
				26	import time
				27	import subprocess
				28	import configparser
				29	import argparse
				30	import ast
				31
				32	from . import architecture_features
				33	from . import stats_writer
				34	from . import tflite_writer
				35	from . import model_reader
				36	from . import compiler_driver
				37	from . import scheduler
				38	from ._version import __version__
				39	from .scheduler import ParetoMetric
				40	from .nn_graph import MemArea, TensorFormat, TensorAllocator, PassPlacement
				41
				42
				43	def process(fname, arch, model_reader_options, compiler_options, scheduler_options):
				44	if compiler_options.timing:
				45	start = time.time()
				46
				47	nng = model_reader.read_model(fname, model_reader_options)
				48
				49	if not nng:
				50	print("reading of", fname, "failed")
				51	assert False
				52
				53	if compiler_options.verbose_operators:
				54	nng.print_operators()
				55
				56	if compiler_options.timing:
				57	stop = time.time()
				58	print("Model reading took %f s" % (stop - start))
				59	start = time.time()
				60
				61	compiler_driver.compiler_driver(nng, arch, compiler_options, scheduler_options)
				62
				63	passes_csv_file = "%s/%s_pass-breakdown_%s.csv" % (compiler_options.output_dir, nng.name, arch.system_config)
				64	stats_writer.write_pass_metrics_csv(nng, passes_csv_file)
				65
				66	summary_csv_file = "%s/%s_summary_%s.csv" % (compiler_options.output_dir, nng.name, arch.system_config)
				67	stats_writer.write_summary_metrics_csv(nng, summary_csv_file, arch)
				68
				69	stats_writer.print_performance_metrics(nng, show_cpu_operations=compiler_options.show_cpu_operations, arch=arch)
				70
				71	if fname.endswith(".tflite"):
				72	tflite_writer.write_tflite(nng, "%s/%s_vela.tflite" % (compiler_options.output_dir, nng.name))
				73
				74	if compiler_options.timing:
				75	stop = time.time()
				76	print("Compiler driver took %f s" % (stop - start))
				77
				78	return nng
				79
				80
				81	def print_subgraph_io_summary(nng):
				82	"""Print a summary of all the input and output tensor sizes for all subgraphs.
				83	Also displays the total tensor size and the memory used area for sram.
				84	"""
				85
				86	print("Subgraph IO Summary")
				87	print("-------------------")
				88	print("NNG: {0}".format(nng.name))
				89	max_sg_size = 0
				90	for sg in reversed(nng.subgraphs):
				91	print(" Subgraph: {0} = {1}".format(sg.name, sg.placement))
				92	sg_size = 0
				93
				94	if sg.placement == PassPlacement.Npu:
				95	for tens in sg.input_tensors + [sg.scratch_tensor] + sg.output_tensors:
				96	if tens in sg.input_tensors:
				97	tens_dir = "In"
				98	elif tens in sg.output_tensors:
				99	tens_dir = "Out"
				100	else:
				101	tens_dir = "In/Out"
				102
				103	size = tens.elements() * tens.element_size() / 1024.0
				104	sg_size = sg_size + size
				105	print(" Tensor [{0}]: {1} = {2} KiB".format(tens_dir, tens.name, size))
				106
				107	print(" Total Size = {0} KiB".format(sg_size))
				108	print(" SRAM Memory Used = {0} KiB".format(sg.memory_used.get(MemArea.Sram, 0) / 1024.0))
				109	max_sg_size = max(sg_size, max_sg_size)
				110
				111	print(" Maximum Subgraph Size = {0} KiB".format(max_sg_size))
				112
				113
				114	def main(args=None):
				115	if args is None:
				116	args = sys.argv[1:]
				117
				118	parser = argparse.ArgumentParser(prog="vela", description="Neural network model compiler for Ethos-U55")
				119
				120	parser.add_argument(
				121	"network", metavar="NETWORK", type=str, default=None, nargs=None, help="Filename of network to process"
				122	)
				123
				124	parser.add_argument("--version", action="version", version=__version__)
				125	parser.add_argument(
				126	"--output-dir", type=str, default="output", help="Output directory to write files to (default: %(default)s)"
				127	)
				128	parser.add_argument("--config", type=str, help="Location of vela configuration file")
				129	parser.add_argument("--batch-size", type=int, default=1, help="Batch size (default: %(default)s)")
				130
				131	parser.add_argument("--verbose-graph", action="store_true", help="Verbose graph rewriter")
				132	parser.add_argument("--verbose-quantization", action="store_true", help="Verbose quantization")
				133	parser.add_argument("--verbose-packing", action="store_true", help="Verbose pass packing")
				134	parser.add_argument("--verbose-tensor-purpose", action="store_true", help="Verbose tensor purpose")
				135	parser.add_argument("--verbose-tensor-format", action="store_true", help="Verbose tensor format")
				136	parser.add_argument("--verbose-schedule", action="store_true", help="Verbose schedule")
				137	parser.add_argument(
				138	"--verbose-pareto-frontier-schedules",
				139	action="store_true",
				140	help="Show all schedules along the pareto frontier of optimisation criteria",
				141	)
				142	parser.add_argument("--verbose-allocation", action="store_true", help="Verbose tensor allocation")
				143	parser.add_argument(
				144	"--verbose-high-level-command-stream", action="store_true", help="Verbose high level command stream"
				145	)
				146	parser.add_argument(
				147	"--verbose-register-command-stream", action="store_true", help="Verbose register command stream"
				148	)
				149	parser.add_argument("--verbose-operators", action="store_true", help="Verbose operator list")
				150
				151	parser.add_argument(
				152	"--show-minimum-possible-allocation", action="store_true", help="Show the minimum possible allocation"
				153	)
				154	parser.add_argument(
				155	"--show-cpu-operations", action="store_true", help="Show the operations that fall back to the CPU"
				156	)
				157	parser.add_argument(
				158	"--cascading",
				159	type=ast.literal_eval,
				160	default=True,
				161	choices=[True, False],
				162	help="Controls the packing of multiple passes into a cascade (default: %(default)s)",
				163	)
				164	parser.add_argument(
				165	"--ifm-ofm-overlap",
				166	type=ast.literal_eval,
				167	default=True,
				168	choices=[True, False],
				169	help="Controls the overlapping of IFM and OFM buffers (default: %(default)s)",
				170	)
				171	parser.add_argument("--force-block-config", type=str, default="", help="Force a specific block configuration HxWxC")
				172	parser.add_argument(
				173	"--inter-pass-cycle-delay",
				174	type=int,
				175	default=0,
				176	help="Artificial delay between passes, measured in NPU cycles (default: %(default)s)",
				177	)
				178	parser.add_argument("--timing", action="store_true", help="Time the compiler doing operations")
				179	parser.add_argument(
				180	"--accelerator-config",
				181	type=str,
				182	default="ethos-u55-256",
				183	choices=list(architecture_features.ArchitectureFeatures.accelerator_configs.keys()),
				184	help="Accelerator configuration to use (default: %(default)s)",
				185	)
				186	parser.add_argument(
				187	"--system-config",
				188	type=str,
				189	default="internal-default",
				190	help="System configuration to use (default: %(default)s)",
				191	)
				192	parser.add_argument(
				193	"--dram-bandwidth",
				194	type=float,
				195	default=0.0,
				196	help="DRAM memory bandwidth in GB/s, use zero to select the value from system config (default: %(default)s)",
				197	)
				198	parser.add_argument(
				199	"--permanent-storage",
				200	default=MemArea.OffChipFlash,
				201	type=lambda s: MemArea[s],
				202	choices=list(MemArea)[3:-1],
				203	help=(
				204	"Memory area for permanent storage. To store the weights and other constant data in SRAM select "
				205	"'OnChipFlash' (default: %(default)s)"
				206	),
				207	)
				208	parser.add_argument(
				209	"--tensor-allocator",
				210	default=TensorAllocator.Greedy,
				211	type=lambda s: TensorAllocator[s],
				212	choices=list(TensorAllocator),
				213	help="Tensor Allocator algorithm (default: %(default)s)",
				214	)
				215	parser.add_argument(
				216	"--show-subgraph-io-summary",
				217	action="store_true",
				218	help="Shows a summary of all the subgraphs and their inputs and outputs",
				219	)
				220	parser.add_argument(
				221	"--ifm-streaming",
				222	type=ast.literal_eval,
				223	default=True,
				224	choices=[True, False],
				225	help="Controls scheduler IFM streaming search (default: %(default)s)",
				226	)
				227	parser.add_argument(
				228	"--block-config-limit",
				229	type=int,
				230	default=16,
				231	help="Limit block config search space, use zero for unlimited (default: %(default)s)",
				232	)
				233	parser.add_argument(
				234	"--global-memory-clock-scale",
				235	type=float,
				236	default=1.0,
				237	help=(
				238	"Performs an additional scaling of the individual memory clock scales specified by the system config "
				239	"(default: %(default)s)"
				240	),
				241	)
				242	parser.add_argument(
				243	"--pareto-metric",
				244	default=ParetoMetric.BwCycMem,
				245	type=lambda s: ParetoMetric[s],
				246	choices=list(ParetoMetric),
				247	help="Controls the calculation of the pareto metric (default: %(default)s)",
				248	)
				249	parser.add_argument(
				250	"--recursion-limit",
				251	type=int,
				252	default=10000,
				253	help="Set the recursion depth limit, may result in RecursionError if too low (default: %(default)s)",
				254	)
				255	parser.add_argument(
				256	"--max-block-dependency",
				257	type=int,
				258	default=architecture_features.ArchitectureFeatures.MAX_BLOCKDEP,
				259	choices=range(0, architecture_features.ArchitectureFeatures.MAX_BLOCKDEP + 1),
				260	help=(
				261	"Set the maximum value that can be used for the block dependency between npu kernel operations "
				262	"(default: %(default)s)"
				263	),
				264	)
				265
				266	args = parser.parse_args(args=args)
				267
				268	# Read configuration file
				269	config_file = args.config
				270	config = None
				271	if config_file is not None:
				272	with open(config_file) as f:
				273	config = configparser.ConfigParser()
				274	config.read_file(f)
				275
				276	if args.network is None:
				277	parser.error("the following argument is required: NETWORK")
				278
				279	sys.setrecursionlimit(args.recursion_limit)
				280
				281	if args.force_block_config:
				282	force_block_config = architecture_features.Block.from_string(args.force_block_config)
				283	else:
				284	force_block_config = None
				285
				286	arch = architecture_features.ArchitectureFeatures(
				287	vela_config=config,
				288	system_config=args.system_config,
				289	accelerator_config=args.accelerator_config,
				290	permanent_storage=args.permanent_storage,
				291	inter_pass_cycle_delay=args.inter_pass_cycle_delay,
				292	dram_bandwidth=args.dram_bandwidth,
				293	override_block_config=force_block_config,
				294	block_config_limit=args.block_config_limit,
				295	global_memory_clock_scale=args.global_memory_clock_scale,
				296	max_blockdep=args.max_block_dependency,
				297	)
				298
				299	compiler_options = compiler_driver.CompilerOptions(
				300	verbose_graph=args.verbose_graph,
				301	verbose_quantization=args.verbose_quantization,
				302	verbose_packing=args.verbose_packing,
				303	verbose_tensor_purpose=args.verbose_tensor_purpose,
				304	verbose_tensor_format=args.verbose_tensor_format,
				305	verbose_allocation=args.verbose_allocation,
				306	verbose_high_level_command_stream=args.verbose_high_level_command_stream,
				307	verbose_register_command_stream=args.verbose_register_command_stream,
				308	verbose_operators=args.verbose_operators,
				309	show_minimum_possible_allocation=args.show_minimum_possible_allocation,
				310	show_cpu_operations=args.show_cpu_operations,
				311	tensor_allocator=args.tensor_allocator,
				312	timing=args.timing,
				313	output_dir=args.output_dir,
				314	)
				315
				316	scheduler_options = scheduler.SchedulerOptions(
				317	use_cascading=args.cascading,
				318	use_ifm_ofm_overlap=args.ifm_ofm_overlap,
				319	verbose_schedule=args.verbose_schedule,
				320	verbose_pareto_frontier_schedules=args.verbose_pareto_frontier_schedules,
				321	use_ifm_streaming=args.ifm_streaming,
				322	pareto_metric=args.pareto_metric,
				323	)
				324
				325	model_reader_options = model_reader.ModelReaderOptions(batch_size=args.batch_size)
				326
				327	os.makedirs(args.output_dir, exist_ok=True)
				328
				329	nng = process(args.network, arch, model_reader_options, compiler_options, scheduler_options)
				330
				331	if args.show_subgraph_io_summary:
				332	print_subgraph_io_summary(nng)
				333
				334	return 0