Blame - ethosu/vela/stats_writer.py - ml/ethos-u/ethos-u-vela

blob: 3fd29d127bc9702027d1f115f64c7cbc359a675f [file] [log] [blame]

Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	1	# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
				2	#
				3	# SPDX-License-Identifier: Apache-2.0
				4	#
				5	# Licensed under the Apache License, Version 2.0 (the License); you may
				6	# not use this file except in compliance with the License.
				7	# You may obtain a copy of the License at
				8	#
				9	# www.apache.org/licenses/LICENSE-2.0
				10	#
				11	# Unless required by applicable law or agreed to in writing, software
				12	# distributed under the License is distributed on an AS IS BASIS, WITHOUT
				13	# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				14	# See the License for the specific language governing permissions and
				15	# limitations under the License.
				16
				17
				18	# Description:
				19	# Writes out per-pass and summary performance statistics to CSV files.
				20
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	21	import csv
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	22	import sys
				23
Diego Russo	ea6111a	2020-04-14 18:41:58 +0100	[diff] [blame^]	24	import numpy as np
				25
				26	from .tensor import MemArea, TensorPurpose
				27	from .nn_graph import PassPlacement
				28	from .npu_performance import PassCycles, MacCount, BandwidthDirection
				29	from .numeric_util import round_up_to_int
				30
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	31
				32	def write_summary_metrics_csv(nng, summary_filename, arch):
				33	with open(summary_filename, "w") as f:
				34	writer = csv.writer(f)
				35
				36	labels = [
				37	"experiment",
				38	"network",
				39	]
				40
				41	labels += (
				42	["accelerator_configuration", "system_config", "npu_clock", "sram_size"]
				43	+ [area.identifier_name() + "_bandwidth" for area in MemArea.all()]
				44	+ ["weights_storage_area", "feature_map_storage_area"]
				45	)
				46
				47	labels += [
				48	"inferences_per_second",
				49	"batch_size",
				50	"inference_time",
				51	"passes_before_fusing",
				52	"passes_after_fusing",
				53	]
				54	labels += [area.identifier_name() + "_memory_used" for area in MemArea.all()]
				55	labels += ["on_chip_flash_bits_per_element", "off_chip_flash_bits_per_element"]
				56
				57	for mem_area in MemArea.all():
				58	labels += [
				59	mem_area.identifier_name() + "_feature_map_read_bytes",
				60	mem_area.identifier_name() + "_feature_map_write_bytes",
				61	mem_area.identifier_name() + "_weight_read_bytes",
				62	mem_area.identifier_name() + "_weight_write_bytes",
				63	mem_area.identifier_name() + "_total_bytes",
				64	]
				65
				66	labels += ["nn_macs", "hardware_macs", "nn_tops", "hardware_tops"]
				67
				68	labels += ["cycles_" + kind.identifier_name() for kind in PassCycles.all()]
				69
				70	writer.writerow(labels)
				71
				72	data_items = [
				73	"default",
				74	nng.name,
				75	]
				76
				77	if arch:
				78	data_items += (
				79	[arch.accelerator_config, arch.system_config, arch.npu_clock, arch.sram_size / 1024]
				80	+ [arch.memory_bandwidths_per_second[mem_area] / 1000.0 / 1000 / 1000 for mem_area in MemArea.all()]
				81	+ [
				82	arch.tensor_storage_mem_area[TensorPurpose.Weights].display_name(),
				83	arch.tensor_storage_mem_area[TensorPurpose.FeatureMap].display_name(),
				84	]
				85	)
				86
				87	midpoint_inference_time = nng.cycles[PassCycles.Total] / arch.npu_clock
				88	midpoint_fps = 1 / midpoint_inference_time
				89
				90	n_passes = sum(len(sg.passes) for sg in nng.subgraphs)
				91	n_cascaded_passes = sum(len(sg.cascaded_passes) for sg in nng.subgraphs)
				92
				93	data_items += [midpoint_fps, nng.batch_size, midpoint_inference_time, n_passes, n_cascaded_passes]
				94	data_items += [nng.memory_used.get(mem_area, 0) / 1024.0 for mem_area in MemArea.all()]
				95
				96	data_items += [
				97	nng.bits_per_element.get(MemArea.OnChipFlash, 0.0),
				98	nng.bits_per_element.get(MemArea.OffChipFlash, 0.0),
				99	]
				100
				101	for mem_area in MemArea.all():
				102	bws = nng.bandwidths[mem_area]
				103	total_bw = np.sum(bws)
				104	weight_bws = bws[TensorPurpose.Weights]
				105	fm_bws = bws[TensorPurpose.FeatureMap]
				106	data_items += [
				107	fm_bws[BandwidthDirection.Read],
				108	fm_bws[BandwidthDirection.Write],
				109	weight_bws[BandwidthDirection.Read],
				110	weight_bws[BandwidthDirection.Write],
				111	total_bw,
				112	]
				113
				114	data_items += [
				115	nng.macs[MacCount.NeuralNetworkMacs],
				116	nng.macs[MacCount.HardwareMacs],
				117	nng.macs[MacCount.NeuralNetworkMacs] * 2 * midpoint_fps / 1e12,
				118	nng.macs[MacCount.HardwareMacs] * 2 * midpoint_fps / 1e12,
				119	]
				120
				121	data_items += [nng.cycles[kind] for kind in PassCycles.all()]
				122
				123	writer.writerow(data_items)
				124
				125
				126	def write_pass_metrics_csv(nng, pass_filename):
				127
				128	with open(pass_filename, "w") as f:
				129	writer = csv.writer(f)
				130
				131	purpose_list = (
				132	("total", (TensorPurpose.Weights, TensorPurpose.FeatureMap)),
				133	("weights", (TensorPurpose.Weights,)),
				134	("feature_map", (TensorPurpose.FeatureMap,)),
				135	)
				136
				137	direction_list = (
				138	("total", (BandwidthDirection.Read, BandwidthDirection.Write)),
				139	("read", (BandwidthDirection.Read,)),
				140	("write", (BandwidthDirection.Write,)),
				141	)
				142	bandwidth_names = []
				143	bandwidth_indices = []
				144	for mem_area in MemArea.all():
				145	for purpose, purpose_candidates in purpose_list:
				146	for direction, direction_candidates in direction_list:
				147	label = "bytes_%s_%s_%s" % (mem_area.identifier_name(), purpose, direction)
				148	bandwidth_names.append(label)
				149	bandwidth_indices.append((mem_area, purpose_candidates, direction_candidates))
				150
				151	all_macs = MacCount.all()
				152	all_cycles = (
				153	PassCycles.Total,
				154	PassCycles.Dpu,
				155	PassCycles.ElementWise,
				156	PassCycles.Cpu,
				157	PassCycles.SramAccess,
				158	PassCycles.DramAccess,
				159	PassCycles.OnChipFlashAccess,
				160	PassCycles.OffChipFlashAccess,
				161	)
				162	writer.writerow(
				163	[
				164	"name",
				165	"operators",
				166	"placement",
				167	"streaming_strategy",
				168	"block_config_height",
				169	"block_config_width",
				170	"block_config_input_channels",
				171	"block_config_output_channels",
				172	"n_blocks_in_pass",
				173	]
				174	+ ["cycles_" + v.identifier_name() for v in all_cycles]
				175	+ [v.identifier_name() for v in all_macs]
				176	+ bandwidth_names
				177	+ ["sram_used"]
				178	)
				179
				180	def write_subgraph(sg):
				181	for cps in sg.cascaded_passes:
				182	if cps.placement == PassPlacement.StartupInit:
				183	continue # skip the dummy init pass
				184
				185	for ps in cps.passes:
				186	if len(ps.ops) == 1 and ps.ops[0].type == "NpuOp":
				187	# just treat this as a call, unroll it
				188	write_subgraph(ps.ops[0].attrs["subgraph"])
				189	continue
				190	stats = [ps.name, " ".join(op.type for op in ps.ops)]
				191	stats += [ps.placement.name]
				192	stats += [cps.strategy.name]
				193	stats += list(ps.block_config)
				194	stats += [ps.n_blocks]
				195	stats += [round_up_to_int(ps.cycles[v]) for v in all_cycles]
				196	stats += [round_up_to_int(ps.macs[v]) for v in all_macs]
				197	for indices in bandwidth_indices:
				198	res = 0
				199	i = indices[0]
				200	for j in indices[1]:
				201	for k in indices[2]:
				202	res += round_up_to_int(ps.bandwidths[i, j, k])
				203	stats.append(res)
				204	stats += [ps.sram_used]
				205
				206	writer.writerow(stats)
				207
				208	write_subgraph(nng.get_root_subgraph())
				209
				210
				211	def print_performance_metrics_for_strat(
				212	arch,
				213	name,
				214	cycles,
				215	macs,
				216	bandwidths,
				217	batch_size,
				218	memory_used,
				219	num_passes,
				220	num_cascaded_passes,
				221	n_operations=0,
				222	cpu_operations=[],
				223	bits_per_element=None,
				224	show_cpu_operations=False,
				225	f=sys.stdout,
				226	):
				227
				228	orig_mem_areas_labels = [(v, v.display_name()) for v in MemArea.all()]
				229
				230	midpoint_inference_time = cycles[PassCycles.Total] / arch.npu_clock
				231	midpoint_fps = 1 / midpoint_inference_time
				232
				233	mem_area_labels = [
				234	(mem_area, label) for mem_area, label in orig_mem_areas_labels if np.sum(bandwidths[mem_area]) > 0
				235	]
				236
				237	if name:
				238	print("", file=f)
				239	print("Network summary for", name, file=f)
				240	print("Accelerator configuration %20s" % (arch.accelerator_config,), file=f)
				241	print("System configuration %20s" % (arch.system_config,), file=f)
				242	print("Accelerator clock %12d MHz" % (arch.npu_clock / 1e6,), file=f)
				243	for mem_area, label in mem_area_labels:
				244	print(
				245	"Design peak %-25s %12.2f GB/s"
				246	% (label + " bandwidth", arch.memory_bandwidths_per_second[mem_area] / 1000.0 / 1000 / 1000,),
				247	file=f,
				248	)
				249
				250	print(file=f)
				251	for mem_area, label in mem_area_labels:
Diego Russo	ea6111a	2020-04-14 18:41:58 +0100	[diff] [blame^]	252	if mem_area not in memory_used:
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	253	continue
				254
				255	aug_label = label + " used"
				256
				257	extra = ""
				258	if (mem_area == MemArea.OnChipFlash or mem_area == MemArea.OffChipFlash) and bits_per_element is not None:
				259	extra = " (%.2f bits per element)" % (bits_per_element[mem_area],)
				260
				261	print("Total %-25s %12.2f KiB%s" % (aug_label, memory_used[mem_area] / 1024.0, extra), file=f)
				262
				263	print(file=f)
				264	print("%d passes fused into %d" % (num_passes, num_cascaded_passes), file=f)
				265
				266	n_cpu_operations = len(cpu_operations)
				267	if n_operations > 0:
				268	print(
				269	"%d/%d (%4.1f %%) operations falling back to the CPU"
				270	% (n_cpu_operations, n_operations, n_cpu_operations / n_operations * 100),
				271	file=f,
				272	)
				273
				274	if show_cpu_operations:
				275	for op in cpu_operations:
				276
				277	def format_tens_list(lst):
				278	return " ".join(str(list(tens.shape)) for tens in lst)
				279
				280	print(
				281	"CPU operation: %s, inputs %s, outputs %s"
				282	% (op.type, format_tens_list(op.inputs), format_tens_list(op.outputs)),
				283	file=f,
				284	)
				285
				286	print("", file=f)
				287
				288	for mem_area, label in mem_area_labels:
				289	bws = bandwidths[mem_area]
				290	total_bw = np.sum(bws)
				291	weight_bws = bws[TensorPurpose.Weights]
				292	fm_bws = bws[TensorPurpose.FeatureMap]
				293	aug_label = label + " bandwidth"
				294	print(
				295	"Average %-25s %12.2f GB/s" % (aug_label, total_bw * midpoint_fps / 1000.0 / 1000.0 / 1000.0,),
				296	file=f,
				297	)
				298	print(
				299	"Input %-25s %12.2f MB/batch"
				300	% (aug_label, np.sum(fm_bws[BandwidthDirection.Read]) / 1000.0 / 1000.0,),
				301	file=f,
				302	)
				303	print("Weight %-25s %12.2f MB/batch" % (aug_label, np.sum(weight_bws) / 1000.0 / 1000.0,), file=f)
				304	print(
				305	"Output %-25s %12.2f MB/batch"
				306	% (aug_label, np.sum(fm_bws[BandwidthDirection.Write]) / 1000.0 / 1000.0,),
				307	file=f,
				308	)
				309	print("Total %-25s %12.2f MB/batch" % (aug_label, total_bw / 1000.0 / 1000.0,), file=f)
				310	print(
				311	"Total %-25s per input %9.2f MB/inference (batch size %d)"
				312	% (aug_label, total_bw / 1000.0 / 1000.0 / batch_size, batch_size),
				313	file=f,
				314	)
				315	print(file=f)
				316
				317	print("Neural network macs %12d MACs/batch" % (macs[MacCount.NeuralNetworkMacs],), file=f)
				318	print("Hardware macs %12d MACs/batch" % (macs[MacCount.HardwareMacs],), file=f)
				319	print(
				320	"Network Tops/s %12.2f Tops/s"
				321	% (macs[MacCount.NeuralNetworkMacs] * 2 * midpoint_fps / 1e12),
				322	file=f,
				323	)
				324	print(
				325	"Hardware Tops/s %12.2f Tops/s"
				326	% (macs[MacCount.HardwareMacs] * 2 * midpoint_fps / 1e12),
				327	file=f,
				328	)
				329	print(file=f)
				330
				331	for kind in PassCycles.all():
				332	aug_label = kind.display_name() + " cycles"
				333	cyc = cycles[kind]
				334	print("%-30s %12d cycles/batch" % (aug_label, cyc,), file=f)
				335	print(file=f)
				336
				337	print(
				338	"Batch Inference time %7.2f ms, %7.2f inferences/s (batch size %d)"
				339	% (midpoint_inference_time * 1000, midpoint_fps, batch_size),
				340	file=f,
				341	)
				342	print(file=f)
				343
				344
				345	def print_performance_metrics(nng, arch, show_cpu_operations=False, f=sys.stdout):
				346	n_passes = sum(len(sg.passes) for sg in nng.subgraphs)
				347	n_cascaded_passes = sum(len(sg.cascaded_passes) for sg in nng.subgraphs)
				348	n_operations = sum(len(ps.ops) for sg in nng.subgraphs for ps in sg.passes)
				349	cpu_operations = sum((ps.ops for sg in nng.subgraphs for ps in sg.passes if ps.placement == PassPlacement.Cpu), [])
				350	return print_performance_metrics_for_strat(
				351	arch,
				352	nng.name,
				353	nng.cycles,
				354	nng.macs,
				355	nng.bandwidths,
				356	nng.batch_size,
				357	nng.memory_used,
				358	n_passes,
				359	n_cascaded_passes,
				360	n_operations,
				361	cpu_operations,
				362	nng.bits_per_element,
				363	show_cpu_operations,
				364	f,
				365	)
				366
				367
				368	def write_human_friendly_metrics(nng, arch, filename):
				369	f = open(filename, "w")
				370	print_performance_metrics(nng, arch, f=f)