ethosu/vela/stats_writer.py - ml/ethos-u/ethos-u-vela - Gitiles

 # Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
 #
 # SPDX-License-Identifier: Apache-2.0
 #
 # Licensed under the Apache License, Version 2.0 (the License); you may
 # not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an AS IS BASIS, WITHOUT
 # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # Description:
 # Writes out per-pass and summary performance statistics to CSV files.
 import csv
 import sys

 import numpy as np

 from .nn_graph import PassPlacement
 from .npu_performance import BandwidthDirection
 from .npu_performance import MacCount
 from .npu_performance import PassCycles
 from .numeric_util import round_up_to_int
 from .operation import Op
 from .tensor import MemArea
 from .tensor import TensorPurpose


 def mem_areas_to_report():
     # Exclude SHRAM, as the SHRAM performance numbers only cover LUT usage
     return [area for area in MemArea.all() if area != MemArea.Shram]


 def write_summary_metrics_csv(nng, summary_filename, arch):
     with open(summary_filename, "w") as f:
         writer = csv.writer(f)
         mem_areas = mem_areas_to_report()

         labels = [
             "experiment",
             "network",
         ]

         labels += (
             ["accelerator_configuration", "system_config", "memory_mode", "core_clock", "sram_size"]
             + [area.identifier_name() + "_bandwidth" for area in mem_areas]
             + ["weights_storage_area", "feature_map_storage_area"]
         )

         labels += [
             "inferences_per_second",
             "batch_size",
             "inference_time",
             "passes_before_fusing",
             "passes_after_fusing",
         ]
         labels += [area.identifier_name() + "_memory_used" for area in mem_areas]
         labels += ["on_chip_flash_bits_per_element", "off_chip_flash_bits_per_element"]

         for mem_area in mem_areas:
             labels += [
                 mem_area.identifier_name() + "_feature_map_read_bytes",
                 mem_area.identifier_name() + "_feature_map_write_bytes",
                 mem_area.identifier_name() + "_weight_read_bytes",
                 mem_area.identifier_name() + "_weight_write_bytes",
                 mem_area.identifier_name() + "_total_bytes",
             ]

         labels += ["nn_macs", "hardware_macs", "nn_tops", "hardware_tops"]

         labels += ["cycles_" + kind.identifier_name() for kind in PassCycles.all()]

         writer.writerow(labels)

         data_items = [
             "default",
             nng.name,
         ]

         if arch:
             data_items += (
                 [
                     arch.accelerator_config.name,
                     arch.system_config,
                     arch.memory_mode,
                     arch.core_clock,
                     arch.sram_size / 1024,
                 ]
                 + [arch.memory_bandwidths_per_second[mem_area] / 1000.0 / 1000 / 1000 for mem_area in mem_areas]
                 + [
                     arch.tensor_storage_mem_area[TensorPurpose.Weights].display_name(),
                     arch.tensor_storage_mem_area[TensorPurpose.FeatureMap].display_name(),
                 ]
             )

         midpoint_inference_time = nng.cycles[PassCycles.Total] / arch.core_clock
         if midpoint_inference_time > 0:
             midpoint_fps = 1 / midpoint_inference_time
         else:
             midpoint_fps = np.nan

         n_passes = sum(len(sg.passes) for sg in nng.subgraphs)
         n_cascaded_passes = sum(len(sg.cascaded_passes) for sg in nng.subgraphs)

         data_items += [midpoint_fps, nng.batch_size, midpoint_inference_time, n_passes, n_cascaded_passes]
         data_items += [nng.memory_used.get(mem_area, 0) / 1024.0 for mem_area in mem_areas]

         data_items += [
             nng.bits_per_element.get(MemArea.OnChipFlash, 0.0),
             nng.bits_per_element.get(MemArea.OffChipFlash, 0.0),
         ]

         for mem_area in mem_areas:
             bws = nng.bandwidths[mem_area]
             total_bw = np.sum(bws)
             weight_bws = bws[TensorPurpose.Weights]
             fm_bws = bws[TensorPurpose.FeatureMap]
             data_items += [
                 fm_bws[BandwidthDirection.Read],
                 fm_bws[BandwidthDirection.Write],
                 weight_bws[BandwidthDirection.Read],
                 weight_bws[BandwidthDirection.Write],
                 total_bw,
             ]

         data_items += [
             nng.macs[MacCount.NeuralNetworkMacs],
             nng.macs[MacCount.HardwareMacs],
             nng.macs[MacCount.NeuralNetworkMacs] * 2 * midpoint_fps / 1e12,
             nng.macs[MacCount.HardwareMacs] * 2 * midpoint_fps / 1e12,
         ]

         data_items += [nng.cycles[kind] for kind in PassCycles.all()]

         writer.writerow(data_items)


 def write_pass_metrics_csv(nng, pass_filename):

     with open(pass_filename, "w") as f:
         writer = csv.writer(f)

         purpose_list = (
             ("total", (TensorPurpose.Weights, TensorPurpose.FeatureMap)),
             ("weights", (TensorPurpose.Weights,)),
             ("feature_map", (TensorPurpose.FeatureMap,)),
         )

         direction_list = (
             ("total", (BandwidthDirection.Read, BandwidthDirection.Write)),
             ("read", (BandwidthDirection.Read,)),
             ("write", (BandwidthDirection.Write,)),
         )
         bandwidth_names = []
         bandwidth_indices = []
         for mem_area in mem_areas_to_report():
             for purpose, purpose_candidates in purpose_list:
                 for direction, direction_candidates in direction_list:
                     label = "bytes_{}_{}_{}".format(mem_area.identifier_name(), purpose, direction)
                     bandwidth_names.append(label)
                     bandwidth_indices.append((mem_area, purpose_candidates, direction_candidates))

         all_macs = MacCount.all()
         all_cycles = (
             PassCycles.Total,
             PassCycles.Npu,
             PassCycles.SramAccess,
             PassCycles.DramAccess,
             PassCycles.OnChipFlashAccess,
             PassCycles.OffChipFlashAccess,
         )
         writer.writerow(
             [
                 "name",
                 "operators",
                 "placement",
                 "streaming_strategy",
                 "block_config_height",
                 "block_config_width",
                 "block_config_input_channels",
                 "block_config_output_channels",
                 "n_blocks_in_pass",
             ]
             + ["cycles_" + v.identifier_name() for v in all_cycles]
             + [v.identifier_name() for v in all_macs]
             + bandwidth_names
             + ["sram_used"]
         )

         def write_subgraph(sg):
             for cps in sg.cascaded_passes:
                 if cps.placement == PassPlacement.StartupInit:
                     continue  # skip the dummy init pass

                 for ps in cps.passes:
                     if len(ps.ops) == 1 and ps.ops[0].type == Op.CustomNpuOp:
                         # just treat this as a call, unroll it
                         write_subgraph(ps.ops[0].attrs["subgraph"])
                         continue
                     stats = [ps.name, " ".join(op.type.name for op in ps.ops)]
                     stats += [ps.placement.name]
                     stats += [cps.strategy.name]
                     stats += list(ps.block_config)
                     stats += [ps.n_blocks]
                     stats += [round_up_to_int(ps.cycles[v]) for v in all_cycles]
                     stats += [round_up_to_int(ps.macs[v]) for v in all_macs]
                     for indices in bandwidth_indices:
                         res = 0
                         i = indices[0]
                         for j in indices[1]:
                             for k in indices[2]:
                                 res += round_up_to_int(ps.bandwidths[i, j, k])
                         stats.append(res)
                     try:
                         stats += [ps.sram_used]
                     except AttributeError:
                         stats += [0]

                     writer.writerow(stats)

         write_subgraph(nng.get_root_subgraph())


 def print_performance_metrics_for_strat(
     arch,
     name,
     cycles,
     macs,
     bandwidths,
     batch_size,
     memory_used,
     num_passes,
     num_cascaded_passes,
     n_operations=0,
     cpu_operations=[],
     bits_per_element=None,
     show_cpu_operations=False,
     f=sys.stdout,
 ):

     orig_mem_areas_labels = [(v, v.display_name()) for v in mem_areas_to_report()]

     midpoint_inference_time = cycles[PassCycles.Total] / arch.core_clock
     if midpoint_inference_time > 0:
         midpoint_fps = 1 / midpoint_inference_time
     else:
         midpoint_fps = np.nan

     mem_area_labels = [
         (mem_area, label) for mem_area, label in orig_mem_areas_labels if np.sum(bandwidths[mem_area]) > 0
     ]

     if name:
         print("", file=f)
         print("Network summary for", name, file=f)
     print("Accelerator configuration        {:>20}".format(arch.accelerator_config.name), file=f)
     print("System configuration             {:>20}".format(arch.system_config), file=f)
     print("Memory mode                      {:>20}".format(arch.memory_mode), file=f)
     print("Accelerator clock                        {:12d} MHz".format(int(arch.core_clock / 1e6)), file=f)
     for mem_area, label in mem_area_labels:
         print(
             "Design peak {:25}    {:12.2f} GB/s".format(
                 label + " bandwidth", arch.memory_bandwidths_per_second[mem_area] / 1000.0 / 1000 / 1000
             ),
             file=f,
         )
     print(file=f)
     for mem_area, label in mem_area_labels:
         if mem_area not in memory_used:
             continue

         aug_label = label + " used"

         extra = ""
         if (mem_area == MemArea.OnChipFlash or mem_area == MemArea.OffChipFlash) and bits_per_element is not None:
             extra = " ({:.2f} bits per element)".format(bits_per_element[mem_area])

         print("Total {:25}          {:12.2f} KiB{}".format(aug_label, memory_used[mem_area] / 1024.0, extra), file=f)

     print(file=f)
     print("{:d} passes fused into {:d}".format(num_passes, num_cascaded_passes), file=f)

     n_cpu_operations = len(cpu_operations)
     if n_operations > 0:
         print(
             "{:d}/{:d} ({:4.1%}) operations falling back to the CPU".format(
                 n_cpu_operations, n_operations, n_cpu_operations / n_operations * 100
             ),
             file=f,
         )

     if show_cpu_operations:
         for op in cpu_operations:

             def format_tens_list(lst):
                 return " ".join(str(list(tens.shape)) for tens in lst)

             print(
                 "CPU operation: {} inputs {}, outputs {}".format(
                     op.type, format_tens_list(op.inputs), format_tens_list(op.outputs)
                 ),
                 file=f,
             )

         print("", file=f)

     for mem_area, label in mem_area_labels:
         bws = bandwidths[mem_area]
         total_bw = np.sum(bws)
         weight_bws = bws[TensorPurpose.Weights]
         fm_bws = bws[TensorPurpose.FeatureMap]
         aug_label = label + " bandwidth"
         print(
             "Average {:25}        {:12.2f} GB/s".format(aug_label, total_bw * midpoint_fps / 1000.0 / 1000.0 / 1000.0),
             file=f,
         )
         print(
             "Input   {:25}        {:12.2f} MB/batch".format(
                 aug_label, np.sum(fm_bws[BandwidthDirection.Read]) / 1000.0 / 1000.0
             ),
             file=f,
         )
         print("Weight  {:25}        {:12.2f} MB/batch".format(aug_label, np.sum(weight_bws) / 1000.0 / 1000.0), file=f)
         print(
             "Output  {:25}        {:12.2f} MB/batch".format(
                 aug_label, np.sum(fm_bws[BandwidthDirection.Write]) / 1000.0 / 1000.0
             ),
             file=f,
         )
         print("Total   {:25}        {:12.2f} MB/batch".format(aug_label, total_bw / 1000.0 / 1000.0), file=f)
         print(
             "Total   {:25} per input {:9.2f} MB/inference (batch size {:d})".format(
                 aug_label, total_bw / 1000.0 / 1000.0 / batch_size, batch_size
             ),
             file=f,
         )
         print(file=f)

     print(
         "Neural network macs                      {:12d} MACs/batch".format(int(macs[MacCount.NeuralNetworkMacs])),
         file=f,
     )
     print("Hardware macs                            {:12d} MACs/batch".format(int(macs[MacCount.HardwareMacs])), file=f)
     print(
         "Network Tops/s                           {:12.2f} Tops/s".format(
             macs[MacCount.NeuralNetworkMacs] * 2 * midpoint_fps / 1e12
         ),
         file=f,
     )
     print(
         "Hardware Tops/s                          {:12.2f} Tops/s".format(
             macs[MacCount.HardwareMacs] * 2 * midpoint_fps / 1e12
         ),
         file=f,
     )
     print(file=f)

     for kind in PassCycles.all():
         aug_label = kind.display_name() + " cycles"
         cyc = cycles[kind]
         print("{:30}           {:12d} cycles/batch".format(aug_label, int(cyc)), file=f)
     print(file=f)

     print(
         "Batch Inference time              {:7.2f} ms, {:7.2f} inferences/s (batch size {:d})".format(
             midpoint_inference_time * 1000, midpoint_fps, batch_size
         ),
         file=f,
     )
     print(file=f)


 def print_performance_metrics(nng, arch, show_cpu_operations=False, f=sys.stdout):
     n_passes = sum(len(sg.passes) for sg in nng.subgraphs)
     n_cascaded_passes = sum(len(sg.cascaded_passes) for sg in nng.subgraphs)
     n_operations = sum(len(ps.ops) for sg in nng.subgraphs for ps in sg.passes)
     cpu_operations = sum((ps.ops for sg in nng.subgraphs for ps in sg.passes if ps.placement == PassPlacement.Cpu), [])
     return print_performance_metrics_for_strat(
         arch,
         nng.name,
         nng.cycles,
         nng.macs,
         nng.bandwidths,
         nng.batch_size,
         nng.memory_used,
         n_passes,
         n_cascaded_passes,
         n_operations,
         cpu_operations,
         nng.bits_per_element,
         show_cpu_operations,
         f,
     )


 def write_human_friendly_metrics(nng, arch, filename):
     f = open(filename, "w")
     print_performance_metrics(nng, arch, f=f)
	# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
	#
	# SPDX-License-Identifier: Apache-2.0
	#
	# Licensed under the Apache License, Version 2.0 (the License); you may
	# not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an AS IS BASIS, WITHOUT
	# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	# Description:
	# Writes out per-pass and summary performance statistics to CSV files.
	import csv
	import sys

	import numpy as np

	from .nn_graph import PassPlacement
	from .npu_performance import BandwidthDirection
	from .npu_performance import MacCount
	from .npu_performance import PassCycles
	from .numeric_util import round_up_to_int
	from .operation import Op
	from .tensor import MemArea
	from .tensor import TensorPurpose


	def mem_areas_to_report():
	# Exclude SHRAM, as the SHRAM performance numbers only cover LUT usage
	return [area for area in MemArea.all() if area != MemArea.Shram]


	def write_summary_metrics_csv(nng, summary_filename, arch):
	with open(summary_filename, "w") as f:
	writer = csv.writer(f)
	mem_areas = mem_areas_to_report()

	labels = [
	"experiment",
	"network",
	]

	labels += (
	["accelerator_configuration", "system_config", "memory_mode", "core_clock", "sram_size"]
	+ [area.identifier_name() + "_bandwidth" for area in mem_areas]
	+ ["weights_storage_area", "feature_map_storage_area"]
	)

	labels += [
	"inferences_per_second",
	"batch_size",
	"inference_time",
	"passes_before_fusing",
	"passes_after_fusing",
	]
	labels += [area.identifier_name() + "_memory_used" for area in mem_areas]
	labels += ["on_chip_flash_bits_per_element", "off_chip_flash_bits_per_element"]

	for mem_area in mem_areas:
	labels += [
	mem_area.identifier_name() + "_feature_map_read_bytes",
	mem_area.identifier_name() + "_feature_map_write_bytes",
	mem_area.identifier_name() + "_weight_read_bytes",
	mem_area.identifier_name() + "_weight_write_bytes",
	mem_area.identifier_name() + "_total_bytes",
	]

	labels += ["nn_macs", "hardware_macs", "nn_tops", "hardware_tops"]

	labels += ["cycles_" + kind.identifier_name() for kind in PassCycles.all()]

	writer.writerow(labels)

	data_items = [
	"default",
	nng.name,
	]

	if arch:
	data_items += (
	[
	arch.accelerator_config.name,
	arch.system_config,
	arch.memory_mode,
	arch.core_clock,
	arch.sram_size / 1024,
	]
	+ [arch.memory_bandwidths_per_second[mem_area] / 1000.0 / 1000 / 1000 for mem_area in mem_areas]
	+ [
	arch.tensor_storage_mem_area[TensorPurpose.Weights].display_name(),
	arch.tensor_storage_mem_area[TensorPurpose.FeatureMap].display_name(),
	]
	)

	midpoint_inference_time = nng.cycles[PassCycles.Total] / arch.core_clock
	if midpoint_inference_time > 0:
	midpoint_fps = 1 / midpoint_inference_time
	else:
	midpoint_fps = np.nan

	n_passes = sum(len(sg.passes) for sg in nng.subgraphs)
	n_cascaded_passes = sum(len(sg.cascaded_passes) for sg in nng.subgraphs)

	data_items += [midpoint_fps, nng.batch_size, midpoint_inference_time, n_passes, n_cascaded_passes]
	data_items += [nng.memory_used.get(mem_area, 0) / 1024.0 for mem_area in mem_areas]

	data_items += [
	nng.bits_per_element.get(MemArea.OnChipFlash, 0.0),
	nng.bits_per_element.get(MemArea.OffChipFlash, 0.0),
	]

	for mem_area in mem_areas:
	bws = nng.bandwidths[mem_area]
	total_bw = np.sum(bws)
	weight_bws = bws[TensorPurpose.Weights]
	fm_bws = bws[TensorPurpose.FeatureMap]
	data_items += [
	fm_bws[BandwidthDirection.Read],
	fm_bws[BandwidthDirection.Write],
	weight_bws[BandwidthDirection.Read],
	weight_bws[BandwidthDirection.Write],
	total_bw,
	]

	data_items += [
	nng.macs[MacCount.NeuralNetworkMacs],
	nng.macs[MacCount.HardwareMacs],
	nng.macs[MacCount.NeuralNetworkMacs] * 2 * midpoint_fps / 1e12,
	nng.macs[MacCount.HardwareMacs] * 2 * midpoint_fps / 1e12,
	]

	data_items += [nng.cycles[kind] for kind in PassCycles.all()]

	writer.writerow(data_items)


	def write_pass_metrics_csv(nng, pass_filename):

	with open(pass_filename, "w") as f:
	writer = csv.writer(f)

	purpose_list = (
	("total", (TensorPurpose.Weights, TensorPurpose.FeatureMap)),
	("weights", (TensorPurpose.Weights,)),
	("feature_map", (TensorPurpose.FeatureMap,)),
	)

	direction_list = (
	("total", (BandwidthDirection.Read, BandwidthDirection.Write)),
	("read", (BandwidthDirection.Read,)),
	("write", (BandwidthDirection.Write,)),
	)
	bandwidth_names = []
	bandwidth_indices = []
	for mem_area in mem_areas_to_report():
	for purpose, purpose_candidates in purpose_list:
	for direction, direction_candidates in direction_list:
	label = "bytes_{}_{}_{}".format(mem_area.identifier_name(), purpose, direction)
	bandwidth_names.append(label)
	bandwidth_indices.append((mem_area, purpose_candidates, direction_candidates))

	all_macs = MacCount.all()
	all_cycles = (
	PassCycles.Total,
	PassCycles.Npu,
	PassCycles.SramAccess,
	PassCycles.DramAccess,
	PassCycles.OnChipFlashAccess,
	PassCycles.OffChipFlashAccess,
	)
	writer.writerow(
	[
	"name",
	"operators",
	"placement",
	"streaming_strategy",
	"block_config_height",
	"block_config_width",
	"block_config_input_channels",
	"block_config_output_channels",
	"n_blocks_in_pass",
	]
	+ ["cycles_" + v.identifier_name() for v in all_cycles]
	+ [v.identifier_name() for v in all_macs]
	+ bandwidth_names
	+ ["sram_used"]
	)

	def write_subgraph(sg):
	for cps in sg.cascaded_passes:
	if cps.placement == PassPlacement.StartupInit:
	continue # skip the dummy init pass

	for ps in cps.passes:
	if len(ps.ops) == 1 and ps.ops[0].type == Op.CustomNpuOp:
	# just treat this as a call, unroll it
	write_subgraph(ps.ops[0].attrs["subgraph"])
	continue
	stats = [ps.name, " ".join(op.type.name for op in ps.ops)]
	stats += [ps.placement.name]
	stats += [cps.strategy.name]
	stats += list(ps.block_config)
	stats += [ps.n_blocks]
	stats += [round_up_to_int(ps.cycles[v]) for v in all_cycles]
	stats += [round_up_to_int(ps.macs[v]) for v in all_macs]
	for indices in bandwidth_indices:
	res = 0
	i = indices[0]
	for j in indices[1]:
	for k in indices[2]:
	res += round_up_to_int(ps.bandwidths[i, j, k])
	stats.append(res)
	try:
	stats += [ps.sram_used]
	except AttributeError:
	stats += [0]

	writer.writerow(stats)

	write_subgraph(nng.get_root_subgraph())


	def print_performance_metrics_for_strat(
	arch,
	name,
	cycles,
	macs,
	bandwidths,
	batch_size,
	memory_used,
	num_passes,
	num_cascaded_passes,
	n_operations=0,
	cpu_operations=[],
	bits_per_element=None,
	show_cpu_operations=False,
	f=sys.stdout,
	):

	orig_mem_areas_labels = [(v, v.display_name()) for v in mem_areas_to_report()]

	midpoint_inference_time = cycles[PassCycles.Total] / arch.core_clock
	if midpoint_inference_time > 0:
	midpoint_fps = 1 / midpoint_inference_time
	else:
	midpoint_fps = np.nan

	mem_area_labels = [
	(mem_area, label) for mem_area, label in orig_mem_areas_labels if np.sum(bandwidths[mem_area]) > 0
	]

	if name:
	print("", file=f)
	print("Network summary for", name, file=f)
	print("Accelerator configuration {:>20}".format(arch.accelerator_config.name), file=f)
	print("System configuration {:>20}".format(arch.system_config), file=f)
	print("Memory mode {:>20}".format(arch.memory_mode), file=f)
	print("Accelerator clock {:12d} MHz".format(int(arch.core_clock / 1e6)), file=f)
	for mem_area, label in mem_area_labels:
	print(
	"Design peak {:25} {:12.2f} GB/s".format(
	label + " bandwidth", arch.memory_bandwidths_per_second[mem_area] / 1000.0 / 1000 / 1000
	),
	file=f,
	)
	print(file=f)
	for mem_area, label in mem_area_labels:
	if mem_area not in memory_used:
	continue

	aug_label = label + " used"

	extra = ""
	if (mem_area == MemArea.OnChipFlash or mem_area == MemArea.OffChipFlash) and bits_per_element is not None:
	extra = " ({:.2f} bits per element)".format(bits_per_element[mem_area])

	print("Total {:25} {:12.2f} KiB{}".format(aug_label, memory_used[mem_area] / 1024.0, extra), file=f)

	print(file=f)
	print("{:d} passes fused into {:d}".format(num_passes, num_cascaded_passes), file=f)

	n_cpu_operations = len(cpu_operations)
	if n_operations > 0:
	print(
	"{:d}/{:d} ({:4.1%}) operations falling back to the CPU".format(
	n_cpu_operations, n_operations, n_cpu_operations / n_operations * 100
	),
	file=f,
	)

	if show_cpu_operations:
	for op in cpu_operations:

	def format_tens_list(lst):
	return " ".join(str(list(tens.shape)) for tens in lst)

	print(
	"CPU operation: {} inputs {}, outputs {}".format(
	op.type, format_tens_list(op.inputs), format_tens_list(op.outputs)
	),
	file=f,
	)

	print("", file=f)

	for mem_area, label in mem_area_labels:
	bws = bandwidths[mem_area]
	total_bw = np.sum(bws)
	weight_bws = bws[TensorPurpose.Weights]
	fm_bws = bws[TensorPurpose.FeatureMap]
	aug_label = label + " bandwidth"
	print(
	"Average {:25} {:12.2f} GB/s".format(aug_label, total_bw * midpoint_fps / 1000.0 / 1000.0 / 1000.0),
	file=f,
	)
	print(
	"Input {:25} {:12.2f} MB/batch".format(
	aug_label, np.sum(fm_bws[BandwidthDirection.Read]) / 1000.0 / 1000.0
	),
	file=f,
	)
	print("Weight {:25} {:12.2f} MB/batch".format(aug_label, np.sum(weight_bws) / 1000.0 / 1000.0), file=f)
	print(
	"Output {:25} {:12.2f} MB/batch".format(
	aug_label, np.sum(fm_bws[BandwidthDirection.Write]) / 1000.0 / 1000.0
	),
	file=f,
	)
	print("Total {:25} {:12.2f} MB/batch".format(aug_label, total_bw / 1000.0 / 1000.0), file=f)
	print(
	"Total {:25} per input {:9.2f} MB/inference (batch size {:d})".format(
	aug_label, total_bw / 1000.0 / 1000.0 / batch_size, batch_size
	),
	file=f,
	)
	print(file=f)

	print(
	"Neural network macs {:12d} MACs/batch".format(int(macs[MacCount.NeuralNetworkMacs])),
	file=f,
	)
	print("Hardware macs {:12d} MACs/batch".format(int(macs[MacCount.HardwareMacs])), file=f)
	print(
	"Network Tops/s {:12.2f} Tops/s".format(
	macs[MacCount.NeuralNetworkMacs] * 2 * midpoint_fps / 1e12
	),
	file=f,
	)
	print(
	"Hardware Tops/s {:12.2f} Tops/s".format(
	macs[MacCount.HardwareMacs] * 2 * midpoint_fps / 1e12
	),
	file=f,
	)
	print(file=f)

	for kind in PassCycles.all():
	aug_label = kind.display_name() + " cycles"
	cyc = cycles[kind]
	print("{:30} {:12d} cycles/batch".format(aug_label, int(cyc)), file=f)
	print(file=f)

	print(
	"Batch Inference time {:7.2f} ms, {:7.2f} inferences/s (batch size {:d})".format(
	midpoint_inference_time * 1000, midpoint_fps, batch_size
	),
	file=f,
	)
	print(file=f)


	def print_performance_metrics(nng, arch, show_cpu_operations=False, f=sys.stdout):
	n_passes = sum(len(sg.passes) for sg in nng.subgraphs)
	n_cascaded_passes = sum(len(sg.cascaded_passes) for sg in nng.subgraphs)
	n_operations = sum(len(ps.ops) for sg in nng.subgraphs for ps in sg.passes)
	cpu_operations = sum((ps.ops for sg in nng.subgraphs for ps in sg.passes if ps.placement == PassPlacement.Cpu), [])
	return print_performance_metrics_for_strat(
	arch,
	nng.name,
	nng.cycles,
	nng.macs,
	nng.bandwidths,
	nng.batch_size,
	nng.memory_used,
	n_passes,
	n_cascaded_passes,
	n_operations,
	cpu_operations,
	nng.bits_per_element,
	show_cpu_operations,
	f,
	)


	def write_human_friendly_metrics(nng, arch, filename):
	f = open(filename, "w")
	print_performance_metrics(nng, arch, f=f)