blob: 597fd151653d1d4d0e9def767d9b1689acea5f06 [file] [log] [blame]
erik.andersson@arm.com460c6892021-02-24 14:38:09 +01001# Copyright (C) 2020-2021 Arm Limited or its affiliates. All rights reserved.
Tim Hall79d07d22020-04-27 18:20:16 +01002#
3# SPDX-License-Identifier: Apache-2.0
4#
5# Licensed under the Apache License, Version 2.0 (the License); you may
6# not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an AS IS BASIS, WITHOUT
13# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
Tim Hall79d07d22020-04-27 18:20:16 +010016# Description:
17# Writes out per-pass and summary performance statistics to CSV files.
Tim Hall79d07d22020-04-27 18:20:16 +010018import csv
Tim Hall79d07d22020-04-27 18:20:16 +010019import sys
20
Diego Russoea6111a2020-04-14 18:41:58 +010021import numpy as np
22
Diego Russoea6111a2020-04-14 18:41:58 +010023from .nn_graph import PassPlacement
Diego Russoe8a10452020-04-21 17:39:10 +010024from .npu_performance import BandwidthDirection
Diego Russoe8a10452020-04-21 17:39:10 +010025from .npu_performance import PassCycles
Diego Russoea6111a2020-04-14 18:41:58 +010026from .numeric_util import round_up_to_int
Louis Verhaardaee5d752020-09-30 09:01:52 +020027from .operation import Op
Diego Russoe8a10452020-04-21 17:39:10 +010028from .tensor import MemArea
29from .tensor import TensorPurpose
Diego Russoea6111a2020-04-14 18:41:58 +010030
Tim Hall79d07d22020-04-27 18:20:16 +010031
Louis Verhaard0265f402020-09-29 13:57:21 +020032def mem_areas_to_report():
33 # Exclude SHRAM, as the SHRAM performance numbers only cover LUT usage
34 return [area for area in MemArea.all() if area != MemArea.Shram]
35
36
Tim Hall79d07d22020-04-27 18:20:16 +010037def write_summary_metrics_csv(nng, summary_filename, arch):
38 with open(summary_filename, "w") as f:
39 writer = csv.writer(f)
Louis Verhaard0265f402020-09-29 13:57:21 +020040 mem_areas = mem_areas_to_report()
Tim Hall79d07d22020-04-27 18:20:16 +010041
42 labels = [
43 "experiment",
44 "network",
45 ]
46
47 labels += (
Tim Hall1bd531d2020-11-01 20:59:36 +000048 ["accelerator_configuration", "system_config", "memory_mode", "core_clock", "sram_size"]
Louis Verhaard0265f402020-09-29 13:57:21 +020049 + [area.identifier_name() + "_bandwidth" for area in mem_areas]
Tim Hall79d07d22020-04-27 18:20:16 +010050 + ["weights_storage_area", "feature_map_storage_area"]
51 )
52
53 labels += [
54 "inferences_per_second",
55 "batch_size",
56 "inference_time",
57 "passes_before_fusing",
58 "passes_after_fusing",
59 ]
Louis Verhaard0265f402020-09-29 13:57:21 +020060 labels += [area.identifier_name() + "_memory_used" for area in mem_areas]
Diqing Zhongdb5124c2021-01-11 12:52:48 +010061 labels += ["weights_compression_ratio"]
Tim Hall79d07d22020-04-27 18:20:16 +010062
Louis Verhaard0265f402020-09-29 13:57:21 +020063 for mem_area in mem_areas:
Tim Hall79d07d22020-04-27 18:20:16 +010064 labels += [
65 mem_area.identifier_name() + "_feature_map_read_bytes",
66 mem_area.identifier_name() + "_feature_map_write_bytes",
67 mem_area.identifier_name() + "_weight_read_bytes",
68 mem_area.identifier_name() + "_weight_write_bytes",
69 mem_area.identifier_name() + "_total_bytes",
70 ]
71
Diqing Zhong69aadd02020-12-08 13:08:48 +010072 labels += ["nn_macs", "nn_tops"]
Tim Hall79d07d22020-04-27 18:20:16 +010073
74 labels += ["cycles_" + kind.identifier_name() for kind in PassCycles.all()]
75
76 writer.writerow(labels)
77
78 data_items = [
79 "default",
80 nng.name,
81 ]
82
83 if arch:
84 data_items += (
Tim Hall1bd531d2020-11-01 20:59:36 +000085 [
86 arch.accelerator_config.name,
87 arch.system_config,
88 arch.memory_mode,
89 arch.core_clock,
90 arch.sram_size / 1024,
91 ]
Louis Verhaard0265f402020-09-29 13:57:21 +020092 + [arch.memory_bandwidths_per_second[mem_area] / 1000.0 / 1000 / 1000 for mem_area in mem_areas]
Tim Hall79d07d22020-04-27 18:20:16 +010093 + [
94 arch.tensor_storage_mem_area[TensorPurpose.Weights].display_name(),
95 arch.tensor_storage_mem_area[TensorPurpose.FeatureMap].display_name(),
96 ]
97 )
98
Tim Hall1bd531d2020-11-01 20:59:36 +000099 midpoint_inference_time = nng.cycles[PassCycles.Total] / arch.core_clock
Michael McGeaghb4249742020-07-30 14:36:40 +0100100 if midpoint_inference_time > 0:
101 midpoint_fps = 1 / midpoint_inference_time
102 else:
103 midpoint_fps = np.nan
Tim Hall79d07d22020-04-27 18:20:16 +0100104
105 n_passes = sum(len(sg.passes) for sg in nng.subgraphs)
106 n_cascaded_passes = sum(len(sg.cascaded_passes) for sg in nng.subgraphs)
107
108 data_items += [midpoint_fps, nng.batch_size, midpoint_inference_time, n_passes, n_cascaded_passes]
Louis Verhaard0265f402020-09-29 13:57:21 +0200109 data_items += [nng.memory_used.get(mem_area, 0) / 1024.0 for mem_area in mem_areas]
Diqing Zhongdb5124c2021-01-11 12:52:48 +0100110 data_items += [nng.weights_compression_ratio]
Tim Hall79d07d22020-04-27 18:20:16 +0100111
Louis Verhaard0265f402020-09-29 13:57:21 +0200112 for mem_area in mem_areas:
Tim Hall79d07d22020-04-27 18:20:16 +0100113 bws = nng.bandwidths[mem_area]
114 total_bw = np.sum(bws)
115 weight_bws = bws[TensorPurpose.Weights]
116 fm_bws = bws[TensorPurpose.FeatureMap]
117 data_items += [
118 fm_bws[BandwidthDirection.Read],
119 fm_bws[BandwidthDirection.Write],
120 weight_bws[BandwidthDirection.Read],
121 weight_bws[BandwidthDirection.Write],
122 total_bw,
123 ]
124
125 data_items += [
Diqing Zhong69aadd02020-12-08 13:08:48 +0100126 nng.macs,
127 nng.macs * 2 * midpoint_fps / 1e12,
Tim Hall79d07d22020-04-27 18:20:16 +0100128 ]
129
130 data_items += [nng.cycles[kind] for kind in PassCycles.all()]
131
132 writer.writerow(data_items)
133
134
135def write_pass_metrics_csv(nng, pass_filename):
136
137 with open(pass_filename, "w") as f:
138 writer = csv.writer(f)
139
140 purpose_list = (
141 ("total", (TensorPurpose.Weights, TensorPurpose.FeatureMap)),
142 ("weights", (TensorPurpose.Weights,)),
143 ("feature_map", (TensorPurpose.FeatureMap,)),
144 )
145
146 direction_list = (
147 ("total", (BandwidthDirection.Read, BandwidthDirection.Write)),
148 ("read", (BandwidthDirection.Read,)),
149 ("write", (BandwidthDirection.Write,)),
150 )
151 bandwidth_names = []
152 bandwidth_indices = []
Louis Verhaard0265f402020-09-29 13:57:21 +0200153 for mem_area in mem_areas_to_report():
Tim Hall79d07d22020-04-27 18:20:16 +0100154 for purpose, purpose_candidates in purpose_list:
155 for direction, direction_candidates in direction_list:
Diqing Zhong42e833d2020-10-02 13:18:42 +0200156 label = "bytes_{}_{}_{}".format(mem_area.identifier_name(), purpose, direction)
Tim Hall79d07d22020-04-27 18:20:16 +0100157 bandwidth_names.append(label)
158 bandwidth_indices.append((mem_area, purpose_candidates, direction_candidates))
159
Tim Hall79d07d22020-04-27 18:20:16 +0100160 all_cycles = (
161 PassCycles.Total,
Diqing Zhong42e833d2020-10-02 13:18:42 +0200162 PassCycles.Npu,
Tim Hall79d07d22020-04-27 18:20:16 +0100163 PassCycles.SramAccess,
164 PassCycles.DramAccess,
165 PassCycles.OnChipFlashAccess,
166 PassCycles.OffChipFlashAccess,
167 )
168 writer.writerow(
169 [
170 "name",
171 "operators",
172 "placement",
173 "streaming_strategy",
174 "block_config_height",
175 "block_config_width",
176 "block_config_input_channels",
177 "block_config_output_channels",
Tim Hall79d07d22020-04-27 18:20:16 +0100178 ]
179 + ["cycles_" + v.identifier_name() for v in all_cycles]
Diqing Zhong69aadd02020-12-08 13:08:48 +0100180 + ["nn_macs"]
Tim Hall79d07d22020-04-27 18:20:16 +0100181 + bandwidth_names
182 + ["sram_used"]
183 )
184
185 def write_subgraph(sg):
186 for cps in sg.cascaded_passes:
187 if cps.placement == PassPlacement.StartupInit:
188 continue # skip the dummy init pass
189
190 for ps in cps.passes:
Louis Verhaardaee5d752020-09-30 09:01:52 +0200191 if len(ps.ops) == 1 and ps.ops[0].type == Op.CustomNpuOp:
Tim Hall79d07d22020-04-27 18:20:16 +0100192 # just treat this as a call, unroll it
193 write_subgraph(ps.ops[0].attrs["subgraph"])
194 continue
Louis Verhaardaee5d752020-09-30 09:01:52 +0200195 stats = [ps.name, " ".join(op.type.name for op in ps.ops)]
Tim Hall79d07d22020-04-27 18:20:16 +0100196 stats += [ps.placement.name]
197 stats += [cps.strategy.name]
198 stats += list(ps.block_config)
Tim Hall79d07d22020-04-27 18:20:16 +0100199 stats += [round_up_to_int(ps.cycles[v]) for v in all_cycles]
Diqing Zhong69aadd02020-12-08 13:08:48 +0100200 stats += [round_up_to_int(ps.macs)]
Tim Hall79d07d22020-04-27 18:20:16 +0100201 for indices in bandwidth_indices:
202 res = 0
203 i = indices[0]
204 for j in indices[1]:
205 for k in indices[2]:
206 res += round_up_to_int(ps.bandwidths[i, j, k])
207 stats.append(res)
Patrik Gustavssoneca2e952020-05-27 09:15:11 +0200208 try:
209 stats += [ps.sram_used]
210 except AttributeError:
211 stats += [0]
Tim Hall79d07d22020-04-27 18:20:16 +0100212
213 writer.writerow(stats)
214
215 write_subgraph(nng.get_root_subgraph())
216
217
218def print_performance_metrics_for_strat(
219 arch,
220 name,
221 cycles,
222 macs,
223 bandwidths,
224 batch_size,
225 memory_used,
erik.andersson@arm.com3438c922021-03-24 10:32:09 +0100226 min_mem_usage,
Tim Hall79d07d22020-04-27 18:20:16 +0100227 num_passes,
228 num_cascaded_passes,
229 n_operations=0,
Michael McGeagh6f725262020-12-03 15:21:36 +0000230 cpu_operations=None,
Diqing Zhongdb5124c2021-01-11 12:52:48 +0100231 weights_compression_ratio=None,
Tim Hall79d07d22020-04-27 18:20:16 +0100232 show_cpu_operations=False,
233 f=sys.stdout,
234):
235
Louis Verhaard0265f402020-09-29 13:57:21 +0200236 orig_mem_areas_labels = [(v, v.display_name()) for v in mem_areas_to_report()]
Tim Hall79d07d22020-04-27 18:20:16 +0100237
Tim Hall1bd531d2020-11-01 20:59:36 +0000238 midpoint_inference_time = cycles[PassCycles.Total] / arch.core_clock
Michael McGeaghb4249742020-07-30 14:36:40 +0100239 if midpoint_inference_time > 0:
240 midpoint_fps = 1 / midpoint_inference_time
241 else:
242 midpoint_fps = np.nan
Tim Hall79d07d22020-04-27 18:20:16 +0100243
244 mem_area_labels = [
245 (mem_area, label) for mem_area, label in orig_mem_areas_labels if np.sum(bandwidths[mem_area]) > 0
246 ]
247
248 if name:
249 print("", file=f)
Diqing Zhong69aadd02020-12-08 13:08:48 +0100250 print(f"Network summary for {name}", file=f)
251 print(f"Accelerator configuration {arch.accelerator_config.name:>20}", file=f)
252 print(f"System configuration {arch.system_config:>20}", file=f)
253 print(f"Memory mode {arch.memory_mode:>20}", file=f)
254 print(f"Accelerator clock {int(arch.core_clock / 1e6):12d} MHz", file=f)
Tim Hall79d07d22020-04-27 18:20:16 +0100255 for mem_area, label in mem_area_labels:
Diqing Zhong69aadd02020-12-08 13:08:48 +0100256 label += " bandwidth"
257 bandwidth = arch.memory_bandwidths_per_second[mem_area] / 1000.0 / 1000 / 1000
Tim Hall79d07d22020-04-27 18:20:16 +0100258 print(
Diqing Zhong69aadd02020-12-08 13:08:48 +0100259 f"Design peak {label:25} {bandwidth:12.2f} GB/s", file=f,
Tim Hall79d07d22020-04-27 18:20:16 +0100260 )
Tim Hall79d07d22020-04-27 18:20:16 +0100261 print(file=f)
262 for mem_area, label in mem_area_labels:
Diego Russoea6111a2020-04-14 18:41:58 +0100263 if mem_area not in memory_used:
Tim Hall79d07d22020-04-27 18:20:16 +0100264 continue
265
266 aug_label = label + " used"
267
Diqing Zhongdb5124c2021-01-11 12:52:48 +0100268 print(f"Total {aug_label:25} {memory_used[mem_area] / 1024.0:12.2f} KiB", file=f)
erik.andersson@arm.com3438c922021-03-24 10:32:09 +0100269 if mem_area == MemArea.Sram and min_mem_usage:
Fredrik Svedbergb8f400b2021-04-08 15:41:52 +0200270 mem_used = memory_used[[mem_area for mem_area, label in mem_area_labels if "SRAM" in label][0]] / 1024.0
erik.andersson@arm.com3438c922021-03-24 10:32:09 +0100271 fraction = (mem_used - min_mem_usage / 1024.0) / (min_mem_usage / 1024.0)
272 print(f"Theoretical minimum SRAM usage{min_mem_usage/1024.0:23.2F} KiB", file=f)
273 print(f"Allocator overhead{100*fraction:35.2F} %", file=f)
Tim Hall79d07d22020-04-27 18:20:16 +0100274
275 print(file=f)
Diqing Zhong69aadd02020-12-08 13:08:48 +0100276 print(f"{num_passes:d} passes fused into {num_cascaded_passes:d}", file=f)
Tim Hall79d07d22020-04-27 18:20:16 +0100277
Michael McGeagh6f725262020-12-03 15:21:36 +0000278 if cpu_operations is None:
279 cpu_operations = []
280
Tim Hall79d07d22020-04-27 18:20:16 +0100281 n_cpu_operations = len(cpu_operations)
282 if n_operations > 0:
283 print(
Diqing Zhong69aadd02020-12-08 13:08:48 +0100284 f"{n_cpu_operations:d}/{n_operations:d}"
Diqing Zhong49b4f1c2021-01-12 12:39:44 +0100285 f" ({n_cpu_operations / n_operations:4.1%}) operations falling back to the CPU",
Tim Hall79d07d22020-04-27 18:20:16 +0100286 file=f,
287 )
288
289 if show_cpu_operations:
290 for op in cpu_operations:
291
292 def format_tens_list(lst):
293 return " ".join(str(list(tens.shape)) for tens in lst)
294
295 print(
Diqing Zhong69aadd02020-12-08 13:08:48 +0100296 f"CPU operation: {op.type}"
297 f" inputs {format_tens_list(op.inputs)}, outputs {format_tens_list(op.outputs)}",
Tim Hall79d07d22020-04-27 18:20:16 +0100298 file=f,
299 )
300
301 print("", file=f)
302
303 for mem_area, label in mem_area_labels:
304 bws = bandwidths[mem_area]
305 total_bw = np.sum(bws)
306 weight_bws = bws[TensorPurpose.Weights]
307 fm_bws = bws[TensorPurpose.FeatureMap]
308 aug_label = label + " bandwidth"
309 print(
Diqing Zhong69aadd02020-12-08 13:08:48 +0100310 f"Average {aug_label:25} {total_bw * midpoint_fps / 1000.0 / 1000.0 / 1000.0:12.2f} GB/s", file=f,
Tim Hall79d07d22020-04-27 18:20:16 +0100311 )
312 print(
Diqing Zhong69aadd02020-12-08 13:08:48 +0100313 f"Input {aug_label:25} {np.sum(fm_bws[BandwidthDirection.Read]) / 1000.0 / 1000.0:12.2f} MB/batch",
Tim Hall79d07d22020-04-27 18:20:16 +0100314 file=f,
315 )
Diqing Zhong69aadd02020-12-08 13:08:48 +0100316 print(f"Weight {aug_label:25} {np.sum(weight_bws) / 1000.0 / 1000.0:12.2f} MB/batch", file=f)
Tim Hall79d07d22020-04-27 18:20:16 +0100317 print(
Diqing Zhong69aadd02020-12-08 13:08:48 +0100318 f"Output {aug_label:25} "
319 f"{np.sum(fm_bws[BandwidthDirection.Write]) / 1000.0 / 1000.0:12.2f} MB/batch",
Tim Hall79d07d22020-04-27 18:20:16 +0100320 file=f,
321 )
Diqing Zhong69aadd02020-12-08 13:08:48 +0100322 print(f"Total {aug_label:25} {total_bw / 1000.0 / 1000.0:12.2f} MB/batch", file=f)
Tim Hall79d07d22020-04-27 18:20:16 +0100323 print(
Diqing Zhong69aadd02020-12-08 13:08:48 +0100324 f"Total {aug_label:25} per input "
325 f"{total_bw / 1000.0 / 1000.0 / batch_size:9.2f} MB/inference (batch size {batch_size:d})",
Tim Hall79d07d22020-04-27 18:20:16 +0100326 file=f,
327 )
328 print(file=f)
329
Henrik G Olsson7e2458f2021-03-18 17:16:45 +0100330 if weights_compression_ratio:
Diqing Zhongdb5124c2021-01-11 12:52:48 +0100331 print(
332 f"Weights Compression Ratio {weights_compression_ratio:12.2f}", file=f,
333 )
334
Tim Hall79d07d22020-04-27 18:20:16 +0100335 print(
Diqing Zhong69aadd02020-12-08 13:08:48 +0100336 f"Neural network macs {int(macs):12d} MACs/batch", file=f,
Tim Hall79d07d22020-04-27 18:20:16 +0100337 )
338 print(
Diqing Zhong69aadd02020-12-08 13:08:48 +0100339 f"Network Tops/s {macs * 2 * midpoint_fps / 1e12:12.2f} Tops/s", file=f,
Tim Hall79d07d22020-04-27 18:20:16 +0100340 )
341 print(file=f)
342
343 for kind in PassCycles.all():
344 aug_label = kind.display_name() + " cycles"
345 cyc = cycles[kind]
Diqing Zhong69aadd02020-12-08 13:08:48 +0100346 print(f"{aug_label:30} {int(cyc):12d} cycles/batch", file=f)
Tim Hall79d07d22020-04-27 18:20:16 +0100347 print(file=f)
348
349 print(
Diqing Zhong69aadd02020-12-08 13:08:48 +0100350 f"Batch Inference time {midpoint_inference_time * 1000:7.2f} ms,"
351 f" {midpoint_fps:7.2f} inferences/s (batch size {batch_size:d})",
Tim Hall79d07d22020-04-27 18:20:16 +0100352 file=f,
353 )
354 print(file=f)
355
356
357def print_performance_metrics(nng, arch, show_cpu_operations=False, f=sys.stdout):
358 n_passes = sum(len(sg.passes) for sg in nng.subgraphs)
359 n_cascaded_passes = sum(len(sg.cascaded_passes) for sg in nng.subgraphs)
360 n_operations = sum(len(ps.ops) for sg in nng.subgraphs for ps in sg.passes)
361 cpu_operations = sum((ps.ops for sg in nng.subgraphs for ps in sg.passes if ps.placement == PassPlacement.Cpu), [])
erik.andersson@arm.com3438c922021-03-24 10:32:09 +0100362 min_mem_usage = max(sg.min_mem_usage for sg in nng.subgraphs)
Tim Hall79d07d22020-04-27 18:20:16 +0100363 return print_performance_metrics_for_strat(
364 arch,
365 nng.name,
366 nng.cycles,
367 nng.macs,
368 nng.bandwidths,
369 nng.batch_size,
370 nng.memory_used,
erik.andersson@arm.com3438c922021-03-24 10:32:09 +0100371 min_mem_usage,
Tim Hall79d07d22020-04-27 18:20:16 +0100372 n_passes,
373 n_cascaded_passes,
374 n_operations,
375 cpu_operations,
Diqing Zhongdb5124c2021-01-11 12:52:48 +0100376 nng.weights_compression_ratio,
Tim Hall79d07d22020-04-27 18:20:16 +0100377 show_cpu_operations,
378 f,
379 )
380
381
382def write_human_friendly_metrics(nng, arch, filename):
383 f = open(filename, "w")
384 print_performance_metrics(nng, arch, f=f)