blob: 494b25e794a0d46d471e8d0cc2948cb2a92803b4 [file] [log] [blame]
Tim Hall79d07d22020-04-27 18:20:16 +01001# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
2#
3# SPDX-License-Identifier: Apache-2.0
4#
5# Licensed under the Apache License, Version 2.0 (the License); you may
6# not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an AS IS BASIS, WITHOUT
13# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
Tim Hall79d07d22020-04-27 18:20:16 +010016# Description:
17# Writes out per-pass and summary performance statistics to CSV files.
Tim Hall79d07d22020-04-27 18:20:16 +010018import csv
Tim Hall79d07d22020-04-27 18:20:16 +010019import sys
20
Diego Russoea6111a2020-04-14 18:41:58 +010021import numpy as np
22
Diego Russoea6111a2020-04-14 18:41:58 +010023from .nn_graph import PassPlacement
Diego Russoe8a10452020-04-21 17:39:10 +010024from .npu_performance import BandwidthDirection
25from .npu_performance import MacCount
26from .npu_performance import PassCycles
Diego Russoea6111a2020-04-14 18:41:58 +010027from .numeric_util import round_up_to_int
Louis Verhaardaee5d752020-09-30 09:01:52 +020028from .operation import Op
Diego Russoe8a10452020-04-21 17:39:10 +010029from .tensor import MemArea
30from .tensor import TensorPurpose
Diego Russoea6111a2020-04-14 18:41:58 +010031
Tim Hall79d07d22020-04-27 18:20:16 +010032
Louis Verhaard0265f402020-09-29 13:57:21 +020033def mem_areas_to_report():
34 # Exclude SHRAM, as the SHRAM performance numbers only cover LUT usage
35 return [area for area in MemArea.all() if area != MemArea.Shram]
36
37
Tim Hall79d07d22020-04-27 18:20:16 +010038def write_summary_metrics_csv(nng, summary_filename, arch):
39 with open(summary_filename, "w") as f:
40 writer = csv.writer(f)
Louis Verhaard0265f402020-09-29 13:57:21 +020041 mem_areas = mem_areas_to_report()
Tim Hall79d07d22020-04-27 18:20:16 +010042
43 labels = [
44 "experiment",
45 "network",
46 ]
47
48 labels += (
Tim Hall1bd531d2020-11-01 20:59:36 +000049 ["accelerator_configuration", "system_config", "memory_mode", "core_clock", "sram_size"]
Louis Verhaard0265f402020-09-29 13:57:21 +020050 + [area.identifier_name() + "_bandwidth" for area in mem_areas]
Tim Hall79d07d22020-04-27 18:20:16 +010051 + ["weights_storage_area", "feature_map_storage_area"]
52 )
53
54 labels += [
55 "inferences_per_second",
56 "batch_size",
57 "inference_time",
58 "passes_before_fusing",
59 "passes_after_fusing",
60 ]
Louis Verhaard0265f402020-09-29 13:57:21 +020061 labels += [area.identifier_name() + "_memory_used" for area in mem_areas]
Tim Hall79d07d22020-04-27 18:20:16 +010062 labels += ["on_chip_flash_bits_per_element", "off_chip_flash_bits_per_element"]
63
Louis Verhaard0265f402020-09-29 13:57:21 +020064 for mem_area in mem_areas:
Tim Hall79d07d22020-04-27 18:20:16 +010065 labels += [
66 mem_area.identifier_name() + "_feature_map_read_bytes",
67 mem_area.identifier_name() + "_feature_map_write_bytes",
68 mem_area.identifier_name() + "_weight_read_bytes",
69 mem_area.identifier_name() + "_weight_write_bytes",
70 mem_area.identifier_name() + "_total_bytes",
71 ]
72
73 labels += ["nn_macs", "hardware_macs", "nn_tops", "hardware_tops"]
74
75 labels += ["cycles_" + kind.identifier_name() for kind in PassCycles.all()]
76
77 writer.writerow(labels)
78
79 data_items = [
80 "default",
81 nng.name,
82 ]
83
84 if arch:
85 data_items += (
Tim Hall1bd531d2020-11-01 20:59:36 +000086 [
87 arch.accelerator_config.name,
88 arch.system_config,
89 arch.memory_mode,
90 arch.core_clock,
91 arch.sram_size / 1024,
92 ]
Louis Verhaard0265f402020-09-29 13:57:21 +020093 + [arch.memory_bandwidths_per_second[mem_area] / 1000.0 / 1000 / 1000 for mem_area in mem_areas]
Tim Hall79d07d22020-04-27 18:20:16 +010094 + [
95 arch.tensor_storage_mem_area[TensorPurpose.Weights].display_name(),
96 arch.tensor_storage_mem_area[TensorPurpose.FeatureMap].display_name(),
97 ]
98 )
99
Tim Hall1bd531d2020-11-01 20:59:36 +0000100 midpoint_inference_time = nng.cycles[PassCycles.Total] / arch.core_clock
Michael McGeaghb4249742020-07-30 14:36:40 +0100101 if midpoint_inference_time > 0:
102 midpoint_fps = 1 / midpoint_inference_time
103 else:
104 midpoint_fps = np.nan
Tim Hall79d07d22020-04-27 18:20:16 +0100105
106 n_passes = sum(len(sg.passes) for sg in nng.subgraphs)
107 n_cascaded_passes = sum(len(sg.cascaded_passes) for sg in nng.subgraphs)
108
109 data_items += [midpoint_fps, nng.batch_size, midpoint_inference_time, n_passes, n_cascaded_passes]
Louis Verhaard0265f402020-09-29 13:57:21 +0200110 data_items += [nng.memory_used.get(mem_area, 0) / 1024.0 for mem_area in mem_areas]
Tim Hall79d07d22020-04-27 18:20:16 +0100111
112 data_items += [
113 nng.bits_per_element.get(MemArea.OnChipFlash, 0.0),
114 nng.bits_per_element.get(MemArea.OffChipFlash, 0.0),
115 ]
116
Louis Verhaard0265f402020-09-29 13:57:21 +0200117 for mem_area in mem_areas:
Tim Hall79d07d22020-04-27 18:20:16 +0100118 bws = nng.bandwidths[mem_area]
119 total_bw = np.sum(bws)
120 weight_bws = bws[TensorPurpose.Weights]
121 fm_bws = bws[TensorPurpose.FeatureMap]
122 data_items += [
123 fm_bws[BandwidthDirection.Read],
124 fm_bws[BandwidthDirection.Write],
125 weight_bws[BandwidthDirection.Read],
126 weight_bws[BandwidthDirection.Write],
127 total_bw,
128 ]
129
130 data_items += [
131 nng.macs[MacCount.NeuralNetworkMacs],
132 nng.macs[MacCount.HardwareMacs],
133 nng.macs[MacCount.NeuralNetworkMacs] * 2 * midpoint_fps / 1e12,
134 nng.macs[MacCount.HardwareMacs] * 2 * midpoint_fps / 1e12,
135 ]
136
137 data_items += [nng.cycles[kind] for kind in PassCycles.all()]
138
139 writer.writerow(data_items)
140
141
142def write_pass_metrics_csv(nng, pass_filename):
143
144 with open(pass_filename, "w") as f:
145 writer = csv.writer(f)
146
147 purpose_list = (
148 ("total", (TensorPurpose.Weights, TensorPurpose.FeatureMap)),
149 ("weights", (TensorPurpose.Weights,)),
150 ("feature_map", (TensorPurpose.FeatureMap,)),
151 )
152
153 direction_list = (
154 ("total", (BandwidthDirection.Read, BandwidthDirection.Write)),
155 ("read", (BandwidthDirection.Read,)),
156 ("write", (BandwidthDirection.Write,)),
157 )
158 bandwidth_names = []
159 bandwidth_indices = []
Louis Verhaard0265f402020-09-29 13:57:21 +0200160 for mem_area in mem_areas_to_report():
Tim Hall79d07d22020-04-27 18:20:16 +0100161 for purpose, purpose_candidates in purpose_list:
162 for direction, direction_candidates in direction_list:
Diqing Zhong42e833d2020-10-02 13:18:42 +0200163 label = "bytes_{}_{}_{}".format(mem_area.identifier_name(), purpose, direction)
Tim Hall79d07d22020-04-27 18:20:16 +0100164 bandwidth_names.append(label)
165 bandwidth_indices.append((mem_area, purpose_candidates, direction_candidates))
166
167 all_macs = MacCount.all()
168 all_cycles = (
169 PassCycles.Total,
Diqing Zhong42e833d2020-10-02 13:18:42 +0200170 PassCycles.Npu,
Tim Hall79d07d22020-04-27 18:20:16 +0100171 PassCycles.SramAccess,
172 PassCycles.DramAccess,
173 PassCycles.OnChipFlashAccess,
174 PassCycles.OffChipFlashAccess,
175 )
176 writer.writerow(
177 [
178 "name",
179 "operators",
180 "placement",
181 "streaming_strategy",
182 "block_config_height",
183 "block_config_width",
184 "block_config_input_channels",
185 "block_config_output_channels",
186 "n_blocks_in_pass",
187 ]
188 + ["cycles_" + v.identifier_name() for v in all_cycles]
189 + [v.identifier_name() for v in all_macs]
190 + bandwidth_names
191 + ["sram_used"]
192 )
193
194 def write_subgraph(sg):
195 for cps in sg.cascaded_passes:
196 if cps.placement == PassPlacement.StartupInit:
197 continue # skip the dummy init pass
198
199 for ps in cps.passes:
Louis Verhaardaee5d752020-09-30 09:01:52 +0200200 if len(ps.ops) == 1 and ps.ops[0].type == Op.CustomNpuOp:
Tim Hall79d07d22020-04-27 18:20:16 +0100201 # just treat this as a call, unroll it
202 write_subgraph(ps.ops[0].attrs["subgraph"])
203 continue
Louis Verhaardaee5d752020-09-30 09:01:52 +0200204 stats = [ps.name, " ".join(op.type.name for op in ps.ops)]
Tim Hall79d07d22020-04-27 18:20:16 +0100205 stats += [ps.placement.name]
206 stats += [cps.strategy.name]
207 stats += list(ps.block_config)
208 stats += [ps.n_blocks]
209 stats += [round_up_to_int(ps.cycles[v]) for v in all_cycles]
210 stats += [round_up_to_int(ps.macs[v]) for v in all_macs]
211 for indices in bandwidth_indices:
212 res = 0
213 i = indices[0]
214 for j in indices[1]:
215 for k in indices[2]:
216 res += round_up_to_int(ps.bandwidths[i, j, k])
217 stats.append(res)
Patrik Gustavssoneca2e952020-05-27 09:15:11 +0200218 try:
219 stats += [ps.sram_used]
220 except AttributeError:
221 stats += [0]
Tim Hall79d07d22020-04-27 18:20:16 +0100222
223 writer.writerow(stats)
224
225 write_subgraph(nng.get_root_subgraph())
226
227
228def print_performance_metrics_for_strat(
229 arch,
230 name,
231 cycles,
232 macs,
233 bandwidths,
234 batch_size,
235 memory_used,
236 num_passes,
237 num_cascaded_passes,
238 n_operations=0,
Michael McGeagh6f725262020-12-03 15:21:36 +0000239 cpu_operations=None,
Tim Hall79d07d22020-04-27 18:20:16 +0100240 bits_per_element=None,
241 show_cpu_operations=False,
242 f=sys.stdout,
243):
244
Louis Verhaard0265f402020-09-29 13:57:21 +0200245 orig_mem_areas_labels = [(v, v.display_name()) for v in mem_areas_to_report()]
Tim Hall79d07d22020-04-27 18:20:16 +0100246
Tim Hall1bd531d2020-11-01 20:59:36 +0000247 midpoint_inference_time = cycles[PassCycles.Total] / arch.core_clock
Michael McGeaghb4249742020-07-30 14:36:40 +0100248 if midpoint_inference_time > 0:
249 midpoint_fps = 1 / midpoint_inference_time
250 else:
251 midpoint_fps = np.nan
Tim Hall79d07d22020-04-27 18:20:16 +0100252
253 mem_area_labels = [
254 (mem_area, label) for mem_area, label in orig_mem_areas_labels if np.sum(bandwidths[mem_area]) > 0
255 ]
256
257 if name:
258 print("", file=f)
259 print("Network summary for", name, file=f)
Tim Hall1bd531d2020-11-01 20:59:36 +0000260 print("Accelerator configuration {:>20}".format(arch.accelerator_config.name), file=f)
261 print("System configuration {:>20}".format(arch.system_config), file=f)
262 print("Memory mode {:>20}".format(arch.memory_mode), file=f)
263 print("Accelerator clock {:12d} MHz".format(int(arch.core_clock / 1e6)), file=f)
Tim Hall79d07d22020-04-27 18:20:16 +0100264 for mem_area, label in mem_area_labels:
265 print(
Diqing Zhong42e833d2020-10-02 13:18:42 +0200266 "Design peak {:25} {:12.2f} GB/s".format(
267 label + " bandwidth", arch.memory_bandwidths_per_second[mem_area] / 1000.0 / 1000 / 1000
268 ),
Tim Hall79d07d22020-04-27 18:20:16 +0100269 file=f,
270 )
Tim Hall79d07d22020-04-27 18:20:16 +0100271 print(file=f)
272 for mem_area, label in mem_area_labels:
Diego Russoea6111a2020-04-14 18:41:58 +0100273 if mem_area not in memory_used:
Tim Hall79d07d22020-04-27 18:20:16 +0100274 continue
275
276 aug_label = label + " used"
277
278 extra = ""
279 if (mem_area == MemArea.OnChipFlash or mem_area == MemArea.OffChipFlash) and bits_per_element is not None:
Diqing Zhong42e833d2020-10-02 13:18:42 +0200280 extra = " ({:.2f} bits per element)".format(bits_per_element[mem_area])
Tim Hall79d07d22020-04-27 18:20:16 +0100281
Diqing Zhong42e833d2020-10-02 13:18:42 +0200282 print("Total {:25} {:12.2f} KiB{}".format(aug_label, memory_used[mem_area] / 1024.0, extra), file=f)
Tim Hall79d07d22020-04-27 18:20:16 +0100283
284 print(file=f)
Diqing Zhong42e833d2020-10-02 13:18:42 +0200285 print("{:d} passes fused into {:d}".format(num_passes, num_cascaded_passes), file=f)
Tim Hall79d07d22020-04-27 18:20:16 +0100286
Michael McGeagh6f725262020-12-03 15:21:36 +0000287 if cpu_operations is None:
288 cpu_operations = []
289
Tim Hall79d07d22020-04-27 18:20:16 +0100290 n_cpu_operations = len(cpu_operations)
291 if n_operations > 0:
292 print(
Diqing Zhong42e833d2020-10-02 13:18:42 +0200293 "{:d}/{:d} ({:4.1%}) operations falling back to the CPU".format(
294 n_cpu_operations, n_operations, n_cpu_operations / n_operations * 100
295 ),
Tim Hall79d07d22020-04-27 18:20:16 +0100296 file=f,
297 )
298
299 if show_cpu_operations:
300 for op in cpu_operations:
301
302 def format_tens_list(lst):
303 return " ".join(str(list(tens.shape)) for tens in lst)
304
305 print(
Diqing Zhong42e833d2020-10-02 13:18:42 +0200306 "CPU operation: {} inputs {}, outputs {}".format(
307 op.type, format_tens_list(op.inputs), format_tens_list(op.outputs)
308 ),
Tim Hall79d07d22020-04-27 18:20:16 +0100309 file=f,
310 )
311
312 print("", file=f)
313
314 for mem_area, label in mem_area_labels:
315 bws = bandwidths[mem_area]
316 total_bw = np.sum(bws)
317 weight_bws = bws[TensorPurpose.Weights]
318 fm_bws = bws[TensorPurpose.FeatureMap]
319 aug_label = label + " bandwidth"
320 print(
Diqing Zhong42e833d2020-10-02 13:18:42 +0200321 "Average {:25} {:12.2f} GB/s".format(aug_label, total_bw * midpoint_fps / 1000.0 / 1000.0 / 1000.0),
Tim Hall79d07d22020-04-27 18:20:16 +0100322 file=f,
323 )
324 print(
Diqing Zhong42e833d2020-10-02 13:18:42 +0200325 "Input {:25} {:12.2f} MB/batch".format(
326 aug_label, np.sum(fm_bws[BandwidthDirection.Read]) / 1000.0 / 1000.0
327 ),
Tim Hall79d07d22020-04-27 18:20:16 +0100328 file=f,
329 )
Diqing Zhong42e833d2020-10-02 13:18:42 +0200330 print("Weight {:25} {:12.2f} MB/batch".format(aug_label, np.sum(weight_bws) / 1000.0 / 1000.0), file=f)
Tim Hall79d07d22020-04-27 18:20:16 +0100331 print(
Diqing Zhong42e833d2020-10-02 13:18:42 +0200332 "Output {:25} {:12.2f} MB/batch".format(
333 aug_label, np.sum(fm_bws[BandwidthDirection.Write]) / 1000.0 / 1000.0
334 ),
Tim Hall79d07d22020-04-27 18:20:16 +0100335 file=f,
336 )
Diqing Zhong42e833d2020-10-02 13:18:42 +0200337 print("Total {:25} {:12.2f} MB/batch".format(aug_label, total_bw / 1000.0 / 1000.0), file=f)
Tim Hall79d07d22020-04-27 18:20:16 +0100338 print(
Diqing Zhong42e833d2020-10-02 13:18:42 +0200339 "Total {:25} per input {:9.2f} MB/inference (batch size {:d})".format(
340 aug_label, total_bw / 1000.0 / 1000.0 / batch_size, batch_size
341 ),
Tim Hall79d07d22020-04-27 18:20:16 +0100342 file=f,
343 )
344 print(file=f)
345
Tim Hall79d07d22020-04-27 18:20:16 +0100346 print(
Diqing Zhong42e833d2020-10-02 13:18:42 +0200347 "Neural network macs {:12d} MACs/batch".format(int(macs[MacCount.NeuralNetworkMacs])),
348 file=f,
349 )
350 print("Hardware macs {:12d} MACs/batch".format(int(macs[MacCount.HardwareMacs])), file=f)
351 print(
352 "Network Tops/s {:12.2f} Tops/s".format(
353 macs[MacCount.NeuralNetworkMacs] * 2 * midpoint_fps / 1e12
354 ),
Tim Hall79d07d22020-04-27 18:20:16 +0100355 file=f,
356 )
357 print(
Diqing Zhong42e833d2020-10-02 13:18:42 +0200358 "Hardware Tops/s {:12.2f} Tops/s".format(
359 macs[MacCount.HardwareMacs] * 2 * midpoint_fps / 1e12
360 ),
Tim Hall79d07d22020-04-27 18:20:16 +0100361 file=f,
362 )
363 print(file=f)
364
365 for kind in PassCycles.all():
366 aug_label = kind.display_name() + " cycles"
367 cyc = cycles[kind]
Diqing Zhong42e833d2020-10-02 13:18:42 +0200368 print("{:30} {:12d} cycles/batch".format(aug_label, int(cyc)), file=f)
Tim Hall79d07d22020-04-27 18:20:16 +0100369 print(file=f)
370
371 print(
Diqing Zhong42e833d2020-10-02 13:18:42 +0200372 "Batch Inference time {:7.2f} ms, {:7.2f} inferences/s (batch size {:d})".format(
373 midpoint_inference_time * 1000, midpoint_fps, batch_size
374 ),
Tim Hall79d07d22020-04-27 18:20:16 +0100375 file=f,
376 )
377 print(file=f)
378
379
380def print_performance_metrics(nng, arch, show_cpu_operations=False, f=sys.stdout):
381 n_passes = sum(len(sg.passes) for sg in nng.subgraphs)
382 n_cascaded_passes = sum(len(sg.cascaded_passes) for sg in nng.subgraphs)
383 n_operations = sum(len(ps.ops) for sg in nng.subgraphs for ps in sg.passes)
384 cpu_operations = sum((ps.ops for sg in nng.subgraphs for ps in sg.passes if ps.placement == PassPlacement.Cpu), [])
385 return print_performance_metrics_for_strat(
386 arch,
387 nng.name,
388 nng.cycles,
389 nng.macs,
390 nng.bandwidths,
391 nng.batch_size,
392 nng.memory_used,
393 n_passes,
394 n_cascaded_passes,
395 n_operations,
396 cpu_operations,
397 nng.bits_per_element,
398 show_cpu_operations,
399 f,
400 )
401
402
403def write_human_friendly_metrics(nng, arch, filename):
404 f = open(filename, "w")
405 print_performance_metrics(nng, arch, f=f)