blob: 9db6a97cd37590f8012e8c62b96236365f31a077 [file] [log] [blame]
erik.andersson@arm.com460c6892021-02-24 14:38:09 +01001# Copyright (C) 2020-2021 Arm Limited or its affiliates. All rights reserved.
Tim Hall79d07d22020-04-27 18:20:16 +01002#
3# SPDX-License-Identifier: Apache-2.0
4#
5# Licensed under the Apache License, Version 2.0 (the License); you may
6# not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an AS IS BASIS, WITHOUT
13# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
Tim Hall79d07d22020-04-27 18:20:16 +010016# Description:
17# Writes out per-pass and summary performance statistics to CSV files.
Tim Hall79d07d22020-04-27 18:20:16 +010018import csv
Tim Hall79d07d22020-04-27 18:20:16 +010019import sys
20
Diego Russoea6111a2020-04-14 18:41:58 +010021import numpy as np
22
Diego Russoea6111a2020-04-14 18:41:58 +010023from .nn_graph import PassPlacement
Diego Russoe8a10452020-04-21 17:39:10 +010024from .npu_performance import BandwidthDirection
Diego Russoe8a10452020-04-21 17:39:10 +010025from .npu_performance import PassCycles
Diego Russoea6111a2020-04-14 18:41:58 +010026from .numeric_util import round_up_to_int
Louis Verhaardaee5d752020-09-30 09:01:52 +020027from .operation import Op
Diego Russoe8a10452020-04-21 17:39:10 +010028from .tensor import MemArea
29from .tensor import TensorPurpose
Diego Russoea6111a2020-04-14 18:41:58 +010030
Tim Hall79d07d22020-04-27 18:20:16 +010031
Louis Verhaard0265f402020-09-29 13:57:21 +020032def mem_areas_to_report():
33 # Exclude SHRAM, as the SHRAM performance numbers only cover LUT usage
34 return [area for area in MemArea.all() if area != MemArea.Shram]
35
36
Tim Hall79d07d22020-04-27 18:20:16 +010037def write_summary_metrics_csv(nng, summary_filename, arch):
38 with open(summary_filename, "w") as f:
39 writer = csv.writer(f)
Louis Verhaard0265f402020-09-29 13:57:21 +020040 mem_areas = mem_areas_to_report()
Tim Hall79d07d22020-04-27 18:20:16 +010041
42 labels = [
43 "experiment",
44 "network",
45 ]
46
47 labels += (
Tim Halld8339a72021-05-27 18:49:40 +010048 ["accelerator_configuration", "system_config", "memory_mode", "core_clock", "arena_cache_size"]
Louis Verhaard0265f402020-09-29 13:57:21 +020049 + [area.identifier_name() + "_bandwidth" for area in mem_areas]
Tim Hall79d07d22020-04-27 18:20:16 +010050 + ["weights_storage_area", "feature_map_storage_area"]
51 )
52
53 labels += [
54 "inferences_per_second",
55 "batch_size",
56 "inference_time",
57 "passes_before_fusing",
58 "passes_after_fusing",
59 ]
Louis Verhaard0265f402020-09-29 13:57:21 +020060 labels += [area.identifier_name() + "_memory_used" for area in mem_areas]
Fredrik Svedbergf5c07c42021-04-23 14:36:42 +020061 labels += ["total_original_weights"]
Fredrik Svedbergf5c07c42021-04-23 14:36:42 +020062 labels += ["total_npu_encoded_weights"]
Tim Hall79d07d22020-04-27 18:20:16 +010063
Louis Verhaard0265f402020-09-29 13:57:21 +020064 for mem_area in mem_areas:
Tim Hall79d07d22020-04-27 18:20:16 +010065 labels += [
66 mem_area.identifier_name() + "_feature_map_read_bytes",
67 mem_area.identifier_name() + "_feature_map_write_bytes",
68 mem_area.identifier_name() + "_weight_read_bytes",
69 mem_area.identifier_name() + "_weight_write_bytes",
70 mem_area.identifier_name() + "_total_bytes",
71 ]
72
Diqing Zhong69aadd02020-12-08 13:08:48 +010073 labels += ["nn_macs", "nn_tops"]
Tim Hall79d07d22020-04-27 18:20:16 +010074
75 labels += ["cycles_" + kind.identifier_name() for kind in PassCycles.all()]
76
77 writer.writerow(labels)
78
79 data_items = [
80 "default",
81 nng.name,
82 ]
83
84 if arch:
85 data_items += (
Tim Hall1bd531d2020-11-01 20:59:36 +000086 [
87 arch.accelerator_config.name,
88 arch.system_config,
89 arch.memory_mode,
90 arch.core_clock,
Tim Halld8339a72021-05-27 18:49:40 +010091 arch.arena_cache_size / 1024,
Tim Hall1bd531d2020-11-01 20:59:36 +000092 ]
Louis Verhaard0265f402020-09-29 13:57:21 +020093 + [arch.memory_bandwidths_per_second[mem_area] / 1000.0 / 1000 / 1000 for mem_area in mem_areas]
Tim Hall79d07d22020-04-27 18:20:16 +010094 + [
95 arch.tensor_storage_mem_area[TensorPurpose.Weights].display_name(),
96 arch.tensor_storage_mem_area[TensorPurpose.FeatureMap].display_name(),
97 ]
98 )
99
Tim Hall1bd531d2020-11-01 20:59:36 +0000100 midpoint_inference_time = nng.cycles[PassCycles.Total] / arch.core_clock
Michael McGeaghb4249742020-07-30 14:36:40 +0100101 if midpoint_inference_time > 0:
102 midpoint_fps = 1 / midpoint_inference_time
103 else:
104 midpoint_fps = np.nan
Tim Hall79d07d22020-04-27 18:20:16 +0100105
106 n_passes = sum(len(sg.passes) for sg in nng.subgraphs)
107 n_cascaded_passes = sum(len(sg.cascaded_passes) for sg in nng.subgraphs)
108
109 data_items += [midpoint_fps, nng.batch_size, midpoint_inference_time, n_passes, n_cascaded_passes]
Louis Verhaard0265f402020-09-29 13:57:21 +0200110 data_items += [nng.memory_used.get(mem_area, 0) / 1024.0 for mem_area in mem_areas]
Fredrik Svedbergf5c07c42021-04-23 14:36:42 +0200111 data_items += [nng.total_original_weights]
Fredrik Svedbergf5c07c42021-04-23 14:36:42 +0200112 data_items += [nng.total_npu_encoded_weights]
Tim Hall79d07d22020-04-27 18:20:16 +0100113
Louis Verhaard0265f402020-09-29 13:57:21 +0200114 for mem_area in mem_areas:
Tim Hall79d07d22020-04-27 18:20:16 +0100115 bws = nng.bandwidths[mem_area]
116 total_bw = np.sum(bws)
117 weight_bws = bws[TensorPurpose.Weights]
118 fm_bws = bws[TensorPurpose.FeatureMap]
119 data_items += [
120 fm_bws[BandwidthDirection.Read],
121 fm_bws[BandwidthDirection.Write],
122 weight_bws[BandwidthDirection.Read],
123 weight_bws[BandwidthDirection.Write],
124 total_bw,
125 ]
126
127 data_items += [
Diqing Zhong69aadd02020-12-08 13:08:48 +0100128 nng.macs,
129 nng.macs * 2 * midpoint_fps / 1e12,
Tim Hall79d07d22020-04-27 18:20:16 +0100130 ]
131
132 data_items += [nng.cycles[kind] for kind in PassCycles.all()]
133
134 writer.writerow(data_items)
135
136
137def write_pass_metrics_csv(nng, pass_filename):
138
139 with open(pass_filename, "w") as f:
140 writer = csv.writer(f)
141
142 purpose_list = (
143 ("total", (TensorPurpose.Weights, TensorPurpose.FeatureMap)),
144 ("weights", (TensorPurpose.Weights,)),
145 ("feature_map", (TensorPurpose.FeatureMap,)),
146 )
147
148 direction_list = (
149 ("total", (BandwidthDirection.Read, BandwidthDirection.Write)),
150 ("read", (BandwidthDirection.Read,)),
151 ("write", (BandwidthDirection.Write,)),
152 )
153 bandwidth_names = []
154 bandwidth_indices = []
Louis Verhaard0265f402020-09-29 13:57:21 +0200155 for mem_area in mem_areas_to_report():
Tim Hall79d07d22020-04-27 18:20:16 +0100156 for purpose, purpose_candidates in purpose_list:
157 for direction, direction_candidates in direction_list:
Diqing Zhong42e833d2020-10-02 13:18:42 +0200158 label = "bytes_{}_{}_{}".format(mem_area.identifier_name(), purpose, direction)
Tim Hall79d07d22020-04-27 18:20:16 +0100159 bandwidth_names.append(label)
160 bandwidth_indices.append((mem_area, purpose_candidates, direction_candidates))
161
Tim Hall79d07d22020-04-27 18:20:16 +0100162 all_cycles = (
163 PassCycles.Total,
Diqing Zhong42e833d2020-10-02 13:18:42 +0200164 PassCycles.Npu,
Tim Hall79d07d22020-04-27 18:20:16 +0100165 PassCycles.SramAccess,
166 PassCycles.DramAccess,
167 PassCycles.OnChipFlashAccess,
168 PassCycles.OffChipFlashAccess,
169 )
170 writer.writerow(
171 [
172 "name",
173 "operators",
174 "placement",
175 "streaming_strategy",
176 "block_config_height",
177 "block_config_width",
178 "block_config_input_channels",
179 "block_config_output_channels",
Tim Hall79d07d22020-04-27 18:20:16 +0100180 ]
181 + ["cycles_" + v.identifier_name() for v in all_cycles]
Diqing Zhong69aadd02020-12-08 13:08:48 +0100182 + ["nn_macs"]
Tim Hall79d07d22020-04-27 18:20:16 +0100183 + bandwidth_names
184 + ["sram_used"]
185 )
186
187 def write_subgraph(sg):
188 for cps in sg.cascaded_passes:
189 if cps.placement == PassPlacement.StartupInit:
190 continue # skip the dummy init pass
191
192 for ps in cps.passes:
Louis Verhaardaee5d752020-09-30 09:01:52 +0200193 if len(ps.ops) == 1 and ps.ops[0].type == Op.CustomNpuOp:
Tim Hall79d07d22020-04-27 18:20:16 +0100194 # just treat this as a call, unroll it
195 write_subgraph(ps.ops[0].attrs["subgraph"])
196 continue
Louis Verhaardaee5d752020-09-30 09:01:52 +0200197 stats = [ps.name, " ".join(op.type.name for op in ps.ops)]
Tim Hall79d07d22020-04-27 18:20:16 +0100198 stats += [ps.placement.name]
199 stats += [cps.strategy.name]
200 stats += list(ps.block_config)
Tim Hall79d07d22020-04-27 18:20:16 +0100201 stats += [round_up_to_int(ps.cycles[v]) for v in all_cycles]
Diqing Zhong69aadd02020-12-08 13:08:48 +0100202 stats += [round_up_to_int(ps.macs)]
Tim Hall79d07d22020-04-27 18:20:16 +0100203 for indices in bandwidth_indices:
204 res = 0
205 i = indices[0]
206 for j in indices[1]:
207 for k in indices[2]:
208 res += round_up_to_int(ps.bandwidths[i, j, k])
209 stats.append(res)
Patrik Gustavssoneca2e952020-05-27 09:15:11 +0200210 try:
211 stats += [ps.sram_used]
212 except AttributeError:
213 stats += [0]
Tim Hall79d07d22020-04-27 18:20:16 +0100214
215 writer.writerow(stats)
216
217 write_subgraph(nng.get_root_subgraph())
218
219
220def print_performance_metrics_for_strat(
221 arch,
222 name,
223 cycles,
224 macs,
225 bandwidths,
226 batch_size,
227 memory_used,
Michael McGeagh6f725262020-12-03 15:21:36 +0000228 cpu_operations=None,
Tim Hall837c31c2021-11-24 15:39:46 +0000229 npu_operations=None,
Tim Hall79d07d22020-04-27 18:20:16 +0100230 show_cpu_operations=False,
Fredrik Svedbergf5c07c42021-04-23 14:36:42 +0200231 weights_data=None,
Tim Hall79d07d22020-04-27 18:20:16 +0100232 f=sys.stdout,
233):
234
Louis Verhaard0265f402020-09-29 13:57:21 +0200235 orig_mem_areas_labels = [(v, v.display_name()) for v in mem_areas_to_report()]
Tim Hall79d07d22020-04-27 18:20:16 +0100236
Tim Hall1bd531d2020-11-01 20:59:36 +0000237 midpoint_inference_time = cycles[PassCycles.Total] / arch.core_clock
Michael McGeaghb4249742020-07-30 14:36:40 +0100238 if midpoint_inference_time > 0:
239 midpoint_fps = 1 / midpoint_inference_time
240 else:
241 midpoint_fps = np.nan
Tim Hall79d07d22020-04-27 18:20:16 +0100242
243 mem_area_labels = [
244 (mem_area, label) for mem_area, label in orig_mem_areas_labels if np.sum(bandwidths[mem_area]) > 0
245 ]
246
247 if name:
248 print("", file=f)
Diqing Zhong69aadd02020-12-08 13:08:48 +0100249 print(f"Network summary for {name}", file=f)
250 print(f"Accelerator configuration {arch.accelerator_config.name:>20}", file=f)
251 print(f"System configuration {arch.system_config:>20}", file=f)
252 print(f"Memory mode {arch.memory_mode:>20}", file=f)
253 print(f"Accelerator clock {int(arch.core_clock / 1e6):12d} MHz", file=f)
Tim Hall79d07d22020-04-27 18:20:16 +0100254 for mem_area, label in mem_area_labels:
Diqing Zhong69aadd02020-12-08 13:08:48 +0100255 label += " bandwidth"
256 bandwidth = arch.memory_bandwidths_per_second[mem_area] / 1000.0 / 1000 / 1000
Tim Hall79d07d22020-04-27 18:20:16 +0100257 print(
Jonas Ohlssond8575072022-03-30 10:30:25 +0200258 f"Design peak {label:25} {bandwidth:12.2f} GB/s",
259 file=f,
Tim Hall79d07d22020-04-27 18:20:16 +0100260 )
Tim Hall79d07d22020-04-27 18:20:16 +0100261 print(file=f)
262 for mem_area, label in mem_area_labels:
Diego Russoea6111a2020-04-14 18:41:58 +0100263 if mem_area not in memory_used:
Tim Hall79d07d22020-04-27 18:20:16 +0100264 continue
265
266 aug_label = label + " used"
267
Diqing Zhongdb5124c2021-01-11 12:52:48 +0100268 print(f"Total {aug_label:25} {memory_used[mem_area] / 1024.0:12.2f} KiB", file=f)
Tim Hall79d07d22020-04-27 18:20:16 +0100269
270 print(file=f)
Tim Hall79d07d22020-04-27 18:20:16 +0100271
Michael McGeagh6f725262020-12-03 15:21:36 +0000272 if cpu_operations is None:
273 cpu_operations = []
Tim Hall837c31c2021-11-24 15:39:46 +0000274 if npu_operations is None:
275 npu_operations = []
Michael McGeagh6f725262020-12-03 15:21:36 +0000276
Tim Hall79d07d22020-04-27 18:20:16 +0100277 n_cpu_operations = len(cpu_operations)
Tim Hall837c31c2021-11-24 15:39:46 +0000278 n_npu_operations = len(npu_operations)
Tim Hall1bbd06b2022-08-25 13:38:50 +0100279 n_total_operations = max(n_cpu_operations + n_npu_operations, 1) # avoid potential divide by zero
Tim Hall79d07d22020-04-27 18:20:16 +0100280
Tim Hall837c31c2021-11-24 15:39:46 +0000281 def format_tens_list(lst):
282 return " ".join(str(list(tens.shape)) for tens in lst)
Tim Hall79d07d22020-04-27 18:20:16 +0100283
Tim Hall837c31c2021-11-24 15:39:46 +0000284 for str_ops_type, n_ops, ops in (
285 ("CPU", n_cpu_operations, cpu_operations),
286 ("NPU", n_npu_operations, npu_operations),
287 ):
288 print(f"{str_ops_type} operators = {n_ops:d} ({n_ops / n_total_operations:4.1%})", file=f)
289 if show_cpu_operations:
290 for op in ops:
291 print(
292 f" {str_ops_type}: {op.type} = {op.name}"
293 f" (inputs {format_tens_list(op.inputs)}, outputs {format_tens_list(op.outputs)})"
294 )
Tim Hall79d07d22020-04-27 18:20:16 +0100295
Tim Hall837c31c2021-11-24 15:39:46 +0000296 print("", file=f)
Tim Hall79d07d22020-04-27 18:20:16 +0100297
298 for mem_area, label in mem_area_labels:
299 bws = bandwidths[mem_area]
300 total_bw = np.sum(bws)
301 weight_bws = bws[TensorPurpose.Weights]
302 fm_bws = bws[TensorPurpose.FeatureMap]
303 aug_label = label + " bandwidth"
304 print(
Jonas Ohlssond8575072022-03-30 10:30:25 +0200305 f"Average {aug_label:25} {total_bw * midpoint_fps / 1000.0 / 1000.0 / 1000.0:12.2f} GB/s",
306 file=f,
Tim Hall79d07d22020-04-27 18:20:16 +0100307 )
308 print(
Diqing Zhong69aadd02020-12-08 13:08:48 +0100309 f"Input {aug_label:25} {np.sum(fm_bws[BandwidthDirection.Read]) / 1000.0 / 1000.0:12.2f} MB/batch",
Tim Hall79d07d22020-04-27 18:20:16 +0100310 file=f,
311 )
Diqing Zhong69aadd02020-12-08 13:08:48 +0100312 print(f"Weight {aug_label:25} {np.sum(weight_bws) / 1000.0 / 1000.0:12.2f} MB/batch", file=f)
Tim Hall79d07d22020-04-27 18:20:16 +0100313 print(
Diqing Zhong69aadd02020-12-08 13:08:48 +0100314 f"Output {aug_label:25} "
315 f"{np.sum(fm_bws[BandwidthDirection.Write]) / 1000.0 / 1000.0:12.2f} MB/batch",
Tim Hall79d07d22020-04-27 18:20:16 +0100316 file=f,
317 )
Diqing Zhong69aadd02020-12-08 13:08:48 +0100318 print(f"Total {aug_label:25} {total_bw / 1000.0 / 1000.0:12.2f} MB/batch", file=f)
Tim Hall79d07d22020-04-27 18:20:16 +0100319 print(
Diqing Zhong69aadd02020-12-08 13:08:48 +0100320 f"Total {aug_label:25} per input "
321 f"{total_bw / 1000.0 / 1000.0 / batch_size:9.2f} MB/inference (batch size {batch_size:d})",
Tim Hall79d07d22020-04-27 18:20:16 +0100322 file=f,
323 )
324 print(file=f)
325
Fredrik Svedbergf5c07c42021-04-23 14:36:42 +0200326 if weights_data:
327 print(f"Original Weights Size {weights_data['original'] / 1024.0:12.2f} KiB", file=f)
Fredrik Svedbergf5c07c42021-04-23 14:36:42 +0200328 print(f"NPU Encoded Weights Size {weights_data['npu_encoded'] / 1024.0:12.2f} KiB", file=f)
329 print(file=f)
Diqing Zhongdb5124c2021-01-11 12:52:48 +0100330
Tim Hall79d07d22020-04-27 18:20:16 +0100331 print(
Jonas Ohlssond8575072022-03-30 10:30:25 +0200332 f"Neural network macs {int(macs):12d} MACs/batch",
333 file=f,
Tim Hall79d07d22020-04-27 18:20:16 +0100334 )
335 print(
Jonas Ohlssond8575072022-03-30 10:30:25 +0200336 f"Network Tops/s {macs * 2 * midpoint_fps / 1e12:12.2f} Tops/s",
337 file=f,
Tim Hall79d07d22020-04-27 18:20:16 +0100338 )
339 print(file=f)
340
341 for kind in PassCycles.all():
342 aug_label = kind.display_name() + " cycles"
343 cyc = cycles[kind]
Diqing Zhong69aadd02020-12-08 13:08:48 +0100344 print(f"{aug_label:30} {int(cyc):12d} cycles/batch", file=f)
Tim Hall79d07d22020-04-27 18:20:16 +0100345 print(file=f)
346
347 print(
Diqing Zhong69aadd02020-12-08 13:08:48 +0100348 f"Batch Inference time {midpoint_inference_time * 1000:7.2f} ms,"
349 f" {midpoint_fps:7.2f} inferences/s (batch size {batch_size:d})",
Tim Hall79d07d22020-04-27 18:20:16 +0100350 file=f,
351 )
352 print(file=f)
353
354
Fredrik Svedbergf5c07c42021-04-23 14:36:42 +0200355def print_performance_metrics(nng, arch, show_cpu_operations=False, verbose_weights=False, f=sys.stdout):
Tim Hall837c31c2021-11-24 15:39:46 +0000356 cpu_operations = []
357 npu_operations = []
358 ir_only_ops = (
359 Op.Const,
360 Op.Placeholder,
361 Op.CustomNpuOp,
362 Op.SubgraphInput,
363 )
364
365 for sg in nng.subgraphs:
366 if sg.placement == PassPlacement.Cpu:
367 for op in sg.get_all_ops():
368 if op.type not in ir_only_ops:
369 cpu_operations.append(op)
370 elif sg.placement == PassPlacement.Npu:
371 for op in sg.get_all_ops():
372 if op.type not in ir_only_ops:
373 npu_operations.append(op)
374
Fredrik Svedbergf5c07c42021-04-23 14:36:42 +0200375 weights_data = (
Ayaan Masoodb801dda2022-02-22 11:28:55 +0000376 {"original": nng.total_original_weights, "npu_encoded": nng.total_npu_encoded_weights}
Fredrik Svedbergf5c07c42021-04-23 14:36:42 +0200377 if verbose_weights
378 else None
379 )
Tim Hall79d07d22020-04-27 18:20:16 +0100380 return print_performance_metrics_for_strat(
381 arch,
382 nng.name,
383 nng.cycles,
384 nng.macs,
385 nng.bandwidths,
386 nng.batch_size,
387 nng.memory_used,
Tim Hall79d07d22020-04-27 18:20:16 +0100388 cpu_operations,
Tim Hall837c31c2021-11-24 15:39:46 +0000389 npu_operations,
Tim Hall79d07d22020-04-27 18:20:16 +0100390 show_cpu_operations,
Fredrik Svedbergf5c07c42021-04-23 14:36:42 +0200391 weights_data,
Tim Hall79d07d22020-04-27 18:20:16 +0100392 f,
393 )