blob: 6fd68f85b2f0f57aa480f67691c9927a05319af4 [file] [log] [blame]
Tim Hall79d07d22020-04-27 18:20:16 +01001# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
2#
3# SPDX-License-Identifier: Apache-2.0
4#
5# Licensed under the Apache License, Version 2.0 (the License); you may
6# not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an AS IS BASIS, WITHOUT
13# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
Tim Hall79d07d22020-04-27 18:20:16 +010016# Description:
17# Writes out per-pass and summary performance statistics to CSV files.
Tim Hall79d07d22020-04-27 18:20:16 +010018import csv
Tim Hall79d07d22020-04-27 18:20:16 +010019import sys
20
Diego Russoea6111a2020-04-14 18:41:58 +010021import numpy as np
22
Diego Russoea6111a2020-04-14 18:41:58 +010023from .nn_graph import PassPlacement
Diego Russoe8a10452020-04-21 17:39:10 +010024from .npu_performance import BandwidthDirection
25from .npu_performance import MacCount
26from .npu_performance import PassCycles
Diego Russoea6111a2020-04-14 18:41:58 +010027from .numeric_util import round_up_to_int
Louis Verhaardaee5d752020-09-30 09:01:52 +020028from .operation import Op
Diego Russoe8a10452020-04-21 17:39:10 +010029from .tensor import MemArea
30from .tensor import TensorPurpose
Diego Russoea6111a2020-04-14 18:41:58 +010031
Tim Hall79d07d22020-04-27 18:20:16 +010032
Louis Verhaard0265f402020-09-29 13:57:21 +020033def mem_areas_to_report():
34 # Exclude SHRAM, as the SHRAM performance numbers only cover LUT usage
35 return [area for area in MemArea.all() if area != MemArea.Shram]
36
37
Tim Hall79d07d22020-04-27 18:20:16 +010038def write_summary_metrics_csv(nng, summary_filename, arch):
39 with open(summary_filename, "w") as f:
40 writer = csv.writer(f)
Louis Verhaard0265f402020-09-29 13:57:21 +020041 mem_areas = mem_areas_to_report()
Tim Hall79d07d22020-04-27 18:20:16 +010042
43 labels = [
44 "experiment",
45 "network",
46 ]
47
48 labels += (
49 ["accelerator_configuration", "system_config", "npu_clock", "sram_size"]
Louis Verhaard0265f402020-09-29 13:57:21 +020050 + [area.identifier_name() + "_bandwidth" for area in mem_areas]
Tim Hall79d07d22020-04-27 18:20:16 +010051 + ["weights_storage_area", "feature_map_storage_area"]
52 )
53
54 labels += [
55 "inferences_per_second",
56 "batch_size",
57 "inference_time",
58 "passes_before_fusing",
59 "passes_after_fusing",
60 ]
Louis Verhaard0265f402020-09-29 13:57:21 +020061 labels += [area.identifier_name() + "_memory_used" for area in mem_areas]
Tim Hall79d07d22020-04-27 18:20:16 +010062 labels += ["on_chip_flash_bits_per_element", "off_chip_flash_bits_per_element"]
63
Louis Verhaard0265f402020-09-29 13:57:21 +020064 for mem_area in mem_areas:
Tim Hall79d07d22020-04-27 18:20:16 +010065 labels += [
66 mem_area.identifier_name() + "_feature_map_read_bytes",
67 mem_area.identifier_name() + "_feature_map_write_bytes",
68 mem_area.identifier_name() + "_weight_read_bytes",
69 mem_area.identifier_name() + "_weight_write_bytes",
70 mem_area.identifier_name() + "_total_bytes",
71 ]
72
73 labels += ["nn_macs", "hardware_macs", "nn_tops", "hardware_tops"]
74
75 labels += ["cycles_" + kind.identifier_name() for kind in PassCycles.all()]
76
77 writer.writerow(labels)
78
79 data_items = [
80 "default",
81 nng.name,
82 ]
83
84 if arch:
85 data_items += (
86 [arch.accelerator_config, arch.system_config, arch.npu_clock, arch.sram_size / 1024]
Louis Verhaard0265f402020-09-29 13:57:21 +020087 + [arch.memory_bandwidths_per_second[mem_area] / 1000.0 / 1000 / 1000 for mem_area in mem_areas]
Tim Hall79d07d22020-04-27 18:20:16 +010088 + [
89 arch.tensor_storage_mem_area[TensorPurpose.Weights].display_name(),
90 arch.tensor_storage_mem_area[TensorPurpose.FeatureMap].display_name(),
91 ]
92 )
93
94 midpoint_inference_time = nng.cycles[PassCycles.Total] / arch.npu_clock
Michael McGeaghb4249742020-07-30 14:36:40 +010095 if midpoint_inference_time > 0:
96 midpoint_fps = 1 / midpoint_inference_time
97 else:
98 midpoint_fps = np.nan
Tim Hall79d07d22020-04-27 18:20:16 +010099
100 n_passes = sum(len(sg.passes) for sg in nng.subgraphs)
101 n_cascaded_passes = sum(len(sg.cascaded_passes) for sg in nng.subgraphs)
102
103 data_items += [midpoint_fps, nng.batch_size, midpoint_inference_time, n_passes, n_cascaded_passes]
Louis Verhaard0265f402020-09-29 13:57:21 +0200104 data_items += [nng.memory_used.get(mem_area, 0) / 1024.0 for mem_area in mem_areas]
Tim Hall79d07d22020-04-27 18:20:16 +0100105
106 data_items += [
107 nng.bits_per_element.get(MemArea.OnChipFlash, 0.0),
108 nng.bits_per_element.get(MemArea.OffChipFlash, 0.0),
109 ]
110
Louis Verhaard0265f402020-09-29 13:57:21 +0200111 for mem_area in mem_areas:
Tim Hall79d07d22020-04-27 18:20:16 +0100112 bws = nng.bandwidths[mem_area]
113 total_bw = np.sum(bws)
114 weight_bws = bws[TensorPurpose.Weights]
115 fm_bws = bws[TensorPurpose.FeatureMap]
116 data_items += [
117 fm_bws[BandwidthDirection.Read],
118 fm_bws[BandwidthDirection.Write],
119 weight_bws[BandwidthDirection.Read],
120 weight_bws[BandwidthDirection.Write],
121 total_bw,
122 ]
123
124 data_items += [
125 nng.macs[MacCount.NeuralNetworkMacs],
126 nng.macs[MacCount.HardwareMacs],
127 nng.macs[MacCount.NeuralNetworkMacs] * 2 * midpoint_fps / 1e12,
128 nng.macs[MacCount.HardwareMacs] * 2 * midpoint_fps / 1e12,
129 ]
130
131 data_items += [nng.cycles[kind] for kind in PassCycles.all()]
132
133 writer.writerow(data_items)
134
135
136def write_pass_metrics_csv(nng, pass_filename):
137
138 with open(pass_filename, "w") as f:
139 writer = csv.writer(f)
140
141 purpose_list = (
142 ("total", (TensorPurpose.Weights, TensorPurpose.FeatureMap)),
143 ("weights", (TensorPurpose.Weights,)),
144 ("feature_map", (TensorPurpose.FeatureMap,)),
145 )
146
147 direction_list = (
148 ("total", (BandwidthDirection.Read, BandwidthDirection.Write)),
149 ("read", (BandwidthDirection.Read,)),
150 ("write", (BandwidthDirection.Write,)),
151 )
152 bandwidth_names = []
153 bandwidth_indices = []
Louis Verhaard0265f402020-09-29 13:57:21 +0200154 for mem_area in mem_areas_to_report():
Tim Hall79d07d22020-04-27 18:20:16 +0100155 for purpose, purpose_candidates in purpose_list:
156 for direction, direction_candidates in direction_list:
157 label = "bytes_%s_%s_%s" % (mem_area.identifier_name(), purpose, direction)
158 bandwidth_names.append(label)
159 bandwidth_indices.append((mem_area, purpose_candidates, direction_candidates))
160
161 all_macs = MacCount.all()
162 all_cycles = (
163 PassCycles.Total,
164 PassCycles.Dpu,
165 PassCycles.ElementWise,
166 PassCycles.Cpu,
167 PassCycles.SramAccess,
168 PassCycles.DramAccess,
169 PassCycles.OnChipFlashAccess,
170 PassCycles.OffChipFlashAccess,
171 )
172 writer.writerow(
173 [
174 "name",
175 "operators",
176 "placement",
177 "streaming_strategy",
178 "block_config_height",
179 "block_config_width",
180 "block_config_input_channels",
181 "block_config_output_channels",
182 "n_blocks_in_pass",
183 ]
184 + ["cycles_" + v.identifier_name() for v in all_cycles]
185 + [v.identifier_name() for v in all_macs]
186 + bandwidth_names
187 + ["sram_used"]
188 )
189
190 def write_subgraph(sg):
191 for cps in sg.cascaded_passes:
192 if cps.placement == PassPlacement.StartupInit:
193 continue # skip the dummy init pass
194
195 for ps in cps.passes:
Louis Verhaardaee5d752020-09-30 09:01:52 +0200196 if len(ps.ops) == 1 and ps.ops[0].type == Op.CustomNpuOp:
Tim Hall79d07d22020-04-27 18:20:16 +0100197 # just treat this as a call, unroll it
198 write_subgraph(ps.ops[0].attrs["subgraph"])
199 continue
Louis Verhaardaee5d752020-09-30 09:01:52 +0200200 stats = [ps.name, " ".join(op.type.name for op in ps.ops)]
Tim Hall79d07d22020-04-27 18:20:16 +0100201 stats += [ps.placement.name]
202 stats += [cps.strategy.name]
203 stats += list(ps.block_config)
204 stats += [ps.n_blocks]
205 stats += [round_up_to_int(ps.cycles[v]) for v in all_cycles]
206 stats += [round_up_to_int(ps.macs[v]) for v in all_macs]
207 for indices in bandwidth_indices:
208 res = 0
209 i = indices[0]
210 for j in indices[1]:
211 for k in indices[2]:
212 res += round_up_to_int(ps.bandwidths[i, j, k])
213 stats.append(res)
Patrik Gustavssoneca2e952020-05-27 09:15:11 +0200214 try:
215 stats += [ps.sram_used]
216 except AttributeError:
217 stats += [0]
Tim Hall79d07d22020-04-27 18:20:16 +0100218
219 writer.writerow(stats)
220
221 write_subgraph(nng.get_root_subgraph())
222
223
224def print_performance_metrics_for_strat(
225 arch,
226 name,
227 cycles,
228 macs,
229 bandwidths,
230 batch_size,
231 memory_used,
232 num_passes,
233 num_cascaded_passes,
234 n_operations=0,
235 cpu_operations=[],
236 bits_per_element=None,
237 show_cpu_operations=False,
238 f=sys.stdout,
239):
240
Louis Verhaard0265f402020-09-29 13:57:21 +0200241 orig_mem_areas_labels = [(v, v.display_name()) for v in mem_areas_to_report()]
Tim Hall79d07d22020-04-27 18:20:16 +0100242
243 midpoint_inference_time = cycles[PassCycles.Total] / arch.npu_clock
Michael McGeaghb4249742020-07-30 14:36:40 +0100244 if midpoint_inference_time > 0:
245 midpoint_fps = 1 / midpoint_inference_time
246 else:
247 midpoint_fps = np.nan
Tim Hall79d07d22020-04-27 18:20:16 +0100248
249 mem_area_labels = [
250 (mem_area, label) for mem_area, label in orig_mem_areas_labels if np.sum(bandwidths[mem_area]) > 0
251 ]
252
253 if name:
254 print("", file=f)
255 print("Network summary for", name, file=f)
256 print("Accelerator configuration %20s" % (arch.accelerator_config,), file=f)
257 print("System configuration %20s" % (arch.system_config,), file=f)
258 print("Accelerator clock %12d MHz" % (arch.npu_clock / 1e6,), file=f)
259 for mem_area, label in mem_area_labels:
260 print(
261 "Design peak %-25s %12.2f GB/s"
262 % (label + " bandwidth", arch.memory_bandwidths_per_second[mem_area] / 1000.0 / 1000 / 1000,),
263 file=f,
264 )
265
266 print(file=f)
267 for mem_area, label in mem_area_labels:
Diego Russoea6111a2020-04-14 18:41:58 +0100268 if mem_area not in memory_used:
Tim Hall79d07d22020-04-27 18:20:16 +0100269 continue
270
271 aug_label = label + " used"
272
273 extra = ""
274 if (mem_area == MemArea.OnChipFlash or mem_area == MemArea.OffChipFlash) and bits_per_element is not None:
275 extra = " (%.2f bits per element)" % (bits_per_element[mem_area],)
276
277 print("Total %-25s %12.2f KiB%s" % (aug_label, memory_used[mem_area] / 1024.0, extra), file=f)
278
279 print(file=f)
280 print("%d passes fused into %d" % (num_passes, num_cascaded_passes), file=f)
281
282 n_cpu_operations = len(cpu_operations)
283 if n_operations > 0:
284 print(
285 "%d/%d (%4.1f %%) operations falling back to the CPU"
286 % (n_cpu_operations, n_operations, n_cpu_operations / n_operations * 100),
287 file=f,
288 )
289
290 if show_cpu_operations:
291 for op in cpu_operations:
292
293 def format_tens_list(lst):
294 return " ".join(str(list(tens.shape)) for tens in lst)
295
296 print(
297 "CPU operation: %s, inputs %s, outputs %s"
298 % (op.type, format_tens_list(op.inputs), format_tens_list(op.outputs)),
299 file=f,
300 )
301
302 print("", file=f)
303
304 for mem_area, label in mem_area_labels:
305 bws = bandwidths[mem_area]
306 total_bw = np.sum(bws)
307 weight_bws = bws[TensorPurpose.Weights]
308 fm_bws = bws[TensorPurpose.FeatureMap]
309 aug_label = label + " bandwidth"
310 print(
311 "Average %-25s %12.2f GB/s" % (aug_label, total_bw * midpoint_fps / 1000.0 / 1000.0 / 1000.0,),
312 file=f,
313 )
314 print(
315 "Input %-25s %12.2f MB/batch"
316 % (aug_label, np.sum(fm_bws[BandwidthDirection.Read]) / 1000.0 / 1000.0,),
317 file=f,
318 )
319 print("Weight %-25s %12.2f MB/batch" % (aug_label, np.sum(weight_bws) / 1000.0 / 1000.0,), file=f)
320 print(
321 "Output %-25s %12.2f MB/batch"
322 % (aug_label, np.sum(fm_bws[BandwidthDirection.Write]) / 1000.0 / 1000.0,),
323 file=f,
324 )
325 print("Total %-25s %12.2f MB/batch" % (aug_label, total_bw / 1000.0 / 1000.0,), file=f)
326 print(
327 "Total %-25s per input %9.2f MB/inference (batch size %d)"
328 % (aug_label, total_bw / 1000.0 / 1000.0 / batch_size, batch_size),
329 file=f,
330 )
331 print(file=f)
332
333 print("Neural network macs %12d MACs/batch" % (macs[MacCount.NeuralNetworkMacs],), file=f)
334 print("Hardware macs %12d MACs/batch" % (macs[MacCount.HardwareMacs],), file=f)
335 print(
336 "Network Tops/s %12.2f Tops/s"
337 % (macs[MacCount.NeuralNetworkMacs] * 2 * midpoint_fps / 1e12),
338 file=f,
339 )
340 print(
341 "Hardware Tops/s %12.2f Tops/s"
342 % (macs[MacCount.HardwareMacs] * 2 * midpoint_fps / 1e12),
343 file=f,
344 )
345 print(file=f)
346
347 for kind in PassCycles.all():
348 aug_label = kind.display_name() + " cycles"
349 cyc = cycles[kind]
350 print("%-30s %12d cycles/batch" % (aug_label, cyc,), file=f)
351 print(file=f)
352
353 print(
354 "Batch Inference time %7.2f ms, %7.2f inferences/s (batch size %d)"
355 % (midpoint_inference_time * 1000, midpoint_fps, batch_size),
356 file=f,
357 )
358 print(file=f)
359
360
361def print_performance_metrics(nng, arch, show_cpu_operations=False, f=sys.stdout):
362 n_passes = sum(len(sg.passes) for sg in nng.subgraphs)
363 n_cascaded_passes = sum(len(sg.cascaded_passes) for sg in nng.subgraphs)
364 n_operations = sum(len(ps.ops) for sg in nng.subgraphs for ps in sg.passes)
365 cpu_operations = sum((ps.ops for sg in nng.subgraphs for ps in sg.passes if ps.placement == PassPlacement.Cpu), [])
366 return print_performance_metrics_for_strat(
367 arch,
368 nng.name,
369 nng.cycles,
370 nng.macs,
371 nng.bandwidths,
372 nng.batch_size,
373 nng.memory_used,
374 n_passes,
375 n_cascaded_passes,
376 n_operations,
377 cpu_operations,
378 nng.bits_per_element,
379 show_cpu_operations,
380 f,
381 )
382
383
384def write_human_friendly_metrics(nng, arch, filename):
385 f = open(filename, "w")
386 print_performance_metrics(nng, arch, f=f)