blob: af7b699736eaf3d4883fa13dc5d3cacdd08f7034 [file] [log] [blame]
Tim Hall79d07d22020-04-27 18:20:16 +01001# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
2#
3# SPDX-License-Identifier: Apache-2.0
4#
5# Licensed under the Apache License, Version 2.0 (the License); you may
6# not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an AS IS BASIS, WITHOUT
13# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
Tim Hall79d07d22020-04-27 18:20:16 +010016# Description:
17# Writes out per-pass and summary performance statistics to CSV files.
Tim Hall79d07d22020-04-27 18:20:16 +010018import csv
Tim Hall79d07d22020-04-27 18:20:16 +010019import sys
20
Diego Russoea6111a2020-04-14 18:41:58 +010021import numpy as np
22
Diego Russoea6111a2020-04-14 18:41:58 +010023from .nn_graph import PassPlacement
Diego Russoe8a10452020-04-21 17:39:10 +010024from .npu_performance import BandwidthDirection
25from .npu_performance import MacCount
26from .npu_performance import PassCycles
Diego Russoea6111a2020-04-14 18:41:58 +010027from .numeric_util import round_up_to_int
Diego Russoe8a10452020-04-21 17:39:10 +010028from .tensor import MemArea
29from .tensor import TensorPurpose
Diego Russoea6111a2020-04-14 18:41:58 +010030
Tim Hall79d07d22020-04-27 18:20:16 +010031
32def write_summary_metrics_csv(nng, summary_filename, arch):
33 with open(summary_filename, "w") as f:
34 writer = csv.writer(f)
35
36 labels = [
37 "experiment",
38 "network",
39 ]
40
41 labels += (
42 ["accelerator_configuration", "system_config", "npu_clock", "sram_size"]
43 + [area.identifier_name() + "_bandwidth" for area in MemArea.all()]
44 + ["weights_storage_area", "feature_map_storage_area"]
45 )
46
47 labels += [
48 "inferences_per_second",
49 "batch_size",
50 "inference_time",
51 "passes_before_fusing",
52 "passes_after_fusing",
53 ]
54 labels += [area.identifier_name() + "_memory_used" for area in MemArea.all()]
55 labels += ["on_chip_flash_bits_per_element", "off_chip_flash_bits_per_element"]
56
57 for mem_area in MemArea.all():
58 labels += [
59 mem_area.identifier_name() + "_feature_map_read_bytes",
60 mem_area.identifier_name() + "_feature_map_write_bytes",
61 mem_area.identifier_name() + "_weight_read_bytes",
62 mem_area.identifier_name() + "_weight_write_bytes",
63 mem_area.identifier_name() + "_total_bytes",
64 ]
65
66 labels += ["nn_macs", "hardware_macs", "nn_tops", "hardware_tops"]
67
68 labels += ["cycles_" + kind.identifier_name() for kind in PassCycles.all()]
69
70 writer.writerow(labels)
71
72 data_items = [
73 "default",
74 nng.name,
75 ]
76
77 if arch:
78 data_items += (
79 [arch.accelerator_config, arch.system_config, arch.npu_clock, arch.sram_size / 1024]
80 + [arch.memory_bandwidths_per_second[mem_area] / 1000.0 / 1000 / 1000 for mem_area in MemArea.all()]
81 + [
82 arch.tensor_storage_mem_area[TensorPurpose.Weights].display_name(),
83 arch.tensor_storage_mem_area[TensorPurpose.FeatureMap].display_name(),
84 ]
85 )
86
87 midpoint_inference_time = nng.cycles[PassCycles.Total] / arch.npu_clock
Michael McGeaghb4249742020-07-30 14:36:40 +010088 if midpoint_inference_time > 0:
89 midpoint_fps = 1 / midpoint_inference_time
90 else:
91 midpoint_fps = np.nan
Tim Hall79d07d22020-04-27 18:20:16 +010092
93 n_passes = sum(len(sg.passes) for sg in nng.subgraphs)
94 n_cascaded_passes = sum(len(sg.cascaded_passes) for sg in nng.subgraphs)
95
96 data_items += [midpoint_fps, nng.batch_size, midpoint_inference_time, n_passes, n_cascaded_passes]
97 data_items += [nng.memory_used.get(mem_area, 0) / 1024.0 for mem_area in MemArea.all()]
98
99 data_items += [
100 nng.bits_per_element.get(MemArea.OnChipFlash, 0.0),
101 nng.bits_per_element.get(MemArea.OffChipFlash, 0.0),
102 ]
103
104 for mem_area in MemArea.all():
105 bws = nng.bandwidths[mem_area]
106 total_bw = np.sum(bws)
107 weight_bws = bws[TensorPurpose.Weights]
108 fm_bws = bws[TensorPurpose.FeatureMap]
109 data_items += [
110 fm_bws[BandwidthDirection.Read],
111 fm_bws[BandwidthDirection.Write],
112 weight_bws[BandwidthDirection.Read],
113 weight_bws[BandwidthDirection.Write],
114 total_bw,
115 ]
116
117 data_items += [
118 nng.macs[MacCount.NeuralNetworkMacs],
119 nng.macs[MacCount.HardwareMacs],
120 nng.macs[MacCount.NeuralNetworkMacs] * 2 * midpoint_fps / 1e12,
121 nng.macs[MacCount.HardwareMacs] * 2 * midpoint_fps / 1e12,
122 ]
123
124 data_items += [nng.cycles[kind] for kind in PassCycles.all()]
125
126 writer.writerow(data_items)
127
128
129def write_pass_metrics_csv(nng, pass_filename):
130
131 with open(pass_filename, "w") as f:
132 writer = csv.writer(f)
133
134 purpose_list = (
135 ("total", (TensorPurpose.Weights, TensorPurpose.FeatureMap)),
136 ("weights", (TensorPurpose.Weights,)),
137 ("feature_map", (TensorPurpose.FeatureMap,)),
138 )
139
140 direction_list = (
141 ("total", (BandwidthDirection.Read, BandwidthDirection.Write)),
142 ("read", (BandwidthDirection.Read,)),
143 ("write", (BandwidthDirection.Write,)),
144 )
145 bandwidth_names = []
146 bandwidth_indices = []
147 for mem_area in MemArea.all():
148 for purpose, purpose_candidates in purpose_list:
149 for direction, direction_candidates in direction_list:
150 label = "bytes_%s_%s_%s" % (mem_area.identifier_name(), purpose, direction)
151 bandwidth_names.append(label)
152 bandwidth_indices.append((mem_area, purpose_candidates, direction_candidates))
153
154 all_macs = MacCount.all()
155 all_cycles = (
156 PassCycles.Total,
157 PassCycles.Dpu,
158 PassCycles.ElementWise,
159 PassCycles.Cpu,
160 PassCycles.SramAccess,
161 PassCycles.DramAccess,
162 PassCycles.OnChipFlashAccess,
163 PassCycles.OffChipFlashAccess,
164 )
165 writer.writerow(
166 [
167 "name",
168 "operators",
169 "placement",
170 "streaming_strategy",
171 "block_config_height",
172 "block_config_width",
173 "block_config_input_channels",
174 "block_config_output_channels",
175 "n_blocks_in_pass",
176 ]
177 + ["cycles_" + v.identifier_name() for v in all_cycles]
178 + [v.identifier_name() for v in all_macs]
179 + bandwidth_names
180 + ["sram_used"]
181 )
182
183 def write_subgraph(sg):
184 for cps in sg.cascaded_passes:
185 if cps.placement == PassPlacement.StartupInit:
186 continue # skip the dummy init pass
187
188 for ps in cps.passes:
189 if len(ps.ops) == 1 and ps.ops[0].type == "NpuOp":
190 # just treat this as a call, unroll it
191 write_subgraph(ps.ops[0].attrs["subgraph"])
192 continue
193 stats = [ps.name, " ".join(op.type for op in ps.ops)]
194 stats += [ps.placement.name]
195 stats += [cps.strategy.name]
196 stats += list(ps.block_config)
197 stats += [ps.n_blocks]
198 stats += [round_up_to_int(ps.cycles[v]) for v in all_cycles]
199 stats += [round_up_to_int(ps.macs[v]) for v in all_macs]
200 for indices in bandwidth_indices:
201 res = 0
202 i = indices[0]
203 for j in indices[1]:
204 for k in indices[2]:
205 res += round_up_to_int(ps.bandwidths[i, j, k])
206 stats.append(res)
Patrik Gustavssoneca2e952020-05-27 09:15:11 +0200207 try:
208 stats += [ps.sram_used]
209 except AttributeError:
210 stats += [0]
Tim Hall79d07d22020-04-27 18:20:16 +0100211
212 writer.writerow(stats)
213
214 write_subgraph(nng.get_root_subgraph())
215
216
217def print_performance_metrics_for_strat(
218 arch,
219 name,
220 cycles,
221 macs,
222 bandwidths,
223 batch_size,
224 memory_used,
225 num_passes,
226 num_cascaded_passes,
227 n_operations=0,
228 cpu_operations=[],
229 bits_per_element=None,
230 show_cpu_operations=False,
231 f=sys.stdout,
232):
233
234 orig_mem_areas_labels = [(v, v.display_name()) for v in MemArea.all()]
235
236 midpoint_inference_time = cycles[PassCycles.Total] / arch.npu_clock
Michael McGeaghb4249742020-07-30 14:36:40 +0100237 if midpoint_inference_time > 0:
238 midpoint_fps = 1 / midpoint_inference_time
239 else:
240 midpoint_fps = np.nan
Tim Hall79d07d22020-04-27 18:20:16 +0100241
242 mem_area_labels = [
243 (mem_area, label) for mem_area, label in orig_mem_areas_labels if np.sum(bandwidths[mem_area]) > 0
244 ]
245
246 if name:
247 print("", file=f)
248 print("Network summary for", name, file=f)
249 print("Accelerator configuration %20s" % (arch.accelerator_config,), file=f)
250 print("System configuration %20s" % (arch.system_config,), file=f)
251 print("Accelerator clock %12d MHz" % (arch.npu_clock / 1e6,), file=f)
252 for mem_area, label in mem_area_labels:
253 print(
254 "Design peak %-25s %12.2f GB/s"
255 % (label + " bandwidth", arch.memory_bandwidths_per_second[mem_area] / 1000.0 / 1000 / 1000,),
256 file=f,
257 )
258
259 print(file=f)
260 for mem_area, label in mem_area_labels:
Diego Russoea6111a2020-04-14 18:41:58 +0100261 if mem_area not in memory_used:
Tim Hall79d07d22020-04-27 18:20:16 +0100262 continue
263
264 aug_label = label + " used"
265
266 extra = ""
267 if (mem_area == MemArea.OnChipFlash or mem_area == MemArea.OffChipFlash) and bits_per_element is not None:
268 extra = " (%.2f bits per element)" % (bits_per_element[mem_area],)
269
270 print("Total %-25s %12.2f KiB%s" % (aug_label, memory_used[mem_area] / 1024.0, extra), file=f)
271
272 print(file=f)
273 print("%d passes fused into %d" % (num_passes, num_cascaded_passes), file=f)
274
275 n_cpu_operations = len(cpu_operations)
276 if n_operations > 0:
277 print(
278 "%d/%d (%4.1f %%) operations falling back to the CPU"
279 % (n_cpu_operations, n_operations, n_cpu_operations / n_operations * 100),
280 file=f,
281 )
282
283 if show_cpu_operations:
284 for op in cpu_operations:
285
286 def format_tens_list(lst):
287 return " ".join(str(list(tens.shape)) for tens in lst)
288
289 print(
290 "CPU operation: %s, inputs %s, outputs %s"
291 % (op.type, format_tens_list(op.inputs), format_tens_list(op.outputs)),
292 file=f,
293 )
294
295 print("", file=f)
296
297 for mem_area, label in mem_area_labels:
298 bws = bandwidths[mem_area]
299 total_bw = np.sum(bws)
300 weight_bws = bws[TensorPurpose.Weights]
301 fm_bws = bws[TensorPurpose.FeatureMap]
302 aug_label = label + " bandwidth"
303 print(
304 "Average %-25s %12.2f GB/s" % (aug_label, total_bw * midpoint_fps / 1000.0 / 1000.0 / 1000.0,),
305 file=f,
306 )
307 print(
308 "Input %-25s %12.2f MB/batch"
309 % (aug_label, np.sum(fm_bws[BandwidthDirection.Read]) / 1000.0 / 1000.0,),
310 file=f,
311 )
312 print("Weight %-25s %12.2f MB/batch" % (aug_label, np.sum(weight_bws) / 1000.0 / 1000.0,), file=f)
313 print(
314 "Output %-25s %12.2f MB/batch"
315 % (aug_label, np.sum(fm_bws[BandwidthDirection.Write]) / 1000.0 / 1000.0,),
316 file=f,
317 )
318 print("Total %-25s %12.2f MB/batch" % (aug_label, total_bw / 1000.0 / 1000.0,), file=f)
319 print(
320 "Total %-25s per input %9.2f MB/inference (batch size %d)"
321 % (aug_label, total_bw / 1000.0 / 1000.0 / batch_size, batch_size),
322 file=f,
323 )
324 print(file=f)
325
326 print("Neural network macs %12d MACs/batch" % (macs[MacCount.NeuralNetworkMacs],), file=f)
327 print("Hardware macs %12d MACs/batch" % (macs[MacCount.HardwareMacs],), file=f)
328 print(
329 "Network Tops/s %12.2f Tops/s"
330 % (macs[MacCount.NeuralNetworkMacs] * 2 * midpoint_fps / 1e12),
331 file=f,
332 )
333 print(
334 "Hardware Tops/s %12.2f Tops/s"
335 % (macs[MacCount.HardwareMacs] * 2 * midpoint_fps / 1e12),
336 file=f,
337 )
338 print(file=f)
339
340 for kind in PassCycles.all():
341 aug_label = kind.display_name() + " cycles"
342 cyc = cycles[kind]
343 print("%-30s %12d cycles/batch" % (aug_label, cyc,), file=f)
344 print(file=f)
345
346 print(
347 "Batch Inference time %7.2f ms, %7.2f inferences/s (batch size %d)"
348 % (midpoint_inference_time * 1000, midpoint_fps, batch_size),
349 file=f,
350 )
351 print(file=f)
352
353
354def print_performance_metrics(nng, arch, show_cpu_operations=False, f=sys.stdout):
355 n_passes = sum(len(sg.passes) for sg in nng.subgraphs)
356 n_cascaded_passes = sum(len(sg.cascaded_passes) for sg in nng.subgraphs)
357 n_operations = sum(len(ps.ops) for sg in nng.subgraphs for ps in sg.passes)
358 cpu_operations = sum((ps.ops for sg in nng.subgraphs for ps in sg.passes if ps.placement == PassPlacement.Cpu), [])
359 return print_performance_metrics_for_strat(
360 arch,
361 nng.name,
362 nng.cycles,
363 nng.macs,
364 nng.bandwidths,
365 nng.batch_size,
366 nng.memory_used,
367 n_passes,
368 n_cascaded_passes,
369 n_operations,
370 cpu_operations,
371 nng.bits_per_element,
372 show_cpu_operations,
373 f,
374 )
375
376
377def write_human_friendly_metrics(nng, arch, filename):
378 f = open(filename, "w")
379 print_performance_metrics(nng, arch, f=f)