blob: c4b4cd9e4c1036e33a8b9ebcb5f0f87d47837ac8 [file] [log] [blame]
Tim Hall79d07d22020-04-27 18:20:16 +01001# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
2#
3# SPDX-License-Identifier: Apache-2.0
4#
5# Licensed under the Apache License, Version 2.0 (the License); you may
6# not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an AS IS BASIS, WITHOUT
13# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
16
17
18# Description:
19# Writes out per-pass and summary performance statistics to CSV files.
20
21import numpy as np
22from .nn_graph import MemArea, TensorPurpose, PassPlacement
23from .npu_performance import PassCycles, MacCount, BandwidthDirection
24import csv
25from .numeric_util import round_up_to_int
26import sys
27
28
29def write_summary_metrics_csv(nng, summary_filename, arch):
30 with open(summary_filename, "w") as f:
31 writer = csv.writer(f)
32
33 labels = [
34 "experiment",
35 "network",
36 ]
37
38 labels += (
39 ["accelerator_configuration", "system_config", "npu_clock", "sram_size"]
40 + [area.identifier_name() + "_bandwidth" for area in MemArea.all()]
41 + ["weights_storage_area", "feature_map_storage_area"]
42 )
43
44 labels += [
45 "inferences_per_second",
46 "batch_size",
47 "inference_time",
48 "passes_before_fusing",
49 "passes_after_fusing",
50 ]
51 labels += [area.identifier_name() + "_memory_used" for area in MemArea.all()]
52 labels += ["on_chip_flash_bits_per_element", "off_chip_flash_bits_per_element"]
53
54 for mem_area in MemArea.all():
55 labels += [
56 mem_area.identifier_name() + "_feature_map_read_bytes",
57 mem_area.identifier_name() + "_feature_map_write_bytes",
58 mem_area.identifier_name() + "_weight_read_bytes",
59 mem_area.identifier_name() + "_weight_write_bytes",
60 mem_area.identifier_name() + "_total_bytes",
61 ]
62
63 labels += ["nn_macs", "hardware_macs", "nn_tops", "hardware_tops"]
64
65 labels += ["cycles_" + kind.identifier_name() for kind in PassCycles.all()]
66
67 writer.writerow(labels)
68
69 data_items = [
70 "default",
71 nng.name,
72 ]
73
74 if arch:
75 data_items += (
76 [arch.accelerator_config, arch.system_config, arch.npu_clock, arch.sram_size / 1024]
77 + [arch.memory_bandwidths_per_second[mem_area] / 1000.0 / 1000 / 1000 for mem_area in MemArea.all()]
78 + [
79 arch.tensor_storage_mem_area[TensorPurpose.Weights].display_name(),
80 arch.tensor_storage_mem_area[TensorPurpose.FeatureMap].display_name(),
81 ]
82 )
83
84 midpoint_inference_time = nng.cycles[PassCycles.Total] / arch.npu_clock
85 midpoint_fps = 1 / midpoint_inference_time
86
87 n_passes = sum(len(sg.passes) for sg in nng.subgraphs)
88 n_cascaded_passes = sum(len(sg.cascaded_passes) for sg in nng.subgraphs)
89
90 data_items += [midpoint_fps, nng.batch_size, midpoint_inference_time, n_passes, n_cascaded_passes]
91 data_items += [nng.memory_used.get(mem_area, 0) / 1024.0 for mem_area in MemArea.all()]
92
93 data_items += [
94 nng.bits_per_element.get(MemArea.OnChipFlash, 0.0),
95 nng.bits_per_element.get(MemArea.OffChipFlash, 0.0),
96 ]
97
98 for mem_area in MemArea.all():
99 bws = nng.bandwidths[mem_area]
100 total_bw = np.sum(bws)
101 weight_bws = bws[TensorPurpose.Weights]
102 fm_bws = bws[TensorPurpose.FeatureMap]
103 data_items += [
104 fm_bws[BandwidthDirection.Read],
105 fm_bws[BandwidthDirection.Write],
106 weight_bws[BandwidthDirection.Read],
107 weight_bws[BandwidthDirection.Write],
108 total_bw,
109 ]
110
111 data_items += [
112 nng.macs[MacCount.NeuralNetworkMacs],
113 nng.macs[MacCount.HardwareMacs],
114 nng.macs[MacCount.NeuralNetworkMacs] * 2 * midpoint_fps / 1e12,
115 nng.macs[MacCount.HardwareMacs] * 2 * midpoint_fps / 1e12,
116 ]
117
118 data_items += [nng.cycles[kind] for kind in PassCycles.all()]
119
120 writer.writerow(data_items)
121
122
123def write_pass_metrics_csv(nng, pass_filename):
124
125 with open(pass_filename, "w") as f:
126 writer = csv.writer(f)
127
128 purpose_list = (
129 ("total", (TensorPurpose.Weights, TensorPurpose.FeatureMap)),
130 ("weights", (TensorPurpose.Weights,)),
131 ("feature_map", (TensorPurpose.FeatureMap,)),
132 )
133
134 direction_list = (
135 ("total", (BandwidthDirection.Read, BandwidthDirection.Write)),
136 ("read", (BandwidthDirection.Read,)),
137 ("write", (BandwidthDirection.Write,)),
138 )
139 bandwidth_names = []
140 bandwidth_indices = []
141 for mem_area in MemArea.all():
142 for purpose, purpose_candidates in purpose_list:
143 for direction, direction_candidates in direction_list:
144 label = "bytes_%s_%s_%s" % (mem_area.identifier_name(), purpose, direction)
145 bandwidth_names.append(label)
146 bandwidth_indices.append((mem_area, purpose_candidates, direction_candidates))
147
148 all_macs = MacCount.all()
149 all_cycles = (
150 PassCycles.Total,
151 PassCycles.Dpu,
152 PassCycles.ElementWise,
153 PassCycles.Cpu,
154 PassCycles.SramAccess,
155 PassCycles.DramAccess,
156 PassCycles.OnChipFlashAccess,
157 PassCycles.OffChipFlashAccess,
158 )
159 writer.writerow(
160 [
161 "name",
162 "operators",
163 "placement",
164 "streaming_strategy",
165 "block_config_height",
166 "block_config_width",
167 "block_config_input_channels",
168 "block_config_output_channels",
169 "n_blocks_in_pass",
170 ]
171 + ["cycles_" + v.identifier_name() for v in all_cycles]
172 + [v.identifier_name() for v in all_macs]
173 + bandwidth_names
174 + ["sram_used"]
175 )
176
177 def write_subgraph(sg):
178 for cps in sg.cascaded_passes:
179 if cps.placement == PassPlacement.StartupInit:
180 continue # skip the dummy init pass
181
182 for ps in cps.passes:
183 if len(ps.ops) == 1 and ps.ops[0].type == "NpuOp":
184 # just treat this as a call, unroll it
185 write_subgraph(ps.ops[0].attrs["subgraph"])
186 continue
187 stats = [ps.name, " ".join(op.type for op in ps.ops)]
188 stats += [ps.placement.name]
189 stats += [cps.strategy.name]
190 stats += list(ps.block_config)
191 stats += [ps.n_blocks]
192 stats += [round_up_to_int(ps.cycles[v]) for v in all_cycles]
193 stats += [round_up_to_int(ps.macs[v]) for v in all_macs]
194 for indices in bandwidth_indices:
195 res = 0
196 i = indices[0]
197 for j in indices[1]:
198 for k in indices[2]:
199 res += round_up_to_int(ps.bandwidths[i, j, k])
200 stats.append(res)
201 stats += [ps.sram_used]
202
203 writer.writerow(stats)
204
205 write_subgraph(nng.get_root_subgraph())
206
207
208def print_performance_metrics_for_strat(
209 arch,
210 name,
211 cycles,
212 macs,
213 bandwidths,
214 batch_size,
215 memory_used,
216 num_passes,
217 num_cascaded_passes,
218 n_operations=0,
219 cpu_operations=[],
220 bits_per_element=None,
221 show_cpu_operations=False,
222 f=sys.stdout,
223):
224
225 orig_mem_areas_labels = [(v, v.display_name()) for v in MemArea.all()]
226
227 midpoint_inference_time = cycles[PassCycles.Total] / arch.npu_clock
228 midpoint_fps = 1 / midpoint_inference_time
229
230 mem_area_labels = [
231 (mem_area, label) for mem_area, label in orig_mem_areas_labels if np.sum(bandwidths[mem_area]) > 0
232 ]
233
234 if name:
235 print("", file=f)
236 print("Network summary for", name, file=f)
237 print("Accelerator configuration %20s" % (arch.accelerator_config,), file=f)
238 print("System configuration %20s" % (arch.system_config,), file=f)
239 print("Accelerator clock %12d MHz" % (arch.npu_clock / 1e6,), file=f)
240 for mem_area, label in mem_area_labels:
241 print(
242 "Design peak %-25s %12.2f GB/s"
243 % (label + " bandwidth", arch.memory_bandwidths_per_second[mem_area] / 1000.0 / 1000 / 1000,),
244 file=f,
245 )
246
247 print(file=f)
248 for mem_area, label in mem_area_labels:
249 if not mem_area in memory_used:
250 continue
251
252 aug_label = label + " used"
253
254 extra = ""
255 if (mem_area == MemArea.OnChipFlash or mem_area == MemArea.OffChipFlash) and bits_per_element is not None:
256 extra = " (%.2f bits per element)" % (bits_per_element[mem_area],)
257
258 print("Total %-25s %12.2f KiB%s" % (aug_label, memory_used[mem_area] / 1024.0, extra), file=f)
259
260 print(file=f)
261 print("%d passes fused into %d" % (num_passes, num_cascaded_passes), file=f)
262
263 n_cpu_operations = len(cpu_operations)
264 if n_operations > 0:
265 print(
266 "%d/%d (%4.1f %%) operations falling back to the CPU"
267 % (n_cpu_operations, n_operations, n_cpu_operations / n_operations * 100),
268 file=f,
269 )
270
271 if show_cpu_operations:
272 for op in cpu_operations:
273
274 def format_tens_list(lst):
275 return " ".join(str(list(tens.shape)) for tens in lst)
276
277 print(
278 "CPU operation: %s, inputs %s, outputs %s"
279 % (op.type, format_tens_list(op.inputs), format_tens_list(op.outputs)),
280 file=f,
281 )
282
283 print("", file=f)
284
285 for mem_area, label in mem_area_labels:
286 bws = bandwidths[mem_area]
287 total_bw = np.sum(bws)
288 weight_bws = bws[TensorPurpose.Weights]
289 fm_bws = bws[TensorPurpose.FeatureMap]
290 aug_label = label + " bandwidth"
291 print(
292 "Average %-25s %12.2f GB/s" % (aug_label, total_bw * midpoint_fps / 1000.0 / 1000.0 / 1000.0,),
293 file=f,
294 )
295 print(
296 "Input %-25s %12.2f MB/batch"
297 % (aug_label, np.sum(fm_bws[BandwidthDirection.Read]) / 1000.0 / 1000.0,),
298 file=f,
299 )
300 print("Weight %-25s %12.2f MB/batch" % (aug_label, np.sum(weight_bws) / 1000.0 / 1000.0,), file=f)
301 print(
302 "Output %-25s %12.2f MB/batch"
303 % (aug_label, np.sum(fm_bws[BandwidthDirection.Write]) / 1000.0 / 1000.0,),
304 file=f,
305 )
306 print("Total %-25s %12.2f MB/batch" % (aug_label, total_bw / 1000.0 / 1000.0,), file=f)
307 print(
308 "Total %-25s per input %9.2f MB/inference (batch size %d)"
309 % (aug_label, total_bw / 1000.0 / 1000.0 / batch_size, batch_size),
310 file=f,
311 )
312 print(file=f)
313
314 print("Neural network macs %12d MACs/batch" % (macs[MacCount.NeuralNetworkMacs],), file=f)
315 print("Hardware macs %12d MACs/batch" % (macs[MacCount.HardwareMacs],), file=f)
316 print(
317 "Network Tops/s %12.2f Tops/s"
318 % (macs[MacCount.NeuralNetworkMacs] * 2 * midpoint_fps / 1e12),
319 file=f,
320 )
321 print(
322 "Hardware Tops/s %12.2f Tops/s"
323 % (macs[MacCount.HardwareMacs] * 2 * midpoint_fps / 1e12),
324 file=f,
325 )
326 print(file=f)
327
328 for kind in PassCycles.all():
329 aug_label = kind.display_name() + " cycles"
330 cyc = cycles[kind]
331 print("%-30s %12d cycles/batch" % (aug_label, cyc,), file=f)
332 print(file=f)
333
334 print(
335 "Batch Inference time %7.2f ms, %7.2f inferences/s (batch size %d)"
336 % (midpoint_inference_time * 1000, midpoint_fps, batch_size),
337 file=f,
338 )
339 print(file=f)
340
341
342def print_performance_metrics(nng, arch, show_cpu_operations=False, f=sys.stdout):
343 n_passes = sum(len(sg.passes) for sg in nng.subgraphs)
344 n_cascaded_passes = sum(len(sg.cascaded_passes) for sg in nng.subgraphs)
345 n_operations = sum(len(ps.ops) for sg in nng.subgraphs for ps in sg.passes)
346 cpu_operations = sum((ps.ops for sg in nng.subgraphs for ps in sg.passes if ps.placement == PassPlacement.Cpu), [])
347 return print_performance_metrics_for_strat(
348 arch,
349 nng.name,
350 nng.cycles,
351 nng.macs,
352 nng.bandwidths,
353 nng.batch_size,
354 nng.memory_used,
355 n_passes,
356 n_cascaded_passes,
357 n_operations,
358 cpu_operations,
359 nng.bits_per_element,
360 show_cpu_operations,
361 f,
362 )
363
364
365def write_human_friendly_metrics(nng, arch, filename):
366 f = open(filename, "w")
367 print_performance_metrics(nng, arch, f=f)