blob: 3fd29d127bc9702027d1f115f64c7cbc359a675f [file] [log] [blame]
Tim Hall79d07d22020-04-27 18:20:16 +01001# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
2#
3# SPDX-License-Identifier: Apache-2.0
4#
5# Licensed under the Apache License, Version 2.0 (the License); you may
6# not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an AS IS BASIS, WITHOUT
13# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
16
17
18# Description:
19# Writes out per-pass and summary performance statistics to CSV files.
20
Tim Hall79d07d22020-04-27 18:20:16 +010021import csv
Tim Hall79d07d22020-04-27 18:20:16 +010022import sys
23
Diego Russoea6111a2020-04-14 18:41:58 +010024import numpy as np
25
26from .tensor import MemArea, TensorPurpose
27from .nn_graph import PassPlacement
28from .npu_performance import PassCycles, MacCount, BandwidthDirection
29from .numeric_util import round_up_to_int
30
Tim Hall79d07d22020-04-27 18:20:16 +010031
32def write_summary_metrics_csv(nng, summary_filename, arch):
33 with open(summary_filename, "w") as f:
34 writer = csv.writer(f)
35
36 labels = [
37 "experiment",
38 "network",
39 ]
40
41 labels += (
42 ["accelerator_configuration", "system_config", "npu_clock", "sram_size"]
43 + [area.identifier_name() + "_bandwidth" for area in MemArea.all()]
44 + ["weights_storage_area", "feature_map_storage_area"]
45 )
46
47 labels += [
48 "inferences_per_second",
49 "batch_size",
50 "inference_time",
51 "passes_before_fusing",
52 "passes_after_fusing",
53 ]
54 labels += [area.identifier_name() + "_memory_used" for area in MemArea.all()]
55 labels += ["on_chip_flash_bits_per_element", "off_chip_flash_bits_per_element"]
56
57 for mem_area in MemArea.all():
58 labels += [
59 mem_area.identifier_name() + "_feature_map_read_bytes",
60 mem_area.identifier_name() + "_feature_map_write_bytes",
61 mem_area.identifier_name() + "_weight_read_bytes",
62 mem_area.identifier_name() + "_weight_write_bytes",
63 mem_area.identifier_name() + "_total_bytes",
64 ]
65
66 labels += ["nn_macs", "hardware_macs", "nn_tops", "hardware_tops"]
67
68 labels += ["cycles_" + kind.identifier_name() for kind in PassCycles.all()]
69
70 writer.writerow(labels)
71
72 data_items = [
73 "default",
74 nng.name,
75 ]
76
77 if arch:
78 data_items += (
79 [arch.accelerator_config, arch.system_config, arch.npu_clock, arch.sram_size / 1024]
80 + [arch.memory_bandwidths_per_second[mem_area] / 1000.0 / 1000 / 1000 for mem_area in MemArea.all()]
81 + [
82 arch.tensor_storage_mem_area[TensorPurpose.Weights].display_name(),
83 arch.tensor_storage_mem_area[TensorPurpose.FeatureMap].display_name(),
84 ]
85 )
86
87 midpoint_inference_time = nng.cycles[PassCycles.Total] / arch.npu_clock
88 midpoint_fps = 1 / midpoint_inference_time
89
90 n_passes = sum(len(sg.passes) for sg in nng.subgraphs)
91 n_cascaded_passes = sum(len(sg.cascaded_passes) for sg in nng.subgraphs)
92
93 data_items += [midpoint_fps, nng.batch_size, midpoint_inference_time, n_passes, n_cascaded_passes]
94 data_items += [nng.memory_used.get(mem_area, 0) / 1024.0 for mem_area in MemArea.all()]
95
96 data_items += [
97 nng.bits_per_element.get(MemArea.OnChipFlash, 0.0),
98 nng.bits_per_element.get(MemArea.OffChipFlash, 0.0),
99 ]
100
101 for mem_area in MemArea.all():
102 bws = nng.bandwidths[mem_area]
103 total_bw = np.sum(bws)
104 weight_bws = bws[TensorPurpose.Weights]
105 fm_bws = bws[TensorPurpose.FeatureMap]
106 data_items += [
107 fm_bws[BandwidthDirection.Read],
108 fm_bws[BandwidthDirection.Write],
109 weight_bws[BandwidthDirection.Read],
110 weight_bws[BandwidthDirection.Write],
111 total_bw,
112 ]
113
114 data_items += [
115 nng.macs[MacCount.NeuralNetworkMacs],
116 nng.macs[MacCount.HardwareMacs],
117 nng.macs[MacCount.NeuralNetworkMacs] * 2 * midpoint_fps / 1e12,
118 nng.macs[MacCount.HardwareMacs] * 2 * midpoint_fps / 1e12,
119 ]
120
121 data_items += [nng.cycles[kind] for kind in PassCycles.all()]
122
123 writer.writerow(data_items)
124
125
126def write_pass_metrics_csv(nng, pass_filename):
127
128 with open(pass_filename, "w") as f:
129 writer = csv.writer(f)
130
131 purpose_list = (
132 ("total", (TensorPurpose.Weights, TensorPurpose.FeatureMap)),
133 ("weights", (TensorPurpose.Weights,)),
134 ("feature_map", (TensorPurpose.FeatureMap,)),
135 )
136
137 direction_list = (
138 ("total", (BandwidthDirection.Read, BandwidthDirection.Write)),
139 ("read", (BandwidthDirection.Read,)),
140 ("write", (BandwidthDirection.Write,)),
141 )
142 bandwidth_names = []
143 bandwidth_indices = []
144 for mem_area in MemArea.all():
145 for purpose, purpose_candidates in purpose_list:
146 for direction, direction_candidates in direction_list:
147 label = "bytes_%s_%s_%s" % (mem_area.identifier_name(), purpose, direction)
148 bandwidth_names.append(label)
149 bandwidth_indices.append((mem_area, purpose_candidates, direction_candidates))
150
151 all_macs = MacCount.all()
152 all_cycles = (
153 PassCycles.Total,
154 PassCycles.Dpu,
155 PassCycles.ElementWise,
156 PassCycles.Cpu,
157 PassCycles.SramAccess,
158 PassCycles.DramAccess,
159 PassCycles.OnChipFlashAccess,
160 PassCycles.OffChipFlashAccess,
161 )
162 writer.writerow(
163 [
164 "name",
165 "operators",
166 "placement",
167 "streaming_strategy",
168 "block_config_height",
169 "block_config_width",
170 "block_config_input_channels",
171 "block_config_output_channels",
172 "n_blocks_in_pass",
173 ]
174 + ["cycles_" + v.identifier_name() for v in all_cycles]
175 + [v.identifier_name() for v in all_macs]
176 + bandwidth_names
177 + ["sram_used"]
178 )
179
180 def write_subgraph(sg):
181 for cps in sg.cascaded_passes:
182 if cps.placement == PassPlacement.StartupInit:
183 continue # skip the dummy init pass
184
185 for ps in cps.passes:
186 if len(ps.ops) == 1 and ps.ops[0].type == "NpuOp":
187 # just treat this as a call, unroll it
188 write_subgraph(ps.ops[0].attrs["subgraph"])
189 continue
190 stats = [ps.name, " ".join(op.type for op in ps.ops)]
191 stats += [ps.placement.name]
192 stats += [cps.strategy.name]
193 stats += list(ps.block_config)
194 stats += [ps.n_blocks]
195 stats += [round_up_to_int(ps.cycles[v]) for v in all_cycles]
196 stats += [round_up_to_int(ps.macs[v]) for v in all_macs]
197 for indices in bandwidth_indices:
198 res = 0
199 i = indices[0]
200 for j in indices[1]:
201 for k in indices[2]:
202 res += round_up_to_int(ps.bandwidths[i, j, k])
203 stats.append(res)
204 stats += [ps.sram_used]
205
206 writer.writerow(stats)
207
208 write_subgraph(nng.get_root_subgraph())
209
210
211def print_performance_metrics_for_strat(
212 arch,
213 name,
214 cycles,
215 macs,
216 bandwidths,
217 batch_size,
218 memory_used,
219 num_passes,
220 num_cascaded_passes,
221 n_operations=0,
222 cpu_operations=[],
223 bits_per_element=None,
224 show_cpu_operations=False,
225 f=sys.stdout,
226):
227
228 orig_mem_areas_labels = [(v, v.display_name()) for v in MemArea.all()]
229
230 midpoint_inference_time = cycles[PassCycles.Total] / arch.npu_clock
231 midpoint_fps = 1 / midpoint_inference_time
232
233 mem_area_labels = [
234 (mem_area, label) for mem_area, label in orig_mem_areas_labels if np.sum(bandwidths[mem_area]) > 0
235 ]
236
237 if name:
238 print("", file=f)
239 print("Network summary for", name, file=f)
240 print("Accelerator configuration %20s" % (arch.accelerator_config,), file=f)
241 print("System configuration %20s" % (arch.system_config,), file=f)
242 print("Accelerator clock %12d MHz" % (arch.npu_clock / 1e6,), file=f)
243 for mem_area, label in mem_area_labels:
244 print(
245 "Design peak %-25s %12.2f GB/s"
246 % (label + " bandwidth", arch.memory_bandwidths_per_second[mem_area] / 1000.0 / 1000 / 1000,),
247 file=f,
248 )
249
250 print(file=f)
251 for mem_area, label in mem_area_labels:
Diego Russoea6111a2020-04-14 18:41:58 +0100252 if mem_area not in memory_used:
Tim Hall79d07d22020-04-27 18:20:16 +0100253 continue
254
255 aug_label = label + " used"
256
257 extra = ""
258 if (mem_area == MemArea.OnChipFlash or mem_area == MemArea.OffChipFlash) and bits_per_element is not None:
259 extra = " (%.2f bits per element)" % (bits_per_element[mem_area],)
260
261 print("Total %-25s %12.2f KiB%s" % (aug_label, memory_used[mem_area] / 1024.0, extra), file=f)
262
263 print(file=f)
264 print("%d passes fused into %d" % (num_passes, num_cascaded_passes), file=f)
265
266 n_cpu_operations = len(cpu_operations)
267 if n_operations > 0:
268 print(
269 "%d/%d (%4.1f %%) operations falling back to the CPU"
270 % (n_cpu_operations, n_operations, n_cpu_operations / n_operations * 100),
271 file=f,
272 )
273
274 if show_cpu_operations:
275 for op in cpu_operations:
276
277 def format_tens_list(lst):
278 return " ".join(str(list(tens.shape)) for tens in lst)
279
280 print(
281 "CPU operation: %s, inputs %s, outputs %s"
282 % (op.type, format_tens_list(op.inputs), format_tens_list(op.outputs)),
283 file=f,
284 )
285
286 print("", file=f)
287
288 for mem_area, label in mem_area_labels:
289 bws = bandwidths[mem_area]
290 total_bw = np.sum(bws)
291 weight_bws = bws[TensorPurpose.Weights]
292 fm_bws = bws[TensorPurpose.FeatureMap]
293 aug_label = label + " bandwidth"
294 print(
295 "Average %-25s %12.2f GB/s" % (aug_label, total_bw * midpoint_fps / 1000.0 / 1000.0 / 1000.0,),
296 file=f,
297 )
298 print(
299 "Input %-25s %12.2f MB/batch"
300 % (aug_label, np.sum(fm_bws[BandwidthDirection.Read]) / 1000.0 / 1000.0,),
301 file=f,
302 )
303 print("Weight %-25s %12.2f MB/batch" % (aug_label, np.sum(weight_bws) / 1000.0 / 1000.0,), file=f)
304 print(
305 "Output %-25s %12.2f MB/batch"
306 % (aug_label, np.sum(fm_bws[BandwidthDirection.Write]) / 1000.0 / 1000.0,),
307 file=f,
308 )
309 print("Total %-25s %12.2f MB/batch" % (aug_label, total_bw / 1000.0 / 1000.0,), file=f)
310 print(
311 "Total %-25s per input %9.2f MB/inference (batch size %d)"
312 % (aug_label, total_bw / 1000.0 / 1000.0 / batch_size, batch_size),
313 file=f,
314 )
315 print(file=f)
316
317 print("Neural network macs %12d MACs/batch" % (macs[MacCount.NeuralNetworkMacs],), file=f)
318 print("Hardware macs %12d MACs/batch" % (macs[MacCount.HardwareMacs],), file=f)
319 print(
320 "Network Tops/s %12.2f Tops/s"
321 % (macs[MacCount.NeuralNetworkMacs] * 2 * midpoint_fps / 1e12),
322 file=f,
323 )
324 print(
325 "Hardware Tops/s %12.2f Tops/s"
326 % (macs[MacCount.HardwareMacs] * 2 * midpoint_fps / 1e12),
327 file=f,
328 )
329 print(file=f)
330
331 for kind in PassCycles.all():
332 aug_label = kind.display_name() + " cycles"
333 cyc = cycles[kind]
334 print("%-30s %12d cycles/batch" % (aug_label, cyc,), file=f)
335 print(file=f)
336
337 print(
338 "Batch Inference time %7.2f ms, %7.2f inferences/s (batch size %d)"
339 % (midpoint_inference_time * 1000, midpoint_fps, batch_size),
340 file=f,
341 )
342 print(file=f)
343
344
345def print_performance_metrics(nng, arch, show_cpu_operations=False, f=sys.stdout):
346 n_passes = sum(len(sg.passes) for sg in nng.subgraphs)
347 n_cascaded_passes = sum(len(sg.cascaded_passes) for sg in nng.subgraphs)
348 n_operations = sum(len(ps.ops) for sg in nng.subgraphs for ps in sg.passes)
349 cpu_operations = sum((ps.ops for sg in nng.subgraphs for ps in sg.passes if ps.placement == PassPlacement.Cpu), [])
350 return print_performance_metrics_for_strat(
351 arch,
352 nng.name,
353 nng.cycles,
354 nng.macs,
355 nng.bandwidths,
356 nng.batch_size,
357 nng.memory_used,
358 n_passes,
359 n_cascaded_passes,
360 n_operations,
361 cpu_operations,
362 nng.bits_per_element,
363 show_cpu_operations,
364 f,
365 )
366
367
368def write_human_friendly_metrics(nng, arch, filename):
369 f = open(filename, "w")
370 print_performance_metrics(nng, arch, f=f)