Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 1 | # Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved. |
| 2 | # |
| 3 | # SPDX-License-Identifier: Apache-2.0 |
| 4 | # |
| 5 | # Licensed under the Apache License, Version 2.0 (the License); you may |
| 6 | # not use this file except in compliance with the License. |
| 7 | # You may obtain a copy of the License at |
| 8 | # |
| 9 | # www.apache.org/licenses/LICENSE-2.0 |
| 10 | # |
| 11 | # Unless required by applicable law or agreed to in writing, software |
| 12 | # distributed under the License is distributed on an AS IS BASIS, WITHOUT |
| 13 | # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 14 | # See the License for the specific language governing permissions and |
| 15 | # limitations under the License. |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 16 | # Description: |
| 17 | # Writes out per-pass and summary performance statistics to CSV files. |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 18 | import csv |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 19 | import sys |
| 20 | |
Diego Russo | ea6111a | 2020-04-14 18:41:58 +0100 | [diff] [blame] | 21 | import numpy as np |
| 22 | |
Diego Russo | ea6111a | 2020-04-14 18:41:58 +0100 | [diff] [blame] | 23 | from .nn_graph import PassPlacement |
Diego Russo | e8a1045 | 2020-04-21 17:39:10 +0100 | [diff] [blame] | 24 | from .npu_performance import BandwidthDirection |
| 25 | from .npu_performance import MacCount |
| 26 | from .npu_performance import PassCycles |
Diego Russo | ea6111a | 2020-04-14 18:41:58 +0100 | [diff] [blame] | 27 | from .numeric_util import round_up_to_int |
Louis Verhaard | aee5d75 | 2020-09-30 09:01:52 +0200 | [diff] [blame] | 28 | from .operation import Op |
Diego Russo | e8a1045 | 2020-04-21 17:39:10 +0100 | [diff] [blame] | 29 | from .tensor import MemArea |
| 30 | from .tensor import TensorPurpose |
Diego Russo | ea6111a | 2020-04-14 18:41:58 +0100 | [diff] [blame] | 31 | |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 32 | |
Louis Verhaard | 0265f40 | 2020-09-29 13:57:21 +0200 | [diff] [blame] | 33 | def mem_areas_to_report(): |
| 34 | # Exclude SHRAM, as the SHRAM performance numbers only cover LUT usage |
| 35 | return [area for area in MemArea.all() if area != MemArea.Shram] |
| 36 | |
| 37 | |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 38 | def write_summary_metrics_csv(nng, summary_filename, arch): |
| 39 | with open(summary_filename, "w") as f: |
| 40 | writer = csv.writer(f) |
Louis Verhaard | 0265f40 | 2020-09-29 13:57:21 +0200 | [diff] [blame] | 41 | mem_areas = mem_areas_to_report() |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 42 | |
| 43 | labels = [ |
| 44 | "experiment", |
| 45 | "network", |
| 46 | ] |
| 47 | |
| 48 | labels += ( |
| 49 | ["accelerator_configuration", "system_config", "npu_clock", "sram_size"] |
Louis Verhaard | 0265f40 | 2020-09-29 13:57:21 +0200 | [diff] [blame] | 50 | + [area.identifier_name() + "_bandwidth" for area in mem_areas] |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 51 | + ["weights_storage_area", "feature_map_storage_area"] |
| 52 | ) |
| 53 | |
| 54 | labels += [ |
| 55 | "inferences_per_second", |
| 56 | "batch_size", |
| 57 | "inference_time", |
| 58 | "passes_before_fusing", |
| 59 | "passes_after_fusing", |
| 60 | ] |
Louis Verhaard | 0265f40 | 2020-09-29 13:57:21 +0200 | [diff] [blame] | 61 | labels += [area.identifier_name() + "_memory_used" for area in mem_areas] |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 62 | labels += ["on_chip_flash_bits_per_element", "off_chip_flash_bits_per_element"] |
| 63 | |
Louis Verhaard | 0265f40 | 2020-09-29 13:57:21 +0200 | [diff] [blame] | 64 | for mem_area in mem_areas: |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 65 | labels += [ |
| 66 | mem_area.identifier_name() + "_feature_map_read_bytes", |
| 67 | mem_area.identifier_name() + "_feature_map_write_bytes", |
| 68 | mem_area.identifier_name() + "_weight_read_bytes", |
| 69 | mem_area.identifier_name() + "_weight_write_bytes", |
| 70 | mem_area.identifier_name() + "_total_bytes", |
| 71 | ] |
| 72 | |
| 73 | labels += ["nn_macs", "hardware_macs", "nn_tops", "hardware_tops"] |
| 74 | |
| 75 | labels += ["cycles_" + kind.identifier_name() for kind in PassCycles.all()] |
| 76 | |
| 77 | writer.writerow(labels) |
| 78 | |
| 79 | data_items = [ |
| 80 | "default", |
| 81 | nng.name, |
| 82 | ] |
| 83 | |
| 84 | if arch: |
| 85 | data_items += ( |
| 86 | [arch.accelerator_config, arch.system_config, arch.npu_clock, arch.sram_size / 1024] |
Louis Verhaard | 0265f40 | 2020-09-29 13:57:21 +0200 | [diff] [blame] | 87 | + [arch.memory_bandwidths_per_second[mem_area] / 1000.0 / 1000 / 1000 for mem_area in mem_areas] |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 88 | + [ |
| 89 | arch.tensor_storage_mem_area[TensorPurpose.Weights].display_name(), |
| 90 | arch.tensor_storage_mem_area[TensorPurpose.FeatureMap].display_name(), |
| 91 | ] |
| 92 | ) |
| 93 | |
| 94 | midpoint_inference_time = nng.cycles[PassCycles.Total] / arch.npu_clock |
Michael McGeagh | b424974 | 2020-07-30 14:36:40 +0100 | [diff] [blame] | 95 | if midpoint_inference_time > 0: |
| 96 | midpoint_fps = 1 / midpoint_inference_time |
| 97 | else: |
| 98 | midpoint_fps = np.nan |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 99 | |
| 100 | n_passes = sum(len(sg.passes) for sg in nng.subgraphs) |
| 101 | n_cascaded_passes = sum(len(sg.cascaded_passes) for sg in nng.subgraphs) |
| 102 | |
| 103 | data_items += [midpoint_fps, nng.batch_size, midpoint_inference_time, n_passes, n_cascaded_passes] |
Louis Verhaard | 0265f40 | 2020-09-29 13:57:21 +0200 | [diff] [blame] | 104 | data_items += [nng.memory_used.get(mem_area, 0) / 1024.0 for mem_area in mem_areas] |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 105 | |
| 106 | data_items += [ |
| 107 | nng.bits_per_element.get(MemArea.OnChipFlash, 0.0), |
| 108 | nng.bits_per_element.get(MemArea.OffChipFlash, 0.0), |
| 109 | ] |
| 110 | |
Louis Verhaard | 0265f40 | 2020-09-29 13:57:21 +0200 | [diff] [blame] | 111 | for mem_area in mem_areas: |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 112 | bws = nng.bandwidths[mem_area] |
| 113 | total_bw = np.sum(bws) |
| 114 | weight_bws = bws[TensorPurpose.Weights] |
| 115 | fm_bws = bws[TensorPurpose.FeatureMap] |
| 116 | data_items += [ |
| 117 | fm_bws[BandwidthDirection.Read], |
| 118 | fm_bws[BandwidthDirection.Write], |
| 119 | weight_bws[BandwidthDirection.Read], |
| 120 | weight_bws[BandwidthDirection.Write], |
| 121 | total_bw, |
| 122 | ] |
| 123 | |
| 124 | data_items += [ |
| 125 | nng.macs[MacCount.NeuralNetworkMacs], |
| 126 | nng.macs[MacCount.HardwareMacs], |
| 127 | nng.macs[MacCount.NeuralNetworkMacs] * 2 * midpoint_fps / 1e12, |
| 128 | nng.macs[MacCount.HardwareMacs] * 2 * midpoint_fps / 1e12, |
| 129 | ] |
| 130 | |
| 131 | data_items += [nng.cycles[kind] for kind in PassCycles.all()] |
| 132 | |
| 133 | writer.writerow(data_items) |
| 134 | |
| 135 | |
| 136 | def write_pass_metrics_csv(nng, pass_filename): |
| 137 | |
| 138 | with open(pass_filename, "w") as f: |
| 139 | writer = csv.writer(f) |
| 140 | |
| 141 | purpose_list = ( |
| 142 | ("total", (TensorPurpose.Weights, TensorPurpose.FeatureMap)), |
| 143 | ("weights", (TensorPurpose.Weights,)), |
| 144 | ("feature_map", (TensorPurpose.FeatureMap,)), |
| 145 | ) |
| 146 | |
| 147 | direction_list = ( |
| 148 | ("total", (BandwidthDirection.Read, BandwidthDirection.Write)), |
| 149 | ("read", (BandwidthDirection.Read,)), |
| 150 | ("write", (BandwidthDirection.Write,)), |
| 151 | ) |
| 152 | bandwidth_names = [] |
| 153 | bandwidth_indices = [] |
Louis Verhaard | 0265f40 | 2020-09-29 13:57:21 +0200 | [diff] [blame] | 154 | for mem_area in mem_areas_to_report(): |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 155 | for purpose, purpose_candidates in purpose_list: |
| 156 | for direction, direction_candidates in direction_list: |
| 157 | label = "bytes_%s_%s_%s" % (mem_area.identifier_name(), purpose, direction) |
| 158 | bandwidth_names.append(label) |
| 159 | bandwidth_indices.append((mem_area, purpose_candidates, direction_candidates)) |
| 160 | |
| 161 | all_macs = MacCount.all() |
| 162 | all_cycles = ( |
| 163 | PassCycles.Total, |
| 164 | PassCycles.Dpu, |
| 165 | PassCycles.ElementWise, |
| 166 | PassCycles.Cpu, |
| 167 | PassCycles.SramAccess, |
| 168 | PassCycles.DramAccess, |
| 169 | PassCycles.OnChipFlashAccess, |
| 170 | PassCycles.OffChipFlashAccess, |
| 171 | ) |
| 172 | writer.writerow( |
| 173 | [ |
| 174 | "name", |
| 175 | "operators", |
| 176 | "placement", |
| 177 | "streaming_strategy", |
| 178 | "block_config_height", |
| 179 | "block_config_width", |
| 180 | "block_config_input_channels", |
| 181 | "block_config_output_channels", |
| 182 | "n_blocks_in_pass", |
| 183 | ] |
| 184 | + ["cycles_" + v.identifier_name() for v in all_cycles] |
| 185 | + [v.identifier_name() for v in all_macs] |
| 186 | + bandwidth_names |
| 187 | + ["sram_used"] |
| 188 | ) |
| 189 | |
| 190 | def write_subgraph(sg): |
| 191 | for cps in sg.cascaded_passes: |
| 192 | if cps.placement == PassPlacement.StartupInit: |
| 193 | continue # skip the dummy init pass |
| 194 | |
| 195 | for ps in cps.passes: |
Louis Verhaard | aee5d75 | 2020-09-30 09:01:52 +0200 | [diff] [blame] | 196 | if len(ps.ops) == 1 and ps.ops[0].type == Op.CustomNpuOp: |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 197 | # just treat this as a call, unroll it |
| 198 | write_subgraph(ps.ops[0].attrs["subgraph"]) |
| 199 | continue |
Louis Verhaard | aee5d75 | 2020-09-30 09:01:52 +0200 | [diff] [blame] | 200 | stats = [ps.name, " ".join(op.type.name for op in ps.ops)] |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 201 | stats += [ps.placement.name] |
| 202 | stats += [cps.strategy.name] |
| 203 | stats += list(ps.block_config) |
| 204 | stats += [ps.n_blocks] |
| 205 | stats += [round_up_to_int(ps.cycles[v]) for v in all_cycles] |
| 206 | stats += [round_up_to_int(ps.macs[v]) for v in all_macs] |
| 207 | for indices in bandwidth_indices: |
| 208 | res = 0 |
| 209 | i = indices[0] |
| 210 | for j in indices[1]: |
| 211 | for k in indices[2]: |
| 212 | res += round_up_to_int(ps.bandwidths[i, j, k]) |
| 213 | stats.append(res) |
Patrik Gustavsson | eca2e95 | 2020-05-27 09:15:11 +0200 | [diff] [blame] | 214 | try: |
| 215 | stats += [ps.sram_used] |
| 216 | except AttributeError: |
| 217 | stats += [0] |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 218 | |
| 219 | writer.writerow(stats) |
| 220 | |
| 221 | write_subgraph(nng.get_root_subgraph()) |
| 222 | |
| 223 | |
| 224 | def print_performance_metrics_for_strat( |
| 225 | arch, |
| 226 | name, |
| 227 | cycles, |
| 228 | macs, |
| 229 | bandwidths, |
| 230 | batch_size, |
| 231 | memory_used, |
| 232 | num_passes, |
| 233 | num_cascaded_passes, |
| 234 | n_operations=0, |
| 235 | cpu_operations=[], |
| 236 | bits_per_element=None, |
| 237 | show_cpu_operations=False, |
| 238 | f=sys.stdout, |
| 239 | ): |
| 240 | |
Louis Verhaard | 0265f40 | 2020-09-29 13:57:21 +0200 | [diff] [blame] | 241 | orig_mem_areas_labels = [(v, v.display_name()) for v in mem_areas_to_report()] |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 242 | |
| 243 | midpoint_inference_time = cycles[PassCycles.Total] / arch.npu_clock |
Michael McGeagh | b424974 | 2020-07-30 14:36:40 +0100 | [diff] [blame] | 244 | if midpoint_inference_time > 0: |
| 245 | midpoint_fps = 1 / midpoint_inference_time |
| 246 | else: |
| 247 | midpoint_fps = np.nan |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 248 | |
| 249 | mem_area_labels = [ |
| 250 | (mem_area, label) for mem_area, label in orig_mem_areas_labels if np.sum(bandwidths[mem_area]) > 0 |
| 251 | ] |
| 252 | |
| 253 | if name: |
| 254 | print("", file=f) |
| 255 | print("Network summary for", name, file=f) |
| 256 | print("Accelerator configuration %20s" % (arch.accelerator_config,), file=f) |
| 257 | print("System configuration %20s" % (arch.system_config,), file=f) |
| 258 | print("Accelerator clock %12d MHz" % (arch.npu_clock / 1e6,), file=f) |
| 259 | for mem_area, label in mem_area_labels: |
| 260 | print( |
| 261 | "Design peak %-25s %12.2f GB/s" |
| 262 | % (label + " bandwidth", arch.memory_bandwidths_per_second[mem_area] / 1000.0 / 1000 / 1000,), |
| 263 | file=f, |
| 264 | ) |
| 265 | |
| 266 | print(file=f) |
| 267 | for mem_area, label in mem_area_labels: |
Diego Russo | ea6111a | 2020-04-14 18:41:58 +0100 | [diff] [blame] | 268 | if mem_area not in memory_used: |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 269 | continue |
| 270 | |
| 271 | aug_label = label + " used" |
| 272 | |
| 273 | extra = "" |
| 274 | if (mem_area == MemArea.OnChipFlash or mem_area == MemArea.OffChipFlash) and bits_per_element is not None: |
| 275 | extra = " (%.2f bits per element)" % (bits_per_element[mem_area],) |
| 276 | |
| 277 | print("Total %-25s %12.2f KiB%s" % (aug_label, memory_used[mem_area] / 1024.0, extra), file=f) |
| 278 | |
| 279 | print(file=f) |
| 280 | print("%d passes fused into %d" % (num_passes, num_cascaded_passes), file=f) |
| 281 | |
| 282 | n_cpu_operations = len(cpu_operations) |
| 283 | if n_operations > 0: |
| 284 | print( |
| 285 | "%d/%d (%4.1f %%) operations falling back to the CPU" |
| 286 | % (n_cpu_operations, n_operations, n_cpu_operations / n_operations * 100), |
| 287 | file=f, |
| 288 | ) |
| 289 | |
| 290 | if show_cpu_operations: |
| 291 | for op in cpu_operations: |
| 292 | |
| 293 | def format_tens_list(lst): |
| 294 | return " ".join(str(list(tens.shape)) for tens in lst) |
| 295 | |
| 296 | print( |
| 297 | "CPU operation: %s, inputs %s, outputs %s" |
| 298 | % (op.type, format_tens_list(op.inputs), format_tens_list(op.outputs)), |
| 299 | file=f, |
| 300 | ) |
| 301 | |
| 302 | print("", file=f) |
| 303 | |
| 304 | for mem_area, label in mem_area_labels: |
| 305 | bws = bandwidths[mem_area] |
| 306 | total_bw = np.sum(bws) |
| 307 | weight_bws = bws[TensorPurpose.Weights] |
| 308 | fm_bws = bws[TensorPurpose.FeatureMap] |
| 309 | aug_label = label + " bandwidth" |
| 310 | print( |
| 311 | "Average %-25s %12.2f GB/s" % (aug_label, total_bw * midpoint_fps / 1000.0 / 1000.0 / 1000.0,), |
| 312 | file=f, |
| 313 | ) |
| 314 | print( |
| 315 | "Input %-25s %12.2f MB/batch" |
| 316 | % (aug_label, np.sum(fm_bws[BandwidthDirection.Read]) / 1000.0 / 1000.0,), |
| 317 | file=f, |
| 318 | ) |
| 319 | print("Weight %-25s %12.2f MB/batch" % (aug_label, np.sum(weight_bws) / 1000.0 / 1000.0,), file=f) |
| 320 | print( |
| 321 | "Output %-25s %12.2f MB/batch" |
| 322 | % (aug_label, np.sum(fm_bws[BandwidthDirection.Write]) / 1000.0 / 1000.0,), |
| 323 | file=f, |
| 324 | ) |
| 325 | print("Total %-25s %12.2f MB/batch" % (aug_label, total_bw / 1000.0 / 1000.0,), file=f) |
| 326 | print( |
| 327 | "Total %-25s per input %9.2f MB/inference (batch size %d)" |
| 328 | % (aug_label, total_bw / 1000.0 / 1000.0 / batch_size, batch_size), |
| 329 | file=f, |
| 330 | ) |
| 331 | print(file=f) |
| 332 | |
| 333 | print("Neural network macs %12d MACs/batch" % (macs[MacCount.NeuralNetworkMacs],), file=f) |
| 334 | print("Hardware macs %12d MACs/batch" % (macs[MacCount.HardwareMacs],), file=f) |
| 335 | print( |
| 336 | "Network Tops/s %12.2f Tops/s" |
| 337 | % (macs[MacCount.NeuralNetworkMacs] * 2 * midpoint_fps / 1e12), |
| 338 | file=f, |
| 339 | ) |
| 340 | print( |
| 341 | "Hardware Tops/s %12.2f Tops/s" |
| 342 | % (macs[MacCount.HardwareMacs] * 2 * midpoint_fps / 1e12), |
| 343 | file=f, |
| 344 | ) |
| 345 | print(file=f) |
| 346 | |
| 347 | for kind in PassCycles.all(): |
| 348 | aug_label = kind.display_name() + " cycles" |
| 349 | cyc = cycles[kind] |
| 350 | print("%-30s %12d cycles/batch" % (aug_label, cyc,), file=f) |
| 351 | print(file=f) |
| 352 | |
| 353 | print( |
| 354 | "Batch Inference time %7.2f ms, %7.2f inferences/s (batch size %d)" |
| 355 | % (midpoint_inference_time * 1000, midpoint_fps, batch_size), |
| 356 | file=f, |
| 357 | ) |
| 358 | print(file=f) |
| 359 | |
| 360 | |
| 361 | def print_performance_metrics(nng, arch, show_cpu_operations=False, f=sys.stdout): |
| 362 | n_passes = sum(len(sg.passes) for sg in nng.subgraphs) |
| 363 | n_cascaded_passes = sum(len(sg.cascaded_passes) for sg in nng.subgraphs) |
| 364 | n_operations = sum(len(ps.ops) for sg in nng.subgraphs for ps in sg.passes) |
| 365 | cpu_operations = sum((ps.ops for sg in nng.subgraphs for ps in sg.passes if ps.placement == PassPlacement.Cpu), []) |
| 366 | return print_performance_metrics_for_strat( |
| 367 | arch, |
| 368 | nng.name, |
| 369 | nng.cycles, |
| 370 | nng.macs, |
| 371 | nng.bandwidths, |
| 372 | nng.batch_size, |
| 373 | nng.memory_used, |
| 374 | n_passes, |
| 375 | n_cascaded_passes, |
| 376 | n_operations, |
| 377 | cpu_operations, |
| 378 | nng.bits_per_element, |
| 379 | show_cpu_operations, |
| 380 | f, |
| 381 | ) |
| 382 | |
| 383 | |
| 384 | def write_human_friendly_metrics(nng, arch, filename): |
| 385 | f = open(filename, "w") |
| 386 | print_performance_metrics(nng, arch, f=f) |