Add Vela codebase

 - Added modules ethosu.vela and ethosu.mlw_codec.
 - Added README and various configuration files.

Change-Id: I3690f8c8f5966306ecddaeb2793c30ca9c6e2eee
diff --git a/ethosu/vela/stats_writer.py b/ethosu/vela/stats_writer.py
new file mode 100644
index 0000000..c4b4cd9
--- /dev/null
+++ b/ethosu/vela/stats_writer.py
@@ -0,0 +1,367 @@
+# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Description:
+# Writes out per-pass and summary performance statistics to CSV files.
+
+import numpy as np
+from .nn_graph import MemArea, TensorPurpose, PassPlacement
+from .npu_performance import PassCycles, MacCount, BandwidthDirection
+import csv
+from .numeric_util import round_up_to_int
+import sys
+
+
+def write_summary_metrics_csv(nng, summary_filename, arch):
+    with open(summary_filename, "w") as f:
+        writer = csv.writer(f)
+
+        labels = [
+            "experiment",
+            "network",
+        ]
+
+        labels += (
+            ["accelerator_configuration", "system_config", "npu_clock", "sram_size"]
+            + [area.identifier_name() + "_bandwidth" for area in MemArea.all()]
+            + ["weights_storage_area", "feature_map_storage_area"]
+        )
+
+        labels += [
+            "inferences_per_second",
+            "batch_size",
+            "inference_time",
+            "passes_before_fusing",
+            "passes_after_fusing",
+        ]
+        labels += [area.identifier_name() + "_memory_used" for area in MemArea.all()]
+        labels += ["on_chip_flash_bits_per_element", "off_chip_flash_bits_per_element"]
+
+        for mem_area in MemArea.all():
+            labels += [
+                mem_area.identifier_name() + "_feature_map_read_bytes",
+                mem_area.identifier_name() + "_feature_map_write_bytes",
+                mem_area.identifier_name() + "_weight_read_bytes",
+                mem_area.identifier_name() + "_weight_write_bytes",
+                mem_area.identifier_name() + "_total_bytes",
+            ]
+
+        labels += ["nn_macs", "hardware_macs", "nn_tops", "hardware_tops"]
+
+        labels += ["cycles_" + kind.identifier_name() for kind in PassCycles.all()]
+
+        writer.writerow(labels)
+
+        data_items = [
+            "default",
+            nng.name,
+        ]
+
+        if arch:
+            data_items += (
+                [arch.accelerator_config, arch.system_config, arch.npu_clock, arch.sram_size / 1024]
+                + [arch.memory_bandwidths_per_second[mem_area] / 1000.0 / 1000 / 1000 for mem_area in MemArea.all()]
+                + [
+                    arch.tensor_storage_mem_area[TensorPurpose.Weights].display_name(),
+                    arch.tensor_storage_mem_area[TensorPurpose.FeatureMap].display_name(),
+                ]
+            )
+
+        midpoint_inference_time = nng.cycles[PassCycles.Total] / arch.npu_clock
+        midpoint_fps = 1 / midpoint_inference_time
+
+        n_passes = sum(len(sg.passes) for sg in nng.subgraphs)
+        n_cascaded_passes = sum(len(sg.cascaded_passes) for sg in nng.subgraphs)
+
+        data_items += [midpoint_fps, nng.batch_size, midpoint_inference_time, n_passes, n_cascaded_passes]
+        data_items += [nng.memory_used.get(mem_area, 0) / 1024.0 for mem_area in MemArea.all()]
+
+        data_items += [
+            nng.bits_per_element.get(MemArea.OnChipFlash, 0.0),
+            nng.bits_per_element.get(MemArea.OffChipFlash, 0.0),
+        ]
+
+        for mem_area in MemArea.all():
+            bws = nng.bandwidths[mem_area]
+            total_bw = np.sum(bws)
+            weight_bws = bws[TensorPurpose.Weights]
+            fm_bws = bws[TensorPurpose.FeatureMap]
+            data_items += [
+                fm_bws[BandwidthDirection.Read],
+                fm_bws[BandwidthDirection.Write],
+                weight_bws[BandwidthDirection.Read],
+                weight_bws[BandwidthDirection.Write],
+                total_bw,
+            ]
+
+        data_items += [
+            nng.macs[MacCount.NeuralNetworkMacs],
+            nng.macs[MacCount.HardwareMacs],
+            nng.macs[MacCount.NeuralNetworkMacs] * 2 * midpoint_fps / 1e12,
+            nng.macs[MacCount.HardwareMacs] * 2 * midpoint_fps / 1e12,
+        ]
+
+        data_items += [nng.cycles[kind] for kind in PassCycles.all()]
+
+        writer.writerow(data_items)
+
+
+def write_pass_metrics_csv(nng, pass_filename):
+
+    with open(pass_filename, "w") as f:
+        writer = csv.writer(f)
+
+        purpose_list = (
+            ("total", (TensorPurpose.Weights, TensorPurpose.FeatureMap)),
+            ("weights", (TensorPurpose.Weights,)),
+            ("feature_map", (TensorPurpose.FeatureMap,)),
+        )
+
+        direction_list = (
+            ("total", (BandwidthDirection.Read, BandwidthDirection.Write)),
+            ("read", (BandwidthDirection.Read,)),
+            ("write", (BandwidthDirection.Write,)),
+        )
+        bandwidth_names = []
+        bandwidth_indices = []
+        for mem_area in MemArea.all():
+            for purpose, purpose_candidates in purpose_list:
+                for direction, direction_candidates in direction_list:
+                    label = "bytes_%s_%s_%s" % (mem_area.identifier_name(), purpose, direction)
+                    bandwidth_names.append(label)
+                    bandwidth_indices.append((mem_area, purpose_candidates, direction_candidates))
+
+        all_macs = MacCount.all()
+        all_cycles = (
+            PassCycles.Total,
+            PassCycles.Dpu,
+            PassCycles.ElementWise,
+            PassCycles.Cpu,
+            PassCycles.SramAccess,
+            PassCycles.DramAccess,
+            PassCycles.OnChipFlashAccess,
+            PassCycles.OffChipFlashAccess,
+        )
+        writer.writerow(
+            [
+                "name",
+                "operators",
+                "placement",
+                "streaming_strategy",
+                "block_config_height",
+                "block_config_width",
+                "block_config_input_channels",
+                "block_config_output_channels",
+                "n_blocks_in_pass",
+            ]
+            + ["cycles_" + v.identifier_name() for v in all_cycles]
+            + [v.identifier_name() for v in all_macs]
+            + bandwidth_names
+            + ["sram_used"]
+        )
+
+        def write_subgraph(sg):
+            for cps in sg.cascaded_passes:
+                if cps.placement == PassPlacement.StartupInit:
+                    continue  # skip the dummy init pass
+
+                for ps in cps.passes:
+                    if len(ps.ops) == 1 and ps.ops[0].type == "NpuOp":
+                        # just treat this as a call, unroll it
+                        write_subgraph(ps.ops[0].attrs["subgraph"])
+                        continue
+                    stats = [ps.name, " ".join(op.type for op in ps.ops)]
+                    stats += [ps.placement.name]
+                    stats += [cps.strategy.name]
+                    stats += list(ps.block_config)
+                    stats += [ps.n_blocks]
+                    stats += [round_up_to_int(ps.cycles[v]) for v in all_cycles]
+                    stats += [round_up_to_int(ps.macs[v]) for v in all_macs]
+                    for indices in bandwidth_indices:
+                        res = 0
+                        i = indices[0]
+                        for j in indices[1]:
+                            for k in indices[2]:
+                                res += round_up_to_int(ps.bandwidths[i, j, k])
+                        stats.append(res)
+                    stats += [ps.sram_used]
+
+                    writer.writerow(stats)
+
+        write_subgraph(nng.get_root_subgraph())
+
+
+def print_performance_metrics_for_strat(
+    arch,
+    name,
+    cycles,
+    macs,
+    bandwidths,
+    batch_size,
+    memory_used,
+    num_passes,
+    num_cascaded_passes,
+    n_operations=0,
+    cpu_operations=[],
+    bits_per_element=None,
+    show_cpu_operations=False,
+    f=sys.stdout,
+):
+
+    orig_mem_areas_labels = [(v, v.display_name()) for v in MemArea.all()]
+
+    midpoint_inference_time = cycles[PassCycles.Total] / arch.npu_clock
+    midpoint_fps = 1 / midpoint_inference_time
+
+    mem_area_labels = [
+        (mem_area, label) for mem_area, label in orig_mem_areas_labels if np.sum(bandwidths[mem_area]) > 0
+    ]
+
+    if name:
+        print("", file=f)
+        print("Network summary for", name, file=f)
+    print("Accelerator configuration        %20s" % (arch.accelerator_config,), file=f)
+    print("System configuration             %20s" % (arch.system_config,), file=f)
+    print("Accelerator clock                        %12d MHz" % (arch.npu_clock / 1e6,), file=f)
+    for mem_area, label in mem_area_labels:
+        print(
+            "Design peak %-25s    %12.2f GB/s"
+            % (label + " bandwidth", arch.memory_bandwidths_per_second[mem_area] / 1000.0 / 1000 / 1000,),
+            file=f,
+        )
+
+    print(file=f)
+    for mem_area, label in mem_area_labels:
+        if not mem_area in memory_used:
+            continue
+
+        aug_label = label + " used"
+
+        extra = ""
+        if (mem_area == MemArea.OnChipFlash or mem_area == MemArea.OffChipFlash) and bits_per_element is not None:
+            extra = " (%.2f bits per element)" % (bits_per_element[mem_area],)
+
+        print("Total %-25s          %12.2f KiB%s" % (aug_label, memory_used[mem_area] / 1024.0, extra), file=f)
+
+    print(file=f)
+    print("%d passes fused into %d" % (num_passes, num_cascaded_passes), file=f)
+
+    n_cpu_operations = len(cpu_operations)
+    if n_operations > 0:
+        print(
+            "%d/%d (%4.1f %%) operations falling back to the CPU"
+            % (n_cpu_operations, n_operations, n_cpu_operations / n_operations * 100),
+            file=f,
+        )
+
+    if show_cpu_operations:
+        for op in cpu_operations:
+
+            def format_tens_list(lst):
+                return " ".join(str(list(tens.shape)) for tens in lst)
+
+            print(
+                "CPU operation: %s, inputs %s, outputs %s"
+                % (op.type, format_tens_list(op.inputs), format_tens_list(op.outputs)),
+                file=f,
+            )
+
+        print("", file=f)
+
+    for mem_area, label in mem_area_labels:
+        bws = bandwidths[mem_area]
+        total_bw = np.sum(bws)
+        weight_bws = bws[TensorPurpose.Weights]
+        fm_bws = bws[TensorPurpose.FeatureMap]
+        aug_label = label + " bandwidth"
+        print(
+            "Average %-25s        %12.2f GB/s" % (aug_label, total_bw * midpoint_fps / 1000.0 / 1000.0 / 1000.0,),
+            file=f,
+        )
+        print(
+            "Input   %-25s        %12.2f MB/batch"
+            % (aug_label, np.sum(fm_bws[BandwidthDirection.Read]) / 1000.0 / 1000.0,),
+            file=f,
+        )
+        print("Weight  %-25s        %12.2f MB/batch" % (aug_label, np.sum(weight_bws) / 1000.0 / 1000.0,), file=f)
+        print(
+            "Output  %-25s        %12.2f MB/batch"
+            % (aug_label, np.sum(fm_bws[BandwidthDirection.Write]) / 1000.0 / 1000.0,),
+            file=f,
+        )
+        print("Total   %-25s        %12.2f MB/batch" % (aug_label, total_bw / 1000.0 / 1000.0,), file=f)
+        print(
+            "Total   %-25s per input %9.2f MB/inference (batch size %d)"
+            % (aug_label, total_bw / 1000.0 / 1000.0 / batch_size, batch_size),
+            file=f,
+        )
+        print(file=f)
+
+    print("Neural network macs                      %12d MACs/batch" % (macs[MacCount.NeuralNetworkMacs],), file=f)
+    print("Hardware macs                            %12d MACs/batch" % (macs[MacCount.HardwareMacs],), file=f)
+    print(
+        "Network Tops/s                           %12.2f Tops/s"
+        % (macs[MacCount.NeuralNetworkMacs] * 2 * midpoint_fps / 1e12),
+        file=f,
+    )
+    print(
+        "Hardware Tops/s                          %12.2f Tops/s"
+        % (macs[MacCount.HardwareMacs] * 2 * midpoint_fps / 1e12),
+        file=f,
+    )
+    print(file=f)
+
+    for kind in PassCycles.all():
+        aug_label = kind.display_name() + " cycles"
+        cyc = cycles[kind]
+        print("%-30s           %12d cycles/batch" % (aug_label, cyc,), file=f)
+    print(file=f)
+
+    print(
+        "Batch Inference time              %7.2f ms, %7.2f inferences/s (batch size %d)"
+        % (midpoint_inference_time * 1000, midpoint_fps, batch_size),
+        file=f,
+    )
+    print(file=f)
+
+
+def print_performance_metrics(nng, arch, show_cpu_operations=False, f=sys.stdout):
+    n_passes = sum(len(sg.passes) for sg in nng.subgraphs)
+    n_cascaded_passes = sum(len(sg.cascaded_passes) for sg in nng.subgraphs)
+    n_operations = sum(len(ps.ops) for sg in nng.subgraphs for ps in sg.passes)
+    cpu_operations = sum((ps.ops for sg in nng.subgraphs for ps in sg.passes if ps.placement == PassPlacement.Cpu), [])
+    return print_performance_metrics_for_strat(
+        arch,
+        nng.name,
+        nng.cycles,
+        nng.macs,
+        nng.bandwidths,
+        nng.batch_size,
+        nng.memory_used,
+        n_passes,
+        n_cascaded_passes,
+        n_operations,
+        cpu_operations,
+        nng.bits_per_element,
+        show_cpu_operations,
+        f,
+    )
+
+
+def write_human_friendly_metrics(nng, arch, filename):
+    f = open(filename, "w")
+    print_performance_metrics(nng, arch, f=f)