Add Vela codebase

 - Added modules ethosu.vela and ethosu.mlw_codec.
 - Added README and various configuration files.

Change-Id: I3690f8c8f5966306ecddaeb2793c30ca9c6e2eee
diff --git a/ethosu/vela/npu_serialisation.py b/ethosu/vela/npu_serialisation.py
new file mode 100644
index 0000000..4542c25
--- /dev/null
+++ b/ethosu/vela/npu_serialisation.py
@@ -0,0 +1,145 @@
+# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Description:
+# Serialises and packs an NPU subgraph into tensors.
+
+from .nn_graph import PassPlacement
+from .tensor import MemArea, Tensor, TensorPurpose, TensorFormat
+from .operation import Operation
+from .data_type import DataType
+import numpy as np
+from . import driver_actions
+import struct
+
+
+def make_memory_tensor(name, mem_area, sz, want_values, arch):
+    tens = Tensor([sz], DataType.uint8, name)
+    tens.mem_area = mem_area
+    tens.purpose = TensorPurpose.FeatureMap
+    tens.set_format(TensorFormat.NHWC, arch)
+    if want_values:
+        tens.values = np.zeros(tens.shape, np.uint8)
+    return tens
+
+
+def copy_compressed_values_to_memory_tensor(memory_tensor, src_tensor):
+    start_addr = src_tensor.address
+    for compressed_values in src_tensor.compressed_values:
+        end_addr = start_addr + len(compressed_values)
+        memory_tensor.values[start_addr:end_addr] = compressed_values
+        start_addr = end_addr
+
+
+def serialise_npu_subgraph_into_tensors(nng, sg, arch, scratch_tens, flash_tens):
+    if sg.placement != PassPlacement.Npu:
+        return scratch_tens, flash_tens
+
+    flash_area = arch.permanent_storage_mem_area
+    scratch_area = MemArea.Sram
+
+    flash_size = sg.memory_used.get(flash_area, 0)
+    scratch_size = sg.memory_used.get(scratch_area, 0)
+
+    # Prepare driver actions for this command tensor
+    da_list = []
+    driver_actions.emit_fourcc(da_list, "COP1")
+    driver_actions.emit_config(da_list, 0, 1, arch)
+    driver_actions.emit_cmd_stream_header(da_list, len(sg.register_command_stream))
+
+    # Append command stream words
+    da_list.extend(sg.register_command_stream)
+
+    # Convert to bytes
+    payload_bytes = struct.pack("<{0}I".format(len(da_list)), *da_list)
+
+    command_stream_size_bytes = len(payload_bytes)
+
+    # Adjust the bits per element calculation to exclude metadata generated by Vela
+    nng.total_size[flash_area] = nng.total_size.get(flash_area, 0) - flash_size - command_stream_size_bytes
+    nng.total_elements[flash_area] = nng.total_elements.get(flash_area, 0) - flash_size - command_stream_size_bytes
+    nng.total_size[scratch_area] = nng.total_size.get(scratch_area, 0) - scratch_size
+    nng.total_elements[scratch_area] = nng.total_elements.get(scratch_area, 0) - scratch_size
+
+    if flash_tens == scratch_tens == None:
+        # First Npu subgraph, create scratch and flash tensors
+        sg.scratch_tensor = make_memory_tensor(sg.name + "_scratch", scratch_area, scratch_size, False, arch)
+        sg.scratch_tensor.purpose = TensorPurpose.Scratch
+        sg.flash_tensor = make_memory_tensor(sg.name + "_flash", flash_area, flash_size, True, arch)
+    else:
+        sg.scratch_tensor = scratch_tens
+        sg.scratch_tensor.shape[0] += scratch_size
+        sg.flash_tensor = flash_tens
+        sg.flash_tensor.shape[0] += flash_size
+
+    for cps in sg.cascaded_passes:
+        for ps in cps.passes:
+            if ps.placement == PassPlacement.Npu and ps.weight_tensor != None:
+                # For DMA ops, ps.weight_tensor is referring to the SRAM weight tensor and therefore the address
+                # is pointing at the destination address of where the weights should be placed in SRAM.
+                # This ensures that the Flash weight tensor is used instead and thus gets the correct address.
+                if ps.weight_tensor.ops[0].type == "DMA":
+                    copy_compressed_values_to_memory_tensor(sg.flash_tensor, ps.weight_tensor.ops[0].inputs[0])
+                else:
+                    copy_compressed_values_to_memory_tensor(sg.flash_tensor, ps.weight_tensor)
+
+                copy_compressed_values_to_memory_tensor(sg.flash_tensor, ps.scale_tensor)
+
+    sg.command_stream_tensor = make_memory_tensor(
+        sg.name + "_command_stream", flash_area, command_stream_size_bytes, True, arch
+    )
+    sg.command_stream_tensor.values = np.frombuffer(payload_bytes, dtype=np.uint8)
+
+    return sg.scratch_tensor, sg.flash_tensor
+
+
+def add_const_tens_to_startup_cascaded_pass(startup_cps, tens):
+    op = Operation("Const", tens.name + "_const")
+    op.outputs = [tens]
+    tens.ops = [op]
+    startup_cps.passes[0].ops.insert(0, op)
+    startup_cps.passes[0].outputs.insert(0, tens)
+    startup_cps.outputs.insert(0, tens)
+
+
+def rewrite_npu_call_ops(nng, sg, arch):
+    if sg.placement != PassPlacement.Cpu:
+        return
+
+    startup_cps = sg.cascaded_passes[0]
+
+    for idx, cps in enumerate(sg.cascaded_passes):
+        for ps in cps.passes:
+            for op in ps.ops:
+                if op.type == "NpuOp":
+                    callee = op.attrs["subgraph"]
+                    op.attrs["custom_options"] = {"type": op.type}
+
+                    sz = 0
+                    for tens in [callee.scratch_tensor, callee.flash_tensor, callee.command_stream_tensor]:
+                        op.inputs.insert(0, tens)
+                        ps.inputs.insert(0, tens)
+                        cps.inputs.insert(0, tens)
+                        if tens != callee.scratch_tensor:
+                            add_const_tens_to_startup_cascaded_pass(startup_cps, tens)
+                        sz += tens.storage_size()
+
+                    for prev_cps in sg.cascaded_passes[: idx + 1]:
+                        prev_cps.sram_used += sz
+
+                    if callee.scratch_tensor is not None:
+                        cps.sram_used += callee.scratch_tensor.storage_size()