Add Vela codebase

 - Added modules ethosu.vela and ethosu.mlw_codec.
 - Added README and various configuration files.

Change-Id: I3690f8c8f5966306ecddaeb2793c30ca9c6e2eee
diff --git a/ethosu/vela/high_level_command_stream_generator.py b/ethosu/vela/high_level_command_stream_generator.py
new file mode 100644
index 0000000..364df6f
--- /dev/null
+++ b/ethosu/vela/high_level_command_stream_generator.py
@@ -0,0 +1,315 @@
+# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# Description:
+# Generate a high-level command stream from a scheduled subgraph with CascadedPasses.
+#
+# Also used during scheduling to work out allowable IFM/OFM overlap, this functionality can be accessed using
+# calc_allowed_ofm_ifm_overlap_for_cascaded_pass().
+
+from .nn_graph import SchedulingStrategy, PassPlacement
+import numpy as np
+from .operation import NpuBlockType
+from .high_level_command_stream import Box, CommandType, Command, NpuStripe, DMA
+
+
+def need_dma(tens):
+    return len(tens.ops) == 1 and tens.ops[0].type == "DMA"
+
+
+def dma_weights_if_necessary(ps, box, weight_tensor):
+    if need_dma(weight_tensor):
+        dma_op = weight_tensor.ops[0]
+        in_tensor = dma_op.inputs[0]
+        yield DMA(in_tensor, weight_tensor, box)
+
+
+def generate_high_level_command_stream_for_pass(strat, passes, block_configs, idx):
+    is_first = idx == 0
+    is_last = idx == len(passes) - 1
+    ps = passes[idx]
+    block_config = block_configs[idx]
+
+    ifm_tensor = ps.ifm_tensor
+    ifm2_tensor = ps.ifm2_tensor
+    ofm_tensor = ps.ofm_tensor
+    weight_tensor = ps.weight_tensor
+    scale_tensor = ps.scale_tensor
+
+    ofm_start = [0] * len(ofm_tensor.shape)
+    ofm_end = list(ofm_tensor.shape)
+
+    strides = None
+    skirt = None
+    if ps.primary_op is not None:
+        strides = ps.primary_op.attrs.get("strides", None)
+        skirt = ps.primary_op.attrs.get("skirt", None)
+
+    npu_block_type = ps.npu_block_type
+
+    concat_axis = 0
+    concat_offset = 0
+
+    split_offsets = [None, None]  # offset for [ifm, ifm2]
+
+    # Fusable activation functions
+    activation_ops = set(("Sigmoid", "Tanh", "Relu", "Relu6", "ReluN1To1"))
+
+    for op in ps.ops:
+        if op.type == "ConcatSliceWrite":
+            concat_axis = op.attrs["concat_axis"]
+            concat_start = op.attrs["concat_start"]
+            concat_end = op.attrs["concat_end"]
+
+            ofm_start[concat_axis] = concat_start
+            ofm_end[concat_axis] = concat_end
+            concat_offset = concat_start
+            ps.primary_op.attrs["fused_memory_function"] = op.type
+        elif op.type in activation_ops:
+            ps.primary_op.attrs["fused_activation_function"] = op.type
+
+    # The ops list has to be reversed here since the Pass Packing is done in reverse
+    ifm_idx = 0
+    for op in reversed(ps.ops):
+        if op.type == "SplitSliceRead":
+            split_offsets[ifm_idx] = op.attrs["split_start"]
+            ps.primary_op.attrs["fused_memory_function"] = op.type
+            ifm_idx += 1
+
+    if strat == SchedulingStrategy.WeightStream:
+        ofm_step = block_config[-1]
+        ofm_stop = ofm_end[-1]
+        if weight_tensor is None or not need_dma(weight_tensor):
+            ofm_step = ofm_stop
+        for start in range(ofm_start[-1], ofm_stop, ofm_step):
+            end = min(start + ofm_step, ofm_stop)
+            ofm_start[-1] = start
+            ofm_end[-1] = end
+            ofm_box = Box(ofm_start, ofm_end)
+            ifm_box = None
+            ifm2_box = None
+
+            if ifm_tensor.shape != []:
+                ifm_box, _, _ = ofm_box.transform_with_strides_and_skirt(
+                    strides, skirt, ifm_tensor.shape, npu_block_type, concat_axis, concat_offset, split_offsets[0]
+                )
+            else:
+                ifm_box = Box([], [])
+            if ifm2_tensor is not None and ifm2_tensor.shape != []:
+                ifm2_box, _, _ = ofm_box.transform_with_strides_and_skirt(
+                    strides, skirt, ifm2_tensor.shape, npu_block_type, concat_axis, concat_offset, split_offsets[1]
+                )
+            else:
+                ifm2_box = Box([], [])
+
+            weight_box = None
+            if weight_tensor is not None:
+                weight_oc_start = start
+                weight_oc_end = end
+                if concat_axis - len(weight_tensor.shape) == -1:
+                    weight_oc_start -= concat_offset
+                    weight_oc_end -= concat_offset
+
+                weight_box = Box.make_weight_box(
+                    weight_tensor.shape,
+                    npu_block_type,
+                    weight_oc_start,
+                    weight_oc_end,
+                    weight_tensor.weight_transpose_depthwise,
+                )
+                yield from dma_weights_if_necessary(ps, weight_box, weight_tensor)
+
+            yield NpuStripe(
+                ps,
+                block_config,
+                is_first,
+                is_last,
+                True,
+                True,
+                ifm_tensor,
+                ifm_box,
+                ofm_tensor,
+                ofm_box,
+                weight_tensor,
+                weight_box,
+                scale_tensor,
+                concat_axis,
+                concat_offset,
+                ifm2_tensor=ifm2_tensor,
+                ifm2_box=ifm2_box,
+            )
+
+    elif strat == SchedulingStrategy.IfmStream:
+        y_step = block_config[0]
+        y_start = 0
+        y_dim = 1
+        if len(ofm_tensor.shape) >= 3:
+            y_start = ofm_start[-3]
+            y_dim = ofm_end[-3]
+        if idx > 0:
+            ifm_y_present = 0
+            prev_pass = passes[idx - 1]
+            prev_pass_gen = generate_high_level_command_stream_for_pass(strat, passes, block_configs, idx - 1)
+        else:
+            ifm_y_present = 1
+            if len(ifm_tensor.shape) >= 3:
+                ifm_y_present = ifm_tensor.shape[-3]
+            prev_pass_gen = []
+            prev_pass = None
+
+        if len(passes) == 1:
+            # no cascading, can just issue one big stripe
+            # but only if we've done allocation and OFM does not overlap IFM
+            if ifm_tensor.address != -1 and ofm_tensor.address != -1:
+                if (
+                    ifm_tensor.address + ifm_tensor.storage_size() <= ofm_tensor.address
+                    or ofm_tensor.address + ofm_tensor.storage_size() <= ifm_tensor.address
+                ):
+                    y_step = y_dim
+
+        weight_box = None
+
+        for start in range(y_start, y_dim, y_step):
+            end = min(start + y_step, y_dim)
+            if len(ofm_tensor.shape) >= 3:
+                ofm_start[-3] = start
+                ofm_end[-3] = end
+            ofm_box = Box(ofm_start, ofm_end)
+
+            k_height = 1
+            if npu_block_type == NpuBlockType.Pooling:
+                if ps.primary_op is not None:
+                    k_height = ps.primary_op.attrs["ksize"][1]
+            else:
+                if weight_tensor is not None:
+                    k_height = weight_tensor.shape[0]
+
+            ifm_box, pad_top, pad_bottom = ofm_box.transform_with_strides_and_skirt(
+                strides, skirt, ifm_tensor.shape, npu_block_type, concat_axis, concat_offset, split_offsets[0], k_height
+            )
+
+            ifm_y_needed = 1
+            if len(ifm_box.end_coord) >= 3:
+                ifm_y_needed = ifm_box.end_coord[-3]
+            if ifm_y_present < ifm_y_needed:
+                for prev_cmd in prev_pass_gen:
+                    yield prev_cmd
+                    rng = prev_cmd.get_ofm_y_range_for_pass(prev_pass)
+                    if rng is not None:
+                        ifm_y_present = max(ifm_y_present, rng[1])
+                        if ifm_y_present >= ifm_y_needed:
+                            break
+
+            if weight_tensor is not None and weight_box is None:
+                weight_box = Box.make_weight_box(
+                    weight_tensor.shape, npu_block_type, weights_transposed=weight_tensor.weight_transpose_depthwise
+                )
+                yield from dma_weights_if_necessary(ps, weight_box, weight_tensor)
+
+            # Check if first/last stripe in pass
+            is_first_h_stripe = start == y_start
+            is_last_h_stripe = (start + y_step) >= y_dim
+
+            stripe = NpuStripe(
+                ps,
+                block_config,
+                is_first,
+                is_last,
+                is_first_h_stripe,
+                is_last_h_stripe,
+                ifm_tensor,
+                ifm_box,
+                ofm_tensor,
+                ofm_box,
+                weight_tensor,
+                weight_box,
+                scale_tensor,
+                concat_axis,
+                concat_offset,
+                None,
+                None,
+                pad_top,
+                pad_bottom,
+            )
+            yield stripe
+    else:
+        assert 0, "unknown scheduling strategy"
+
+
+def generate_high_level_command_stream_for_pass_list(strat, passes, block_configs):
+    if strat == SchedulingStrategy.WeightStream:
+        for idx in range(len(passes)):
+            yield from generate_high_level_command_stream_for_pass(strat, passes, block_configs, idx)
+    elif strat == SchedulingStrategy.IfmStream:
+        yield from generate_high_level_command_stream_for_pass(strat, passes, block_configs, len(passes) - 1)
+    else:
+        assert 0, "Unknown streaming strategy"
+
+
+def generate_high_level_command_stream_for_cascaded_pass(cps):
+    yield from generate_high_level_command_stream_for_pass_list(
+        cps.strategy, cps.passes, [ps.block_config for ps in cps.passes]
+    )
+
+
+def generate_high_level_command_stream(nng, sg, arch, verbose_high_level_command_stream):
+    res = []
+    for cps in sg.cascaded_passes:
+        if cps.placement == PassPlacement.Npu:
+            res += list(generate_high_level_command_stream_for_cascaded_pass(cps))
+
+    sg.high_level_command_stream = res
+    if verbose_high_level_command_stream:
+        sg.print_high_level_command_stream()
+
+
+def calc_allowed_ofm_ifm_overlap_for_pass_list(strat, passes, block_configs):
+    highest_ofm_write = 0
+    if not passes[0].ifm_tensor or not passes[-1].ofm_tensor:
+        return 0
+
+    ifm_read = passes[0].ifm_tensor.storage_size
+    min_overlap = 999999999999999999999
+    ofm_size = passes[-1].ofm_tensor.storage_size()
+    if strat == SchedulingStrategy.WeightStream:
+        return 0
+    for cmd in generate_high_level_command_stream_for_pass_list(strat, passes, block_configs):
+        if cmd.is_npu_pass_command():
+            if cmd.is_first:
+                ifm_read = cmd.ifm_tensor.address_offset_for_coordinate(cmd.ifm_box.start_coord, is_top_box=False)
+                if ifm_read is None:
+                    return 0
+            if cmd.is_last:
+                write_offset = cmd.ofm_tensor.address_offset_for_coordinate(cmd.ofm_box.end_coord, is_top_box=True)
+                if write_offset is None:
+                    return 0
+                highest_ofm_write = max(write_offset, highest_ofm_write)
+
+            if cmd.is_first or cmd.is_last:
+                overlap_required = max(highest_ofm_write - min(ifm_read, ofm_size), 0)
+                can_overwrite = ofm_size - overlap_required
+                min_overlap = min(min_overlap, can_overwrite)
+
+            if cmd.is_first:
+                ifm_read = cmd.ifm_tensor.address_offset_for_coordinate(cmd.ifm_box.end_coord, is_top_box=True)
+
+    min_overlap = max(min_overlap, 0)
+    return min_overlap
+
+
+def calc_allowed_ofm_ifm_overlap_for_cascaded_pass(cps):
+    return calc_allowed_ofm_ifm_overlap_for_pass_list(cps.strategy, cps.passes, [ps.block_config for ps in cps.passes])