MLBEDSW-4034: New Scheduler Size or Performance Optimisation

 - Merged dev/scheduler at 83639f90e8c828f70de6e29142355a940224959b

Signed-off-by: Tim Hall <tim.hall@arm.com>
Change-Id: I0050529d4b42da93768c7264296434dd877fb5b4
diff --git a/ethosu/vela/test/extapi/test_extapi_generate_commands.py b/ethosu/vela/test/extapi/test_extapi_generate_commands.py
index 3c9a43d..ee13430 100644
--- a/ethosu/vela/test/extapi/test_extapi_generate_commands.py
+++ b/ethosu/vela/test/extapi/test_extapi_generate_commands.py
@@ -167,11 +167,13 @@
     check_cmd0(cmds, cmd0.NPU_SET_OFM_BLK_HEIGHT_M1, 15)
     check_cmd0(cmds, cmd0.NPU_SET_OFM_BLK_WIDTH_M1, 3)
     check_cmd0(cmds, cmd0.NPU_SET_OFM_BLK_DEPTH_M1, 15)
-    check_cmd0(cmds, cmd0.NPU_SET_IFM_IB_END, 14)
-    check_cmd0(cmds, cmd0.NPU_SET_AB_START, 14)
     check_cmd0(cmds, cmd0.NPU_SET_ACC_FORMAT, 0)
     check_cmd0(cmds, cmd0.NPU_SET_BLOCKDEP, 0)
     check_cmd0(cmds, cmd0.NPU_OP_CONV, 0)
+    ib_end = find_cmd0(cmds, cmd0.NPU_SET_IFM_IB_END)
+    ab_start = find_cmd0(cmds, cmd0.NPU_SET_AB_START)
+    assert ib_end > 0
+    assert ib_end <= ab_start
 
 
 def create_fully_connected_op() -> NpuConv2DOperation:
@@ -296,11 +298,13 @@
     check_cmd0(cmds, cmd0.NPU_SET_IFM2_PRECISION, 0)
     check_cmd0(cmds, cmd0.NPU_SET_IFM2_BROADCAST, 5)
     check_cmd0(cmds, cmd0.NPU_SET_IFM_IB_END, 16)
-    check_cmd0(cmds, cmd0.NPU_SET_AB_START, 16)
-    check_cmd0(cmds, cmd0.NPU_SET_IFM2_IB_START, 9)
     check_cmd0(cmds, cmd0.NPU_SET_ACC_FORMAT, 0)
     check_cmd0(cmds, cmd0.NPU_SET_BLOCKDEP, 0)
     check_cmd0(cmds, cmd0.NPU_OP_ELEMENTWISE, 0)
+    ab_start = find_cmd0(cmds, cmd0.NPU_SET_AB_START)
+    assert ab_start > 0
+    ifm2_ib_start = find_cmd0(cmds, cmd0.NPU_SET_IFM2_IB_START)
+    assert 0 < ifm2_ib_start < ab_start
     # Check that block width/height were generated that fit
     blk_height = find_cmd0(cmds, cmd0.NPU_SET_OFM_BLK_HEIGHT_M1)
     blk_width = find_cmd0(cmds, cmd0.NPU_SET_OFM_BLK_WIDTH_M1)
@@ -413,11 +417,11 @@
     w, h = op.ofm.shape.width, op.ofm.shape.height
     op.ofm.tiles = NpuTileBox(width_0=w, height_0=h, height_1=h, addresses=[32 * 1024, 0, 0, 0])
     # 384K for spilling should fit
-    arch.sram_size = 384 * 1024
+    arch.arena_cache_size = 384 * 1024
     mem_limits = get_mem_limits_for_regions(arch)
     generate_command_stream([op], arch, verbose=False, mem_limits=mem_limits)
     # 32K for spilling does not fit, due to the OFM address
-    arch.sram_size = 32 * 1024
+    arch.arena_cache_size = 32 * 1024
     mem_limits = get_mem_limits_for_regions(arch)
     with pytest.raises(VelaError):
         generate_command_stream([op], arch, verbose=False, mem_limits=mem_limits)
diff --git a/ethosu/vela/test/test_architecture_allocator.py b/ethosu/vela/test/test_architecture_allocator.py
new file mode 100644
index 0000000..94768fc
--- /dev/null
+++ b/ethosu/vela/test/test_architecture_allocator.py
@@ -0,0 +1,123 @@
+# Copyright (C) 2021 Arm Limited or its affiliates. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Description:
+# Unit tests for architecture_allocator.py
+import pytest
+
+from ethosu.vela.architecture_allocator import find_block_config
+from ethosu.vela.architecture_allocator import try_block_config
+from ethosu.vela.architecture_features import Accelerator
+from ethosu.vela.architecture_features import Block
+from ethosu.vela.architecture_features import create_default_arch
+from ethosu.vela.ethos_u55_regs.ethos_u55_regs import resampling_mode
+from ethosu.vela.operation import Kernel
+from ethosu.vela.operation import NpuBlockType
+from ethosu.vela.shape4d import Shape4D
+
+test_data = [
+    {
+        "block_type": NpuBlockType.ConvolutionDepthWise,
+        "kernel": Kernel(25, 5, 2, 2, 1, 1),
+        "ofm_shape": Shape4D(2, 11, 22),
+        "ifm_shape": Shape4D(27, 25, 22),
+    },
+    {
+        "block_type": NpuBlockType.Pooling,
+        "kernel": Kernel(2, 2),
+        "ofm_shape": Shape4D(53, 49, 22),
+        "ifm_shape": Shape4D(27, 25, 22),
+        "ifm_resampling": resampling_mode.NEAREST,
+    },
+    {
+        "block_type": NpuBlockType.ConvolutionMxN,
+        "accelerator": Accelerator.Ethos_U55_32,
+        "kernel": Kernel(2, 5),
+        "ofm_shape": Shape4D(48, 1, 17),
+        "ifm_shape": Shape4D(24, 5, 18),
+        "ifm_resampling": resampling_mode.TRANSPOSE,
+    },
+    {
+        "block_type": NpuBlockType.ElementWise,
+        "ofm_shape": Shape4D(27, 2, 22),
+        "ifm_shape": Shape4D(27, 2, 1),
+        "ifm2_shape": Shape4D(27, 25, 22),
+    },
+    {
+        "block_type": NpuBlockType.ElementWise,
+        "accelerator": Accelerator.Ethos_U55_32,
+        "ofm_shape": Shape4D(48, 37, 17),
+        "ifm_shape": Shape4D(48, 37, 17),
+        "uses_scalar": True,
+        "lut_banks": 2,
+    },
+    {
+        "block_type": NpuBlockType.ElementWise,
+        "ofm_shape": Shape4D(27, 2, 22),
+        "ifm_shape": Shape4D(27, 2, 22),
+        "ifm_bits": 16,
+    },
+]
+
+
+@pytest.mark.parametrize("test_data", test_data)
+def test_allocate(test_data):
+    """Tests that find_block_config and try_block_config produce consistent SHRAM layouts"""
+    accelerator = test_data.get("accelerator", Accelerator.Ethos_U55_128)
+    arch = create_default_arch(accelerator)
+    kernel = test_data.get("kernel", Kernel(1, 1))
+    block_type = test_data["block_type"]
+    ofm_shape = test_data["ofm_shape"]
+    ifm_shape = test_data["ifm_shape"]
+    ifm2_shape = test_data.get("ifm2_shape")
+    uses_scalar = test_data.get("uses_scalar", False)
+    ifm_bits = test_data.get("ifm_bits", 8)
+    ifm_resampling = test_data.get("ifm_resampling", resampling_mode.NONE)
+    scaled = test_data.get("scaled", True)
+    lut_banks = test_data.get("lut_banks", 0)
+    config = find_block_config(
+        arch,
+        block_type,
+        ofm_shape,
+        ifm_shape,
+        ifm2_shape,
+        uses_scalar=uses_scalar,
+        ifm_bits=ifm_bits,
+        kernel=kernel,
+        lut_banks=lut_banks,
+        scaled=scaled,
+        ifm_resampling=ifm_resampling,
+    )
+    assert config is not None
+    config2 = try_block_config(
+        Block.from_shape(config.ofm_block.as_list()),
+        arch,
+        block_type,
+        ifm_shape,
+        ifm2_shape,
+        is_partkernel=config.is_partkernel,
+        uses_scalar=uses_scalar,
+        ifm_bits=ifm_bits,
+        kernel=kernel,
+        lut_banks=lut_banks,
+        scaled=scaled,
+        ifm_resampling=ifm_resampling,
+    )
+    assert config2 is not None
+    assert config.layout.ib_end == config2.layout.ib_end
+    assert config.layout.ab_start == config2.layout.ab_start
+    assert config.layout.ib_start2 == config2.layout.ib_start2
+    assert config.acc_type == config2.acc_type
diff --git a/ethosu/vela/test/test_lut.py b/ethosu/vela/test/test_lut.py
index 44ee0af..4ddc8b9 100644
--- a/ethosu/vela/test/test_lut.py
+++ b/ethosu/vela/test/test_lut.py
@@ -19,7 +19,6 @@
 
 import numpy as np
 
-from ethosu.vela import insert_dma
 from ethosu.vela import lut
 from ethosu.vela import mark_tensors
 from ethosu.vela import pass_packing
@@ -27,37 +26,41 @@
 from ethosu.vela.high_level_command_stream import DMA
 from ethosu.vela.nn_graph import Graph
 from ethosu.vela.operation import Op
+from ethosu.vela.rewrite_graph import rewrite_graph_pre_order
 from ethosu.vela.rewrite_graph import verify_graph_health
 from ethosu.vela.tensor import create_const_tensor
 from ethosu.vela.tensor import TensorPurpose
 from ethosu.vela.test import testutil
 
 
-def set_256_lut(op, key):
+def set_256_lut(op, key, arch):
     random.seed(key)
     values = random.choices(range(256), k=256)
     lut_tensor = create_const_tensor(
         op.name + "_lut", [1, 1, 1, 256], DataType.int8, values, np.uint8, TensorPurpose.LUT
     )
-    op.set_activation_lut(lut_tensor)
+    scratch_lut_tensor = lut_tensor.clone_into_fast_storage(arch)
+    op.set_activation_lut(scratch_lut_tensor)
 
 
-def set_1K_lut(op, key):
+def set_1K_lut(op, key, arch):
     random.seed(key)
     values = random.choices(range(256), k=256)
     lut_tensor = create_const_tensor(
         op.name + "_lut", [1, 1, 1, 256], DataType.int32, values, np.uint32, TensorPurpose.LUT
     )
-    op.set_activation_lut(lut_tensor)
+    scratch_lut_tensor = lut_tensor.clone_into_fast_storage(arch)
+    op.set_activation_lut(scratch_lut_tensor)
 
 
-def set_2K_lut(op, key):
+def set_2K_lut(op, key, arch):
     random.seed(key)
     values = random.choices(range(512), k=512)
     lut_tensor = create_const_tensor(
         op.name + "_lut", [1, 1, 1, 512], DataType.int32, values, np.uint32, TensorPurpose.LUT
     )
-    op.set_activation_lut(lut_tensor)
+    scratch_lut_tensor = lut_tensor.clone_into_fast_storage(arch)
+    op.set_activation_lut(scratch_lut_tensor)
 
 
 def process(arch, op_list):
@@ -68,16 +71,16 @@
     assert verify_graph_health(nng)
     nng = mark_tensors.mark_tensor_purpose(nng, arch, False)
     assert verify_graph_health(nng)
-    nng = insert_dma.insert_dma_commands(nng, arch, False)
-    assert verify_graph_health(nng)
+    rewrite_graph_pre_order(nng, sg, arch, [], [])
     pass_packing.pack_into_passes(nng, arch, False)
     assert verify_graph_health(nng)
     # Create a DMA instruction for every op
     cmd_list = []
     for ps in sg.passes:
-        for intermediate in ps.intermediates:
-            if intermediate.needs_dma():
-                cmd_list.append(DMA(ps, intermediate.get_dma_src_tensor(), intermediate, None))
+        for input_tens in ps.inputs:
+            if input_tens.src_tensor:
+                cmd_list.append(DMA(ps, input_tens.src_tensor, input_tens, None))
+
     sg.high_level_command_stream = cmd_list
     return sg
 
@@ -96,28 +99,28 @@
     shape = [1, 1, 1, 1]
     # u8 LUT op, should lead to DMA
     op0 = testutil.create_elemwise_op(Op.Add, "op0", shape, shape, shape)
-    set_256_lut(op0, "lut0")
+    set_256_lut(op0, "lut0", arch)
     # u8 LUT op, should lead to DMA
     op1 = testutil.create_elemwise_op(Op.Add, "op1", shape, shape, shape)
-    set_256_lut(op1, "lut1")
+    set_256_lut(op1, "lut1", arch)
     # u8 LUT op with different LUT, should lead to DMA
     op2 = testutil.create_elemwise_op(Op.Add, "op2", shape, shape, shape)
-    set_256_lut(op2, "lut2")
+    set_256_lut(op2, "lut2", arch)
     # u8 LUT op with same LUT as in op1, should not lead to DMA
     op3 = testutil.create_elemwise_op(Op.Add, "op3", shape, shape, shape)
-    set_256_lut(op3, "lut1")
+    set_256_lut(op3, "lut1", arch)
     # u8 LUT op with same LUT as in op2, should not lead to DMA
     op4 = testutil.create_elemwise_op(Op.Add, "op4", shape, shape, shape)
-    set_256_lut(op4, "lut2")
+    set_256_lut(op4, "lut2", arch)
     # 2K LUT op, should lead to DMA, and will overwrite all previous LUTs in SHRAM
     op5_2K = testutil.create_elemwise_op(Op.Add, "op5", shape, shape, shape)
-    set_2K_lut(op5_2K, "lut5")
+    set_2K_lut(op5_2K, "lut5", arch)
     # Another 2K LUT op, should lead to DMA, and will overwrite the previous LUT in SHRAM
     op6_2K = testutil.create_elemwise_op(Op.Add, "op6", shape, shape, shape)
-    set_2K_lut(op6_2K, "lut6")
+    set_2K_lut(op6_2K, "lut6", arch)
     # u8 LUT op with same LUT as in op1, should lead to DMA
     op7 = testutil.create_elemwise_op(Op.Add, "op7", shape, shape, shape)
-    set_256_lut(op7, "lut1")
+    set_256_lut(op7, "lut1", arch)
 
     op_list = [op0, op1, op2, op3, op4, op5_2K, op6_2K, op7]
     sg = process(arch, op_list)
@@ -132,7 +135,7 @@
     orig_cmd_list = filter_lut_cmds(orig_cmd_list)
 
     for (cmd, op) in zip(cmd_list, expected_dma_ops):
-        assert cmd.in_tensor == op.activation_lut
+        assert cmd.in_tensor == op.activation_lut.src_tensor
     # Check that lut0, lut1 and lut2 in op0, op1, op2 are stored on different addresses
     assert orig_cmd_list[0].out_tensor.address != orig_cmd_list[1].out_tensor.address
     assert orig_cmd_list[0].out_tensor.address != orig_cmd_list[2].out_tensor.address
@@ -151,28 +154,28 @@
     shape = [1, 1, 1, 1]
     # u8 LUT op, should lead to DMA
     op0 = testutil.create_elemwise_op(Op.Add, "op0", shape, shape, shape)
-    set_256_lut(op0, "lut0")
+    set_256_lut(op0, "lut0", arch)
     # u8 LUT op, should lead to DMA
     op1 = testutil.create_elemwise_op(Op.Add, "op1", shape, shape, shape)
-    set_256_lut(op1, "lut1")
+    set_256_lut(op1, "lut1", arch)
     # 1K LUT op with different LUT, should lead to DMA
     op2_1K = testutil.create_elemwise_op(Op.Add, "op2", shape, shape, shape)
-    set_1K_lut(op2_1K, "lut2")
+    set_1K_lut(op2_1K, "lut2", arch)
     # u8 LUT op with same LUT as in op1, should not lead to DMA
     op3 = testutil.create_elemwise_op(Op.Add, "op3", shape, shape, shape)
-    set_256_lut(op3, "lut1")
+    set_256_lut(op3, "lut1", arch)
     # 1K LUT op with same LUT as in op2, should not lead to DMA
     op4_1K = testutil.create_elemwise_op(Op.Add, "op4", shape, shape, shape)
-    set_1K_lut(op4_1K, "lut2")
+    set_1K_lut(op4_1K, "lut2", arch)
     # 1K LUT op, should lead to DMA, and will overwrite lut2
     op5_2K = testutil.create_elemwise_op(Op.Add, "op5", shape, shape, shape)
-    set_1K_lut(op5_2K, "lut5")
+    set_1K_lut(op5_2K, "lut5", arch)
     # u8 LUT op, lut0 should still be present, should not lead to DMA
     op6 = testutil.create_elemwise_op(Op.Add, "op6", shape, shape, shape)
-    set_256_lut(op6, "lut0")
+    set_256_lut(op6, "lut0", arch)
     # 1K LUT op with same LUT as in op2, should lead to DMA
     op7 = testutil.create_elemwise_op(Op.Add, "op7", shape, shape, shape)
-    set_1K_lut(op7, "lut2")
+    set_1K_lut(op7, "lut2", arch)
 
     op_list = [op0, op1, op2_1K, op3, op4_1K, op5_2K, op6, op7]
     sg = process(arch, op_list)
@@ -187,7 +190,7 @@
     # Check that only the needed DMA commands are left
     expected_dma_ops = [op0, op1, op2_1K, op5_2K, op7]
     for (cmd, op) in zip(cmd_list, expected_dma_ops):
-        assert cmd.in_tensor == op.activation_lut
+        assert cmd.in_tensor == op.activation_lut.src_tensor
     # Check that lut0, lut1 and lut2 in op0, op1, op2 are stored on different addresses
     assert orig_cmd_list[0].out_tensor.address != orig_cmd_list[1].out_tensor.address
     assert orig_cmd_list[0].out_tensor.address != orig_cmd_list[2].out_tensor.address
diff --git a/ethosu/vela/test/test_new_performance.py b/ethosu/vela/test/test_new_performance.py
new file mode 100644
index 0000000..a35905b
--- /dev/null
+++ b/ethosu/vela/test/test_new_performance.py
@@ -0,0 +1,78 @@
+# Copyright (C) 2021 Arm Limited or its affiliates. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Description:
+# Contains unit tests for new performance estimation code
+from ethosu.vela import architecture_allocator
+from ethosu.vela import architecture_features
+from ethosu.vela import npu_performance
+from ethosu.vela import operation
+from ethosu.vela.architecture_features import resampling_mode
+from ethosu.vela.shape4d import Shape4D
+from ethosu.vela.shape4d import VolumeIterator
+from ethosu.vela.tensor import MemArea
+
+
+def test_new_performance():
+    arch = architecture_features.create_default_arch(architecture_features.Accelerator.Ethos_U55_128)
+
+    query = npu_performance.PerformanceQuery(architecture_features.NpuBlockType.ConvolutionMxN)
+    query.ifm_shape = Shape4D(1, 16, 16, 16)
+    query.ifm2_shape = Shape4D()
+    query.ifm_memory_area = MemArea.Sram
+    query.ifm_bits = 8
+    query.ofm_shape = Shape4D(1, 16, 16, 1)
+    query.ofm_memory_area = MemArea.Sram
+    query.ofm_bits = 8
+    query.const_shape = Shape4D(1, 1, 1, query.ofm_shape.depth)
+    query.const_memory_area = MemArea.OffChipFlash
+    query.kernel = operation.Kernel(1, 1, 1, 1, 1, 1, valid_padding=False)
+    query.config = architecture_allocator.find_block_config(
+        arch,
+        architecture_features.NpuBlockType.ConvolutionMxN,
+        Shape4D(1, 16, 16, 1),
+        query.ifm_shape,
+        None,
+        False,
+        8,
+        query.kernel,
+        0,
+        False,
+        resampling_mode.NONE,
+    )
+
+    print("For block Config = {}".format(query.config))
+
+    # -s to display output
+    for sub_shape in [Shape4D(1, 4, 8, 16), Shape4D(1, 8, 8, 16), Shape4D(1, 8, 16, 16), query.ofm_shape]:
+        print("\n-- Subshape = {}".format(sub_shape))
+        iterator = VolumeIterator(query.ofm_shape, sub_shape)
+        a = npu_performance.ElementAccess()
+        c = npu_performance.CycleCost()
+        for pos, shape in iterator:
+            print("\tpos = {} shape = {}".format(pos, shape))
+            ta, tc = npu_performance.measure_performance_cost(
+                arch, operation.Op.Conv2D, operation.Op.Relu, query, pos, shape
+            )
+            a += ta
+            c += tc
+            print("\t\taccess: {}".format(ta))
+            print("\t\tcycles: {}".format(tc))
+        print("\tAccess: {}".format(a))
+        print("\tCycles: {}".format(c))
+        assert c.op_macs == 4096
+
+    assert True  # Any successful result is okay