MLBEDSW-4034: New Scheduler Size or Performance Optimisation

 - Merged dev/scheduler at 83639f90e8c828f70de6e29142355a940224959b

Signed-off-by: Tim Hall <>
Change-Id: I0050529d4b42da93768c7264296434dd877fb5b4
diff --git a/ethosu/vela/test/ b/ethosu/vela/test/
new file mode 100644
index 0000000..a35905b
--- /dev/null
+++ b/ethosu/vela/test/
@@ -0,0 +1,78 @@
+# Copyright (C) 2021 Arm Limited or its affiliates. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Description:
+# Contains unit tests for new performance estimation code
+from ethosu.vela import architecture_allocator
+from ethosu.vela import architecture_features
+from ethosu.vela import npu_performance
+from ethosu.vela import operation
+from ethosu.vela.architecture_features import resampling_mode
+from ethosu.vela.shape4d import Shape4D
+from ethosu.vela.shape4d import VolumeIterator
+from ethosu.vela.tensor import MemArea
+def test_new_performance():
+    arch = architecture_features.create_default_arch(architecture_features.Accelerator.Ethos_U55_128)
+    query = npu_performance.PerformanceQuery(architecture_features.NpuBlockType.ConvolutionMxN)
+    query.ifm_shape = Shape4D(1, 16, 16, 16)
+    query.ifm2_shape = Shape4D()
+    query.ifm_memory_area = MemArea.Sram
+    query.ifm_bits = 8
+    query.ofm_shape = Shape4D(1, 16, 16, 1)
+    query.ofm_memory_area = MemArea.Sram
+    query.ofm_bits = 8
+    query.const_shape = Shape4D(1, 1, 1, query.ofm_shape.depth)
+    query.const_memory_area = MemArea.OffChipFlash
+    query.kernel = operation.Kernel(1, 1, 1, 1, 1, 1, valid_padding=False)
+    query.config = architecture_allocator.find_block_config(
+        arch,
+        architecture_features.NpuBlockType.ConvolutionMxN,
+        Shape4D(1, 16, 16, 1),
+        query.ifm_shape,
+        None,
+        False,
+        8,
+        query.kernel,
+        0,
+        False,
+        resampling_mode.NONE,
+    )
+    print("For block Config = {}".format(query.config))
+    # -s to display output
+    for sub_shape in [Shape4D(1, 4, 8, 16), Shape4D(1, 8, 8, 16), Shape4D(1, 8, 16, 16), query.ofm_shape]:
+        print("\n-- Subshape = {}".format(sub_shape))
+        iterator = VolumeIterator(query.ofm_shape, sub_shape)
+        a = npu_performance.ElementAccess()
+        c = npu_performance.CycleCost()
+        for pos, shape in iterator:
+            print("\tpos = {} shape = {}".format(pos, shape))
+            ta, tc = npu_performance.measure_performance_cost(
+                arch, operation.Op.Conv2D, operation.Op.Relu, query, pos, shape
+            )
+            a += ta
+            c += tc
+            print("\t\taccess: {}".format(ta))
+            print("\t\tcycles: {}".format(tc))
+        print("\tAccess: {}".format(a))
+        print("\tCycles: {}".format(c))
+        assert c.op_macs == 4096
+    assert True  # Any successful result is okay