MLBEDSW-603: Improve cycle estimation in elementwise ops

Signed-off-by: Diqing Zhong <diqing.zhong@arm.com>
Change-Id: I9f3671041c2b1497519cf42b5f52e3cd01d9c10a
(cherry picked from commit e8c989f5236cce12d07a6644329935dbbf0ee8e6)
diff --git a/ethosu/vela/architecture_features.py b/ethosu/vela/architecture_features.py
index 3ef4d1b..04c1c62 100644
--- a/ethosu/vela/architecture_features.py
+++ b/ethosu/vela/architecture_features.py
@@ -136,15 +136,14 @@
 
 class ArchitectureFeatures:
     """This class is a container for various parameters of the Ethos-U55 core
-and system configuration that can be tuned, either by command line
-parameters or by the Ethos-U55 architects. The class is often passed
-around to passes that need to do architecture-dependent actions.
+    and system configuration that can be tuned, either by command line
+    parameters or by the Ethos-U55 architects. The class is often passed
+    around to passes that need to do architecture-dependent actions.
 
-Note the difference between ArchitectureFeatures and CompilerOptions
-- ArchitectureFeatures is for changing the Ethos-U55 and system architecture
-- CompilerOptions is for changing the behaviour of the compiler
-
-"""
+    Note the difference between ArchitectureFeatures and CompilerOptions
+    - ArchitectureFeatures is for changing the Ethos-U55 and system architecture
+    - CompilerOptions is for changing the behaviour of the compiler
+    """
 
     ArchitectureConfig = namedtuple(
         "ArchitectureConfig", "macs cores ofm_ublock ifm_ublock shram_banks shram_granules elem_units"
@@ -239,6 +238,9 @@
 
         self.memory_bandwidths_per_second = self.memory_bandwidths_per_cycle * self.npu_clock
 
+        # Get output/activation performance numbers
+        self._generate_output_perf_tables(self.accelerator_config)
+
         # sizes as N x H x W x C. we need to round up to these when allocating storage
         self.storage_rounding_quantums = {
             TensorFormat.Unknown: (1, 1, 1, 1),
@@ -374,6 +376,24 @@
                     key = ArchitectureFeatures.make_block_config_key(w, h, c)
                     self.block_config_map[key] = self.generate_block_config(w, h, c)
 
+    def _generate_output_perf_tables(self, accel_config):
+        if accel_config == Accelerator.Ethos_U55_32:
+            self.output_cycles_per_elem = (2.0, 3.0, 3.0, 3.0, 4.0, 6.0, 1.0, 2.0)
+            self.activation_cycles_per_elem = (1.0, 1.0, 0.0)
+        elif accel_config == Accelerator.Ethos_U55_64:
+            self.output_cycles_per_elem = (1.0, 1.5, 1.5, 1.5, 2.0, 3.0, 0.5, 1.0)
+            self.activation_cycles_per_elem = (1.0, 1.0, 0.0)
+        elif accel_config == Accelerator.Ethos_U55_128:
+            self.output_cycles_per_elem = (0.75, 1.25, 0.75, 0.75, 1.0, 1.5, 0.25, 0.5)
+            self.activation_cycles_per_elem = (1.0, 0.5, 0.0)
+        elif accel_config in (Accelerator.Ethos_U55_256, Accelerator.Yoda_256):
+            self.output_cycles_per_elem = (0.625, 1.125, 0.5, 0.375, 0.5, 0.75, 0.125, 0.25)
+            self.activation_cycles_per_elem = (1.0, 0.25, 0.0)
+        else:
+            assert accel_config == Accelerator.Yoda_512
+            self.output_cycles_per_elem = (0.3125, 0.5625, 0.25, 0.1875, 0.25, 0.375, 0.0625, 0.125)
+            self.activation_cycles_per_elem = (0.5, 0.125, 0.0)
+
     def calc_ifm_block_depth(self, ifm_depth, ifm_bits):
         assert ifm_bits in (8, 16, 32)
         assert ifm_depth > 0
diff --git a/ethosu/vela/npu_performance.py b/ethosu/vela/npu_performance.py
index fc148f3..e71e95b 100644
--- a/ethosu/vela/npu_performance.py
+++ b/ethosu/vela/npu_performance.py
@@ -25,9 +25,12 @@
 
 from . import numeric_util
 from .architecture_features import Block
+from .architecture_features import SHRAMElements
+from .data_type import DataType
 from .nn_graph import PassPlacement
 from .nn_graph import SchedulerRewrite
 from .operation import NpuBlockType
+from .operation import Op
 from .register_command_stream_generator import get_op_kernel
 from .tensor import MemArea
 from .tensor import shape_num_elements
@@ -210,6 +213,66 @@
     return total_blocks, total_area, block_setup
 
 
+def get_output_cycle_estimate(arch, ps):
+    primary_op = ps.primary_op
+    assert primary_op
+    npu_block_type = primary_op.type.npu_block_type
+    faf = primary_op.activation
+
+    if npu_block_type == NpuBlockType.ElementWise and ps.ifm_tensor.dtype == DataType.int32:
+        if ps.ifm2_tensor is None:
+            # Unary op
+            output_perf_index = 0
+        else:
+            # Binary op
+            output_perf_index = 1
+    elif ps.primary_op.type == Op.Mul and ps.ofm_tensor.dtype == DataType.int32:
+        output_perf_index = 2
+    elif ps.primary_op.type == Op.Mul or (
+        npu_block_type
+        in (
+            NpuBlockType.ConvolutionMxN,
+            NpuBlockType.ConvolutionDepthWise,
+            NpuBlockType.Pooling,
+            NpuBlockType.ReduceSum,
+            NpuBlockType.VectorProduct,
+        )
+        and ps.shared_buffer.use_accumulator_element == SHRAMElements.Acc40
+    ):
+        output_perf_index = 3
+    elif ps.primary_op.type in (Op.Add, Op.Sub):
+        input_scale = ps.ifm_tensor.quantization.scale_f32
+        input2_scale = ps.ifm2_tensor.quantization.scale_f32
+        output_scale = ps.ofm_tensor.quantization.scale_f32
+
+        if "resizebilinear" in primary_op.attrs:
+            output_scale = input2_scale
+
+        if None in (input_scale, input2_scale, output_scale) or input_scale == input2_scale:
+            # Simple Add/Sub
+            output_perf_index = 4
+        else:
+            # Advanced Add/Sub
+            output_perf_index = 5
+    elif ps.primary_op.type.is_maxpool_op():
+        output_perf_index = 6
+    else:
+        output_perf_index = 7
+
+    if faf in (Op.Sigmoid, Op.Tanh, Op.LUT):
+        activation_perf_index = 0
+    elif faf in (Op.Relu, Op.Relu6, Op.ReluN1To1):
+        activation_perf_index = 1
+    else:
+        activation_perf_index = 2
+
+    num_elems = ps.outputs[0].elements()
+    cycle_per_elem = max(
+        arch.output_cycles_per_elem[output_perf_index], arch.activation_cycles_per_elem[activation_perf_index]
+    )
+    return num_elems * cycle_per_elem
+
+
 def performance_metrics_for_pass(arch, ps, block_config=None, rewrite_list=[], force_outputs_to_fast_storage=False):
     if block_config is None:
         block_config = ps.block_config
@@ -385,14 +448,9 @@
             replacement_read_bws[weight_tensor] = weight_tensor.bandwidth() * non_zero_fraction
             ifm_read_multiple = 1
             weight_read_multiple = non_zero_fraction
-    else:
-        if ps.placement == PassPlacement.Npu and len(ps.outputs):
-            # Assume element-wise operation going through the element pipelines.
+        elif npu_block_type == NpuBlockType.ElementWise:
             # Work out how many elements we have and calculate performance.
-            out = ps.outputs[0]
-            elms = out.elements()
-
-            cycles[PassCycles.ElementWise] = numeric_util.round_up_divide(elms, arch.num_elem_wise_units)
+            cycles[PassCycles.ElementWise] = get_output_cycle_estimate(arch, ps)
 
     # apply the desired rewrites
     for rewrite_op, tens, _, _, _, ps_to_rewrite in rewrite_list: