MLBEDSW-603: Improve cycle estimation in elementwise ops Signed-off-by: Diqing Zhong <diqing.zhong@arm.com> Change-Id: I9f3671041c2b1497519cf42b5f52e3cd01d9c10a (cherry picked from commit e8c989f5236cce12d07a6644329935dbbf0ee8e6)

commit: e8887a3e6ed6638b06ecac9581deaaa89b8059c0 [log] [tgz]
author: Diqing Zhong <diqing.zhong@arm.com> Thu Sep 24 09:53:48 2020 +0200
committer: patrik.gustavsson <patrik.gustavsson@arm.com> Wed Oct 21 12:09:45 2020 +0000
tree: 9bdb11e1e1318a81d14dda1c00b4acf37089d2e8
parent: 17afa2837ad366f2da32e2bc0e2659ebb35bd1d5 [diff] [blame]
diff --git a/ethosu/vela/npu_performance.py b/ethosu/vela/npu_performance.py
index fc148f3..e71e95b 100644
--- a/ethosu/vela/npu_performance.py
+++ b/ethosu/vela/npu_performance.py

@@ -25,9 +25,12 @@
 
 from . import numeric_util
 from .architecture_features import Block
+from .architecture_features import SHRAMElements
+from .data_type import DataType
 from .nn_graph import PassPlacement
 from .nn_graph import SchedulerRewrite
 from .operation import NpuBlockType
+from .operation import Op
 from .register_command_stream_generator import get_op_kernel
 from .tensor import MemArea
 from .tensor import shape_num_elements
@@ -210,6 +213,66 @@
     return total_blocks, total_area, block_setup
 
 
+def get_output_cycle_estimate(arch, ps):
+    primary_op = ps.primary_op
+    assert primary_op
+    npu_block_type = primary_op.type.npu_block_type
+    faf = primary_op.activation
+
+    if npu_block_type == NpuBlockType.ElementWise and ps.ifm_tensor.dtype == DataType.int32:
+        if ps.ifm2_tensor is None:
+            # Unary op
+            output_perf_index = 0
+        else:
+            # Binary op
+            output_perf_index = 1
+    elif ps.primary_op.type == Op.Mul and ps.ofm_tensor.dtype == DataType.int32:
+        output_perf_index = 2
+    elif ps.primary_op.type == Op.Mul or (
+        npu_block_type
+        in (
+            NpuBlockType.ConvolutionMxN,
+            NpuBlockType.ConvolutionDepthWise,
+            NpuBlockType.Pooling,
+            NpuBlockType.ReduceSum,
+            NpuBlockType.VectorProduct,
+        )
+        and ps.shared_buffer.use_accumulator_element == SHRAMElements.Acc40
+    ):
+        output_perf_index = 3
+    elif ps.primary_op.type in (Op.Add, Op.Sub):
+        input_scale = ps.ifm_tensor.quantization.scale_f32
+        input2_scale = ps.ifm2_tensor.quantization.scale_f32
+        output_scale = ps.ofm_tensor.quantization.scale_f32
+
+        if "resizebilinear" in primary_op.attrs:
+            output_scale = input2_scale
+
+        if None in (input_scale, input2_scale, output_scale) or input_scale == input2_scale:
+            # Simple Add/Sub
+            output_perf_index = 4
+        else:
+            # Advanced Add/Sub
+            output_perf_index = 5
+    elif ps.primary_op.type.is_maxpool_op():
+        output_perf_index = 6
+    else:
+        output_perf_index = 7
+
+    if faf in (Op.Sigmoid, Op.Tanh, Op.LUT):
+        activation_perf_index = 0
+    elif faf in (Op.Relu, Op.Relu6, Op.ReluN1To1):
+        activation_perf_index = 1
+    else:
+        activation_perf_index = 2
+
+    num_elems = ps.outputs[0].elements()
+    cycle_per_elem = max(
+        arch.output_cycles_per_elem[output_perf_index], arch.activation_cycles_per_elem[activation_perf_index]
+    )
+    return num_elems * cycle_per_elem
+
+
 def performance_metrics_for_pass(arch, ps, block_config=None, rewrite_list=[], force_outputs_to_fast_storage=False):
     if block_config is None:
         block_config = ps.block_config
@@ -385,14 +448,9 @@
             replacement_read_bws[weight_tensor] = weight_tensor.bandwidth() * non_zero_fraction
             ifm_read_multiple = 1
             weight_read_multiple = non_zero_fraction
-    else:
-        if ps.placement == PassPlacement.Npu and len(ps.outputs):
-            # Assume element-wise operation going through the element pipelines.
+        elif npu_block_type == NpuBlockType.ElementWise:
             # Work out how many elements we have and calculate performance.
-            out = ps.outputs[0]
-            elms = out.elements()
-
-            cycles[PassCycles.ElementWise] = numeric_util.round_up_divide(elms, arch.num_elem_wise_units)
+            cycles[PassCycles.ElementWise] = get_output_cycle_estimate(arch, ps)
 
     # apply the desired rewrites
     for rewrite_op, tens, _, _, _, ps_to_rewrite in rewrite_list:
commit	e8887a3e6ed6638b06ecac9581deaaa89b8059c0	[log] [tgz]
author	Diqing Zhong <diqing.zhong@arm.com>	Thu Sep 24 09:53:48 2020 +0200
committer	patrik.gustavsson <patrik.gustavsson@arm.com>	Wed Oct 21 12:09:45 2020 +0000
tree	9bdb11e1e1318a81d14dda1c00b4acf37089d2e8
parent	17afa2837ad366f2da32e2bc0e2659ebb35bd1d5 [diff] [blame]