MLBEDSW-603: Improve cycle estimation in elementwise ops

Signed-off-by: Diqing Zhong <diqing.zhong@arm.com>
Change-Id: I9f3671041c2b1497519cf42b5f52e3cd01d9c10a
(cherry picked from commit e8c989f5236cce12d07a6644329935dbbf0ee8e6)
diff --git a/ethosu/vela/architecture_features.py b/ethosu/vela/architecture_features.py
index 3ef4d1b..04c1c62 100644
--- a/ethosu/vela/architecture_features.py
+++ b/ethosu/vela/architecture_features.py
@@ -136,15 +136,14 @@
 
 class ArchitectureFeatures:
     """This class is a container for various parameters of the Ethos-U55 core
-and system configuration that can be tuned, either by command line
-parameters or by the Ethos-U55 architects. The class is often passed
-around to passes that need to do architecture-dependent actions.
+    and system configuration that can be tuned, either by command line
+    parameters or by the Ethos-U55 architects. The class is often passed
+    around to passes that need to do architecture-dependent actions.
 
-Note the difference between ArchitectureFeatures and CompilerOptions
-- ArchitectureFeatures is for changing the Ethos-U55 and system architecture
-- CompilerOptions is for changing the behaviour of the compiler
-
-"""
+    Note the difference between ArchitectureFeatures and CompilerOptions
+    - ArchitectureFeatures is for changing the Ethos-U55 and system architecture
+    - CompilerOptions is for changing the behaviour of the compiler
+    """
 
     ArchitectureConfig = namedtuple(
         "ArchitectureConfig", "macs cores ofm_ublock ifm_ublock shram_banks shram_granules elem_units"
@@ -239,6 +238,9 @@
 
         self.memory_bandwidths_per_second = self.memory_bandwidths_per_cycle * self.npu_clock
 
+        # Get output/activation performance numbers
+        self._generate_output_perf_tables(self.accelerator_config)
+
         # sizes as N x H x W x C. we need to round up to these when allocating storage
         self.storage_rounding_quantums = {
             TensorFormat.Unknown: (1, 1, 1, 1),
@@ -374,6 +376,24 @@
                     key = ArchitectureFeatures.make_block_config_key(w, h, c)
                     self.block_config_map[key] = self.generate_block_config(w, h, c)
 
+    def _generate_output_perf_tables(self, accel_config):
+        if accel_config == Accelerator.Ethos_U55_32:
+            self.output_cycles_per_elem = (2.0, 3.0, 3.0, 3.0, 4.0, 6.0, 1.0, 2.0)
+            self.activation_cycles_per_elem = (1.0, 1.0, 0.0)
+        elif accel_config == Accelerator.Ethos_U55_64:
+            self.output_cycles_per_elem = (1.0, 1.5, 1.5, 1.5, 2.0, 3.0, 0.5, 1.0)
+            self.activation_cycles_per_elem = (1.0, 1.0, 0.0)
+        elif accel_config == Accelerator.Ethos_U55_128:
+            self.output_cycles_per_elem = (0.75, 1.25, 0.75, 0.75, 1.0, 1.5, 0.25, 0.5)
+            self.activation_cycles_per_elem = (1.0, 0.5, 0.0)
+        elif accel_config in (Accelerator.Ethos_U55_256, Accelerator.Yoda_256):
+            self.output_cycles_per_elem = (0.625, 1.125, 0.5, 0.375, 0.5, 0.75, 0.125, 0.25)
+            self.activation_cycles_per_elem = (1.0, 0.25, 0.0)
+        else:
+            assert accel_config == Accelerator.Yoda_512
+            self.output_cycles_per_elem = (0.3125, 0.5625, 0.25, 0.1875, 0.25, 0.375, 0.0625, 0.125)
+            self.activation_cycles_per_elem = (0.5, 0.125, 0.0)
+
     def calc_ifm_block_depth(self, ifm_depth, ifm_bits):
         assert ifm_bits in (8, 16, 32)
         assert ifm_depth > 0