MLBEDSW-3465: Add memory settings into sys config

Signed-off-by: Diqing Zhong <diqing.zhong@arm.com>
Change-Id: I4a5c53d0c5957595fc639b174b2b227ea043d409
diff --git a/ethosu/vela/architecture_features.py b/ethosu/vela/architecture_features.py
index 576f793..9edc87e 100644
--- a/ethosu/vela/architecture_features.py
+++ b/ethosu/vela/architecture_features.py
@@ -32,6 +32,7 @@
 from .operation import NpuBlockType
 from .operation import PointXYZ
 from .supported_operators import SupportedOperators
+from .tensor import BandwidthDirection
 from .tensor import MemArea
 from .tensor import MemType
 from .tensor import TensorFormat
@@ -465,6 +466,12 @@
             self.axi1_port = MemArea.Dram
             self.memory_clock_scales[MemArea.Sram] = 1.0
             self.memory_clock_scales[MemArea.Dram] = 0.75  # 3 / 4
+            self.memory_burst_length[MemArea.Sram] = 32
+            self.memory_burst_length[MemArea.Dram] = 128
+            self.memory_latency[MemArea.Sram][BandwidthDirection.Read] = 32
+            self.memory_latency[MemArea.Sram][BandwidthDirection.Write] = 32
+            self.memory_latency[MemArea.Dram][BandwidthDirection.Read] = 500
+            self.memory_latency[MemArea.Dram][BandwidthDirection.Write] = 250
         else:
             # Default Ethos-U55 system configuration
             # Ethos-U55 High-End Embedded: SRAM (4 GB/s) and Flash (0.5 GB/s)
@@ -473,6 +480,12 @@
             self.axi1_port = MemArea.OffChipFlash
             self.memory_clock_scales[MemArea.Sram] = 1.0
             self.memory_clock_scales[MemArea.OffChipFlash] = 0.125  # 1 / 8
+            self.memory_burst_length[MemArea.Sram] = 32
+            self.memory_burst_length[MemArea.OffChipFlash] = 128
+            self.memory_latency[MemArea.Sram][BandwidthDirection.Read] = 32
+            self.memory_latency[MemArea.Sram][BandwidthDirection.Write] = 32
+            self.memory_latency[MemArea.OffChipFlash][BandwidthDirection.Read] = 64
+            self.memory_latency[MemArea.OffChipFlash][BandwidthDirection.Write] = 64
 
     def _set_default_mem_mode(self):
         # ArchitectureFeatures.DEFAULT_CONFIG values
@@ -500,6 +513,8 @@
         self.axi0_port = MemArea(1)
         self.axi1_port = MemArea(1)
         self.memory_clock_scales = np.ones(MemArea.Size)
+        self.memory_burst_length = np.ones(MemArea.Size)
+        self.memory_latency = np.zeros((MemArea.Size, BandwidthDirection.Size))
         self.const_mem_area = MemPort(1)
         self.arena_mem_area = MemPort(1)
         self.cache_mem_area = MemPort(1)
@@ -526,7 +541,25 @@
                         sys_cfg_section, mem_area.name + "_clock_scale", self.memory_clock_scales[mem_area]
                     )
                 )
-
+                self.memory_burst_length[mem_area] = int(
+                    self._read_config(
+                        sys_cfg_section, mem_area.name + "_burst_length", self.memory_burst_length[mem_area]
+                    )
+                )
+                self.memory_latency[mem_area][BandwidthDirection.Read] = int(
+                    self._read_config(
+                        sys_cfg_section,
+                        mem_area.name + "_read_latency",
+                        self.memory_latency[mem_area][BandwidthDirection.Read],
+                    )
+                )
+                self.memory_latency[mem_area][BandwidthDirection.Write] = int(
+                    self._read_config(
+                        sys_cfg_section,
+                        mem_area.name + "_write_latency",
+                        self.memory_latency[mem_area][BandwidthDirection.Write],
+                    )
+                )
         elif self.system_config == ArchitectureFeatures.DEFAULT_CONFIG:
             self._set_default_sys_config()
 
@@ -578,6 +611,8 @@
                     self.const_mem_area = MemPort.Axi0
                     self.axi0_port = MemArea.OnChipFlash
                 self.memory_clock_scales[MemArea.OnChipFlash] = self.memory_clock_scales[MemArea.Sram]
+                self.memory_burst_length[MemArea.OnChipFlash] = self.memory_burst_length[MemArea.Sram]
+                self.memory_latency[MemArea.OnChipFlash] = self.memory_latency[MemArea.Sram]
 
         # check configuration
         if self._mem_port_mapping(self.cache_mem_area) != MemArea.Sram:
@@ -623,6 +658,9 @@
             print(f"   axi1_port = {self.axi1_port.name}")
             for mem in (MemArea.Sram, MemArea.Dram, MemArea.OnChipFlash, MemArea.OffChipFlash):
                 print(f"   {mem.name}_clock_scales = {self.memory_clock_scales[mem]}")
+                print(f"   {mem.name}_burst_length = {self.memory_burst_length[mem]}")
+                print(f"   {mem.name}_read_latency = {self.memory_latency[mem][BandwidthDirection.Read]}")
+                print(f"   {mem.name}_write_latency = {self.memory_latency[mem][BandwidthDirection.Write]}")
 
             print(f"Memory Mode ({self.memory_mode}):")
             print(f"   const_mem_area = {self.const_mem_area.name}")
diff --git a/ethosu/vela/npu_performance.py b/ethosu/vela/npu_performance.py
index 8ada1e2..9d83f6f 100644
--- a/ethosu/vela/npu_performance.py
+++ b/ethosu/vela/npu_performance.py
@@ -33,6 +33,7 @@
 from .operation import NpuBlockType
 from .operation import Op
 from .shared_buffer_allocation import is_acc_40bits_used
+from .tensor import BandwidthDirection
 from .tensor import MemArea
 from .tensor import shape_num_elements
 from .tensor import Tensor
@@ -90,22 +91,6 @@
         )
 
 
-class BandwidthDirection(IntEnum):
-    Read = 0
-    Write = auto()
-    Size = auto()
-
-    def display_name(self):
-        return self.name
-
-    def identifier_name(self):
-        return self.name.lower()
-
-    @staticmethod
-    def all():
-        return (BandwidthDirection.Read, BandwidthDirection.Write)
-
-
 def make_bandwidth_array():
     return np.zeros((MemArea.Size, TensorPurpose.Size, BandwidthDirection.Size))
 
@@ -133,8 +118,6 @@
 
 
 def get_minimal_cmd_cycles(arch, ifm_tensor, ofm_tensor, ifm_blk: Block, ofm_blk: Block, output_cycles, dpu_cycles=0):
-    latencies_rd = {MemArea.Sram: 32, MemArea.Dram: 500, MemArea.OnChipFlash: 64, MemArea.OffChipFlash: 64}
-    latencies_wr = {MemArea.Sram: 32, MemArea.Dram: 250, MemArea.OnChipFlash: 64, MemArea.OffChipFlash: 64}
     ifm_tens_blk = Tensor((1, ifm_blk.height, ifm_blk.width, ifm_blk.depth), ifm_tensor.dtype, "ifm_blk")
     ofm_tens_blk = Tensor((1, ofm_blk.height, ofm_blk.width, ofm_blk.depth), ofm_tensor.dtype, "ofm_blk")
     cycles_ifm_blk = (
@@ -146,11 +129,11 @@
         / arch.memory_bandwidths_per_cycle[ofm_tensor.mem_area]
     )
     return (
-        latencies_rd[ifm_tensor.mem_area]
+        arch.memory_latency[ifm_tensor.mem_area][BandwidthDirection.Read]
         + cycles_ifm_blk
         + dpu_cycles
         + output_cycles
-        + latencies_wr[ofm_tensor.mem_area]
+        + arch.memory_latency[ofm_tensor.mem_area][BandwidthDirection.Write]
         + cycles_ofm_blk
     ) / 4
 
@@ -351,13 +334,12 @@
     )
 
     if scale_tensor:
-        if scale_tensor.mem_area is MemArea.Sram:
-            latency = 32
-        elif scale_tensor.mem_area is MemArea.Dram:
-            latency = 500
-        else:
-            latency = 64
-        cycles_bias_blk = 10 * min(ofm_block.depth, ofm_tens_shape[3]) * latency / 256
+        cycles_bias_blk = (
+            10
+            * min(ofm_block.depth, ofm_tens_shape[3])
+            * arch.memory_latency[scale_tensor.mem_area][BandwidthDirection.Read]
+            / 256
+        )
         cycles_output_blk = max(cycles_output_blk, cycles_bias_blk)
 
     cycles_cmd = get_minimal_cmd_cycles(
@@ -380,7 +362,6 @@
 
     # Estimate memory transfer efficiency by calculating the burst length
     # this is related to data format, block shape, and tensor shape, etc.
-    max_burst_len = 32 if mem_area == MemArea.Sram else 128
     burst_len = 0
     elem_size = tensor.dtype.size_in_bytes()
     is_ifm = direction == BandwidthDirection.Read
@@ -408,10 +389,10 @@
             else:
                 burst_len = min(64, 16 * elem_size * arch.ncores, block_size.depth * elem_size)
 
-    burst_len = min(max_burst_len, burst_len)
+    burst_len = min(arch.memory_burst_length[mem_area], burst_len)
     bw = tens.bandwidth() if replace_bw is None else replace_bw
 
-    return bw * (max_burst_len / burst_len)
+    return bw * (arch.memory_burst_length[mem_area] / burst_len)
 
 
 def performance_metrics_for_pass(arch, ps, block_config=None, rewrite_list=None, force_outputs_to_fast_storage=False):
diff --git a/ethosu/vela/tensor.py b/ethosu/vela/tensor.py
index de97710..257cb5f 100644
--- a/ethosu/vela/tensor.py
+++ b/ethosu/vela/tensor.py
@@ -19,6 +19,7 @@
 import enum
 import uuid
 from collections import defaultdict
+from enum import auto
 from functools import lru_cache
 from typing import Dict
 from typing import List
@@ -62,6 +63,22 @@
         return self.name
 
 
+class BandwidthDirection(enum.IntEnum):
+    Read = 0
+    Write = auto()
+    Size = auto()
+
+    def display_name(self):
+        return self.name
+
+    def identifier_name(self):
+        return self.name.lower()
+
+    @staticmethod
+    def all():
+        return (BandwidthDirection.Read, BandwidthDirection.Write)
+
+
 class MemArea(enum.IntFlag):
     Unknown = 0
     Sram = 1