MLBEDSW-2551 Add support for more mem-cfgs

Added support for one more memory configuration-

Change-Id: Iac19992386e3e9b80bd519acb1b0a399c47d736f
Signed-off-by: Patrik Gustavsson <patrik.gustavsson@arm.com>
diff --git a/ethosu/vela/architecture_features.py b/ethosu/vela/architecture_features.py
index 1dce435..6460c52 100644
--- a/ethosu/vela/architecture_features.py
+++ b/ethosu/vela/architecture_features.py
@@ -274,8 +274,8 @@
         self.cycles_weight = 40
         self.max_sram_used_weight = 1000
 
-        if self.is_yoda_system:
-            self.max_sram_used_weight = 1000
+        if self.is_yoda_system and (self.fast_storage_mem_area != self.feature_map_storage_mem_area):
+            self.max_sram_used_weight = 0
 
         # Shared Buffer Block allocations
         self.shram_bank_size = 1024  # bytes
@@ -587,10 +587,6 @@
             self.fast_storage_mem_area = MemArea[self.__sys_config("fast_storage_mem_area", "Sram")]
             self.feature_map_storage_mem_area = MemArea[self.__sys_config("feature_map_storage_mem_area", "Sram")]
 
-            if self.fast_storage_mem_area != self.feature_map_storage_mem_area:
-                raise Exception(
-                    "Invalid memory configuration fast_storage_mem_area must be same as feature_map_storage_mem_area"
-                )
             self.permanent_storage_mem_area = MemArea[self.__sys_config("permanent_storage_mem_area", "OffChipFlash")]
             if is_yoda_system:
                 if self.permanent_storage_mem_area is not MemArea.Dram:
diff --git a/ethosu/vela/compiler_driver.py b/ethosu/vela/compiler_driver.py
index e495f1c..b5a6c42 100644
--- a/ethosu/vela/compiler_driver.py
+++ b/ethosu/vela/compiler_driver.py
@@ -126,6 +126,7 @@
 
     # Placeholders for scratch and flash tensors that are common for all Npu subgraphs
     scratch_tens = None
+    scratch_fast_tens = None
     flash_tens = None
 
     # Calculate live ranges for all constant Npu tensors, in permanent storage
@@ -199,12 +200,16 @@
         register_command_stream_generator.generate_register_command_stream(
             nng, sg, arch, options.verbose_register_command_stream
         )
-        scratch_tens, flash_tens = npu_serialisation.serialise_npu_subgraph_into_tensors(
-            nng, sg, arch, scratch_tens, flash_tens
+        scratch_tens, scratch_fast_tens, flash_tens = npu_serialisation.serialise_npu_subgraph_into_tensors(
+            nng, sg, arch, scratch_tens, scratch_fast_tens, flash_tens
         )
 
     npu_serialisation.rewrite_npu_call_ops(nng, root_sg, arch)
 
+    if root_sg is not None and (arch.feature_map_storage_mem_area != arch.fast_storage_mem_area):
+        if root_sg.memory_used_per_type.get(MemType.Scratch_fast, 0) > arch.sram_size:
+            print("Warning: Sram limit has been exceeded, by the scratch fast tensor")
+
     # Allocate all Cpu constant tensors, this is done last because the Npu-ops
     # have to be serialized into flash and scratch tensors first
     tensor_allocation.allocate_tensors(
diff --git a/ethosu/vela/npu_serialisation.py b/ethosu/vela/npu_serialisation.py
index bd13a3e..2d1c6b1 100644
--- a/ethosu/vela/npu_serialisation.py
+++ b/ethosu/vela/npu_serialisation.py
@@ -55,12 +55,13 @@
     memory_tensor.values[start_addr:end_addr] = src_tensor.quant_values
 
 
-def serialise_npu_subgraph_into_tensors(nng, sg, arch, scratch_tens, flash_tens):
+def serialise_npu_subgraph_into_tensors(nng, sg, arch, scratch_tens, scratch_fast_tens, flash_tens):
     if sg.placement != PassPlacement.Npu:
-        return scratch_tens, flash_tens
+        return scratch_tens, scratch_fast_tens, flash_tens
 
     flash_area = arch.permanent_storage_mem_area
     scratch_area = arch.feature_map_storage_mem_area
+    scratch_fast_area = arch.fast_storage_mem_area
 
     flash_size = sg.memory_used.get(flash_area, 0)
     scratch_size = sg.memory_used.get(scratch_area, 0)
@@ -85,6 +86,10 @@
     nng.total_size[scratch_area] = nng.total_size.get(scratch_area, 0) - scratch_size
     nng.total_elements[scratch_area] = nng.total_elements.get(scratch_area, 0) - scratch_size
 
+    if scratch_area != scratch_fast_area:
+        nng.total_size[scratch_fast_area] = nng.total_size.get(scratch_fast_area, 0)
+        nng.total_elements[scratch_fast_area] = nng.total_elements.get(scratch_fast_area, 0)
+
     if flash_tens == scratch_tens is None:
         # First Npu subgraph, create scratch and flash tensors
         sg.scratch_tensor = make_memory_tensor(
@@ -94,12 +99,22 @@
         sg.flash_tensor = make_memory_tensor(
             sg.name + "_flash", flash_area, MemType.Permanent_CPU, flash_size, True, arch
         )
+        # Scratch fast tensor size set to 0. This forces a minimal allocation in the tensor arena
+        # which causes a slot in the basep registers to be reserved, so that the scratch fast tensor
+        # address can be overridden.
+        sg.scratch_fast_tensor = make_memory_tensor(
+            sg.name + "_scratch_fast", scratch_fast_area, MemType.Scratch, 0, False, arch
+        )
+        sg.scratch_fast_tensor.purpose = TensorPurpose.Scratch
     else:
         sg.scratch_tensor = scratch_tens
         sg.scratch_tensor.shape[0] += scratch_size
         sg.flash_tensor = flash_tens
         sg.flash_tensor.shape[0] += flash_size
 
+        sg.scratch_fast_tensor = scratch_fast_tens
+        sg.scratch_fast_tensor.shape[0] = 0
+
     for cps in sg.cascaded_passes:
         for ps in cps.passes:
             if ps.placement == PassPlacement.Npu:
@@ -126,7 +141,7 @@
     )
     sg.command_stream_tensor.values = np.frombuffer(payload_bytes, dtype=np.uint8)
 
-    return sg.scratch_tensor, sg.flash_tensor
+    return sg.scratch_tensor, sg.scratch_fast_tensor, sg.flash_tensor
 
 
 def add_const_tens_to_startup_cascaded_pass(startup_cps, tens):
@@ -152,11 +167,16 @@
                     op.attrs["custom_type"] = op.type
 
                     sz = 0
-                    for tens in [callee.scratch_tensor, callee.flash_tensor, callee.command_stream_tensor]:
+                    for tens in [
+                        callee.scratch_fast_tensor,
+                        callee.scratch_tensor,
+                        callee.flash_tensor,
+                        callee.command_stream_tensor,
+                    ]:
                         op.inputs.insert(0, tens)
                         ps.inputs.insert(0, tens)
                         cps.inputs.insert(0, tens)
-                        if tens != callee.scratch_tensor:
+                        if tens != callee.scratch_tensor and tens != callee.scratch_fast_tensor:
                             add_const_tens_to_startup_cascaded_pass(startup_cps, tens)
                         sz += tens.storage_size()
 
@@ -166,3 +186,7 @@
                     if callee.scratch_tensor is not None:
                         if callee.scratch_tensor.mem_area == MemArea.Sram:
                             cps.sram_used += callee.scratch_tensor.storage_size()
+
+                    if callee.scratch_fast_tensor is not None:
+                        if callee.scratch_fast_tensor.mem_area == MemArea.Sram:
+                            cps.sram_used += callee.scratch_fast_tensor.storage_size()
diff --git a/ethosu/vela/scheduler.py b/ethosu/vela/scheduler.py
index be104b8..36bb3c2 100644
--- a/ethosu/vela/scheduler.py
+++ b/ethosu/vela/scheduler.py
@@ -232,7 +232,8 @@
 
         if self.arch.feature_map_storage_mem_area != MemArea.Sram:
             self.use_ifm_ofm_overlap = False  # force off IFM/OFM overlap if IFMs and OFMs are not in the SRAM
-        self.use_ifm_ofm_overlap = options.use_ifm_ofm_overlap
+        else:
+            self.use_ifm_ofm_overlap = options.use_ifm_ofm_overlap
 
         self.verbose_schedule = options.verbose_schedule
         self.verbose_pareto_frontier_schedules = options.verbose_pareto_frontier_schedules
diff --git a/ethosu/vela/tflite_writer.py b/ethosu/vela/tflite_writer.py
index 4aa23b5..cf40b5b 100644
--- a/ethosu/vela/tflite_writer.py
+++ b/ethosu/vela/tflite_writer.py
@@ -142,7 +142,7 @@
 
         buffer_map = {}
 
-        buf_idx = 1
+        buf_idx = 2
 
         for tens in tensors:
             # Set buffer ids depending on allocation
@@ -314,7 +314,11 @@
 
         all_tensors = [tens for nm, idx, tens in sorted((tens.name, idx, tens) for idx, tens in enumerate(tensor_set))]
 
-        scratch_tensors = [tens for tens in all_tensors if tens.purpose == TensorPurpose.Scratch]
+        scratch_tensors = [tens for tens in all_tensors if tens.name.endswith("scratch")]
+
+        for tens in all_tensors:
+            if tens.name.endswith("scratch_fast"):
+                scratch_fast_tensor = tens
 
         if len(scratch_tensors) == 0:
             scratch_tensor = None
@@ -331,11 +335,16 @@
         assert all(inp in sg.original_inputs for inp in sg.input_tensors)
         inputs = [self.tensor_map[tens] for tens in sg.original_inputs]
 
-        # Add the Scratch Tensor as input to the NPU subgraph to get it allocated by TensorFlow Lite Micro
+        # Add the Scratch Tensors as input to the NPU subgraph to get them allocated by TensorFlow Lite Micro
         scratch_tensor_idx = self.tensor_map.get(scratch_tensor, None)
+        scratch_fast_tensor_idx = self.tensor_map.get(scratch_fast_tensor, None)
+
         if scratch_tensor_idx is not None and scratch_tensor_idx not in inputs:
             inputs.append(scratch_tensor_idx)
 
+        if scratch_fast_tensor_idx is not None and scratch_fast_tensor_idx not in inputs:
+            inputs.append(scratch_fast_tensor_idx)
+
         inputs_offset = self.write_int_vector(inputs)
         outputs_offset = self.write_int_vector([self.tensor_map[tens] for tens in sg.output_tensors])