MLBEDSW-2551 Add support for more mem-cfgs

Added support for one more memory configuration-

Change-Id: Iac19992386e3e9b80bd519acb1b0a399c47d736f
Signed-off-by: Patrik Gustavsson <patrik.gustavsson@arm.com>
diff --git a/ethosu/vela/npu_serialisation.py b/ethosu/vela/npu_serialisation.py
index bd13a3e..2d1c6b1 100644
--- a/ethosu/vela/npu_serialisation.py
+++ b/ethosu/vela/npu_serialisation.py
@@ -55,12 +55,13 @@
     memory_tensor.values[start_addr:end_addr] = src_tensor.quant_values
 
 
-def serialise_npu_subgraph_into_tensors(nng, sg, arch, scratch_tens, flash_tens):
+def serialise_npu_subgraph_into_tensors(nng, sg, arch, scratch_tens, scratch_fast_tens, flash_tens):
     if sg.placement != PassPlacement.Npu:
-        return scratch_tens, flash_tens
+        return scratch_tens, scratch_fast_tens, flash_tens
 
     flash_area = arch.permanent_storage_mem_area
     scratch_area = arch.feature_map_storage_mem_area
+    scratch_fast_area = arch.fast_storage_mem_area
 
     flash_size = sg.memory_used.get(flash_area, 0)
     scratch_size = sg.memory_used.get(scratch_area, 0)
@@ -85,6 +86,10 @@
     nng.total_size[scratch_area] = nng.total_size.get(scratch_area, 0) - scratch_size
     nng.total_elements[scratch_area] = nng.total_elements.get(scratch_area, 0) - scratch_size
 
+    if scratch_area != scratch_fast_area:
+        nng.total_size[scratch_fast_area] = nng.total_size.get(scratch_fast_area, 0)
+        nng.total_elements[scratch_fast_area] = nng.total_elements.get(scratch_fast_area, 0)
+
     if flash_tens == scratch_tens is None:
         # First Npu subgraph, create scratch and flash tensors
         sg.scratch_tensor = make_memory_tensor(
@@ -94,12 +99,22 @@
         sg.flash_tensor = make_memory_tensor(
             sg.name + "_flash", flash_area, MemType.Permanent_CPU, flash_size, True, arch
         )
+        # Scratch fast tensor size set to 0. This forces a minimal allocation in the tensor arena
+        # which causes a slot in the basep registers to be reserved, so that the scratch fast tensor
+        # address can be overridden.
+        sg.scratch_fast_tensor = make_memory_tensor(
+            sg.name + "_scratch_fast", scratch_fast_area, MemType.Scratch, 0, False, arch
+        )
+        sg.scratch_fast_tensor.purpose = TensorPurpose.Scratch
     else:
         sg.scratch_tensor = scratch_tens
         sg.scratch_tensor.shape[0] += scratch_size
         sg.flash_tensor = flash_tens
         sg.flash_tensor.shape[0] += flash_size
 
+        sg.scratch_fast_tensor = scratch_fast_tens
+        sg.scratch_fast_tensor.shape[0] = 0
+
     for cps in sg.cascaded_passes:
         for ps in cps.passes:
             if ps.placement == PassPlacement.Npu:
@@ -126,7 +141,7 @@
     )
     sg.command_stream_tensor.values = np.frombuffer(payload_bytes, dtype=np.uint8)
 
-    return sg.scratch_tensor, sg.flash_tensor
+    return sg.scratch_tensor, sg.scratch_fast_tensor, sg.flash_tensor
 
 
 def add_const_tens_to_startup_cascaded_pass(startup_cps, tens):
@@ -152,11 +167,16 @@
                     op.attrs["custom_type"] = op.type
 
                     sz = 0
-                    for tens in [callee.scratch_tensor, callee.flash_tensor, callee.command_stream_tensor]:
+                    for tens in [
+                        callee.scratch_fast_tensor,
+                        callee.scratch_tensor,
+                        callee.flash_tensor,
+                        callee.command_stream_tensor,
+                    ]:
                         op.inputs.insert(0, tens)
                         ps.inputs.insert(0, tens)
                         cps.inputs.insert(0, tens)
-                        if tens != callee.scratch_tensor:
+                        if tens != callee.scratch_tensor and tens != callee.scratch_fast_tensor:
                             add_const_tens_to_startup_cascaded_pass(startup_cps, tens)
                         sz += tens.storage_size()
 
@@ -166,3 +186,7 @@
                     if callee.scratch_tensor is not None:
                         if callee.scratch_tensor.mem_area == MemArea.Sram:
                             cps.sram_used += callee.scratch_tensor.storage_size()
+
+                    if callee.scratch_fast_tensor is not None:
+                        if callee.scratch_fast_tensor.mem_area == MemArea.Sram:
+                            cps.sram_used += callee.scratch_fast_tensor.storage_size()