Add elementwise vector scalars support

Write the constant scalars into flash. In case it's Dram
or OffChipFlash, DMA the scalars from flash to sram.

Signed-off-by: Charles Xu <charles.xu@arm.com>
Change-Id: I42300a05dfe968d623b8aec8549644549e0f54b5
diff --git a/ethosu/vela/npu_serialisation.py b/ethosu/vela/npu_serialisation.py
index b8ac20f..0cb40ed 100644
--- a/ethosu/vela/npu_serialisation.py
+++ b/ethosu/vela/npu_serialisation.py
@@ -46,6 +46,10 @@
         memory_tensor.values[start_addr:end_addr] = compressed_values
         start_addr = end_addr
 
+def copy_ifm_values_to_memory_tensor(memory_tensor, src_tensor):
+    start_addr = src_tensor.address
+    end_addr = start_addr + src_tensor.quant_values.size
+    memory_tensor.values[start_addr:end_addr] = src_tensor.quant_values
 
 def serialise_npu_subgraph_into_tensors(nng, sg, arch, scratch_tens, flash_tens):
     if sg.placement != PassPlacement.Npu:
@@ -90,16 +94,22 @@
 
     for cps in sg.cascaded_passes:
         for ps in cps.passes:
-            if ps.placement == PassPlacement.Npu and ps.weight_tensor is not None:
-                # For DMA ops, ps.weight_tensor is referring to the SRAM weight tensor and therefore the address
-                # is pointing at the destination address of where the weights should be placed in SRAM.
-                # This ensures that the Flash weight tensor is used instead and thus gets the correct address.
-                if ps.weight_tensor.ops[0].type == "DMA":
-                    copy_compressed_values_to_memory_tensor(sg.flash_tensor, ps.weight_tensor.ops[0].inputs[0])
-                else:
-                    copy_compressed_values_to_memory_tensor(sg.flash_tensor, ps.weight_tensor)
+            if ps.placement == PassPlacement.Npu:
+                if ps.weight_tensor != None:
+                    # For DMA ops, ps.weight_tensor is referring to the SRAM weight tensor and therefore the address
+                    # is pointing at the destination address of where the weights should be placed in SRAM.
+                    # This ensures that the Flash weight tensor is used instead and thus gets the correct address.
+                    if ps.weight_tensor.ops[0].type == "DMA":
+                        copy_compressed_values_to_memory_tensor(sg.flash_tensor, ps.weight_tensor.ops[0].inputs[0])
+                    else:
+                        copy_compressed_values_to_memory_tensor(sg.flash_tensor, ps.weight_tensor)
 
-                copy_compressed_values_to_memory_tensor(sg.flash_tensor, ps.scale_tensor)
+                    copy_compressed_values_to_memory_tensor(sg.flash_tensor, ps.scale_tensor)
+
+                if ps.ifm_tensor != None and ps.ifm_tensor.mem_area != MemArea.Sram:
+                    copy_ifm_values_to_memory_tensor(sg.flash_tensor, ps.ifm_tensor)
+                if ps.ifm2_tensor != None and ps.ifm2_tensor.mem_area != MemArea.Sram:
+                    copy_ifm_values_to_memory_tensor(sg.flash_tensor, ps.ifm2_tensor)
 
     sg.command_stream_tensor = make_memory_tensor(
         sg.name + "_command_stream", flash_area, command_stream_size_bytes, True, arch