Add elementwise vector scalars support

Write the constant scalars into flash. In case it's Dram
or OffChipFlash, DMA the scalars from flash to sram.

Signed-off-by: Charles Xu <charles.xu@arm.com>
Change-Id: I42300a05dfe968d623b8aec8549644549e0f54b5
diff --git a/ethosu/vela/graph_optimiser.py b/ethosu/vela/graph_optimiser.py
index 913b9a6..351716e 100644
--- a/ethosu/vela/graph_optimiser.py
+++ b/ethosu/vela/graph_optimiser.py
@@ -25,6 +25,7 @@
 from .operation import NpuBlockType
 from .operation import Operation
 from .tensor import Tensor
+from .numeric_util import full_shape
 
 passthrough_nodes = set(("Identity",))
 
@@ -313,6 +314,7 @@
 depthwise_op = set(("DepthwiseConv2dNative", "DepthwiseConv2dBiasAct",))
 pool_op = set(("AvgPool", "MaxPool", "QuantizedAvgPool", "QuantizedMaxPool", "AvgPoolAct", "MaxPoolAct", "ResizeBilinear",))
 elementwise_op = set(("AddAct", "MulAct", "SubAct", "Maximum", "Minimum", "LeakyRelu", "Abs"))
+binary_elementwise_op = set(("AddAct", "MulAct", "SubAct", "Maximum", "Minimum"))
 activation_ops = set(("Relu", "Relu6", "ReluN1To1", "Sigmoid", "Tanh"))
 memory_only_ops = set(("Reshape",))
 
@@ -399,6 +401,16 @@
             op.type = "Identity"
     return op
 
+def fixup_elementwise_with_scalars(op, arch):
+    if op.type in binary_elementwise_op:
+        ifm_tensor, ifm2_tensor, _, ofm_tensor = op.get_ifm_ifm2_weights_ofm()
+        if ifm2_tensor.shape != [] and ifm_tensor.shape != []:
+            diff = len(ifm_tensor.shape) - len(ifm2_tensor.shape)
+            if diff > 0:
+                ifm2_tensor.shape = full_shape(len(ifm_tensor.shape), ifm2_tensor.shape, 1)
+            elif diff < 0:
+                ifm_tensor.shape = full_shape(len(ifm2_tensor.shape), ifm_tensor.shape, 1)
+    return op
 
 # Set input/output tensor equivalence to the same id for memory operations
 def set_tensor_equivalence(op, arch):
@@ -492,6 +504,7 @@
         fixup_act_reorder,
         add_padding_fields,
         mark_npu_block_type,
+        fixup_elementwise_with_scalars,
         # convert_mul_max_to_abs_or_lrelu # TODO: enable optimisation once quantisation issues are resolved
     ]
 
diff --git a/ethosu/vela/high_level_command_stream_generator.py b/ethosu/vela/high_level_command_stream_generator.py
index ef21e06..0cc70a7 100644
--- a/ethosu/vela/high_level_command_stream_generator.py
+++ b/ethosu/vela/high_level_command_stream_generator.py
@@ -24,17 +24,18 @@
 from .nn_graph import PassPlacement
 from .nn_graph import SchedulingStrategy
 from .operation import NpuBlockType
+from .tensor import TensorPurpose
 
 
 def need_dma(tens):
     return len(tens.ops) == 1 and tens.ops[0].type == "DMA"
 
 
-def dma_weights_if_necessary(ps, box, weight_tensor):
-    if need_dma(weight_tensor):
-        dma_op = weight_tensor.ops[0]
+def dma_if_necessary(ps, box, tensor):
+    if need_dma(tensor):
+        dma_op = tensor.ops[0]
         in_tensor = dma_op.inputs[0]
-        yield DMA(in_tensor, weight_tensor, box)
+        yield DMA(in_tensor, tensor, box)
 
 
 def generate_high_level_command_stream_for_pass(strat, passes, block_configs, idx):
@@ -115,6 +116,13 @@
             else:
                 ifm2_box = Box([], [])
 
+            for intermediate in ps.intermediates:
+                if intermediate != None and intermediate.shape != [] and intermediate.purpose == TensorPurpose.FeatureMap:
+                    intermediate_box, _, _ = ofm_box.transform_with_strides_and_skirt(
+                        strides, skirt, intermediate.shape, npu_block_type, concat_axis, concat_offset, split_offsets[0]
+                    )
+                    yield from dma_if_necessary(ps, intermediate_box, intermediate)
+
             weight_box = None
             if weight_tensor is not None:
                 weight_oc_start = start
@@ -130,7 +138,7 @@
                     weight_oc_end,
                     weight_tensor.weight_transpose_depthwise,
                 )
-                yield from dma_weights_if_necessary(ps, weight_box, weight_tensor)
+                yield from dma_if_necessary(ps, weight_box, weight_tensor)
 
             yield NpuStripe(
                 ps,
@@ -201,6 +209,13 @@
                 strides, skirt, ifm_tensor.shape, npu_block_type, concat_axis, concat_offset, split_offsets[0], k_height
             )
 
+            for intermediate in ps.intermediates:
+                if intermediate != None and intermediate.shape != [] and intermediate.purpose == TensorPurpose.FeatureMap:
+                    intermediate_box, _, _ = ofm_box.transform_with_strides_and_skirt(
+                        strides, skirt, intermediate.shape, npu_block_type, concat_axis, concat_offset, split_offsets[0]
+                    )
+                    yield from dma_if_necessary(ps, intermediate_box, intermediate)
+
             ifm_y_needed = 1
             if len(ifm_box.end_coord) >= 3:
                 ifm_y_needed = ifm_box.end_coord[-3]
@@ -217,7 +232,7 @@
                 weight_box = Box.make_weight_box(
                     weight_tensor.shape, npu_block_type, weights_transposed=weight_tensor.weight_transpose_depthwise
                 )
-                yield from dma_weights_if_necessary(ps, weight_box, weight_tensor)
+                yield from dma_if_necessary(ps, weight_box, weight_tensor)
 
             # Check if first/last stripe in pass
             is_first_h_stripe = start == y_start
diff --git a/ethosu/vela/insert_dma.py b/ethosu/vela/insert_dma.py
index 703ab9d..b1b8985 100644
--- a/ethosu/vela/insert_dma.py
+++ b/ethosu/vela/insert_dma.py
@@ -21,6 +21,7 @@
 from .tensor import MemArea
 from .tensor import TensorPurpose
 
+binary_elementwise_op = set(("AddAct", "MulAct", "SubAct", "Maximum", "Minimum"))
 
 def insert_dma_cmd(op, arch):
     if op.type == "DMA":
@@ -28,7 +29,9 @@
     for idx, tens in enumerate(op.inputs):
 
         if tens.mem_area in (MemArea.Dram, MemArea.OffChipFlash) and tens.mem_area != arch.fast_storage_mem_area:
-            if tens.purpose == TensorPurpose.Weights:
+            if (tens.purpose == TensorPurpose.Weights or
+                (tens.purpose == TensorPurpose.FeatureMap and
+                 op.type in binary_elementwise_op)):
                 only_vector_product_consumers = True
                 for oper in tens.consumers():
                     if oper is None or oper.attrs.get("npu_block_type") != NpuBlockType.VectorProduct:
diff --git a/ethosu/vela/npu_serialisation.py b/ethosu/vela/npu_serialisation.py
index b8ac20f..0cb40ed 100644
--- a/ethosu/vela/npu_serialisation.py
+++ b/ethosu/vela/npu_serialisation.py
@@ -46,6 +46,10 @@
         memory_tensor.values[start_addr:end_addr] = compressed_values
         start_addr = end_addr
 
+def copy_ifm_values_to_memory_tensor(memory_tensor, src_tensor):
+    start_addr = src_tensor.address
+    end_addr = start_addr + src_tensor.quant_values.size
+    memory_tensor.values[start_addr:end_addr] = src_tensor.quant_values
 
 def serialise_npu_subgraph_into_tensors(nng, sg, arch, scratch_tens, flash_tens):
     if sg.placement != PassPlacement.Npu:
@@ -90,16 +94,22 @@
 
     for cps in sg.cascaded_passes:
         for ps in cps.passes:
-            if ps.placement == PassPlacement.Npu and ps.weight_tensor is not None:
-                # For DMA ops, ps.weight_tensor is referring to the SRAM weight tensor and therefore the address
-                # is pointing at the destination address of where the weights should be placed in SRAM.
-                # This ensures that the Flash weight tensor is used instead and thus gets the correct address.
-                if ps.weight_tensor.ops[0].type == "DMA":
-                    copy_compressed_values_to_memory_tensor(sg.flash_tensor, ps.weight_tensor.ops[0].inputs[0])
-                else:
-                    copy_compressed_values_to_memory_tensor(sg.flash_tensor, ps.weight_tensor)
+            if ps.placement == PassPlacement.Npu:
+                if ps.weight_tensor != None:
+                    # For DMA ops, ps.weight_tensor is referring to the SRAM weight tensor and therefore the address
+                    # is pointing at the destination address of where the weights should be placed in SRAM.
+                    # This ensures that the Flash weight tensor is used instead and thus gets the correct address.
+                    if ps.weight_tensor.ops[0].type == "DMA":
+                        copy_compressed_values_to_memory_tensor(sg.flash_tensor, ps.weight_tensor.ops[0].inputs[0])
+                    else:
+                        copy_compressed_values_to_memory_tensor(sg.flash_tensor, ps.weight_tensor)
 
-                copy_compressed_values_to_memory_tensor(sg.flash_tensor, ps.scale_tensor)
+                    copy_compressed_values_to_memory_tensor(sg.flash_tensor, ps.scale_tensor)
+
+                if ps.ifm_tensor != None and ps.ifm_tensor.mem_area != MemArea.Sram:
+                    copy_ifm_values_to_memory_tensor(sg.flash_tensor, ps.ifm_tensor)
+                if ps.ifm2_tensor != None and ps.ifm2_tensor.mem_area != MemArea.Sram:
+                    copy_ifm_values_to_memory_tensor(sg.flash_tensor, ps.ifm2_tensor)
 
     sg.command_stream_tensor = make_memory_tensor(
         sg.name + "_command_stream", flash_area, command_stream_size_bytes, True, arch
diff --git a/ethosu/vela/supported_operators.py b/ethosu/vela/supported_operators.py
index e527145..574b3a4 100644
--- a/ethosu/vela/supported_operators.py
+++ b/ethosu/vela/supported_operators.py
@@ -229,13 +229,6 @@
         if op.type in self.binary_elem_wise_main_ops: # if op type is unary, ifm2_tensor is None
             if len(ifm2_tensor.shape) > 2 and ifm2_tensor.shape[0] != 1:
                 return False
-
-        # check scalar size
-        if hasattr(ifm_tensor.values, "__len__") and len(ifm_tensor.values) > 1:
-            return False
-        if op.type in self.binary_elem_wise_main_ops: # same as above
-            if hasattr(ifm2_tensor.values, "__len__") and len(ifm2_tensor.values) > 1:
-                return False
         return True
 
     def check_memory_only_restrictions(self, op):