MLBEDSW-6260: Add support for using DMA to copy feature maps

- Reshape ops can be bypassed and there is no need to process them by the NPU.
There are use cases when the IFM must be preserved so a memcpy is needed.
This is implemented by an AvgPool.
- In order to reduce the cost of the AvgPool the IFM can be copied by DMA.
This is faster and also it can be turned into a real NOP in cases where
the IFM and the OFM can use the same memory space.
- Added new memcpy op. Only NHWC format supported since DMA can not change
the format on the fly.
- Allow ofm to reuse ifm for memcpy op
- Make sure the DMA copy size is 16 byte aligned

Change-Id: I3605a48d47646ff60d2bb3644dd3a23f872235a7
Signed-off-by: Johan Alfven <johan.alfven@arm.com>
diff --git a/ethosu/vela/live_range.py b/ethosu/vela/live_range.py
index 05e481e..995a0cc 100644
--- a/ethosu/vela/live_range.py
+++ b/ethosu/vela/live_range.py
@@ -165,16 +165,11 @@
 
 
 def _get_ifm_to_fuse(sched_op, target_mem_area=None, target_mem_type_set=None):
-    def _tensor_should_be_ignored(tens):
-        if tens.ifm_write_protected:
-            return True
-        return tensor_should_be_ignored(tens, target_mem_area, target_mem_type_set)
-
-    # Check if possible to merge ifm/ofm live ranges of elementwise op
     ifm_tens = None
     if sched_op.op_type.is_elementwise_op():
+        # Check if possible to merge ifm/ofm live ranges of elementwise op
         elem_op = sched_op.parent_op
-        if not _tensor_should_be_ignored(elem_op.ofm):
+        if not tensor_should_be_ignored(elem_op.ofm, target_mem_area, target_mem_type_set):
             # Check if overwriting the inputs can be allowed
             OpShapeTens = namedtuple("OpShapeTens", ["op_shape", "tens"])
             outp = OpShapeTens(elem_op.ofm_shapes[0], elem_op.ofm)
@@ -183,7 +178,6 @@
                 inps.append(OpShapeTens(elem_op.ifm_shapes[0], elem_op.ifm))
             if elem_op.ifm2 is not None:
                 inps.append(OpShapeTens(elem_op.ifm_shapes[1], elem_op.ifm2))
-
             # find an input tensor that can be overwritten by the output
             for inp in inps:
                 if (
@@ -192,7 +186,8 @@
                     # check input tensor is valid
                     and inp.tens is not None
                     and inp.tens.shape != []
-                    and not _tensor_should_be_ignored(inp.tens)
+                    and not inp.tens.ifm_write_protected
+                    and not tensor_should_be_ignored(inp.tens, target_mem_area, target_mem_type_set)
                     # check input and output tensors are compatible
                     and inp.tens.format == outp.tens.format
                     and inp.tens.dtype == outp.tens.dtype
@@ -203,6 +198,17 @@
                 ):
                     ifm_tens = inp.tens
                     break
+    elif sched_op.op_type == Op.Memcpy:
+        # Check if possible to merge ifm/ofm live ranges of dma op
+        dma_op = sched_op.parent_op
+        ifm = dma_op.ifm
+        ofm = dma_op.ofm
+        if not (
+            tensor_should_be_ignored(ifm, target_mem_area, target_mem_type_set)
+            or tensor_should_be_ignored(ofm, target_mem_area, target_mem_type_set)
+        ):
+            # Currently DMA only used when bypassing memory only ops so ok to reuse ifm
+            ifm_tens = ifm
 
     return ifm_tens