MLBEDSW-6260: Add support for using DMA to copy feature maps - Reshape ops can be bypassed and there is no need to process them by the NPU. There are use cases when the IFM must be preserved so a memcpy is needed. This is implemented by an AvgPool. - In order to reduce the cost of the AvgPool the IFM can be copied by DMA. This is faster and also it can be turned into a real NOP in cases where the IFM and the OFM can use the same memory space. - Added new memcpy op. Only NHWC format supported since DMA can not change the format on the fly. - Allow ofm to reuse ifm for memcpy op - Make sure the DMA copy size is 16 byte aligned Change-Id: I3605a48d47646ff60d2bb3644dd3a23f872235a7 Signed-off-by: Johan Alfven <johan.alfven@arm.com>

commit: 90724965751e882c58de74a044cc7adab307bc55 [log] [tgz]
author: Johan Alfven <johan.alfven@arm.com> Thu Feb 02 09:07:48 2023 +0100
committer: Johan Alfven <johan.alfven@arm.com> Tue Mar 14 11:00:58 2023 +0100
tree: 425ccea87487b66ca298a801b298fbf8567f86d9
parent: bb9885190f5f7ea959f171b38ee1dd44d3e1e75e [diff] [blame]
diff --git a/ethosu/vela/pass_packing.py b/ethosu/vela/pass_packing.py
index 5a9f957..e43a919 100644
--- a/ethosu/vela/pass_packing.py
+++ b/ethosu/vela/pass_packing.py

@@ -39,6 +39,7 @@
     StartupInit = 64
     MemoryOnly = 128
     PostFusingLimited = 256
+    Memcpy = 512
 
 
 mac_main_ops = set(
@@ -95,6 +96,7 @@
         Op.ExpandDims,
     )
 )
+memcpy_ops = set((Op.Memcpy,))
 
 
 test_sequence = [
@@ -160,6 +162,16 @@
     ),
     (
         # ops_set
+        memcpy_ops,
+        # incompatible_pack_flags
+        PassFlags.Cpu | PassFlags.MemoryOnly | PassFlags.Mac | PassFlags.Main | PassFlags.PostFusingLimited,
+        # flags_to_set
+        PassFlags.Npu | PassFlags.Memcpy | PassFlags.Main,
+        # flags_to_clear
+        PassFlags.Empty,
+    ),
+    (
+        # ops_set
         cpu_ops,
         # incompatible_pack_flags
         PassFlags.Npu | PassFlags.MemoryOnly | PassFlags.Main,
@@ -248,7 +260,11 @@
 
                         if flags_to_set & PassFlags.Npu:
                             if flags_to_set & (
-                                PassFlags.Mac | PassFlags.ElementWise | PassFlags.Post | PassFlags.PostFusingLimited
+                                PassFlags.Mac
+                                | PassFlags.ElementWise
+                                | PassFlags.Post
+                                | PassFlags.PostFusingLimited
+                                | PassFlags.Memcpy
                             ):
                                 assert len(curr_op.inputs) >= 1
                                 ifm_tensor = curr_op.ifm
commit	90724965751e882c58de74a044cc7adab307bc55	[log] [tgz]
author	Johan Alfven <johan.alfven@arm.com>	Thu Feb 02 09:07:48 2023 +0100
committer	Johan Alfven <johan.alfven@arm.com>	Tue Mar 14 11:00:58 2023 +0100
tree	425ccea87487b66ca298a801b298fbf8567f86d9
parent	bb9885190f5f7ea959f171b38ee1dd44d3e1e75e [diff] [blame]