MLBEDSW-6384: Updated weight buffering cycle calculation - The npu cycles are not correct calculated when only one weight buffer is used, since weights can not be fetched in parallel. - Added new calculation in the single buffer case. Signed-off-by: Johan Alfven <johan.alfven@arm.com> Change-Id: I8568912d11d137a298225ab77b8b3272613c76f6

commit: 0f98de67b2f929a1297326721eb421f0a44ef216 [log] [tgz]
author: Johan Alfvén <johan.alfven@arm.com> Sun May 15 14:54:51 2022 +0200
committer: tim.hall <tim.hall@arm.com> Thu May 19 14:44:01 2022 +0000
tree: 3f76d7776aa391c42ed5900e5aff96c23d519c34
parent: 1e363b10a6d4ce0fc062e34df0182b847b08850d [diff]
diff --git a/ethosu/vela/npu_performance.py b/ethosu/vela/npu_performance.py
index b7607e6..0e2e3ca 100644
--- a/ethosu/vela/npu_performance.py
+++ b/ethosu/vela/npu_performance.py

@@ -50,6 +50,7 @@
 from .tensor import BandwidthDirection
 from .tensor import MemArea
 from .tensor import TensorPurpose
+from .tensor import TensorSubPurpose
 from .tflite_mapping import optype_to_builtintype as tflite_optype_to_builtintype
 from .tosa_mapping import optype_to_tosa_op_type as tosa_optype_to_tosa_op_type
 from .weight_compressor import WeightKey
@@ -674,10 +675,18 @@
         )
 
         # Add cycles for Weight + Scale Transfer
-        cycles_a[PassCycles.Npu] = max(
-            cost.full_weight_transfer_cycles - slack_cycles + cost.slack_buffering_cycles,
-            cycles.op_cycles + max(ws_first_transfer_cycles - slack_cycles, 0),
-        )
+        if cost.buffered_weight_tensors[0].sub_purpose == TensorSubPurpose.DoubleBuffer:
+            # Double buffer - weights can be fetched in parallel
+            cycles_a[PassCycles.Npu] = max(
+                cost.full_weight_transfer_cycles - slack_cycles + cost.slack_buffering_cycles,
+                cycles.op_cycles + max(ws_first_transfer_cycles - slack_cycles, 0),
+            )
+        else:
+            # Standard buffer - weights can not be fetched in parallel so weight transfer
+            # must be included in the result
+            cycles_a[PassCycles.Npu] = (
+                cycles.op_cycles + cost.full_weight_transfer_cycles - min(ws_first_transfer_cycles, slack_cycles)
+            )
 
         # Add cycles for LUT Transfer
         cycles_a[PassCycles.Npu] += lut_transfer_cycles
commit	0f98de67b2f929a1297326721eb421f0a44ef216	[log] [tgz]
author	Johan Alfvén <johan.alfven@arm.com>	Sun May 15 14:54:51 2022 +0200
committer	tim.hall <tim.hall@arm.com>	Thu May 19 14:44:01 2022 +0000
tree	3f76d7776aa391c42ed5900e5aff96c23d519c34
parent	1e363b10a6d4ce0fc062e34df0182b847b08850d [diff]