MLBEDSW-5880 Fixed Vela verbose weight flag

*Original weights and encoded NPU weight now report correct size instead
of zero when running vela with --verbose-weights flag
(Code to update the aforementioned attributes was missing)

*Removed print references to unencoded NPU weight size

Change-Id: I6d3e41c04cc46d24eeb54cab89818a35e5df27be
Signed-off-by: Ayaan Masood <Ayaan.Masood@arm.com>
diff --git a/ethosu/vela/npu_performance.py b/ethosu/vela/npu_performance.py
index 21b420b..08967f4 100644
--- a/ethosu/vela/npu_performance.py
+++ b/ethosu/vela/npu_performance.py
@@ -22,6 +22,8 @@
 import copy
 from enum import auto
 from enum import IntEnum
+from typing import Set
+from uuid import UUID
 
 import numpy as np
 
@@ -31,11 +33,13 @@
 from .architecture_features import NpuBlockType
 from .architecture_features import SHRAMElements
 from .architecture_features import TensorFormat
+from .nn_graph import Graph
 from .numeric_util import round_up
 from .operation import Kernel
 from .operation import Op
 from .scheduler import Schedule
 from .scheduler import SchedulerOperation
+from .scheduler import SchedulerOpInfo
 from .shape4d import Shape4D
 from .tensor import BandwidthDirection
 from .tensor import MemArea
@@ -725,16 +729,39 @@
     return bws, macs, cycles_a
 
 
-def calc_new_performance_for_network(nng, arch):
+def calc_new_performance_for_network(nng: Graph, arch):
     total_bws = make_bandwidth_array()
     total_macs = 0
     total_cycles = np.zeros(PassCycles.Size)
+    total_weight_size = 0
+    total_encoded_weight_size = 0
+
+    # Store unique instances of original/encoded weight tensor uuids to prevent double counting of weights
+    original_weight_uuids: Set[UUID] = set()
+    encoded_npu_weight_uuids: Set[UUID] = set()
 
     for sg in nng.subgraphs:
         prev_op = None
         for sched_op in sg.sched_ops:
-            op_info = sg.schedule.cost_map[sched_op]
+            op_info: SchedulerOpInfo = sg.schedule.cost_map[sched_op]
             bws, macs, cycles = estimate_full_op_performance(arch, sg.schedule, sched_op, prev_op, op_info.block_config)
+
+            # Tensors for calculating weight sizes
+            original_weight = sched_op.parent_op.weights
+            encoded_npu_weight = op_info.npu_weights_tensor
+
+            # Save UUIDs of original_weight so only unique instances of tensors are used to calculate weights
+            if original_weight and (original_weight.equivalence_id not in original_weight_uuids):
+
+                original_weight_uuids.add(original_weight.equivalence_id)
+                total_weight_size += original_weight.values.itemsize * original_weight.values.size
+
+            # Save UUIDs of encoded_npu_weight so only unique instances of tensors are used to calculate weights
+            if encoded_npu_weight and (encoded_npu_weight.equivalence_id not in encoded_npu_weight_uuids):
+
+                encoded_npu_weight_uuids.add(encoded_npu_weight)
+                total_encoded_weight_size += len(encoded_npu_weight.buffer)
+
             total_bws += bws
             total_macs += macs
             total_cycles += cycles
@@ -743,3 +770,5 @@
     nng.bandwidths = total_bws
     nng.macs = total_macs
     nng.cycles = total_cycles
+    nng.total_original_weights = total_weight_size
+    nng.total_npu_encoded_weights = total_encoded_weight_size
diff --git a/ethosu/vela/scheduler.py b/ethosu/vela/scheduler.py
index 8f2426c..6b08459 100644
--- a/ethosu/vela/scheduler.py
+++ b/ethosu/vela/scheduler.py
@@ -339,7 +339,7 @@
         self.nng = nng
         self.sg = sg
         self.arch = arch
-        self.sched_ops: List(SchedulerOperation) = []
+        self.sched_ops: List[SchedulerOperation] = []
         self.max_schedule = None
         self.scheduler_options = options
 
@@ -459,7 +459,6 @@
     def create_initial_schedule(self) -> Schedule:
         """Creates an initial schedule with no cascading or buffering of any kind"""
         schedule = Schedule(self.sg, "MAX")
-
         for op in self.sched_ops:
             cost = op.create_scheduler_info(self.nng, op.ofm.shape)
             cost.cycles = self.estimate_op_performance(op, cost.block_config, op.ofm.shape.depth)
diff --git a/ethosu/vela/stats_writer.py b/ethosu/vela/stats_writer.py
index 86f531a..d8a274b 100644
--- a/ethosu/vela/stats_writer.py
+++ b/ethosu/vela/stats_writer.py
@@ -110,7 +110,6 @@
         data_items += [midpoint_fps, nng.batch_size, midpoint_inference_time, n_passes, n_cascaded_passes]
         data_items += [nng.memory_used.get(mem_area, 0) / 1024.0 for mem_area in mem_areas]
         data_items += [nng.total_original_weights]
-        data_items += [nng.total_npu_weights]
         data_items += [nng.total_npu_encoded_weights]
 
         for mem_area in mem_areas:
@@ -325,7 +324,6 @@
 
     if weights_data:
         print(f"Original Weights Size                    {weights_data['original'] / 1024.0:12.2f} KiB", file=f)
-        print(f"NPU Weights Size                         {weights_data['npu'] / 1024.0:12.2f} KiB", file=f)
         print(f"NPU Encoded Weights Size                 {weights_data['npu_encoded'] / 1024.0:12.2f} KiB", file=f)
         print(file=f)
 
@@ -372,11 +370,7 @@
                     npu_operations.append(op)
 
     weights_data = (
-        {
-            "original": nng.total_original_weights,
-            "npu": nng.total_npu_weights,
-            "npu_encoded": nng.total_npu_encoded_weights,
-        }
+        {"original": nng.total_original_weights, "npu_encoded": nng.total_npu_encoded_weights}
         if verbose_weights
         else None
     )