[MLBEDSW-4254] Improve weight information in summary

Improved weight information showed in summary if --verbose-weights
option is used.

Signed-off-by: Fredrik Svedberg <fredrik.svedberg@arm.com>
Change-Id: Iac142f2a813bf1c05aa9da3f8a384466e2914d06
diff --git a/OPTIONS.md b/OPTIONS.md
index 86b05f6..e820711 100644
--- a/OPTIONS.md
+++ b/OPTIONS.md
@@ -431,6 +431,14 @@
 vela network.tflite --verbose-operators
 ```
 
+### Verbose Weights
+
+Verbose weights information.  
+
+```bash
+vela network.tflite --verbose-weights
+```
+
 ## Configuration File
 
 This is used to describe various properties of the Ethos-U embedded system.  The
diff --git a/ethosu/vela/api.py b/ethosu/vela/api.py
index f972133..e91c0bd 100644
--- a/ethosu/vela/api.py
+++ b/ethosu/vela/api.py
@@ -416,15 +416,16 @@
     :param ofm_block_depth: the depth of blocks for processing
     :param is_depthwise: a boolean indicating these weights are used for a depthwise traversal
     :param block_traversal: indicates how these weights are traversed on sub-kernel basis
-    :return: a bytearray of compressed weights
+    :return: a bytearray of encoded weights
     """
     from .architecture_features import Accelerator
     from . import weight_compressor
 
     acc = Accelerator.from_npu_accelerator(accelerator)
-    return weight_compressor.encode_weights(
+    encoded_weights, _ = weight_compressor.encode_weights(
         acc, weights_volume, dilation_xy, ifm_bitdepth, ofm_block_depth, is_depthwise, block_traversal
     )
+    return encoded_weights
 
 
 def npu_encode_bias(bias: numpy.int64, scale: int, shift: int):
diff --git a/ethosu/vela/compiler_driver.py b/ethosu/vela/compiler_driver.py
index a3c0100..26d350e 100644
--- a/ethosu/vela/compiler_driver.py
+++ b/ethosu/vela/compiler_driver.py
@@ -61,6 +61,7 @@
         verbose_high_level_command_stream=False,
         verbose_register_command_stream=False,
         verbose_operators=False,
+        verbose_weights=False,
         show_cpu_operations=False,
         tensor_allocator=TensorAllocator.Greedy,
         timing=False,
@@ -77,6 +78,7 @@
         self.verbose_high_level_command_stream = verbose_high_level_command_stream
         self.verbose_register_command_stream = verbose_register_command_stream
         self.verbose_operators = verbose_operators
+        self.verbose_weights = verbose_weights
         self.show_cpu_operations = show_cpu_operations
         self.tensor_allocator = tensor_allocator
         self.timing = timing
diff --git a/ethosu/vela/nn_graph.py b/ethosu/vela/nn_graph.py
index 2d4b0c8..677a385 100644
--- a/ethosu/vela/nn_graph.py
+++ b/ethosu/vela/nn_graph.py
@@ -517,9 +517,9 @@
         self.subgraphs = []
         self.metadata = []
         self.memory_used = {}
-        self.weights_compression_ratio = 0
         self.total_original_weights = 0
-        self.total_compressed_weights = 0
+        self.total_npu_weights = 0
+        self.total_npu_encoded_weights = 0
         self.weight_cache = None  # See CompressedWeightCache
 
     def get_root_subgraph(self):
diff --git a/ethosu/vela/stats_writer.py b/ethosu/vela/stats_writer.py
index 597fd15..18b8092 100644
--- a/ethosu/vela/stats_writer.py
+++ b/ethosu/vela/stats_writer.py
@@ -58,7 +58,9 @@
             "passes_after_fusing",
         ]
         labels += [area.identifier_name() + "_memory_used" for area in mem_areas]
-        labels += ["weights_compression_ratio"]
+        labels += ["total_original_weights"]
+        labels += ["total_npu_weights"]
+        labels += ["total_npu_encoded_weights"]
 
         for mem_area in mem_areas:
             labels += [
@@ -107,7 +109,9 @@
 
         data_items += [midpoint_fps, nng.batch_size, midpoint_inference_time, n_passes, n_cascaded_passes]
         data_items += [nng.memory_used.get(mem_area, 0) / 1024.0 for mem_area in mem_areas]
-        data_items += [nng.weights_compression_ratio]
+        data_items += [nng.total_original_weights]
+        data_items += [nng.total_npu_weights]
+        data_items += [nng.total_npu_encoded_weights]
 
         for mem_area in mem_areas:
             bws = nng.bandwidths[mem_area]
@@ -228,8 +232,8 @@
     num_cascaded_passes,
     n_operations=0,
     cpu_operations=None,
-    weights_compression_ratio=None,
     show_cpu_operations=False,
+    weights_data=None,
     f=sys.stdout,
 ):
 
@@ -327,10 +331,11 @@
         )
         print(file=f)
 
-    if weights_compression_ratio:
-        print(
-            f"Weights Compression Ratio                {weights_compression_ratio:12.2f}", file=f,
-        )
+    if weights_data:
+        print(f"Original Weights Size                    {weights_data['original'] / 1024.0:12.2f} KiB", file=f)
+        print(f"NPU Weights Size                         {weights_data['npu'] / 1024.0:12.2f} KiB", file=f)
+        print(f"NPU Encoded Weights Size                 {weights_data['npu_encoded'] / 1024.0:12.2f} KiB", file=f)
+        print(file=f)
 
     print(
         f"Neural network macs                      {int(macs):12d} MACs/batch", file=f,
@@ -354,12 +359,21 @@
     print(file=f)
 
 
-def print_performance_metrics(nng, arch, show_cpu_operations=False, f=sys.stdout):
+def print_performance_metrics(nng, arch, show_cpu_operations=False, verbose_weights=False, f=sys.stdout):
     n_passes = sum(len(sg.passes) for sg in nng.subgraphs)
     n_cascaded_passes = sum(len(sg.cascaded_passes) for sg in nng.subgraphs)
     n_operations = sum(len(ps.ops) for sg in nng.subgraphs for ps in sg.passes)
     cpu_operations = sum((ps.ops for sg in nng.subgraphs for ps in sg.passes if ps.placement == PassPlacement.Cpu), [])
     min_mem_usage = max(sg.min_mem_usage for sg in nng.subgraphs)
+    weights_data = (
+        {
+            "original": nng.total_original_weights,
+            "npu": nng.total_npu_weights,
+            "npu_encoded": nng.total_npu_encoded_weights,
+        }
+        if verbose_weights
+        else None
+    )
     return print_performance_metrics_for_strat(
         arch,
         nng.name,
@@ -373,12 +387,7 @@
         n_cascaded_passes,
         n_operations,
         cpu_operations,
-        nng.weights_compression_ratio,
         show_cpu_operations,
+        weights_data,
         f,
     )
-
-
-def write_human_friendly_metrics(nng, arch, filename):
-    f = open(filename, "w")
-    print_performance_metrics(nng, arch, f=f)
diff --git a/ethosu/vela/tensor_allocation.py b/ethosu/vela/tensor_allocation.py
index 0ad30e5..7ffc6f3 100644
--- a/ethosu/vela/tensor_allocation.py
+++ b/ethosu/vela/tensor_allocation.py
@@ -217,9 +217,5 @@
 
     if sg == nng.get_root_subgraph():
         nng.memory_used = sg.memory_used
-        try:
-            nng.weights_compression_ratio = nng.total_compressed_weights / nng.total_original_weights
-        except ZeroDivisionError:
-            nng.weights_compression_ratio = 0.0
 
     return True
diff --git a/ethosu/vela/vela.py b/ethosu/vela/vela.py
index c955186..aa74ecf 100644
--- a/ethosu/vela/vela.py
+++ b/ethosu/vela/vela.py
@@ -77,7 +77,12 @@
     summary_csv_file = "{0}_summary_{1}.csv".format(output_basename, arch.system_config)
     stats_writer.write_summary_metrics_csv(nng, summary_csv_file, arch)
 
-    stats_writer.print_performance_metrics(nng, show_cpu_operations=compiler_options.show_cpu_operations, arch=arch)
+    stats_writer.print_performance_metrics(
+        nng,
+        show_cpu_operations=compiler_options.show_cpu_operations,
+        verbose_weights=compiler_options.verbose_weights,
+        arch=arch,
+    )
 
     output_filename = output_basename + "_vela.tflite"
     if input_name.endswith(".tflite"):
@@ -284,6 +289,7 @@
             "--verbose-register-command-stream", action="store_true", help="Verbose register command stream"
         )
         parser.add_argument("--verbose-operators", action="store_true", help="Verbose operator list")
+        parser.add_argument("--verbose-weights", action="store_true", help="Verbose weights information")
         parser.add_argument(
             "--show-cpu-operations", action="store_true", help="Show the operations that fall back to the CPU"
         )
@@ -456,6 +462,7 @@
             verbose_high_level_command_stream=args.verbose_high_level_command_stream,
             verbose_register_command_stream=args.verbose_register_command_stream,
             verbose_operators=args.verbose_operators,
+            verbose_weights=args.verbose_weights,
             show_cpu_operations=args.show_cpu_operations,
             tensor_allocator=args.tensor_allocator,
             timing=args.timing,
diff --git a/ethosu/vela/weight_compressor.py b/ethosu/vela/weight_compressor.py
index bb7cd67..7ce237c 100644
--- a/ethosu/vela/weight_compressor.py
+++ b/ethosu/vela/weight_compressor.py
@@ -68,7 +68,7 @@
     :param is_depthwise: a boolean indicating these weights are used for a depthwise traversal
     :param block_traversal: indicates how these weights are traversed on sub-kernel basis
 
-    :return: a bytearray of compressed weights
+    :return: a tuple with a bytearray of encoded weights and the size of the unencoded weights
     """
     # Check arg types
     assert isinstance(accelerator, Accelerator)
@@ -104,7 +104,7 @@
         dilation=dilation_xy,
     )
     encoded_stream = encode(raw_stream)
-    return encoded_stream
+    return encoded_stream, len(raw_stream)
 
 
 def encode_bias(bias: np.int64, scale: int, shift: int):
@@ -161,15 +161,23 @@
     def __init__(self):
         self.cache = {}  # maps from WeightCompressionConfig to a tensor clone containing compressed weights
 
-    def get_tensor_with_same_compression(self, wcc):
-        return self.cache.get(wcc)
+    def has_tensor_with_same_compression(self, wcc):
+        return self.cache.get(wcc) is not None
 
-    def add(self, tens):
+    def get_tensor_with_same_compression(self, wcc):
+        cache_obj = self.cache.get(wcc)
+        return cache_obj[0] if cache_obj else None
+
+    def get_unencoded_size_with_same_compression(self, wcc):
+        cache_obj = self.cache.get(wcc)
+        return cache_obj[1] if cache_obj else None
+
+    def add(self, tens, unencoded_size):
         # Adds the compressed weights from the tensor to the cache
         wcc = tens.weight_compression_config
         # Clone the tensor to make sure that nothing related to the weight compression is modified
         tens_clone = tens.clone("_weights{}_{}".format(wcc.ofm_block_depth, wcc.ofm_depth_step))
-        self.cache[wcc] = tens_clone
+        self.cache[wcc] = (tens_clone, unencoded_size)
 
 
 def encode(weight_stream):
@@ -300,7 +308,7 @@
         # Cache hit, copy weights from the cache
         tens.copy_compressed_weight_info(tens_cached)
         set_storage_shape(tens)
-        return
+        return nng.weight_cache.get_unencoded_size_with_same_compression(wcc)
     # No cache hit, perform the compression
     assert tens.quantization is not None
     assert tens.quantization.scale_f32 is not None
@@ -321,6 +329,7 @@
     encoded_streams_substream_offsets = []
     offset = 0
     max_single_buffer_len = 0
+    unencoded_size = 0
 
     ifm_bitdepth = tens.consumer_list[0].inputs[0].dtype.size_in_bits()
     ifm_depth = weights.shape[-2]
@@ -371,7 +380,7 @@
             block_depth = (ofm_block_depth + arch.ncores - 1 - core) // arch.ncores
             encoded_substream = []
             if block_depth != 0:
-                encoded_substream = encode_weights(
+                encoded_substream, raw_stream_size = encode_weights(
                     accelerator=arch.accelerator_config,
                     weights_volume=core_weights,
                     dilation_xy=dilation,
@@ -380,6 +389,7 @@
                     is_depthwise=is_depthwise,
                     block_traversal=block_traversal,
                 )
+                unencoded_size += raw_stream_size
             encoded_stream.extend(encoded_substream)
             substream_offsets.append(len(encoded_stream))
 
@@ -408,7 +418,8 @@
     tens.compressed_values_substream_offsets = encoded_streams_substream_offsets
     tens.brick_size = brick_size
     set_storage_shape(tens)
-    nng.weight_cache.add(tens)
+    nng.weight_cache.add(tens, unencoded_size)
+    return unencoded_size
 
 
 def calc_scales_and_pack_biases(tens, arch, ofm_depth_step, rescale_for_faf=False):
@@ -525,11 +536,11 @@
                     ofm_depth_step = ps.block_config[-1]
                 else:
                     ofm_depth_step = tens.shape[-1]
-                compress_weights(
+                nng.total_npu_weights += compress_weights(
                     arch, nng, tens, op.type.npu_block_type, ps.block_config[-1], ofm_depth_step, op.get_dilation_h_w()
                 )
-                nng.total_compressed_weights += tens.weight_compressed_offsets[-1]
-                nng.total_original_weights += tens.elements() * tens.element_size()
+                nng.total_npu_encoded_weights += tens.weight_compressed_offsets[-1]
+                nng.total_original_weights += int(tens.elements() * tens.element_size())
 
                 # Update source tensor
                 if needs_dma: