MLBED-2822 Added CLI-opt for weight size est. Added --weight-estimation-scaling, which enables additional scaling of weight compression scale estimate. Signed-off-by: Patrik Gustavsson <patrik.gustavsson@arm.com> Change-Id: Idcda41257f44901d3a3f345341e07fb1ae8585a9

commit: 90831bc18d45008b703e59aad0594026beb7da82 [log] [tgz]
author: Patrik Gustavsson <patrik.gustavsson@arm.com> Mon Aug 24 16:26:11 2020 +0200
committer: tim.hall <tim.hall@arm.com> Wed Aug 26 09:37:01 2020 +0000
tree: 6b647996160ba8ecd037da1e15c91f55d4b0f665
parent: d47cc2777f3a3f371958c30a3c1880f692b0b0a2 [diff]
diff --git a/ethosu/vela/architecture_features.py b/ethosu/vela/architecture_features.py
index 5453f2c..8b968a3 100644
--- a/ethosu/vela/architecture_features.py
+++ b/ethosu/vela/architecture_features.py

@@ -184,6 +184,7 @@
         global_memory_clock_scale,
         max_blockdep,
         softmax_support,
+        weight_estimation_scaling,
     ):
         accelerator_config = accelerator_config.lower()
         self.vela_config = vela_config
@@ -215,6 +216,7 @@
             )
 
         self.max_blockdep = max_blockdep
+        self.weight_estimation_scaling = weight_estimation_scaling
 
         dpu_min_height = accel_config.ofm_ublock.height
         dpu_min_width = accel_config.ofm_ublock.width

diff --git a/ethosu/vela/compiler_driver.py b/ethosu/vela/compiler_driver.py
index 5e9e38f..94900ad 100644
--- a/ethosu/vela/compiler_driver.py
+++ b/ethosu/vela/compiler_driver.py

@@ -212,7 +212,9 @@
     if root_sg is not None and (arch.feature_map_storage_mem_area != arch.fast_storage_mem_area):
         if root_sg.memory_used_per_type.get(MemType.Scratch_fast, 0) > arch.sram_size:
             raise VelaError(
-                "Sram limit {} bytes, has been exceeded by the scratch fast tensor {} bytes".format(
+                "Sram limit {} bytes, has been exceeded by the scratch fast tensor {} bytes. "
+                "Increasing the value of --weight-estimation-scaling may help to resolve the issue. "
+                "See OPTIONS.md for more information.".format(
                     arch.sram_size, root_sg.memory_used_per_type.get(MemType.Scratch_fast, 0)
                 )
             )

diff --git a/ethosu/vela/scheduler.py b/ethosu/vela/scheduler.py
index f3b3a79..9a8215d 100644
--- a/ethosu/vela/scheduler.py
+++ b/ethosu/vela/scheduler.py

@@ -608,7 +608,10 @@
         base_sram_used = 0
         for tens in ps.intermediates:
             if tens.mem_area == self.mem_area:
-                base_sram_used += tens.storage_size()
+                if tens.purpose == TensorPurpose.Weights:
+                    base_sram_used = tens.storage_size(self.arch.weight_estimation_scaling)
+                else:
+                    base_sram_used += tens.storage_size()
 
         all_block_configs = self.get_block_configs(ps)
         for block_config in all_block_configs:
@@ -718,7 +721,7 @@
                     )
                 ]
                 sram_used += ifm_tensor.storage_size_for_sub_purpose(
-                    TensorSubPurpose.RollingBufferY, rolling_buffer_y, None
+                    self.arch, TensorSubPurpose.RollingBufferY, rolling_buffer_y, None
                 )
 
                 all_candidates.extend(self.append_sram_rewrite_list(sram_used, rewrite_list, [strat_opt]))
@@ -779,7 +782,9 @@
             for tens in ps.intermediates:
                 if tens.mem_area == self.mem_area:
                     if tens.purpose == TensorPurpose.Weights:
-                        sram_used += tens.storage_size_for_sub_purpose(TensorSubPurpose.DoubleBuffer, block_config[3])
+                        sram_used += tens.storage_size_for_sub_purpose(
+                            self.arch, TensorSubPurpose.DoubleBuffer, block_config[3]
+                        )
                         rewrite_list.append(
                             (
                                 SchedulerRewrite.ChangeTensorSubPurpose,

diff --git a/ethosu/vela/tensor.py b/ethosu/vela/tensor.py
index d4f6a40..3ad9b25 100644
--- a/ethosu/vela/tensor.py
+++ b/ethosu/vela/tensor.py

@@ -439,20 +439,25 @@
     def has_fully_defined_shape(self):
         return shape_fully_defined(self.shape)
 
-    def storage_size(self):
-        raw_size = self.storage_elements() * self.element_size()
+    def storage_size(self, scale=1.0):
+        raw_size = self.storage_elements() * self.element_size() * scale
         if raw_size == 0:
             raw_size = 1  # force it to take up space
         rounded_size = numeric_util.round_up(numeric_util.round_up_to_int(raw_size), self.alignment)
         return rounded_size
 
-    def storage_size_for_sub_purpose(self, sub_purpose, param_a=None, param_b=None):
+    def storage_size_for_sub_purpose(self, arch, sub_purpose, param_a=None, param_b=None):
         alt_shape = self.storage_shape_for_sub_purpose(sub_purpose, param_a, param_b)
         elems = shape_num_elements(alt_shape)
         if elems is None:
             return 0
         if sub_purpose == TensorSubPurpose.DoubleBuffer:
-            raw_size = elems * self.element_size() * self.compression_scale_for_worst_weight_stream
+            raw_size = (
+                elems
+                * self.element_size()
+                * self.compression_scale_for_worst_weight_stream
+                * arch.weight_estimation_scaling
+            )
         else:
             # Rolling buffers are used for intermediate data in ifm streaming
             # These will all use the NHCWB16 format, and need to be aligned to 16 in the C-dimension

diff --git a/ethosu/vela/test/testutil.py b/ethosu/vela/test/testutil.py
index 116afa4..68866fc 100644
--- a/ethosu/vela/test/testutil.py
+++ b/ethosu/vela/test/testutil.py

@@ -38,6 +38,7 @@
         global_memory_clock_scale=1.0,
         max_blockdep=0,
         softmax_support=True,
+        weight_estimation_scaling=1.0,
     )
 
 

diff --git a/ethosu/vela/vela.py b/ethosu/vela/vela.py
index 97cc873..1908092 100644
--- a/ethosu/vela/vela.py
+++ b/ethosu/vela/vela.py

@@ -260,7 +260,12 @@
         choices=[True, False],
         help="Control if Softmax should be transformed into a set of npu operations (default: %(default)s)",
     )
-
+    parser.add_argument(
+        "--weight-estimation-scaling",
+        type=float,
+        default=1.0,
+        help=("Performs an additional scaling of weight compression scale estimate (default: %(default)s)"),
+    )
     args = parser.parse_args(args=args)
 
     # Read configuration file
@@ -291,6 +296,7 @@
         global_memory_clock_scale=args.global_memory_clock_scale,
         max_blockdep=args.max_block_dependency,
         softmax_support=args.softmax_support,
+        weight_estimation_scaling=args.weight_estimation_scaling,
     )
 
     compiler_options = compiler_driver.CompilerOptions(
commit	90831bc18d45008b703e59aad0594026beb7da82	[log] [tgz]
author	Patrik Gustavsson <patrik.gustavsson@arm.com>	Mon Aug 24 16:26:11 2020 +0200
committer	tim.hall <tim.hall@arm.com>	Wed Aug 26 09:37:01 2020 +0000
tree	6b647996160ba8ecd037da1e15c91f55d4b0f665
parent	d47cc2777f3a3f371958c30a3c1880f692b0b0a2 [diff]