MLBEDSW-7352: Refactoring move_constant_data Refactoring move_constant_data in the scheduler. The use case currently only work for LUT tensor, so simplifying the logic. In order to make it work for other tensors one would also have to take into consideration memory usage when building cascades and also the use_fast_storage_for_feature_maps would be effected. Change-Id: Ic8de53b65a2c17d34515002d7f184d0ab1830222 Signed-off-by: Johan Alfven <johan.alfven@arm.com>

commit: 126558e26df26830c2d331ec0041dc9a4f1a0d38 [log] [tgz]
author: Johan Alfven <johan.alfven@arm.com> Thu Mar 09 08:36:10 2023 +0100
committer: Fredrik Svedberg <fredrik.svedberg@arm.com> Thu Mar 16 16:12:36 2023 +0000
tree: 597de0b202cfcd3950faf9c7f4e71e56ed0d867d
parent: a5e1b6224d8436365e7f0bdb0afef060423fba57 [diff]
diff --git a/ethosu/vela/scheduler.py b/ethosu/vela/scheduler.py
index a50f262..16531c2 100644
--- a/ethosu/vela/scheduler.py
+++ b/ethosu/vela/scheduler.py

@@ -60,7 +60,6 @@
 from .live_range import ofm_can_reuse_ifm
 from .numeric_util import round_down
 from .numeric_util import round_up
-from .operation import NpuBlockType
 from .operation import Op
 from .shape4d import Shape4D
 from .tensor import MemArea
@@ -213,6 +212,14 @@
             ps.ofm_tensor.format,
         )
 
+        # LUT must be placed in shram area. The copy is done by DMA
+        # generated by the high level command stream generator.
+        for idx, tens in enumerate(self.parent_op.inputs):
+            if tens.purpose == TensorPurpose.LUT:
+                new_tens = tens.clone_into_shram(self.arch)
+                new_tens.consumer_list.append(self.parent_op)
+                self.parent_op.inputs[idx] = new_tens
+
         # Input volume width and height required to produce the smallest possible stripe
         self.min_stripe_input_w, self.min_stripe_input_h = self._calculate_min_stripe_input()
 
@@ -1379,52 +1386,6 @@
             )
         assert max(max_mem_usage) <= staging_limit, "Allocation exceeds staging limit"
 
-    def move_constant_data(self):
-        """Determine if data can be moved from permanent storage to another memory area. A move will generate a DMA
-        command in the high-level command stream"""
-        for sched_op in self.sched_ops:
-            parent_op = sched_op.parent_op
-            is_lut_used = any(inp.purpose == TensorPurpose.LUT for inp in parent_op.inputs)
-            max_ifm_shram_avail = (
-                (self.arch.available_shram_banks(is_lut_used) - self.arch.shram_reserved_output_banks)
-                * self.arch.shram_bank_size
-                // 2
-            )
-
-            for idx, tens in enumerate(parent_op.inputs):
-                if tens.mem_type not in (MemType.Scratch, MemType.Scratch_fast):
-                    # Tensor is in permanent storage
-                    # Only when permanent storage differs from feature map storage, there is a point moving the data
-                    if (
-                        tens.mem_area in self.arch.permanent_storage_mem_area
-                        and self.arch.permanent_storage_mem_area != self.arch.feature_map_storage_mem_area
-                    ) or tens.purpose == TensorPurpose.LUT:
-                        if tens.purpose == TensorPurpose.LUT or (
-                            # For elementwise broadcast
-                            tens.purpose == TensorPurpose.FeatureMap
-                            and sched_op.op_type.is_binary_elementwise_op()
-                            and tens.shape != []
-                            and sched_op.ifm.shape != sched_op.ofm.shape
-                            and parent_op.write_shape is None
-                            and tens.storage_size() > max_ifm_shram_avail
-                        ):
-                            only_vector_product_consumers = all(
-                                oper and oper.type.npu_block_type == NpuBlockType.VectorProduct
-                                for oper in tens.consumers()
-                            )
-
-                            if (not only_vector_product_consumers) or tens.purpose == TensorPurpose.LUT:
-                                new_tens = tens.clone_into_fast_storage(self.arch)
-                                if tens.purpose == TensorPurpose.LUT:
-                                    new_tens.mem_area = MemArea.Shram
-
-                                new_tens.consumer_list.append(parent_op)
-                                parent_op.inputs[idx] = new_tens
-                                # If the index is out of range, IFM and IFM2 are the same tensor
-                                # and pass inputs don't have duplicates
-                                if idx < len(sched_op.parent_ps.inputs):
-                                    sched_op.parent_ps.inputs[idx] = new_tens
-
     def print_schedule(self, schedule: Schedule):
         print(f"Schedule: '{schedule.name}'")
         for sched_op in self.sched_ops:
@@ -1634,7 +1595,6 @@
 
             scheduler.create_scheduler_representation(arch)
             sg.sched_ops = scheduler.sched_ops
-            scheduler.move_constant_data()
 
             # Create the Max schedule template
             max_schedule_template = scheduler.create_initial_schedule()

diff --git a/ethosu/vela/tensor.py b/ethosu/vela/tensor.py
index 008cd05..86306ca 100644
--- a/ethosu/vela/tensor.py
+++ b/ethosu/vela/tensor.py

@@ -501,10 +501,9 @@
 
         return res
 
-    def clone_into_fast_storage(self, arch) -> "Tensor":
-        res = self.clone(suffix="_fast_storage")
-        res.mem_area = arch.fast_storage_mem_area
-        res.mem_type = MemType.Scratch_fast
+    def clone_into_shram(self, arch) -> "Tensor":
+        res = self.clone(suffix="_shram")
+        res.mem_area = MemArea.Shram
         res.src_tensor = self
         return res
 

diff --git a/ethosu/vela/test/test_lut.py b/ethosu/vela/test/test_lut.py
index 58e72bb..e52b489 100644
--- a/ethosu/vela/test/test_lut.py
+++ b/ethosu/vela/test/test_lut.py

@@ -36,7 +36,7 @@
     random.seed(key)
     values = random.choices(range(256), k=256)
     lut_tensor = create_const_tensor(op.name + "_lut", [1, 1, 1, 256], DataType.uint8, values, TensorPurpose.LUT)
-    scratch_lut_tensor = lut_tensor.clone_into_fast_storage(arch)
+    scratch_lut_tensor = lut_tensor.clone_into_shram(arch)
     op.set_activation_lut(scratch_lut_tensor)
 
 
@@ -44,7 +44,7 @@
     random.seed(key)
     values = random.choices(range(256), k=256)
     lut_tensor = create_const_tensor(op.name + "_lut", [1, 1, 1, 256], DataType.int32, values, TensorPurpose.LUT)
-    scratch_lut_tensor = lut_tensor.clone_into_fast_storage(arch)
+    scratch_lut_tensor = lut_tensor.clone_into_shram(arch)
     op.set_activation_lut(scratch_lut_tensor)
 
 
@@ -52,7 +52,7 @@
     random.seed(key)
     values = random.choices(range(512), k=512)
     lut_tensor = create_const_tensor(op.name + "_lut", [1, 1, 1, 512], DataType.int32, values, TensorPurpose.LUT)
-    scratch_lut_tensor = lut_tensor.clone_into_fast_storage(arch)
+    scratch_lut_tensor = lut_tensor.clone_into_shram(arch)
     op.set_activation_lut(scratch_lut_tensor)
commit	126558e26df26830c2d331ec0041dc9a4f1a0d38	[log] [tgz]
author	Johan Alfven <johan.alfven@arm.com>	Thu Mar 09 08:36:10 2023 +0100
committer	Fredrik Svedberg <fredrik.svedberg@arm.com>	Thu Mar 16 16:12:36 2023 +0000
tree	597de0b202cfcd3950faf9c7f4e71e56ed0d867d
parent	a5e1b6224d8436365e7f0bdb0afef060423fba57 [diff]