MLBEDSW-7352: Refactoring move_constant_data

Refactoring move_constant_data in the scheduler. The use case currently
only work for LUT tensor, so simplifying the logic. In order to make it
work for other tensors one would also have to take into consideration
memory usage when building cascades and also the
use_fast_storage_for_feature_maps would be effected.

Change-Id: Ic8de53b65a2c17d34515002d7f184d0ab1830222
Signed-off-by: Johan Alfven <johan.alfven@arm.com>
diff --git a/ethosu/vela/scheduler.py b/ethosu/vela/scheduler.py
index a50f262..16531c2 100644
--- a/ethosu/vela/scheduler.py
+++ b/ethosu/vela/scheduler.py
@@ -60,7 +60,6 @@
 from .live_range import ofm_can_reuse_ifm
 from .numeric_util import round_down
 from .numeric_util import round_up
-from .operation import NpuBlockType
 from .operation import Op
 from .shape4d import Shape4D
 from .tensor import MemArea
@@ -213,6 +212,14 @@
             ps.ofm_tensor.format,
         )
 
+        # LUT must be placed in shram area. The copy is done by DMA
+        # generated by the high level command stream generator.
+        for idx, tens in enumerate(self.parent_op.inputs):
+            if tens.purpose == TensorPurpose.LUT:
+                new_tens = tens.clone_into_shram(self.arch)
+                new_tens.consumer_list.append(self.parent_op)
+                self.parent_op.inputs[idx] = new_tens
+
         # Input volume width and height required to produce the smallest possible stripe
         self.min_stripe_input_w, self.min_stripe_input_h = self._calculate_min_stripe_input()
 
@@ -1379,52 +1386,6 @@
             )
         assert max(max_mem_usage) <= staging_limit, "Allocation exceeds staging limit"
 
-    def move_constant_data(self):
-        """Determine if data can be moved from permanent storage to another memory area. A move will generate a DMA
-        command in the high-level command stream"""
-        for sched_op in self.sched_ops:
-            parent_op = sched_op.parent_op
-            is_lut_used = any(inp.purpose == TensorPurpose.LUT for inp in parent_op.inputs)
-            max_ifm_shram_avail = (
-                (self.arch.available_shram_banks(is_lut_used) - self.arch.shram_reserved_output_banks)
-                * self.arch.shram_bank_size
-                // 2
-            )
-
-            for idx, tens in enumerate(parent_op.inputs):
-                if tens.mem_type not in (MemType.Scratch, MemType.Scratch_fast):
-                    # Tensor is in permanent storage
-                    # Only when permanent storage differs from feature map storage, there is a point moving the data
-                    if (
-                        tens.mem_area in self.arch.permanent_storage_mem_area
-                        and self.arch.permanent_storage_mem_area != self.arch.feature_map_storage_mem_area
-                    ) or tens.purpose == TensorPurpose.LUT:
-                        if tens.purpose == TensorPurpose.LUT or (
-                            # For elementwise broadcast
-                            tens.purpose == TensorPurpose.FeatureMap
-                            and sched_op.op_type.is_binary_elementwise_op()
-                            and tens.shape != []
-                            and sched_op.ifm.shape != sched_op.ofm.shape
-                            and parent_op.write_shape is None
-                            and tens.storage_size() > max_ifm_shram_avail
-                        ):
-                            only_vector_product_consumers = all(
-                                oper and oper.type.npu_block_type == NpuBlockType.VectorProduct
-                                for oper in tens.consumers()
-                            )
-
-                            if (not only_vector_product_consumers) or tens.purpose == TensorPurpose.LUT:
-                                new_tens = tens.clone_into_fast_storage(self.arch)
-                                if tens.purpose == TensorPurpose.LUT:
-                                    new_tens.mem_area = MemArea.Shram
-
-                                new_tens.consumer_list.append(parent_op)
-                                parent_op.inputs[idx] = new_tens
-                                # If the index is out of range, IFM and IFM2 are the same tensor
-                                # and pass inputs don't have duplicates
-                                if idx < len(sched_op.parent_ps.inputs):
-                                    sched_op.parent_ps.inputs[idx] = new_tens
-
     def print_schedule(self, schedule: Schedule):
         print(f"Schedule: '{schedule.name}'")
         for sched_op in self.sched_ops:
@@ -1634,7 +1595,6 @@
 
             scheduler.create_scheduler_representation(arch)
             sg.sched_ops = scheduler.sched_ops
-            scheduler.move_constant_data()
 
             # Create the Max schedule template
             max_schedule_template = scheduler.create_initial_schedule()
diff --git a/ethosu/vela/tensor.py b/ethosu/vela/tensor.py
index 008cd05..86306ca 100644
--- a/ethosu/vela/tensor.py
+++ b/ethosu/vela/tensor.py
@@ -501,10 +501,9 @@
 
         return res
 
-    def clone_into_fast_storage(self, arch) -> "Tensor":
-        res = self.clone(suffix="_fast_storage")
-        res.mem_area = arch.fast_storage_mem_area
-        res.mem_type = MemType.Scratch_fast
+    def clone_into_shram(self, arch) -> "Tensor":
+        res = self.clone(suffix="_shram")
+        res.mem_area = MemArea.Shram
         res.src_tensor = self
         return res
 
diff --git a/ethosu/vela/test/test_lut.py b/ethosu/vela/test/test_lut.py
index 58e72bb..e52b489 100644
--- a/ethosu/vela/test/test_lut.py
+++ b/ethosu/vela/test/test_lut.py
@@ -36,7 +36,7 @@
     random.seed(key)
     values = random.choices(range(256), k=256)
     lut_tensor = create_const_tensor(op.name + "_lut", [1, 1, 1, 256], DataType.uint8, values, TensorPurpose.LUT)
-    scratch_lut_tensor = lut_tensor.clone_into_fast_storage(arch)
+    scratch_lut_tensor = lut_tensor.clone_into_shram(arch)
     op.set_activation_lut(scratch_lut_tensor)
 
 
@@ -44,7 +44,7 @@
     random.seed(key)
     values = random.choices(range(256), k=256)
     lut_tensor = create_const_tensor(op.name + "_lut", [1, 1, 1, 256], DataType.int32, values, TensorPurpose.LUT)
-    scratch_lut_tensor = lut_tensor.clone_into_fast_storage(arch)
+    scratch_lut_tensor = lut_tensor.clone_into_shram(arch)
     op.set_activation_lut(scratch_lut_tensor)
 
 
@@ -52,7 +52,7 @@
     random.seed(key)
     values = random.choices(range(512), k=512)
     lut_tensor = create_const_tensor(op.name + "_lut", [1, 1, 1, 512], DataType.int32, values, TensorPurpose.LUT)
-    scratch_lut_tensor = lut_tensor.clone_into_fast_storage(arch)
+    scratch_lut_tensor = lut_tensor.clone_into_shram(arch)
     op.set_activation_lut(scratch_lut_tensor)