MLBEDSW-2688: LUT DMA may require kernel wait

LUT related updates specific for 16K SHRAM:
- prevent LUT DMA transfer from overwriting accumulator SHRAM of an ongoing operation
- do not use the last 2K of SHRAM as accumulator during LUT operations

Change-Id: I17066e0410c6f07b125ed245002d7b19269a7a8a
Signed-off-by: Louis Verhaard <louis.verhaard@arm.com>
diff --git a/ethosu/vela/shared_buffer_allocation.py b/ethosu/vela/shared_buffer_allocation.py
index 053377c..fdcbe94 100644
--- a/ethosu/vela/shared_buffer_allocation.py
+++ b/ethosu/vela/shared_buffer_allocation.py
@@ -25,6 +25,8 @@
 from .errors import VelaError
 from .ethos_u55_regs.ethos_u55_regs import resampling_mode
 from .operation import NpuBlockType
+from .range_set import MemoryRangeSet
+from .tensor import MemArea
 
 
 class SharedBufferAllocation:
@@ -40,6 +42,7 @@
         dilation = (1, 1, 1, 1)
         self.kernel = Kernel(1, 1)
         is_elementwise = ps.npu_block_type == NpuBlockType.ElementWise
+        self.uses_lut = False
 
         if ps.primary_op:
             strides = ps.primary_op.attrs.get("strides", strides)
@@ -55,6 +58,7 @@
                 k_w = ps.primary_op.attrs.get("filter_width", 1)
 
             self.kernel = Kernel(k_w, k_h, strides[2], strides[1], dilation[2], dilation[1])
+            self.uses_lut = ps.primary_op.activation_lut is not None
 
         self.is_equal_depth_op = is_elementwise or ps.npu_block_type in (
             NpuBlockType.ConvolutionDepthWise,
@@ -102,7 +106,7 @@
 
         # Accumulator area is measured from the end of the buffer
         self.bank_locations[SharedBufferArea.Accumulators] = (
-            self.arch.shram_total_banks - self.banks_required[SharedBufferArea.Accumulators]
+            self.arch.available_shram_banks(self.uses_lut) - self.banks_required[SharedBufferArea.Accumulators]
         )
         ifm_end = self.bank_locations[SharedBufferArea.IFM] + self.banks_required[SharedBufferArea.IFM]
         return ifm_end <= self.bank_locations[SharedBufferArea.Accumulators]
@@ -156,6 +160,13 @@
 
         return True
 
+    def get_shram_memory_access_range(self):
+        # Returns the SHRAM memory access range used by this shared buffer,
+        # excluding access to LUT
+        return MemoryRangeSet(
+            MemArea.Shram, 0, self.arch.available_shram_banks(self.uses_lut) * self.arch.shram_bank_size
+        )
+
 
 def shared_buffer_allocation_for_pass_and_block_config(arch, ps, block_config):
     alloc = SharedBufferAllocation(arch, ps)