MLBEDSW-2688: LUT DMA may require kernel wait
LUT related updates specific for 16K SHRAM:
- prevent LUT DMA transfer from overwriting accumulator SHRAM of an ongoing operation
- do not use the last 2K of SHRAM as accumulator during LUT operations
Change-Id: I17066e0410c6f07b125ed245002d7b19269a7a8a
Signed-off-by: Louis Verhaard <louis.verhaard@arm.com>
diff --git a/ethosu/vela/shared_buffer_allocation.py b/ethosu/vela/shared_buffer_allocation.py
index 053377c..fdcbe94 100644
--- a/ethosu/vela/shared_buffer_allocation.py
+++ b/ethosu/vela/shared_buffer_allocation.py
@@ -25,6 +25,8 @@
from .errors import VelaError
from .ethos_u55_regs.ethos_u55_regs import resampling_mode
from .operation import NpuBlockType
+from .range_set import MemoryRangeSet
+from .tensor import MemArea
class SharedBufferAllocation:
@@ -40,6 +42,7 @@
dilation = (1, 1, 1, 1)
self.kernel = Kernel(1, 1)
is_elementwise = ps.npu_block_type == NpuBlockType.ElementWise
+ self.uses_lut = False
if ps.primary_op:
strides = ps.primary_op.attrs.get("strides", strides)
@@ -55,6 +58,7 @@
k_w = ps.primary_op.attrs.get("filter_width", 1)
self.kernel = Kernel(k_w, k_h, strides[2], strides[1], dilation[2], dilation[1])
+ self.uses_lut = ps.primary_op.activation_lut is not None
self.is_equal_depth_op = is_elementwise or ps.npu_block_type in (
NpuBlockType.ConvolutionDepthWise,
@@ -102,7 +106,7 @@
# Accumulator area is measured from the end of the buffer
self.bank_locations[SharedBufferArea.Accumulators] = (
- self.arch.shram_total_banks - self.banks_required[SharedBufferArea.Accumulators]
+ self.arch.available_shram_banks(self.uses_lut) - self.banks_required[SharedBufferArea.Accumulators]
)
ifm_end = self.bank_locations[SharedBufferArea.IFM] + self.banks_required[SharedBufferArea.IFM]
return ifm_end <= self.bank_locations[SharedBufferArea.Accumulators]
@@ -156,6 +160,13 @@
return True
+ def get_shram_memory_access_range(self):
+ # Returns the SHRAM memory access range used by this shared buffer,
+ # excluding access to LUT
+ return MemoryRangeSet(
+ MemArea.Shram, 0, self.arch.available_shram_banks(self.uses_lut) * self.arch.shram_bank_size
+ )
+
def shared_buffer_allocation_for_pass_and_block_config(arch, ps, block_config):
alloc = SharedBufferAllocation(arch, ps)