MLBEDSW-7397: Wrong mem_area used in scheduler

Performance estimation now uses the parent_tensor mem_area instead of
the scheduler_op mem_area, because the mem_area is only set on the
parent_tensor by the scheduler.

Signed-off-by: wilisa01 <william.isaksson@arm.com>
Change-Id: I11f73686bfbd6958a8920c5e264a5f95cc3f23d1
diff --git a/ethosu/vela/npu_performance.py b/ethosu/vela/npu_performance.py
index 2325a9c..eb9f66c 100644
--- a/ethosu/vela/npu_performance.py
+++ b/ethosu/vela/npu_performance.py
@@ -194,7 +194,7 @@
 
 
 def _estimate_memory_transfer_efficiency(
-    arch, is_read, mem_area, format: TensorFormat, element_bits, block_size, shape4D, to_transfer
+    arch, is_read, mem_area, format, element_bits, block_size, shape4D, to_transfer
 ):
     burst_len = 8
 
@@ -620,14 +620,14 @@
     query = PerformanceQuery(op.op_type.npu_block_type)
     query.ifm_shape = op.ifm.shape
     query.ifm_format = op.ifm.format
-    query.ifm_memory_area = op.ifm.mem_area
+    query.ifm_memory_area = op.ifm.connection.parent_tens.mem_area  # Mem Area is set directly on parent_tens
     query.ifm_bits = op.ifm.dtype.size_in_bits()
     query.ifm2_shape = op.ifm2 and op.ifm2.shape
     query.ifm2_format = op.ifm2 and op.ifm2.format
-    query.ifm2_memory_area = op.ifm2 and op.ifm2.mem_area
+    query.ifm2_memory_area = op.ifm2 and op.ifm2.connection.parent_tens.mem_area
     query.ifm2_bits = op.ifm2 and op.ifm2.dtype.size_in_bits()
     query.ofm_shape = op.ofm.shape
-    query.ofm_memory_area = op.ofm.mem_area
+    query.ofm_memory_area = op.ofm.connection.parent_tens.mem_area
     query.ofm_bits = op.ofm.dtype.size_in_bits()
     query.ofm_format = op.ofm.format
     query.kernel = op.kernel
@@ -715,31 +715,38 @@
         cycles_a[PassCycles.Npu] += max(dma_transfer_cycles - slack_cycles, 0)
 
     # OFM write
-    ofm = op.parent_op.ofm
+    ofm = op.ofm.connection.parent_tens
     bw = access.ofm_write * ofm.element_size()
     bws[query.ofm_memory_area][ofm.purpose][BandwidthDirection.Write] += bw
-    scaled_bws[ofm.mem_area][ofm.purpose][BandwidthDirection.Write] += _estimate_memory_transfer_efficiency(
-        arch, False, query.ofm_memory_area, ofm.format, query.ofm_bits, query.config.ofm_block, query.ofm_shape, bw
+    scaled_bws[query.ofm_memory_area][ofm.purpose][BandwidthDirection.Write] += _estimate_memory_transfer_efficiency(
+        arch,
+        False,
+        query.ofm_memory_area,
+        query.ofm_format,
+        query.ofm_bits,
+        query.config.ofm_block,
+        query.ofm_shape,
+        bw,
     )
 
     # IFM read
-    ifm = op.parent_op.ifm2 if op.reversed_operands else op.parent_op.ifm
+    ifm = op.ifm.connection.parent_tens
     bw = access.ifm_read[0] * ifm.element_size()
-    bws[ifm.mem_area][ifm.purpose][BandwidthDirection.Read] += bw
-    scaled_bws[ifm.mem_area][ifm.purpose][BandwidthDirection.Read] += _estimate_memory_transfer_efficiency(
-        arch, True, query.ifm_memory_area, ifm.format, query.ifm_bits, query.config.ifm_block, query.ifm_shape, bw
+    bws[query.ifm_memory_area][ifm.purpose][BandwidthDirection.Read] += bw
+    scaled_bws[query.ifm_memory_area][ifm.purpose][BandwidthDirection.Read] += _estimate_memory_transfer_efficiency(
+        arch, True, query.ifm_memory_area, query.ifm_format, query.ifm_bits, query.config.ifm_block, query.ifm_shape, bw
     )
 
     if query.ifm2_shape:
-        ifm2 = op.parent_op.ifm if op.reversed_operands else op.parent_op.ifm2
+        ifm2 = op.ifm2.connection.parent_tens
         bw = access.ifm_read[1] * ifm2.element_size()
         bws[ifm2.mem_area][ifm2.purpose][BandwidthDirection.Read] += bw
         scaled_bws[ifm2.mem_area][ifm2.purpose][BandwidthDirection.Read] += _estimate_memory_transfer_efficiency(
             arch,
             True,
             query.ifm2_memory_area,
-            ifm2.format,
-            op.ifm2.dtype.size_in_bits(),
+            query.ifm2_format,
+            query.ifm2_bits,
             query.config.ifm_block,
             query.ifm2_shape,
             bw,
diff --git a/ethosu/vela/scheduler.py b/ethosu/vela/scheduler.py
index 4befad4..8188b5b 100644
--- a/ethosu/vela/scheduler.py
+++ b/ethosu/vela/scheduler.py
@@ -566,15 +566,15 @@
     def estimate_op_performance(self, op: SchedulerOperation, block_config, ofm_depth):
         query = npu_performance.PerformanceQuery(op.op_type.npu_block_type)
         query.ifm_shape = op.ifm.shape
-        query.ifm_memory_area = op.ifm.mem_area
+        query.ifm_memory_area = op.ifm.connection.parent_tens.mem_area
         query.ifm_bits = op.ifm.dtype.size_in_bits()
         query.ifm_format = op.ifm.format
         query.ifm2_shape = op.ifm2 and op.ifm2.shape
-        query.ifm2_memory_area = op.ifm2 and op.ifm2.mem_area
+        query.ifm2_memory_area = op.ifm2 and op.ifm2.connection.parent_tens.mem_area
         query.ifm2_bits = op.ifm2 and op.ifm2.dtype.size_in_bits()
         query.ifm2_format = op.ifm2 and op.ifm2.format
         query.ofm_shape = op.ofm.shape.with_depth(ofm_depth)
-        query.ofm_memory_area = op.ofm.mem_area
+        query.ofm_memory_area = op.ofm.connection.parent_tens.mem_area
         query.ofm_bits = op.ofm.dtype.size_in_bits()
         query.ofm_format = op.ofm.format
         if op.parent_op.bias:
@@ -589,15 +589,15 @@
     def estimate_element_access(self, op: SchedulerOperation, block_config, ofm_depth):
         query = npu_performance.PerformanceQuery(op.op_type.npu_block_type)
         query.ifm_shape = op.ifm.shape
-        query.ifm_memory_area = op.ifm.mem_area
+        query.ifm_memory_area = op.ifm.connection.parent_tens.mem_area
         query.ifm_bits = op.ifm.dtype.size_in_bits()
         query.ifm_format = op.ifm.format
         query.ifm2_shape = op.ifm2 and op.ifm2.shape
-        query.ifm2_memory_area = op.ifm2 and op.ifm2.mem_area
+        query.ifm2_memory_area = op.ifm2 and op.ifm2.connection.parent_tens.mem_area
         query.ifm2_bits = op.ifm2 and op.ifm2.dtype.size_in_bits()
         query.ifm2_format = op.ifm2 and op.ifm2.format
         query.ofm_shape = op.ofm.shape.with_depth(ofm_depth)
-        query.ofm_memory_area = op.ofm.mem_area
+        query.ofm_memory_area = op.ofm.connection.parent_tens.mem_area
         query.ofm_bits = op.ofm.dtype.size_in_bits()
         query.ofm_format = op.ofm.format
         if op.parent_op.bias: