MLBEDSW-4034: New Scheduler Size or Performance Optimisation

 - Merged dev/scheduler at 83639f90e8c828f70de6e29142355a940224959b

Signed-off-by: Tim Hall <tim.hall@arm.com>
Change-Id: I0050529d4b42da93768c7264296434dd877fb5b4
diff --git a/ethosu/vela/npu_performance.py b/ethosu/vela/npu_performance.py
index c83f8f5..b1dae4e 100644
--- a/ethosu/vela/npu_performance.py
+++ b/ethosu/vela/npu_performance.py
@@ -19,45 +19,28 @@
 #
 # Called during scheduling to evaluate different proposals, as well as post-scheduling to provide a final performance
 # estimate.
+import copy
 from enum import auto
 from enum import IntEnum
 
 import numpy as np
 
 from . import numeric_util
+from .architecture_allocator import ArchitectureBlockConfig
 from .architecture_features import Accelerator
-from .architecture_features import Block
-from .data_type import DataType
-from .nn_graph import PassPlacement
-from .nn_graph import SchedulerRewrite
-from .operation import NpuBlockType
+from .architecture_features import NpuBlockType
+from .architecture_features import SHRAMElements
+from .architecture_features import TensorFormat
+from .numeric_util import round_up
+from .operation import Kernel
 from .operation import Op
-from .shared_buffer_allocation import is_acc_40bits_used
+from .scheduler import Schedule
+from .scheduler import SchedulerOperation
+from .shape4d import Shape4D
 from .tensor import BandwidthDirection
 from .tensor import MemArea
-from .tensor import shape_num_elements
-from .tensor import Tensor
-from .tensor import TensorBlockTraversal
-from .tensor import TensorFormat
 from .tensor import TensorPurpose
-
-
-def rolling_buffer_dims_from_passes(arch, ps1, block_config_ps1, ps2, block_config_ps2):
-    ofm_block = Block(block_config_ps2[-3], block_config_ps2[-4], block_config_ps2[-1])
-    kernel = ps2.primary_op.kernel
-
-    if ps2.npu_block_type in (NpuBlockType.ConvolutionMxN, NpuBlockType.VectorProduct):
-        op = ps2.primary_op
-        ifm_block_depth = arch.calc_ifm_block_depth(op.ifm_shapes[0].depth, op.ifm.dtype.size_in_bits())
-    else:
-        ifm_block_depth = block_config_ps2[-1]
-
-    ifm_block = arch.get_ifm_block_size(ifm_block_depth, ofm_block, kernel, arch.ofm_block_max)
-
-    # The performed height calculation is for worst case
-    height = numeric_util.round_up(ifm_block.height + block_config_ps1[0], block_config_ps1[0])
-    width = ifm_block.width
-    return [height, width]
+from .weight_compressor import WeightKey
 
 
 class PassCycles(IntEnum):
@@ -91,82 +74,173 @@
         )
 
 
-def make_bandwidth_array():
-    return np.zeros((MemArea.Size, TensorPurpose.Size, BandwidthDirection.Size))
+class PerformanceQuery:
+    def __init__(self, npu_block_type=0):
+        self.npu_block_type = npu_block_type
+        self.ifm_shape = Shape4D(0)
+        self.ifm_format = TensorFormat.NHWC
+        self.ifm_memory_area = MemArea.Unknown
+        self.ifm2_memory_area = MemArea.Unknown
+        self.ifm_bits = 0
+        self.ifm2_bits = 0
+        self.ifm2_shape = None
+        self.ifm2_format = TensorFormat.NHWC
+        self.ofm_shape = Shape4D(0)
+        self.ofm_format = TensorFormat.NHWC
+        self.ofm_memory_area = MemArea.Unknown
+        self.ofm_bits = 0
+        self.const_shape = Shape4D(0)
+        self.const_memory_area = MemArea.Unknown
+        self.kernel = Kernel(1, 1)
+        self.config = ArchitectureBlockConfig()
 
 
-def make_cycles_array():
-    return np.zeros(PassCycles.Size)
+class CycleCost:
+    def __init__(self):
+        self.op_macs = 0
+        self.op_cycles = 0
+
+    def __mul__(self, scale):
+        out = CycleCost()
+        out.op_macs = self.op_macs * scale
+        out.op_cycles = self.op_cycles * scale
+        return out
+
+    def __iadd__(self, rhs):
+        self.op_macs += rhs.op_macs
+        self.op_cycles += rhs.op_cycles
+        return self
+
+    def __str__(self):
+        return "macs = {}, cycles = {}".format(self.op_macs, self.op_cycles)
 
 
-def make_metrics_arrays():
-    return (make_bandwidth_array(), 0, make_cycles_array())
+class ElementAccess:
+    def __init__(self):
+        # List of ONLY element access counts, consumers
+        # need to scale these values by the correct bitwidths
+        # to calculated memory bandwidth
+        self.ifm_read = [0, 0]  # ifm1, ifm2
+        self.ofm_write = 0
+        self.weights_refetch = 0
+        self.const_read = [0, 0]  # weights, scales
+
+    def __mul__(self, scale):
+        out = ElementAccess()
+        out.ifm_read[0] = self.ifm_read[0] * scale
+        out.ifm_read[1] = self.ifm_read[1] * scale
+        out.ofm_write = self.ofm_write * scale
+        out.weights_refetch = self.weights_refetch * scale
+        out.const_read[0] = self.const_read[0] * scale
+        out.const_read[1] = self.const_read[1] * scale
+        return out
+
+    def __iadd__(self, rhs):
+        self.ifm_read[0] += rhs.ifm_read[0]
+        self.ifm_read[1] += rhs.ifm_read[1]
+        self.ofm_write += rhs.ofm_write
+        self.weights_refetch += rhs.weights_refetch
+        self.const_read[0] += rhs.const_read[0]
+        self.const_read[1] += rhs.const_read[1]
+        return self
+
+    def __str__(self):
+        return "ifm read = {}, ofm write = {}, const read={}".format(self.ifm_read, self.ofm_write, self.const_read)
 
 
-def get_ifm_block_depth(npu_block_type, ifm_depth, ifm_elemwidth, block_traversal, ofm_blk_depth):
-    ifm_blk_depth = ofm_blk_depth
+def _strides_for_shape(shape: Shape4D, format: TensorFormat, element_bits):
+    if format == TensorFormat.NHWC:
+        strides = [0, 0, 0, 0]
+        strides[3] = element_bits / 8  # +Z
+        strides[2] = (element_bits * shape.depth) // 8  # +X
+        strides[1] = (element_bits * shape.depth * shape.width) // 8  # +Y
+        strides[0] = (element_bits * shape.depth * shape.width * shape.height) // 8  # +N
+    elif format == TensorFormat.NHCWB16:
+        strides = [0, 0, 0, 0, 0]
+        strides[4] = element_bits / 8  # +Z
+        strides[3] = (element_bits * 16) / 8  # +X
+        strides[2] = (element_bits * 16 * shape.width) / 8  # +C
+        strides[1] = (element_bits * shape.width * shape.depth) / 8  # +Y
+        strides[0] = (element_bits * shape.width * shape.depth) / 8  # +N
 
-    if npu_block_type in (NpuBlockType.ConvolutionMxN, NpuBlockType.VectorProduct, NpuBlockType.ReduceSum):
-        if ifm_elemwidth == 16 or block_traversal == TensorBlockTraversal.PartKernelFirst:
-            ifm_blk_depth = 16
-        elif ifm_elemwidth == 8:
-            ifm_blk_depth = 32
-        else:
-            ifm_blk_depth = 8
-
-    return min(ifm_depth, ifm_blk_depth)
+    return strides
 
 
-def get_minimal_cmd_cycles(
-    arch, ifm_tensor, ofm_tensor, ifm_blk: Block, ofm_blk: Block, output_cycles, ifm_shape4D, ofm_shape4D, dpu_cycles=0
+def _estimate_memory_transfer_efficiency(
+    arch, is_read, mem_area, format: TensorFormat, element_bits, block_size, shape4D, to_transfer
 ):
-    ifm_tens_blk = Tensor((1, ifm_blk.height, ifm_blk.width, ifm_blk.depth), ifm_tensor.dtype, "ifm_blk")
-    ofm_tens_blk = Tensor((1, ofm_blk.height, ofm_blk.width, ofm_blk.depth), ofm_tensor.dtype, "ofm_blk")
-    cycles_ifm_blk = (
-        estimate_memory_transfer_efficiency(
-            arch, ifm_tensor.mem_area, BandwidthDirection.Read, ifm_tens_blk, ifm_blk, shape4D=ifm_shape4D
-        )
-        / arch.memory_bandwidths_per_cycle[ifm_tensor.mem_area]
-    )
-    cycles_ofm_blk = (
-        estimate_memory_transfer_efficiency(
-            arch, ofm_tensor.mem_area, BandwidthDirection.Write, ofm_tens_blk, ofm_blk, shape4D=ofm_shape4D
-        )
-        / arch.memory_bandwidths_per_cycle[ofm_tensor.mem_area]
-    )
-    return (
-        arch.memory_latency[ifm_tensor.mem_area][BandwidthDirection.Read]
-        + cycles_ifm_blk
-        + dpu_cycles
-        + output_cycles
-        + arch.memory_latency[ofm_tensor.mem_area][BandwidthDirection.Write]
-        + cycles_ofm_blk
-    ) / 4
+    burst_len = 8
 
+    strides = _strides_for_shape(shape4D, format, element_bits)
 
-def estimate_output_cycles(
-    arch,
-    npu_block_type,
-    primary_op,
-    num_elems,
-    ifm_tensor,
-    ofm_tensor,
-    use_acc_40bits=False,
-    ifm2_tensor=None,
-    block_config: Block = None,
-):
-    faf = None if primary_op.activation is None else primary_op.activation.op_type
-    if npu_block_type == NpuBlockType.ElementWise and ifm_tensor.dtype == DataType.int32:
-        if ifm2_tensor is None:
-            # Unary op
-            output_perf_index = 0
+    if format == TensorFormat.NHCWB16:
+        if strides[2] == block_size.depth:  # TODO is this check corrrect for non 8-bit
+            burst_len = element_bits * block_size.depth * block_size.width
+        elif is_read:
+            burst_len = 16 * element_bits * block_size.width
         else:
-            # Binary op
-            output_perf_index = 1
-    elif primary_op.type == Op.Mul and ofm_tensor.dtype == DataType.int32:
+            burst_len = 16 * element_bits * block_size.width * arch.ncores
+    elif format == TensorFormat.NHWC:
+        if is_read:
+            if strides[3] == block_size.depth:
+                burst_len = element_bits * block_size.depth * block_size.width
+            else:
+                burst_len = element_bits * block_size.depth
+        else:
+            if block_size.depth <= 16 and strides[3] == block_size.depth:
+                burst_len = element_bits * block_size.depth * block_size.width
+            else:
+                burst_len = min(64 * 8, 16 * element_bits * arch.ncores, block_size.depth * element_bits)
+
+    burst_len = burst_len // 8  # bits->bytes
+    burst_len = min(arch.memory_burst_length[mem_area], burst_len)
+    return to_transfer * (arch.memory_burst_length[mem_area] / burst_len)
+
+
+def _estimate_minimum_memory_cycles(arch, query: PerformanceQuery):
+    # Input block HW transfer (only for elements present)
+    ifm_bytes = Shape4D.min(query.ifm_shape, query.config.ifm_block).elements()
+    cycles_ifm_blk = arch.memory_latency[query.ifm_memory_area][BandwidthDirection.Read]
+    cycles_ifm_blk = cycles_ifm_blk + (
+        _estimate_memory_transfer_efficiency(
+            arch,
+            True,
+            query.ifm_memory_area,
+            query.ifm_format,
+            query.ifm_bits,
+            query.config.ifm_block,
+            query.ifm_shape,
+            ifm_bytes,
+        )
+        / arch.memory_bandwidths_per_cycle[query.ifm_memory_area]
+    )
+    # Output block HW transfer (only for elements present)
+    ofm_bytes = Shape4D.min(query.ofm_shape, query.config.ofm_block).elements()
+    cycles_ofm_blk = arch.memory_latency[query.ofm_memory_area][BandwidthDirection.Write]
+    cycles_ofm_blk = cycles_ofm_blk + (
+        _estimate_memory_transfer_efficiency(
+            arch,
+            False,
+            query.ofm_memory_area,
+            query.ofm_format,
+            query.ofm_bits,
+            query.config.ofm_block,
+            query.ofm_shape,
+            ofm_bytes,
+        )
+        / arch.memory_bandwidths_per_cycle[query.ofm_memory_area]
+    )
+    return cycles_ifm_blk, cycles_ofm_blk
+
+
+def _estimate_output_cycles_per_element(arch, op_type: Op, faf_type: Op, query: PerformanceQuery):
+    if query.npu_block_type == NpuBlockType.ElementWise and query.ifm_bits == 32:
+        # Unary op else Binary op
+        output_perf_index = 0 if query.ifm2_shape is not None else 1
+    elif op_type == Op.Mul and query.ofm_bits == 32:
         output_perf_index = 2
-    elif primary_op.type == Op.Mul or (
-        npu_block_type
+    elif op_type == Op.Mul or (
+        query.npu_block_type
         in (
             NpuBlockType.ConvolutionMxN,
             NpuBlockType.ConvolutionDepthWise,
@@ -174,31 +248,24 @@
             NpuBlockType.ReduceSum,
             NpuBlockType.VectorProduct,
         )
-        and use_acc_40bits
+        and query.config.acc_type == SHRAMElements.Acc40
     ):
         output_perf_index = 3
-    elif primary_op.type in (Op.Add, Op.Sub):
-        input_scale = ifm_tensor.quantization.scale_f32
-        input2_scale = ifm2_tensor.quantization.scale_f32
-        output_scale = ofm_tensor.quantization.scale_f32
-
-        if "resizebilinear" in primary_op.attrs:
-            output_scale = input2_scale
-
-        if None in (input_scale, input2_scale, output_scale) or input_scale == input2_scale:
+    elif op_type in (Op.Add, Op.Sub):
+        if False:
             # Simple Add/Sub
             output_perf_index = 4
         else:
-            # Advanced Add/Sub
+            # Advanced Add/Sub TODO: Add as perf selection as operator variant
             output_perf_index = 5
-    elif primary_op.type.is_maxpool_op():
+    elif op_type.is_maxpool_op():
         output_perf_index = 6
     else:
         output_perf_index = 7
 
-    if faf in (Op.Sigmoid, Op.Tanh, Op.LUT):
+    if faf_type in (Op.Sigmoid, Op.Tanh, Op.LUT):
         activation_perf_index = 0
-    elif faf in (Op.Relu, Op.Relu6, Op.ReluN1To1):
+    elif faf_type in (Op.Relu, Op.Relu6, Op.ReluN1To1):
         activation_perf_index = 1
     else:
         activation_perf_index = 2
@@ -207,69 +274,48 @@
         arch.output_cycles_per_elem[output_perf_index], arch.activation_cycles_per_elem[activation_perf_index]
     )
 
-    if primary_op.type.is_elementwise_op() and block_config is not None:
-        num_elems_blk = block_config.width * block_config.height * block_config.depth
-        cycle_cmd = get_minimal_cmd_cycles(
-            arch,
-            ifm_tensor,
-            ofm_tensor,
-            block_config,
-            block_config,
-            num_elems_blk * cycle_per_elem,
-            primary_op.ifm_shapes[0],
-            primary_op.ofm_shapes[0],
-        )
+    if op_type.is_elementwise_op():
+        num_elems_blk = query.config.ofm_block.elements()
+        ifm_blk_cycles, ofm_blk_cycles = _estimate_minimum_memory_cycles(arch, query)
+        cycle_cmd = ifm_blk_cycles + ofm_blk_cycles
+        cycle_cmd = (cycle_cmd + cycle_per_elem * num_elems_blk) / 4  # per DPU
         cycle_per_elem = max(cycle_per_elem, cycle_cmd / num_elems_blk)
 
-    return num_elems * cycle_per_elem
+    return cycle_per_elem
 
 
-def estimate_conv_pooling_cycles(
-    arch,
-    npu_block_type,
-    primary_op,
-    ifm_block: Block,
-    ofm_block: Block,
-    block_traversal,
-    kernel_dims,
-    ifm_tensor,
-    ofm_tensor,
-    scale_tensor=None,
-):
-    ofm_ublock = Block(arch.config.ofm_ublock.width, arch.config.ofm_ublock.height, arch.config.ofm_ublock.depth)
-    ifm_tens_shape = primary_op.ifm_shapes[0]
-    ofm_tens_shape = primary_op.ofm_shapes[0]
+def _estimate_conv_cycles(arch, op_type: Op, faf_type: Op, query: PerformanceQuery):
+    ifm_block = Shape4D.min(query.ifm_shape, query.config.ifm_block)
+    ofm_block = Shape4D.min(query.ofm_shape, query.config.ofm_block)
 
     if (
         arch.config.ofm_ublock.height == 2
-        and npu_block_type
+        and query.npu_block_type
         in (NpuBlockType.ConvolutionMxN, NpuBlockType.ConvolutionDepthWise, NpuBlockType.VectorProduct)
-        and ofm_tens_shape.height == 1
+        and query.ofm_shape.height == 1
         # Optimisation only applies for even width tensors
-        and ofm_tens_shape.width % 2 == 0
-        and kernel_dims[0] == 1
+        and query.ofm_shape.width % 2 == 0
+        and query.kernel.height == 1
     ):
-        ofm_ublock.width = 4
-        ofm_ublock.height = 1
-        ofm_block.height = 1
+        ofm_ublock = Shape4D(1, 1, 4, arch.config.ofm_ublock.depth)
+        ofm_block = ofm_block.with_height(1)
+    else:
+        ofm_ublock = Shape4D(arch.config.ofm_ublock.to_hwc())
 
     num_ublk_x = numeric_util.round_up_divide(ofm_block.width, ofm_ublock.width)
-    num_ublk_y = ofm_block.height // ofm_ublock.height
+    num_ublk_y = numeric_util.round_up_divide(ofm_block.height, ofm_ublock.height)
     num_ublk_xy = num_ublk_x * num_ublk_y
-    num_ublk_z = ofm_block.depth // ofm_ublock.depth
-    num_ofm_blk = 0
-    total_cycles = 0
-    num_elems_blk = ofm_block.width * ofm_block.height * ofm_block.depth
-    use_acc_40bits = is_acc_40bits_used(npu_block_type, ifm_tensor, ofm_tensor)
+    num_ublk_z = numeric_util.round_up_divide(ofm_block.depth, ofm_ublock.depth)
+    use_acc_40bits = query.config.acc_type == SHRAMElements.Acc40
 
-    sub_kernel_limits = arch.sub_kernel_limits[npu_block_type]
-    n_sub_kernels_y = numeric_util.round_up_divide(kernel_dims[0], sub_kernel_limits[0])
-    n_sub_kernels_x = numeric_util.round_up_divide(kernel_dims[1], sub_kernel_limits[1])
+    sub_kernel_limits = arch.sub_kernel_limits[query.npu_block_type]
+    n_sub_kernels_y = numeric_util.round_up_divide(query.kernel.height, sub_kernel_limits[0])
+    n_sub_kernels_x = numeric_util.round_up_divide(query.kernel.width, sub_kernel_limits[1])
     sub_kernel_x = [
-        min((kernel_dims[1] - i * sub_kernel_limits[1]), sub_kernel_limits[1]) for i in range(n_sub_kernels_x)
+        min((query.kernel.width - i * sub_kernel_limits[1]), sub_kernel_limits[1]) for i in range(n_sub_kernels_x)
     ]
     sub_kernel_y = [
-        min((kernel_dims[0] - i * sub_kernel_limits[0]), sub_kernel_limits[0]) for i in range(n_sub_kernels_y)
+        min((query.kernel.height - i * sub_kernel_limits[0]), sub_kernel_limits[0]) for i in range(n_sub_kernels_y)
     ]
     sub_kernel_size = (x * y for y in sub_kernel_y for x in sub_kernel_x)
 
@@ -277,27 +323,27 @@
     cycles_wb = 32 * ofm_ublock.depth // 8
 
     for num_kernel_elems in sub_kernel_size:
-        if npu_block_type == NpuBlockType.Pooling:
+        if query.npu_block_type == NpuBlockType.Pooling:
             num_kernel_steps = 1
             cycles = max(4, num_kernel_elems) * num_ublk_xy * num_ublk_z
-            if ifm_tensor.dtype.size_in_bits() == 16 and arch.accelerator_config != Accelerator.Ethos_U55_32:
+            if query.ifm_bits == 16 and arch.accelerator_config != Accelerator.Ethos_U55_32:
                 cycles *= 2
-        elif npu_block_type == NpuBlockType.ConvolutionDepthWise:
+        elif query.npu_block_type == NpuBlockType.ConvolutionDepthWise:
             cycles = 4 * num_ublk_xy
-            if ifm_tensor.dtype.size_in_bits() == 16:
+            if query.ifm_bits == 16:
                 cycles *= 2
             num_kernel_steps = numeric_util.round_up_divide(num_kernel_elems, 4)
             cycles = max(cycles_wb, cycles) * num_kernel_steps * num_ublk_z
         elif (
-            (npu_block_type == NpuBlockType.ConvolutionMxN and block_traversal != TensorBlockTraversal.PartKernelFirst)
-            or npu_block_type == NpuBlockType.VectorProduct
-            or npu_block_type == NpuBlockType.ReduceSum
+            (query.npu_block_type == NpuBlockType.ConvolutionMxN and not query.config.is_partkernel)
+            or query.npu_block_type == NpuBlockType.VectorProduct
+            or query.npu_block_type == NpuBlockType.ReduceSum
         ):
             num_kernel_steps = num_kernel_elems
             cycles = max(cycles_wb, 4 * num_ublk_xy) * num_kernel_steps * num_ublk_z
         else:
-            assert block_traversal == TensorBlockTraversal.PartKernelFirst
-            divider = 2 if ifm_tensor.dtype.size_in_bits() == 16 else 4
+            assert query.config.is_partkernel
+            divider = 2 if query.ifm_bits == 16 else 4
             num_kernel_steps = numeric_util.round_up_divide(num_kernel_elems, divider)
             cycles = max(cycles_wb, 4 * num_ublk_xy) * (
                 num_kernel_steps * numeric_util.round_up_divide(ifm_block.depth, 8) * num_ublk_z
@@ -314,345 +360,199 @@
             if (num_ublk_x == 1 or num_ublk_y == 1) and num_ublk_z > 1 and use_acc_40bits:
                 delay_cycles += delay * num_ublk_z
         else:
-            delay = (
-                3
-                if use_acc_40bits and arch.accelerator_config in (Accelerator.Ethos_U55_64, Accelerator.Ethos_U55_128)
-                else 2
-            )
+            if use_acc_40bits and arch.accelerator_config in (Accelerator.Ethos_U55_64, Accelerator.Ethos_U55_128):
+                delay = 3
+            else:
+                delay = 2
+
             if num_ublk_x == 1 and num_ublk_y == 1:
                 if num_ublk_z == 1:
                     delay_cycles = delay * num_kernel_steps
                 elif num_kernel_steps > 1:
                     delay_cycles = delay * (num_kernel_steps - 1) * num_ublk_z
 
-        if npu_block_type == NpuBlockType.ConvolutionMxN and block_traversal == TensorBlockTraversal.PartKernelFirst:
+        if query.npu_block_type == NpuBlockType.ConvolutionMxN and query.config.is_partkernel:
             delay_cycles *= numeric_util.round_up_divide(ifm_block.depth, 8)
 
         cycles_dpu_blk += cycles
         cycles_dpu_blk += delay_cycles
 
-    if npu_block_type in (NpuBlockType.ConvolutionMxN, NpuBlockType.VectorProduct, NpuBlockType.ReduceSum):
-        cycles_dpu_blk *= numeric_util.round_up_divide(ifm_tens_shape.depth, ifm_block.depth)
+    if query.npu_block_type in (NpuBlockType.ConvolutionMxN, NpuBlockType.VectorProduct, NpuBlockType.ReduceSum):
+        cycles_dpu_blk *= numeric_util.round_up_divide(query.ifm_shape.depth, ifm_block.depth)
 
     cycles_dpu_blk /= arch.ncores
 
-    num_ofm_blk = (
-        numeric_util.round_up_divide(ofm_tens_shape.height, ofm_block.height)
-        * numeric_util.round_up_divide(ofm_tens_shape.width, ofm_block.width)
-        * numeric_util.round_up_divide(ofm_tens_shape.depth, ofm_block.depth)
-    )
+    # Estimate output cycles
+    num_ofm_blks = query.ofm_shape.div_round_up(ofm_block).elements()
+    cycles_output_blk = _estimate_output_cycles_per_element(arch, op_type, faf_type, query) * ofm_block.elements()
 
-    cycles_output_blk = estimate_output_cycles(
-        arch, npu_block_type, primary_op, num_elems_blk, ifm_tensor, ofm_tensor, use_acc_40bits
-    )
-
-    if scale_tensor:
+    # Scale and bias tensor
+    if query.const_shape.depth > 0:
         cycles_bias_blk = (
-            10
-            * min(ofm_block.depth, ofm_tens_shape.depth)
-            * arch.memory_latency[scale_tensor.mem_area][BandwidthDirection.Read]
-            / 256
+            10 * ofm_block.depth * arch.memory_latency[query.const_memory_area][BandwidthDirection.Read] / 256
         )
         cycles_output_blk = max(cycles_output_blk, cycles_bias_blk)
 
-    cycles_cmd = get_minimal_cmd_cycles(
-        arch,
-        ifm_tensor,
-        ofm_tensor,
-        ifm_block,
-        ofm_block,
-        cycles_dpu_blk,
-        ifm_tens_shape,
-        ofm_tens_shape,
-        cycles_output_blk,
-    )
+    ifm_blk_cycles, ofm_blk_cycles = _estimate_minimum_memory_cycles(arch, query)
+    cycles_cmd = ifm_blk_cycles + ofm_blk_cycles
+    cycles_cmd = (cycles_cmd + cycles_output_blk + cycles_dpu_blk) / 4  # per DPU
+
     cycles_dpu_blk = max(cycles_dpu_blk, cycles_cmd)
     cycles_output_blk = max(cycles_output_blk, cycles_cmd)
 
     if cycles_dpu_blk > cycles_output_blk:
-        total_cycles = cycles_dpu_blk * num_ofm_blk + cycles_output_blk
+        total_cycles = cycles_dpu_blk * num_ofm_blks + cycles_output_blk
     else:
-        total_cycles = cycles_output_blk * num_ofm_blk + cycles_dpu_blk
+        total_cycles = cycles_output_blk * num_ofm_blks + cycles_dpu_blk
 
     return total_cycles
 
 
-def estimate_memory_transfer_efficiency(
-    arch, mem_area, direction, tensor, block_size: Block, replace_bw=None, shape4D=None
-):
-    if tensor.format not in (TensorFormat.NHWC, TensorFormat.NHCWB16):
-        return tensor.bandwidth() if replace_bw is None else replace_bw
+def measure_mem2mem_cycles(arch, from_mem_area, to_mem_area, to_transfer):
+    from_cycles = to_transfer // arch.memory_bandwidths_per_cycle[from_mem_area]
+    to_cycles = to_transfer // arch.memory_bandwidths_per_cycle[to_mem_area]
+    return max(from_cycles, to_cycles)
 
-    # Estimate memory transfer efficiency by calculating the burst length
-    # this is related to data format, block shape, and tensor shape, etc.
-    burst_len = 0
-    elem_size = tensor.dtype.size_in_bytes()
-    is_ifm = direction == BandwidthDirection.Read
-    tens = tensor.clone()
 
-    if not tensor.needs_linear_format:
-        tens.set_format(TensorFormat.NHCWB16, arch)
-    strides = tens.get_strides(shape4D=shape4D)
+def measure_cycle_cost(arch, op_type: Op, faf_type: Op, query: PerformanceQuery):
+    cycles = CycleCost()
 
-    if tens.format == TensorFormat.NHCWB16:
-        if strides[1] == block_size.depth:
-            burst_len = elem_size * block_size.depth * block_size.width
-        elif is_ifm:
-            burst_len = 16 * elem_size * block_size.width
+    # Convolution/Vector product cycle calculation
+    if query.npu_block_type in (
+        NpuBlockType.ConvolutionMxN,
+        NpuBlockType.ConvolutionDepthWise,
+        NpuBlockType.VectorProduct,
+        NpuBlockType.Pooling,
+        NpuBlockType.ReduceSum,
+    ):
+        # cycles.op_macs and cycles.op_cycles should both handle >32-bits
+        if query.npu_block_type in (NpuBlockType.ConvolutionDepthWise, NpuBlockType.Pooling):
+            cycles.op_macs = int(query.kernel.elements_wh()) * 1 * int(query.ofm_shape.elements())
         else:
-            burst_len = 16 * elem_size * block_size.width * arch.ncores
+            cycles.op_macs = (
+                int(query.kernel.elements_wh()) * int(query.ifm_shape.depth) * int(query.ofm_shape.elements())
+            )
+
+        cycles.op_cycles = int(_estimate_conv_cycles(arch, op_type, faf_type, query))
+    # Elementwise cycle calculation
+    elif query.npu_block_type == NpuBlockType.ElementWise:
+        cycles.op_macs = 0
+        cycles.op_cycles = int(_estimate_output_cycles_per_element(arch, op_type, faf_type, query)) * int(
+            query.ofm_shape.elements()
+        )
     else:
-        assert tens.format == TensorFormat.NHWC
-        if is_ifm:
-            if strides[3] == block_size.depth:
-                burst_len = elem_size * block_size.depth * block_size.width
-            else:
-                burst_len = elem_size * block_size.depth
-        else:
-            if block_size.depth <= 16 and strides[3] == block_size.depth:
-                burst_len = elem_size * block_size.depth * block_size.width
-            else:
-                burst_len = min(64, 16 * elem_size * arch.ncores, block_size.depth * elem_size)
+        assert False
 
-    burst_len = min(arch.memory_burst_length[mem_area], burst_len)
-    bw = tens.bandwidth() if replace_bw is None else replace_bw
-
-    return bw * (arch.memory_burst_length[mem_area] / burst_len)
+    return cycles
 
 
-def performance_metrics_for_pass(arch, ps, block_config=None, rewrite_list=None, force_outputs_to_fast_storage=False):
-    if block_config is None:
-        block_config = ps.block_config
-    bws = make_bandwidth_array()
-    scaled_bws = make_bandwidth_array()  # scaled bw with memory transfer efficiency
-    macs = 0
-    cycles = make_cycles_array()
-    ifm_read_multiple = 1
-    weight_read_multiple = 0
+def measure_element_access(arch, query: PerformanceQuery):
+    access = ElementAccess()
 
-    if ps.placement in (PassPlacement.MemoryOnly, PassPlacement.StartupInit):
-        return bws, macs, cycles, ifm_read_multiple, weight_read_multiple  # nothing real happening in this pass
+    ifm_block = Shape4D.min(query.ifm_shape, query.config.ifm_block)
+    ofm_block = Shape4D.min(query.ofm_shape, query.config.ofm_block)
+    ifm_rounding = Shape4D(list(arch.storage_rounding_quantums[query.ifm_format]))
 
-    explicit_padding = (0, 0, 0, 0)
-    primary_op = ps.primary_op
-    replacement_read_bws = {}
-    ofm_block = Block(block_config[1], block_config[0], block_config[3])
-    ifm_block = Block(block_config[1], block_config[0], block_config[3])
+    # Number of ofm blocks in the overall output shape
+    ofm_blocks = query.ofm_shape.div_round_up(ofm_block)
+    ofm_block_depth = ofm_block.depth
+    if query.npu_block_type in (NpuBlockType.ConvolutionDepthWise, NpuBlockType.Pooling):
+        ofm_blocks = ofm_blocks.with_depth(1)
+        ofm_block_depth = query.ifm_shape.depth
 
-    if ps.placement == PassPlacement.Npu and primary_op:
-        explicit_padding = primary_op.attrs.get("explicit_padding", explicit_padding)
-        assert primary_op.type.npu_block_type == ps.npu_block_type
-        npu_block_type = primary_op.type.npu_block_type
+    # Convolution & pooling
+    if query.npu_block_type in (
+        NpuBlockType.ConvolutionMxN,
+        NpuBlockType.ConvolutionDepthWise,
+        NpuBlockType.VectorProduct,
+        NpuBlockType.Pooling,
+        NpuBlockType.ReduceSum,
+    ):
+        # Number of sub kernels
+        sub_kernel_limits = arch.sub_kernel_limits[query.npu_block_type]
+        subkernels = numeric_util.round_up_divide(query.kernel.width, sub_kernel_limits[0])
+        subkernels *= numeric_util.round_up_divide(query.kernel.height, sub_kernel_limits[1])
 
-        ifm_tensor, _, weight_tensor, ofm_tensor = ps.get_primary_op_ifm_ifm2_weights_ofm()
-        ifm_tensor_shape = ps.primary_op.ifm_shapes[0]
-        ofm_tensor_shape = ps.primary_op.ofm_shapes[0]
-        ofm_block.width = min(ofm_block.width, ofm_tensor_shape.width)
-        ofm_block.height = min(ofm_block.height, ofm_tensor_shape.height)
-        ofm_block.depth = min(ofm_block.depth, ofm_tensor_shape.depth)
+        ofm_block_count = ofm_blocks.elements()
 
-        if npu_block_type == NpuBlockType.ReduceSum:
-            block_traversal = TensorBlockTraversal.DepthFirst
-        elif npu_block_type in (
-            NpuBlockType.ConvolutionMxN,
-            NpuBlockType.ConvolutionDepthWise,
-            NpuBlockType.VectorProduct,
-        ):
-            block_traversal = weight_tensor.block_traversal
-        else:
-            block_traversal = TensorBlockTraversal.Default
-        ifm_block_depth = get_ifm_block_depth(
-            npu_block_type, ifm_tensor_shape.depth, ifm_tensor.dtype.size_in_bits(), block_traversal, ofm_block.depth
-        )
-        ifm_block = arch.get_ifm_block_size(
-            ifm_block_depth, ofm_block, primary_op.kernel, ifm_resampling_mode=ifm_tensor.resampling_mode
-        )
-        ifm_block.width = min(ifm_block.width, ifm_tensor_shape.width)
-        ifm_block.height = min(ifm_block.height, ifm_tensor_shape.height)
-
-        if npu_block_type in (
-            NpuBlockType.ConvolutionMxN,
-            NpuBlockType.ConvolutionDepthWise,
-            NpuBlockType.VectorProduct,
-            NpuBlockType.Pooling,
-            NpuBlockType.ReduceSum,
-        ):
-            # extent the ifm to full dimension
-
-            batch_size = ifm_tensor_shape.batch
-
-            # add in padding, height += top and bottom, width  += left and right
-            ifm_tensor_shape = ifm_tensor_shape.add(
-                0, explicit_padding[0] + explicit_padding[2], explicit_padding[1] + explicit_padding[3], 0
-            )
-
-            if npu_block_type != NpuBlockType.Pooling:
-                if npu_block_type == NpuBlockType.ReduceSum:
-                    weight_tensor_shape = [1, 1, ifm_tensor.shape[3], ofm_tensor.shape[3]]
-                    weight_tensor_bandwidth_shape = [0] * 4
-                    weight_tensor_element_size = 0
-                    weight_tensor_bandwidth_compression_scale = 0.0
-                else:
-                    # For Vector product, weight format of IO is extended to HWIO, with H=W=1
-                    weight_tensor_shape = numeric_util.full_shape(4, weight_tensor.shape, 1)
-                    weight_tensor_bandwidth_shape = numeric_util.full_shape(4, weight_tensor.bandwidth_shape, 1)
-                    weight_tensor_element_size = weight_tensor.element_size()
-                    weight_tensor_bandwidth_compression_scale = weight_tensor.bandwidth_compression_scale
-
-                nn_ops = (
-                    int(ofm_tensor_shape.batch)
-                    * int(ofm_tensor_shape.height)
-                    * int(ofm_tensor_shape.width)
-                    * int(weight_tensor_shape[0])
-                    * int(weight_tensor_shape[1])
-                    * int(weight_tensor_shape[2])
-                    * int(weight_tensor_shape[3])
-                )
-            else:
-                weight_tensor_shape = [
-                    *primary_op.get_kernel_size(),
-                    1,
-                    ifm_tensor_shape.depth,
-                ]
-                weight_tensor_bandwidth_shape = weight_tensor_shape
-                weight_tensor_element_size = 0
-                weight_tensor_bandwidth_compression_scale = 0.0
-                nn_ops = 0  # pooling doesn't count as NN ops
-
-            kernel_dims = weight_tensor_shape[:2]
-
-            sub_kernel_limits = arch.sub_kernel_limits[npu_block_type]
-            # count the sub kernels; the IFM block needs to be refetched for each of them
-            n_sub_kernels_y = numeric_util.round_up_divide(kernel_dims[0], sub_kernel_limits[0])
-            n_sub_kernels_x = numeric_util.round_up_divide(kernel_dims[1], sub_kernel_limits[1])
-            n_sub_kernels = n_sub_kernels_y * n_sub_kernels_x
-
-            n_full_depth_stages = numeric_util.round_up_divide(weight_tensor_bandwidth_shape[3], ofm_block.depth)
-            if npu_block_type in (NpuBlockType.ConvolutionDepthWise, NpuBlockType.Pooling):
-                n_full_depth_stages = 1  # force to no reread
-
-            ifm_read_multiple = n_sub_kernels * n_full_depth_stages
-            replacement_read_bws[ifm_tensor] = ifm_tensor.bandwidth() * ifm_read_multiple
-
-            weight_read_multiple = numeric_util.round_up_divide(
-                ofm_tensor_shape.height, ofm_block.height
-            ) * numeric_util.round_up_divide(ofm_tensor_shape.width, ofm_block.width)
-            replacement_read_bws[weight_tensor] = (
-                batch_size
-                * shape_num_elements(weight_tensor_bandwidth_shape)
-                * weight_tensor_element_size
-                * weight_tensor_bandwidth_compression_scale
-                * weight_read_multiple
-            )
-
-            macs += nn_ops
-            cycles[PassCycles.Npu] = estimate_conv_pooling_cycles(
-                arch,
-                npu_block_type,
-                primary_op,
-                ifm_block,
-                ofm_block,
-                block_traversal,
-                kernel_dims,
-                ifm_tensor,
-                ofm_tensor,
-                ps.scale_tensor,
-            )
-        elif npu_block_type == NpuBlockType.ElementWise:
-            # Work out how many elements we have and calculate performance.
-            cycles[PassCycles.Npu] = estimate_output_cycles(
-                arch,
-                npu_block_type,
-                primary_op,
-                ofm_tensor.elements(),
-                ps.ifm_tensor,
-                ps.ofm_tensor,
-                None,
-                ps.ifm2_tensor,
-                ofm_block,
-            )
-
-        prev_npu_pass = next((npu_ps for npu_ps in ps.dag_predecessors if npu_ps.placement is PassPlacement.Npu), None)
-        if prev_npu_pass is None:
-            # cycles for DMA ops in first pass
-            dma_ops = (op for op in ps.ops if op.type == Op.DMA)
-            for dma_op in dma_ops:
-                mem_area = dma_op.attrs["source"]
-                for tens in dma_op.inputs:
-                    cycles[PassCycles.Npu] += tens.storage_size() / arch.memory_bandwidths_per_cycle[mem_area]
-
-    if rewrite_list is not None:
-        # apply the desired rewrites
-        for rewrite_op, tens, _, _, _, ps_to_rewrite in rewrite_list:
-            if ps != ps_to_rewrite:
-                continue
-            if rewrite_op == SchedulerRewrite.Nop:
-                pass  # these are fine, no bandwidth changes
-            elif rewrite_op in (SchedulerRewrite.ChangeTensorSubPurpose,):
-                bws[arch.fast_storage_mem_area][tens.purpose][BandwidthDirection.Read] += replacement_read_bws[tens]
-                if tens.purpose == TensorPurpose.FeatureMap:
-                    scaled_bw = estimate_memory_transfer_efficiency(
-                        arch,
-                        arch.fast_storage_mem_area,
-                        BandwidthDirection.Read,
-                        tens,
-                        ifm_block,
-                        replacement_read_bws[tens],
-                    )
-                else:
-                    scaled_bw = replacement_read_bws[tens]
-                scaled_bws[arch.fast_storage_mem_area][tens.purpose][BandwidthDirection.Read] += scaled_bw
-                replacement_read_bws[tens] = 0
-
-    for tens in ps.outputs:
-        if force_outputs_to_fast_storage:
-            bws[arch.fast_storage_mem_area][tens.purpose][BandwidthDirection.Write] += tens.bandwidth()
-            scaled_bws[arch.fast_storage_mem_area][tens.purpose][
-                BandwidthDirection.Write
-            ] += estimate_memory_transfer_efficiency(
-                arch, arch.fast_storage_mem_area, BandwidthDirection.Write, tens, ofm_block, shape4D=ps.ofm_shapes[0],
-            )
-        else:
-            bws[tens.mem_area][tens.purpose][BandwidthDirection.Write] += tens.bandwidth()
-            scaled_bws[tens.mem_area][tens.purpose][BandwidthDirection.Write] += estimate_memory_transfer_efficiency(
-                arch, tens.mem_area, BandwidthDirection.Write, tens, ofm_block, shape4D=ps.ofm_shapes[0]
-            )
-
-    for tens in ps.intermediates:
-        bws[tens.mem_area][tens.purpose][BandwidthDirection.Write] += tens.bandwidth()
-        scaled_bws[tens.mem_area][tens.purpose][BandwidthDirection.Write] += tens.bandwidth()
-
-        if tens in replacement_read_bws:
-            bw = replacement_read_bws[tens]
-        else:
-            bw = tens.bandwidth()
-
-        bws[tens.mem_area][tens.purpose][BandwidthDirection.Read] += bw
-        scaled_bws[tens.mem_area][tens.purpose][BandwidthDirection.Read] += bw
-
-    for tens in ps.inputs:
-        if tens in replacement_read_bws:
-            bw = replacement_read_bws[tens]
-        else:
-            bw = tens.bandwidth()
-
-        bws[tens.mem_area][tens.purpose][BandwidthDirection.Read] += bw
-
-        op_shape = None
-        if ps.placement == PassPlacement.Npu and primary_op:
-            if tens == ps.ifm_tensor:
-                op_shape = ps.ifm_shapes[0]
-            elif tens == ps.ifm2_tensor:
-                op_shape = ps.ifm_shapes[1]
-
-        scaled_bws[tens.mem_area][tens.purpose][BandwidthDirection.Read] += estimate_memory_transfer_efficiency(
-            arch, tens.mem_area, BandwidthDirection.Read, tens, ifm_block, bw, op_shape
+        ifm_fetch = (
+            Shape4D.round_up(ifm_block, ifm_rounding).elements_wh()
+            * Shape4D.round_up(query.ifm_shape, ifm_rounding).depth
         )
 
-    # quick build access counts for only current pass, even though these aren't the final numbers
-    update_summary_cycles(arch, scaled_bws, cycles)
+        if query.npu_block_type in (NpuBlockType.ConvolutionDepthWise, NpuBlockType.Pooling):
+            kernel_read = query.kernel.elements_wh() * 1  # force to no reread
+        else:
+            kernel_read = query.kernel.elements_wh() * query.ifm_shape.depth
 
-    return bws, macs, cycles, ifm_read_multiple, weight_read_multiple
+        weight_fetch = kernel_read * ofm_block_depth * ofm_block_count
+
+        access.ifm_read[0] = ifm_fetch * subkernels * ofm_block_count
+
+        if query.npu_block_type not in (NpuBlockType.Pooling, NpuBlockType.ReduceSum):
+            access.const_read[0] = weight_fetch
+            access.const_read[1] = query.ofm_shape.depth  # Scales & biases
+            access.weights_refetch = ofm_blocks.elements_wh()
+    # Elementwise
+    elif query.npu_block_type == NpuBlockType.ElementWise:
+        if query.ifm_shape.elements() == 1:
+            if query.ifm_bits > 8:
+                # ifm is a non 8-bit scalar
+                access.ifm_read[0] = Shape4D.round_up(query.ifm_shape, ifm_rounding).elements()
+            if query.ifm2_shape:
+                access.ifm_read[1] = Shape4D.round_up(query.ofm_shape, ifm_rounding).elements()
+        else:
+            access.ifm_read[0] = Shape4D.round_up(query.ofm_shape, ifm_rounding).elements()
+            if query.ifm2_shape:
+                if query.ifm2_shape.elements() > 1:
+                    access.ifm_read[1] = Shape4D.round_up(query.ofm_shape, ifm_rounding).elements()
+                elif query.ifm2_bits > 8:
+                    # ifm2 is a non 8-bit scalar
+                    access.ifm_read[1] = Shape4D.round_up(query.ifm2_shape, ifm_rounding).elements()
+    # Unknown
+    else:
+        assert False
+
+    ofm_rounding = Shape4D(list(arch.storage_rounding_quantums[query.ofm_format]))
+    access.ofm_write = Shape4D.round_up(query.ofm_shape, ofm_rounding).elements()
+    return access
+
+
+def measure_performance_cost(
+    arch, op_type: Op, faf_type: Op, query: PerformanceQuery, offset: Shape4D, sub_shape: Shape4D
+):
+    assert (query.ofm_bits > 0) and (query.ifm_bits > 0)
+    assert query.ofm_shape.elements() != 0
+
+    # Default to start if no offset provided
+    if offset is None:
+        offset = Shape4D(0, 0, 0, 0)
+
+    # Default to entire area if no sub-shape provided
+    if sub_shape is None:
+        sub_shape = query.ofm_shape
+    else:
+        sub_shape = Shape4D.min(sub_shape, query.ofm_shape)
+
+    sub_query = copy.deepcopy(query)
+    sub_query.ofm_shape = query.ofm_shape.clip(offset, sub_shape)
+
+    access = ElementAccess()
+    cycles = CycleCost()
+
+    cycle_tmp = measure_cycle_cost(arch, op_type, faf_type, sub_query)
+    cycles += cycle_tmp
+    access = measure_element_access(arch, sub_query)
+
+    return access, cycles
+
+
+def make_bandwidth_array():
+    return np.zeros((MemArea.Size, TensorPurpose.Size, BandwidthDirection.Size))
+
+
+def make_cycles_array():
+    return np.zeros(PassCycles.Size)
 
 
 def update_summary_cycles(arch, bws, cycles):
@@ -669,42 +569,169 @@
     return cycles
 
 
-def collate_stats_for_cascaded_pass(arch, bws, macs, cycles):
-    return bws, macs, cycles
+def estimate_full_op_performance(
+    arch, schedule: Schedule, op: SchedulerOperation, prev_op: SchedulerOperation, block_config
+):
+    cycles_a = make_cycles_array()
+    bws = make_bandwidth_array()
+    scaled_bws = make_bandwidth_array()  # scaled bw with memory transfer efficiency
+    macs = 0
+
+    query = PerformanceQuery(op.op_type.npu_block_type)
+    query.ifm_shape = op.ifm.shape
+    query.ifm_format = op.ifm.format
+    query.ifm_memory_area = op.ifm.mem_area
+    query.ifm_bits = op.ifm.dtype.size_in_bits()
+    query.ifm2_shape = op.ifm2 and op.ifm2.shape
+    query.ifm2_format = op.ifm2 and op.ifm2.format
+    query.ifm2_memory_area = op.ifm2 and op.ifm2.mem_area
+    query.ifm2_bits = op.ifm2 and op.ifm2.dtype.size_in_bits()
+    query.ofm_shape = op.ofm.shape
+    query.ofm_memory_area = op.ofm.mem_area
+    query.ofm_bits = op.ofm.dtype.size_in_bits()
+    query.ofm_format = op.ofm.format
+    query.kernel = op.kernel
+    query.config = block_config
+
+    cost = schedule.cost_map[op]
+    prev_cost = schedule.cost_map[prev_op] if prev_op else None
+    if op.parent_op.bias:
+        query.const_shape = Shape4D(1, 1, 1, op.ofm.shape.depth)
+        if cost.buffered_weight_tensor:
+            query.const_memory_area = cost.buffered_weight_tensor.mem_area
+        else:
+            query.const_memory_area = cost.npu_weights_tensor.mem_area
+
+    cycles = measure_cycle_cost(arch, op.op_type, op.parent_op.activation and op.parent_op.activation.op_type, query)
+    cycles_a[PassCycles.Npu] = cycles.op_cycles
+    macs = cycles.op_macs
+
+    access = measure_element_access(arch, query)
+
+    # How many NPU cycles are available under the previously executing
+    # operator for performing buffered DMA transfers
+    slack_cycles = prev_cost.slack_buffering_cycles if prev_cost else 0
+
+    # LUT Transfer
+    parent_op = op.parent_op
+    lut_transfer_cycles = 0
+    if parent_op.activation_lut:
+        lut_tensor = [tens for tens in parent_op.inputs if tens.purpose == TensorPurpose.LUT][0]
+        src_tensor = lut_tensor.src_tensor
+        if src_tensor and lut_tensor.mem_area != src_tensor.mem_area:
+            bw = src_tensor.storage_size()
+            lut_transfer_cycles = measure_mem2mem_cycles(arch, src_tensor.mem_area, lut_tensor.mem_area, bw)
+
+            bws[src_tensor.mem_area][lut_tensor.purpose][BandwidthDirection.Read] += bw
+            # LUT read from SHRAM TODO remove?
+            scaled_bws[lut_tensor.mem_area][lut_tensor.purpose][
+                BandwidthDirection.Read
+            ] += _estimate_memory_transfer_efficiency(
+                arch,
+                True,
+                lut_tensor.mem_area,
+                lut_tensor.format,
+                lut_tensor.element_size(),
+                query.config.ifm_block,
+                Shape4D(lut_tensor.shape),
+                bw,
+            )
+
+    if cost.npu_weights_tensor and cost.buffered_weight_tensor:
+        # DMA Weight Transfer
+        sz = 0
+        # Get the size of the first DMA
+        for core in range(0, arch.ncores):
+            key = WeightKey(core, 0)
+            if key in cost.npu_weights_tensor.encoded_ranges:
+                weight_range = cost.npu_weights_tensor.encoded_ranges[key]
+                sz += round_up(weight_range.total_bytes, 16)
+
+        total_sz = len(cost.npu_weights_tensor.buffer)
+        bws[cost.npu_weights_tensor.mem_area][TensorPurpose.Weights][BandwidthDirection.Read] += total_sz
+        bws[cost.buffered_weight_tensor.mem_area][TensorPurpose.Weights][BandwidthDirection.Write] += total_sz
+
+        ws_first_transfer_cycles = measure_mem2mem_cycles(
+            arch, cost.npu_weights_tensor.mem_area, cost.buffered_weight_tensor.mem_area, sz
+        )
+
+        # Add cycles for Weight + Scale Transfer
+        cycles_a[PassCycles.Npu] = max(
+            cost.full_weight_transfer_cycles - slack_cycles + cost.slack_buffering_cycles,
+            cycles.op_cycles + max(ws_first_transfer_cycles - slack_cycles, 0),
+        )
+
+        # Add cycles for LUT Transfer
+        cycles_a[PassCycles.Npu] += lut_transfer_cycles
+    else:
+        # Add cycles for LUT Transfer
+        cycles_a[PassCycles.Npu] += max(lut_transfer_cycles - slack_cycles, 0)
+
+    # OFM write
+    ofm = op.parent_op.ofm
+    bw = access.ofm_write * ofm.element_size()
+    bws[query.ofm_memory_area][ofm.purpose][BandwidthDirection.Write] += bw
+    scaled_bws[ofm.mem_area][ofm.purpose][BandwidthDirection.Write] += _estimate_memory_transfer_efficiency(
+        arch, False, query.ofm_memory_area, ofm.format, query.ofm_bits, query.config.ofm_block, query.ofm_shape, bw
+    )
+
+    # IFM read
+    ifm = op.parent_op.ifm
+    bw = access.ifm_read[0] * ifm.element_size()
+    bws[ifm.mem_area][ifm.purpose][BandwidthDirection.Read] += bw
+    scaled_bws[ifm.mem_area][ifm.purpose][BandwidthDirection.Read] += _estimate_memory_transfer_efficiency(
+        arch, True, query.ifm_memory_area, ifm.format, query.ifm_bits, query.config.ifm_block, query.ifm_shape, bw
+    )
+    if query.ifm2_shape:
+        ifm2 = op.parent_op.ifm2
+        bw = access.ifm_read[1] * ifm2.element_size()
+        bws[ifm2.mem_area][ifm2.purpose][BandwidthDirection.Read] += bw
+        scaled_bws[ifm2.mem_area][ifm2.purpose][BandwidthDirection.Read] += _estimate_memory_transfer_efficiency(
+            arch,
+            True,
+            query.ifm2_memory_area,
+            ifm2.format,
+            op.ifm2.dtype.size_in_bits(),
+            query.config.ifm_block,
+            query.ifm2_shape,
+            bw,
+        )
+
+    # Weight read
+    if access.const_read[0] > 0:
+        # alignment not accounted for in bandwidth_compression_scale_approx
+        encoded_size_approx = (
+            cost.npu_weights_tensor.elements() - access.const_read[1] * op.parent_op.bias.element_size()
+        )
+        orig_weight_size = parent_op.weights.elements()
+        bandwidth_compression_scale_approx = encoded_size_approx / orig_weight_size
+        bw = access.const_read[0] * bandwidth_compression_scale_approx
+        bws[query.const_memory_area][TensorPurpose.Weights][BandwidthDirection.Read] += bw
+
+    if access.const_read[1] > 0:
+        # Scales & biases
+        bw = access.const_read[1] * op.parent_op.bias.element_size()
+        bws[query.const_memory_area][TensorPurpose.FSBias][BandwidthDirection.Read] += bw
+
+    update_summary_cycles(arch, scaled_bws, cycles_a)
+
+    return bws, macs, cycles_a
 
 
-def performance_for_cascaded_pass(arch, cps):
-    total_bws = make_bandwidth_array()
-    total_macs = 0
-    total_cycles = make_cycles_array()
-
-    for ps in cps.passes:
-        bws, macs, cycles, _, _ = performance_metrics_for_pass(arch, ps)
-        ps.bandwidths = bws
-        ps.macs = macs
-        ps.cycles = cycles
-        total_bws += bws
-        total_macs += macs
-        total_cycles += cycles
-
-    bws, macs, cycles = collate_stats_for_cascaded_pass(arch, total_bws, total_macs, total_cycles)
-    cps.bandwidths = bws
-    cps.macs = macs
-    cps.cycles = cycles
-    return bws, macs, cycles
-
-
-def calc_performance_for_network(nng, arch):
+def calc_new_performance_for_network(nng, arch):
     total_bws = make_bandwidth_array()
     total_macs = 0
     total_cycles = np.zeros(PassCycles.Size)
 
     for sg in nng.subgraphs:
-        for cps in sg.cascaded_passes:
-            bws, macs, cycles = performance_for_cascaded_pass(arch, cps)
+        prev_op = None
+        for sched_op in sg.sched_ops:
+            op_info = sg.schedule.cost_map[sched_op]
+            bws, macs, cycles = estimate_full_op_performance(arch, sg.schedule, sched_op, prev_op, op_info.block_config)
             total_bws += bws
             total_macs += macs
             total_cycles += cycles
+            prev_op = sched_op
 
     nng.bandwidths = total_bws
     nng.macs = total_macs