| # SPDX-FileCopyrightText: Copyright 2020-2022 Arm Limited and/or its affiliates <open-source-office@arm.com> |
| # |
| # SPDX-License-Identifier: Apache-2.0 |
| # |
| # Licensed under the Apache License, Version 2.0 (the License); you may |
| # not use this file except in compliance with the License. |
| # You may obtain a copy of the License at |
| # |
| # www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, software |
| # distributed under the License is distributed on an AS IS BASIS, WITHOUT |
| # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| # See the License for the specific language governing permissions and |
| # limitations under the License. |
| # |
| # Description: |
| # NPU performance estimation functions to estimate performance of a Pass and CascadedPass. Uses a model that takes the |
| # maximum of the 'cycles required for bandwidth' and 'cycles required for computing'. |
| # |
| # Called during scheduling to evaluate different proposals, as well as post-scheduling to provide a final performance |
| # estimate. |
| import copy |
| import csv |
| from enum import auto |
| from enum import IntEnum |
| from typing import Optional |
| from typing import Set |
| from uuid import UUID |
| |
| import numpy as np |
| |
| from . import numeric_util |
| from .architecture_allocator import ArchitectureBlockConfig |
| from .architecture_features import Accelerator |
| from .architecture_features import ArchitectureFeatures |
| from .architecture_features import NpuBlockType |
| from .architecture_features import SHRAMElements |
| from .architecture_features import TensorFormat |
| from .debug_database import DebugDatabase |
| from .nn_graph import Graph |
| from .nn_graph import NetworkType |
| from .nn_graph import PassPlacement |
| from .numeric_util import round_up |
| from .numeric_util import round_up_to_int |
| from .operation import Kernel |
| from .operation import Op |
| from .scheduler import Schedule |
| from .scheduler import SchedulerOperation |
| from .scheduler import SchedulerOpInfo |
| from .shape4d import Shape4D |
| from .tensor import BandwidthDirection |
| from .tensor import MemArea |
| from .tensor import TensorPurpose |
| from .tensor import TensorSubPurpose |
| from .tflite_mapping import optype_to_builtintype as tflite_optype_to_builtintype |
| from .tosa_mapping import optype_to_tosa_op_type as tosa_optype_to_tosa_op_type |
| from .weight_compressor import WeightKey |
| |
| |
| class PassCycles(IntEnum): |
| Npu = 0 |
| SramAccess = auto() |
| DramAccess = auto() |
| OnChipFlashAccess = auto() |
| OffChipFlashAccess = auto() |
| Total = auto() |
| Size = auto() |
| |
| def display_name(self): |
| return ( |
| "NPU", |
| "SRAM Access", |
| "DRAM Access", |
| "On-chip Flash Access", |
| "Off-chip Flash Access", |
| "Total", |
| "Size", |
| )[self.value] |
| |
| def identifier_name(self): |
| return ( |
| "npu", |
| "sram_access", |
| "dram_access", |
| "on_chip_flash_access", |
| "off_chip_flash_access", |
| "total", |
| "size", |
| )[self.value] |
| |
| @staticmethod |
| def all(): |
| return ( |
| PassCycles.Npu, |
| PassCycles.SramAccess, |
| PassCycles.DramAccess, |
| PassCycles.OnChipFlashAccess, |
| PassCycles.OffChipFlashAccess, |
| PassCycles.Total, |
| ) |
| |
| |
| class PerformanceQuery: |
| def __init__(self, npu_block_type=0): |
| self.npu_block_type = npu_block_type |
| self.ifm_shape = Shape4D(0) |
| self.ifm_format = TensorFormat.NHWC |
| self.ifm_memory_area = MemArea.Unknown |
| self.ifm2_memory_area = MemArea.Unknown |
| self.ifm_bits = 0 |
| self.ifm2_bits = 0 |
| self.ifm2_shape = None |
| self.ifm2_format = TensorFormat.NHWC |
| self.ofm_shape = Shape4D(0) |
| self.ofm_format = TensorFormat.NHWC |
| self.ofm_memory_area = MemArea.Unknown |
| self.ofm_bits = 0 |
| self.const_shape = Shape4D(0) |
| self.const_memory_area = MemArea.Unknown |
| self.kernel = Kernel(1, 1) |
| self.config = ArchitectureBlockConfig() |
| |
| |
| class CycleCost: |
| def __init__(self): |
| self.op_macs = 0 |
| self.op_cycles = 0 |
| |
| def __mul__(self, scale): |
| out = CycleCost() |
| out.op_macs = self.op_macs * scale |
| out.op_cycles = self.op_cycles * scale |
| return out |
| |
| def __iadd__(self, rhs): |
| self.op_macs += rhs.op_macs |
| self.op_cycles += rhs.op_cycles |
| return self |
| |
| def __str__(self): |
| return "macs = {}, cycles = {}".format(self.op_macs, self.op_cycles) |
| |
| |
| class ElementAccess: |
| def __init__(self): |
| # List of ONLY element access counts, consumers |
| # need to scale these values by the correct bitwidths |
| # to calculated memory bandwidth |
| self.ifm_read = [0, 0] # ifm1, ifm2 |
| self.ofm_write = 0 |
| self.weights_refetch = 0 |
| self.const_read = [0, 0] # weights, scales |
| |
| def __mul__(self, scale): |
| out = ElementAccess() |
| out.ifm_read[0] = self.ifm_read[0] * scale |
| out.ifm_read[1] = self.ifm_read[1] * scale |
| out.ofm_write = self.ofm_write * scale |
| out.weights_refetch = self.weights_refetch * scale |
| out.const_read[0] = self.const_read[0] * scale |
| out.const_read[1] = self.const_read[1] * scale |
| return out |
| |
| def __iadd__(self, rhs): |
| self.ifm_read[0] += rhs.ifm_read[0] |
| self.ifm_read[1] += rhs.ifm_read[1] |
| self.ofm_write += rhs.ofm_write |
| self.weights_refetch += rhs.weights_refetch |
| self.const_read[0] += rhs.const_read[0] |
| self.const_read[1] += rhs.const_read[1] |
| return self |
| |
| def __str__(self): |
| return "ifm read = {}, ofm write = {}, const read={}".format(self.ifm_read, self.ofm_write, self.const_read) |
| |
| |
| def _strides_for_shape(shape: Shape4D, format: TensorFormat, element_bits): |
| if format == TensorFormat.NHWC: |
| strides = [0, 0, 0, 0] |
| strides[3] = element_bits / 8 # +Z |
| strides[2] = (element_bits * shape.depth) // 8 # +X |
| strides[1] = (element_bits * shape.depth * shape.width) // 8 # +Y |
| strides[0] = (element_bits * shape.depth * shape.width * shape.height) // 8 # +N |
| elif format == TensorFormat.NHCWB16: |
| strides = [0, 0, 0, 0, 0] |
| strides[4] = element_bits / 8 # +Z |
| strides[3] = (element_bits * 16) / 8 # +X |
| strides[2] = (element_bits * 16 * shape.width) / 8 # +C |
| strides[1] = (element_bits * shape.width * shape.depth) / 8 # +Y |
| strides[0] = (element_bits * shape.width * shape.depth) / 8 # +N |
| |
| return strides |
| |
| |
| def _estimate_memory_transfer_efficiency( |
| arch, is_read, mem_area, format: TensorFormat, element_bits, block_size, shape4D, to_transfer |
| ): |
| burst_len = 8 |
| |
| strides = _strides_for_shape(shape4D, format, element_bits) |
| |
| if format == TensorFormat.NHCWB16: |
| if strides[2] == block_size.depth: # TODO is this check corrrect for non 8-bit |
| burst_len = element_bits * block_size.depth * block_size.width |
| elif is_read: |
| burst_len = 16 * element_bits * block_size.width |
| else: |
| burst_len = 16 * element_bits * block_size.width * arch.ncores |
| elif format == TensorFormat.NHWC: |
| if is_read: |
| if strides[3] == block_size.depth: |
| burst_len = element_bits * block_size.depth * block_size.width |
| else: |
| burst_len = element_bits * block_size.depth |
| else: |
| if block_size.depth <= 16 and strides[3] == block_size.depth: |
| burst_len = element_bits * block_size.depth * block_size.width |
| else: |
| burst_len = min(64 * 8, 16 * element_bits * arch.ncores, block_size.depth * element_bits) |
| |
| burst_len = burst_len // 8 # bits->bytes |
| burst_len = min(arch.memory_burst_length[mem_area], burst_len) |
| return to_transfer * (arch.memory_burst_length[mem_area] / burst_len) |
| |
| |
| def _estimate_minimum_memory_cycles(arch, query: PerformanceQuery): |
| # Input block HW transfer (only for elements present) |
| ifm_bytes = Shape4D.min(query.ifm_shape, query.config.ifm_block).elements() |
| cycles_ifm_blk = arch.memory_latency[query.ifm_memory_area][BandwidthDirection.Read] |
| cycles_ifm_blk = cycles_ifm_blk + ( |
| _estimate_memory_transfer_efficiency( |
| arch, |
| True, |
| query.ifm_memory_area, |
| query.ifm_format, |
| query.ifm_bits, |
| query.config.ifm_block, |
| query.ifm_shape, |
| ifm_bytes, |
| ) |
| / arch.memory_bandwidths_per_cycle[query.ifm_memory_area] |
| ) |
| # Output block HW transfer (only for elements present) |
| ofm_bytes = Shape4D.min(query.ofm_shape, query.config.ofm_block).elements() |
| cycles_ofm_blk = arch.memory_latency[query.ofm_memory_area][BandwidthDirection.Write] |
| cycles_ofm_blk = cycles_ofm_blk + ( |
| _estimate_memory_transfer_efficiency( |
| arch, |
| False, |
| query.ofm_memory_area, |
| query.ofm_format, |
| query.ofm_bits, |
| query.config.ofm_block, |
| query.ofm_shape, |
| ofm_bytes, |
| ) |
| / arch.memory_bandwidths_per_cycle[query.ofm_memory_area] |
| ) |
| return cycles_ifm_blk, cycles_ofm_blk |
| |
| |
| def _estimate_output_cycles_per_element(arch, op_type: Op, faf_type: Op, query: PerformanceQuery): |
| if query.npu_block_type == NpuBlockType.ElementWise and query.ifm_bits == 32: |
| # Unary op else Binary op |
| output_perf_index = 0 if query.ifm2_shape is not None else 1 |
| elif op_type == Op.Mul and query.ofm_bits == 32: |
| output_perf_index = 2 |
| elif op_type == Op.Mul or ( |
| query.npu_block_type |
| in ( |
| NpuBlockType.ConvolutionMxN, |
| NpuBlockType.ConvolutionDepthWise, |
| NpuBlockType.Pooling, |
| NpuBlockType.ReduceSum, |
| NpuBlockType.VectorProduct, |
| ) |
| and query.config.acc_type == SHRAMElements.Acc40 |
| ): |
| output_perf_index = 3 |
| elif op_type in (Op.Add, Op.Sub): |
| if False: |
| # Simple Add/Sub |
| output_perf_index = 4 |
| else: |
| # Advanced Add/Sub TODO: Add as perf selection as operator variant |
| output_perf_index = 5 |
| elif op_type.is_maxpool_op(): |
| output_perf_index = 6 |
| else: |
| output_perf_index = 7 |
| |
| if faf_type in (Op.Sigmoid, Op.Tanh, Op.LUT): |
| activation_perf_index = 0 |
| elif faf_type in (Op.Relu, Op.Relu6, Op.ReluN1To1): |
| activation_perf_index = 1 |
| else: |
| activation_perf_index = 2 |
| |
| cycle_per_elem = max( |
| arch.output_cycles_per_elem[output_perf_index], arch.activation_cycles_per_elem[activation_perf_index] |
| ) |
| |
| if op_type.is_elementwise_op(): |
| num_elems_blk = query.config.ofm_block.elements() |
| ifm_blk_cycles, ofm_blk_cycles = _estimate_minimum_memory_cycles(arch, query) |
| cycle_cmd = ifm_blk_cycles + ofm_blk_cycles |
| cycle_cmd = (cycle_cmd + cycle_per_elem * num_elems_blk) / 4 # per DPU |
| cycle_per_elem = max(cycle_per_elem, cycle_cmd / num_elems_blk) |
| |
| return cycle_per_elem |
| |
| |
| def _estimate_conv_cycles(arch, op_type: Op, faf_type: Op, query: PerformanceQuery): |
| ifm_block = Shape4D.min(query.ifm_shape, query.config.ifm_block) |
| ofm_block = Shape4D.min(query.ofm_shape, query.config.ofm_block) |
| |
| if ( |
| arch.config.ofm_ublock.height == 2 |
| and query.npu_block_type |
| in (NpuBlockType.ConvolutionMxN, NpuBlockType.ConvolutionDepthWise, NpuBlockType.VectorProduct) |
| and query.ofm_shape.height == 1 |
| # Optimisation only applies for even width tensors |
| and query.ofm_shape.width % 2 == 0 |
| and query.kernel.height == 1 |
| ): |
| ofm_ublock = Shape4D(1, 1, 4, arch.config.ofm_ublock.depth) |
| ofm_block = ofm_block.with_height(1) |
| else: |
| ofm_ublock = Shape4D(arch.config.ofm_ublock.to_hwc()) |
| |
| num_ublk_x = numeric_util.round_up_divide(ofm_block.width, ofm_ublock.width) |
| num_ublk_y = numeric_util.round_up_divide(ofm_block.height, ofm_ublock.height) |
| num_ublk_xy = num_ublk_x * num_ublk_y |
| num_ublk_z = numeric_util.round_up_divide(ofm_block.depth, ofm_ublock.depth) |
| use_acc_40bits = query.config.acc_type == SHRAMElements.Acc40 |
| |
| sub_kernel_limits = arch.sub_kernel_limits[query.npu_block_type] |
| n_sub_kernels_y = numeric_util.round_up_divide(query.kernel.height, sub_kernel_limits[0]) |
| n_sub_kernels_x = numeric_util.round_up_divide(query.kernel.width, sub_kernel_limits[1]) |
| sub_kernel_x = [ |
| min((query.kernel.width - i * sub_kernel_limits[1]), sub_kernel_limits[1]) for i in range(n_sub_kernels_x) |
| ] |
| sub_kernel_y = [ |
| min((query.kernel.height - i * sub_kernel_limits[0]), sub_kernel_limits[0]) for i in range(n_sub_kernels_y) |
| ] |
| sub_kernel_size = (x * y for y in sub_kernel_y for x in sub_kernel_x) |
| |
| cycles_dpu_blk = 0 |
| cycles_wb = 32 * ofm_ublock.depth // 8 |
| |
| for num_kernel_elems in sub_kernel_size: |
| if query.npu_block_type == NpuBlockType.Pooling: |
| num_kernel_steps = 1 |
| cycles = max(4, num_kernel_elems) * num_ublk_xy * num_ublk_z |
| if query.ifm_bits == 16 and arch.accelerator_config != Accelerator.Ethos_U55_32: |
| cycles *= 2 |
| elif query.npu_block_type == NpuBlockType.ConvolutionDepthWise: |
| cycles = 4 * num_ublk_xy |
| if query.ifm_bits == 16: |
| cycles *= 2 |
| num_kernel_steps = numeric_util.round_up_divide(num_kernel_elems, 4) |
| cycles = max(cycles_wb, cycles) * num_kernel_steps * num_ublk_z |
| elif ( |
| (query.npu_block_type == NpuBlockType.ConvolutionMxN and not query.config.is_partkernel) |
| or query.npu_block_type == NpuBlockType.VectorProduct |
| or query.npu_block_type == NpuBlockType.ReduceSum |
| ): |
| num_kernel_steps = num_kernel_elems |
| cycles = max(cycles_wb, 4 * num_ublk_xy) * num_kernel_steps * num_ublk_z |
| else: |
| assert query.config.is_partkernel |
| divider = 2 if query.ifm_bits == 16 else 4 |
| num_kernel_steps = numeric_util.round_up_divide(num_kernel_elems, divider) |
| cycles = max(cycles_wb, 4 * num_ublk_xy) * ( |
| num_kernel_steps * numeric_util.round_up_divide(ifm_block.depth, 8) * num_ublk_z |
| ) |
| |
| delay_cycles = 0 |
| if arch.accelerator_config is Accelerator.Ethos_U55_32: |
| delay = 7 if use_acc_40bits else 3 |
| if num_ublk_x == 1 and num_ublk_y == 1: |
| if num_ublk_z == 1: |
| delay_cycles = delay * num_kernel_steps |
| elif num_kernel_steps > 1: |
| delay_cycles = delay * (num_kernel_steps - 1) * num_ublk_z |
| if (num_ublk_x == 1 or num_ublk_y == 1) and num_ublk_z > 1 and use_acc_40bits: |
| delay_cycles += delay * num_ublk_z |
| else: |
| if use_acc_40bits and arch.accelerator_config in (Accelerator.Ethos_U55_64, Accelerator.Ethos_U55_128): |
| delay = 3 |
| else: |
| delay = 2 |
| |
| if num_ublk_x == 1 and num_ublk_y == 1: |
| if num_ublk_z == 1: |
| delay_cycles = delay * num_kernel_steps |
| elif num_kernel_steps > 1: |
| delay_cycles = delay * (num_kernel_steps - 1) * num_ublk_z |
| |
| if query.npu_block_type == NpuBlockType.ConvolutionMxN and query.config.is_partkernel: |
| delay_cycles *= numeric_util.round_up_divide(ifm_block.depth, 8) |
| |
| cycles_dpu_blk += cycles |
| cycles_dpu_blk += delay_cycles |
| |
| if query.npu_block_type in (NpuBlockType.ConvolutionMxN, NpuBlockType.VectorProduct, NpuBlockType.ReduceSum): |
| cycles_dpu_blk *= numeric_util.round_up_divide(query.ifm_shape.depth, ifm_block.depth) |
| |
| cycles_dpu_blk /= arch.ncores |
| |
| # Estimate output cycles |
| num_ofm_blks = query.ofm_shape.div_round_up(ofm_block).elements() |
| cycles_output_blk = round_up_to_int( |
| _estimate_output_cycles_per_element(arch, op_type, faf_type, query) * ofm_block.elements() |
| ) |
| |
| # Scale and bias tensor |
| if query.const_shape.depth > 0: |
| cycles_bias_blk = ( |
| 10 * ofm_block.depth * arch.memory_latency[query.const_memory_area][BandwidthDirection.Read] / 256 |
| ) |
| cycles_output_blk = max(cycles_output_blk, cycles_bias_blk) |
| |
| ifm_blk_cycles, ofm_blk_cycles = _estimate_minimum_memory_cycles(arch, query) |
| cycles_cmd = ifm_blk_cycles + ofm_blk_cycles |
| cycles_cmd = (cycles_cmd + cycles_output_blk + cycles_dpu_blk) / 4 # per DPU |
| |
| cycles_dpu_blk = max(cycles_dpu_blk, cycles_cmd) |
| cycles_output_blk = max(cycles_output_blk, cycles_cmd) |
| |
| if cycles_dpu_blk > cycles_output_blk: |
| total_cycles = cycles_dpu_blk * num_ofm_blks + cycles_output_blk |
| else: |
| total_cycles = cycles_output_blk * num_ofm_blks + cycles_dpu_blk |
| |
| return total_cycles |
| |
| |
| def measure_mem2mem_cycles(arch, from_mem_area, to_mem_area, to_transfer): |
| from_cycles = to_transfer // arch.memory_bandwidths_per_cycle[from_mem_area] |
| from_cycles += arch.memory_latency[from_mem_area][BandwidthDirection.Read] |
| to_cycles = to_transfer // arch.memory_bandwidths_per_cycle[to_mem_area] |
| return max(from_cycles, to_cycles) |
| |
| |
| def measure_cycle_cost(arch, op_type: Op, faf_type: Op, query: PerformanceQuery): |
| cycles = CycleCost() |
| |
| # Convolution/Vector product cycle calculation |
| if query.npu_block_type in ( |
| NpuBlockType.ConvolutionMxN, |
| NpuBlockType.ConvolutionDepthWise, |
| NpuBlockType.VectorProduct, |
| NpuBlockType.Pooling, |
| NpuBlockType.ReduceSum, |
| ): |
| # cycles.op_macs and cycles.op_cycles should both handle >32-bits |
| if query.npu_block_type in (NpuBlockType.ConvolutionDepthWise, NpuBlockType.Pooling): |
| cycles.op_macs = int(query.kernel.elements_wh()) * 1 * int(query.ofm_shape.elements()) |
| else: |
| cycles.op_macs = ( |
| int(query.kernel.elements_wh()) * int(query.ifm_shape.depth) * int(query.ofm_shape.elements()) |
| ) |
| |
| cycles.op_cycles = int(_estimate_conv_cycles(arch, op_type, faf_type, query)) |
| # Elementwise cycle calculation |
| elif query.npu_block_type == NpuBlockType.ElementWise: |
| cycles.op_macs = 0 |
| ofm_rounding = Shape4D(list(arch.storage_rounding_quantums[query.ofm_format])) |
| cycles.op_cycles = round_up_to_int( |
| _estimate_output_cycles_per_element(arch, op_type, faf_type, query) |
| * Shape4D.round_up(query.ofm_shape, ofm_rounding).elements() |
| ) |
| else: |
| assert False |
| |
| return cycles |
| |
| |
| def measure_element_access(arch, query: PerformanceQuery): |
| access = ElementAccess() |
| |
| ifm_block = Shape4D.min(query.ifm_shape, query.config.ifm_block) |
| ofm_block = Shape4D.min(query.ofm_shape, query.config.ofm_block) |
| ifm_rounding = Shape4D(list(arch.storage_rounding_quantums[query.ifm_format])) |
| |
| # Number of ofm blocks in the overall output shape |
| ofm_blocks = query.ofm_shape.div_round_up(ofm_block) |
| ofm_block_depth = ofm_block.depth |
| if query.npu_block_type in (NpuBlockType.ConvolutionDepthWise, NpuBlockType.Pooling): |
| ofm_blocks = ofm_blocks.with_depth(1) |
| ofm_block_depth = query.ifm_shape.depth |
| |
| # Convolution & pooling |
| if query.npu_block_type in ( |
| NpuBlockType.ConvolutionMxN, |
| NpuBlockType.ConvolutionDepthWise, |
| NpuBlockType.VectorProduct, |
| NpuBlockType.Pooling, |
| NpuBlockType.ReduceSum, |
| ): |
| # Number of sub kernels |
| sub_kernel_limits = arch.sub_kernel_limits[query.npu_block_type] |
| subkernels = numeric_util.round_up_divide(query.kernel.width, sub_kernel_limits[0]) |
| subkernels *= numeric_util.round_up_divide(query.kernel.height, sub_kernel_limits[1]) |
| |
| ofm_block_count = ofm_blocks.elements() |
| |
| ifm_fetch = ( |
| Shape4D.round_up(ifm_block, ifm_rounding).elements_wh() |
| * Shape4D.round_up(query.ifm_shape, ifm_rounding).depth |
| ) |
| |
| if query.npu_block_type in (NpuBlockType.ConvolutionDepthWise, NpuBlockType.Pooling): |
| kernel_read = query.kernel.elements_wh() * 1 # force to no reread |
| else: |
| kernel_read = query.kernel.elements_wh() * query.ifm_shape.depth |
| |
| weight_fetch = kernel_read * ofm_block_depth * ofm_block_count |
| |
| access.ifm_read[0] = ifm_fetch * subkernels * ofm_block_count |
| |
| if query.npu_block_type not in (NpuBlockType.Pooling, NpuBlockType.ReduceSum): |
| access.const_read[0] = weight_fetch |
| access.const_read[1] = query.ofm_shape.depth # Scales & biases |
| access.weights_refetch = ofm_blocks.elements_wh() |
| # Elementwise |
| elif query.npu_block_type == NpuBlockType.ElementWise: |
| if query.ifm_shape.elements() == 1: |
| if query.ifm_bits > 8: |
| # ifm is a non 8-bit scalar |
| access.ifm_read[0] = Shape4D.round_up(query.ifm_shape, ifm_rounding).elements() |
| if query.ifm2_shape: |
| access.ifm_read[1] = Shape4D.round_up(query.ofm_shape, ifm_rounding).elements() |
| else: |
| access.ifm_read[0] = Shape4D.round_up(query.ofm_shape, ifm_rounding).elements() |
| if query.ifm2_shape: |
| if query.ifm2_shape.elements() > 1: |
| access.ifm_read[1] = Shape4D.round_up(query.ofm_shape, ifm_rounding).elements() |
| elif query.ifm2_bits > 8: |
| # ifm2 is a non 8-bit scalar |
| access.ifm_read[1] = Shape4D.round_up(query.ifm2_shape, ifm_rounding).elements() |
| # Unknown |
| else: |
| assert False |
| |
| ofm_rounding = Shape4D(list(arch.storage_rounding_quantums[query.ofm_format])) |
| access.ofm_write = Shape4D.round_up(query.ofm_shape, ofm_rounding).elements() |
| return access |
| |
| |
| def measure_performance_cost( |
| arch, op_type: Op, faf_type: Op, query: PerformanceQuery, offset: Shape4D, sub_shape: Shape4D |
| ): |
| assert (query.ofm_bits > 0) and (query.ifm_bits > 0) |
| assert query.ofm_shape.elements() != 0 |
| |
| # Default to start if no offset provided |
| if offset is None: |
| offset = Shape4D(0, 0, 0, 0) |
| |
| # Default to entire area if no sub-shape provided |
| if sub_shape is None: |
| sub_shape = query.ofm_shape |
| else: |
| sub_shape = Shape4D.min(sub_shape, query.ofm_shape) |
| |
| sub_query = copy.deepcopy(query) |
| sub_query.ofm_shape = query.ofm_shape.clip(offset, sub_shape) |
| |
| access = ElementAccess() |
| cycles = CycleCost() |
| |
| cycle_tmp = measure_cycle_cost(arch, op_type, faf_type, sub_query) |
| cycles += cycle_tmp |
| access = measure_element_access(arch, sub_query) |
| |
| return access, cycles |
| |
| |
| def make_bandwidth_array(): |
| return np.zeros((MemArea.Size, TensorPurpose.Size, BandwidthDirection.Size)) |
| |
| |
| def make_cycles_array(): |
| return np.zeros(PassCycles.Size) |
| |
| |
| def update_summary_cycles(arch, bws, cycles): |
| cycles[PassCycles.SramAccess] = np.sum(bws[MemArea.Sram]) / arch.memory_bandwidths_per_cycle[MemArea.Sram] |
| cycles[PassCycles.DramAccess] = np.sum(bws[MemArea.Dram]) / arch.memory_bandwidths_per_cycle[MemArea.Dram] |
| cycles[PassCycles.OnChipFlashAccess] = ( |
| np.sum(bws[MemArea.OnChipFlash]) / arch.memory_bandwidths_per_cycle[MemArea.OnChipFlash] |
| ) |
| cycles[PassCycles.OffChipFlashAccess] = ( |
| np.sum(bws[MemArea.OffChipFlash]) / arch.memory_bandwidths_per_cycle[MemArea.OffChipFlash] |
| ) |
| |
| cycles[PassCycles.Total] = np.max(cycles[: PassCycles.Total]) |
| return cycles |
| |
| |
| def estimate_full_op_performance( |
| arch, schedule: Schedule, op: SchedulerOperation, prev_op: Optional[SchedulerOperation], block_config |
| ): |
| cycles_a = make_cycles_array() |
| bws = make_bandwidth_array() |
| scaled_bws = make_bandwidth_array() # scaled bw with memory transfer efficiency |
| macs = 0 |
| |
| query = PerformanceQuery(op.op_type.npu_block_type) |
| query.ifm_shape = op.ifm.shape |
| query.ifm_format = op.ifm.format |
| query.ifm_memory_area = op.ifm.mem_area |
| query.ifm_bits = op.ifm.dtype.size_in_bits() |
| query.ifm2_shape = op.ifm2 and op.ifm2.shape |
| query.ifm2_format = op.ifm2 and op.ifm2.format |
| query.ifm2_memory_area = op.ifm2 and op.ifm2.mem_area |
| query.ifm2_bits = op.ifm2 and op.ifm2.dtype.size_in_bits() |
| query.ofm_shape = op.ofm.shape |
| query.ofm_memory_area = op.ofm.mem_area |
| query.ofm_bits = op.ofm.dtype.size_in_bits() |
| query.ofm_format = op.ofm.format |
| query.kernel = op.kernel |
| query.config = block_config |
| |
| cost = schedule.cost_map[op] |
| prev_cost = schedule.cost_map[prev_op] if prev_op else None |
| if op.parent_op.bias: |
| query.const_shape = Shape4D(1, 1, 1, op.ofm.shape.depth) |
| if cost.buffered_weight_tensors: |
| query.const_memory_area = cost.buffered_weight_tensors[0].mem_area |
| else: |
| query.const_memory_area = cost.npu_weights_tensor.mem_area |
| |
| cycles = measure_cycle_cost(arch, op.op_type, op.parent_op.activation and op.parent_op.activation.op_type, query) |
| cycles_a[PassCycles.Npu] = cycles.op_cycles |
| macs = cycles.op_macs |
| |
| access = measure_element_access(arch, query) |
| |
| # How many NPU cycles are available under the previously executing |
| # operator for performing buffered DMA transfers |
| slack_cycles = prev_cost.slack_buffering_cycles if prev_cost else 0 |
| |
| # LUT Transfer |
| parent_op = op.parent_op |
| lut_transfer_cycles = 0 |
| if parent_op.activation_lut: |
| lut_tensor = [tens for tens in parent_op.inputs if tens.purpose == TensorPurpose.LUT][0] |
| src_tensor = lut_tensor.src_tensor |
| if src_tensor and lut_tensor.mem_area != src_tensor.mem_area: |
| bw = src_tensor.storage_size() |
| lut_transfer_cycles = measure_mem2mem_cycles(arch, src_tensor.mem_area, lut_tensor.mem_area, bw) |
| |
| bws[src_tensor.mem_area][lut_tensor.purpose][BandwidthDirection.Read] += bw |
| # LUT read from SHRAM TODO remove? |
| scaled_bws[lut_tensor.mem_area][lut_tensor.purpose][BandwidthDirection.Read] += bw |
| |
| if cost.npu_weights_tensor and cost.buffered_weight_tensors: |
| # DMA Weight Transfer |
| sz = 0 |
| # Get the size of the first DMA |
| for core in range(0, arch.ncores): |
| key = WeightKey(core, 0) |
| if key in cost.npu_weights_tensor.encoded_ranges: |
| weight_range = cost.npu_weights_tensor.encoded_ranges[key] |
| sz += round_up(weight_range.total_bytes, 16) |
| |
| total_sz = len(cost.npu_weights_tensor.buffer) |
| bws[cost.npu_weights_tensor.mem_area][TensorPurpose.Weights][BandwidthDirection.Read] += total_sz |
| bws[cost.buffered_weight_tensors[0].mem_area][TensorPurpose.Weights][BandwidthDirection.Write] += total_sz |
| |
| ws_first_transfer_cycles = measure_mem2mem_cycles( |
| arch, cost.npu_weights_tensor.mem_area, cost.buffered_weight_tensors[0].mem_area, sz |
| ) |
| |
| # Add cycles for Weight + Scale Transfer |
| if cost.buffered_weight_tensors[0].sub_purpose == TensorSubPurpose.DoubleBuffer: |
| # Double buffer - weights can be fetched in parallel |
| cycles_a[PassCycles.Npu] = max( |
| cost.full_weight_transfer_cycles - slack_cycles + cost.slack_buffering_cycles, |
| cycles.op_cycles + max(ws_first_transfer_cycles - slack_cycles, 0), |
| ) |
| else: |
| # Standard buffer - weights can not be fetched in parallel so weight transfer |
| # must be included in the result |
| cycles_a[PassCycles.Npu] = ( |
| cycles.op_cycles + cost.full_weight_transfer_cycles - min(ws_first_transfer_cycles, slack_cycles) |
| ) |
| |
| # Add cycles for LUT Transfer |
| cycles_a[PassCycles.Npu] += lut_transfer_cycles |
| else: |
| # Add cycles for LUT Transfer |
| cycles_a[PassCycles.Npu] += max(lut_transfer_cycles - slack_cycles, 0) |
| |
| # OFM write |
| ofm = op.parent_op.ofm |
| bw = access.ofm_write * ofm.element_size() |
| bws[query.ofm_memory_area][ofm.purpose][BandwidthDirection.Write] += bw |
| scaled_bws[ofm.mem_area][ofm.purpose][BandwidthDirection.Write] += _estimate_memory_transfer_efficiency( |
| arch, False, query.ofm_memory_area, ofm.format, query.ofm_bits, query.config.ofm_block, query.ofm_shape, bw |
| ) |
| |
| # IFM read |
| ifm = op.parent_op.ifm2 if op.reversed_operands else op.parent_op.ifm |
| bw = access.ifm_read[0] * ifm.element_size() |
| bws[ifm.mem_area][ifm.purpose][BandwidthDirection.Read] += bw |
| scaled_bws[ifm.mem_area][ifm.purpose][BandwidthDirection.Read] += _estimate_memory_transfer_efficiency( |
| arch, True, query.ifm_memory_area, ifm.format, query.ifm_bits, query.config.ifm_block, query.ifm_shape, bw |
| ) |
| |
| if query.ifm2_shape: |
| ifm2 = op.parent_op.ifm if op.reversed_operands else op.parent_op.ifm2 |
| bw = access.ifm_read[1] * ifm2.element_size() |
| bws[ifm2.mem_area][ifm2.purpose][BandwidthDirection.Read] += bw |
| scaled_bws[ifm2.mem_area][ifm2.purpose][BandwidthDirection.Read] += _estimate_memory_transfer_efficiency( |
| arch, |
| True, |
| query.ifm2_memory_area, |
| ifm2.format, |
| op.ifm2.dtype.size_in_bits(), |
| query.config.ifm_block, |
| query.ifm2_shape, |
| bw, |
| ) |
| |
| # Weight read |
| if access.const_read[0] > 0: |
| # alignment not accounted for in bandwidth_compression_scale_approx |
| encoded_size_approx = ( |
| cost.npu_weights_tensor.elements() - access.const_read[1] * op.parent_op.bias.element_size() |
| ) |
| orig_weight_size = parent_op.weights.elements() |
| bandwidth_compression_scale_approx = encoded_size_approx / orig_weight_size |
| bw = access.const_read[0] * bandwidth_compression_scale_approx |
| bws[query.const_memory_area][TensorPurpose.Weights][BandwidthDirection.Read] += bw |
| |
| if not cost.buffered_weight_tensors: |
| scaled_bws[query.const_memory_area][TensorPurpose.Weights][BandwidthDirection.Read] += bw |
| |
| if access.const_read[1] > 0: |
| # Scales & biases |
| bw = access.const_read[1] * op.parent_op.bias.element_size() |
| bws[query.const_memory_area][TensorPurpose.FSBias][BandwidthDirection.Read] += bw |
| |
| if not cost.buffered_weight_tensors: |
| scaled_bws[query.const_memory_area][TensorPurpose.FSBias][BandwidthDirection.Read] += bw |
| |
| update_summary_cycles(arch, scaled_bws, cycles_a) |
| |
| return bws, macs, cycles_a |
| |
| |
| def print_performance( |
| nng: Graph, |
| arch: ArchitectureFeatures, |
| network_type: NetworkType, |
| bws: dict, |
| macs: dict, |
| cycles: dict, |
| mem_usage: dict, |
| output_basename: str, |
| ): |
| def _percentage(part, whole): |
| # desired behaviour is for division by zero to return 100% |
| if whole == 0: |
| return 100.0 |
| else: |
| return part / whole * 100.0 |
| |
| if network_type == NetworkType.TFLite: |
| nng_optype_to_input_op_type = tflite_optype_to_builtintype |
| else: |
| nng_optype_to_input_op_type = tosa_optype_to_tosa_op_type |
| |
| suid_inv_map = {v: k for k, v in DebugDatabase._sourceUID.items()} |
| |
| # the header is a list (one entry per column) of tuples (column name, alignment, width, precision) |
| header = [ |
| (f"{network_type.name}_operator", "<", 20, -1), |
| ("NNG Operator", "<", 20, -1), |
| ("SRAM Usage", ">", 10, 0.0), |
| ("Peak%", ">", 6, 0.2), |
| ("Op Cycles", ">", 10, 0.0), |
| ("Network%", ">", 8, 0.2), |
| ("NPU", ">", 10, 0.0), |
| ("SRAM AC", ">", 10, 0.0), |
| ("DRAM AC", ">", 10, 0.0), |
| ("OnFlash AC", ">", 10, 0.0), |
| ("OffFlash AC", ">", 11, 0.0), |
| ("MAC Count", ">", 10, 0.0), |
| ("Network%", ">", 8, 0.2), |
| ("Util%", ">", 6, 0.2), |
| ("Name", "<", 20, -1), |
| ] |
| |
| # open the csv |
| csv_file = open(output_basename + "_per-layer.csv", "w", encoding="UTF8") |
| writer = csv.writer(csv_file) |
| |
| for sg in nng.subgraphs: |
| |
| if sg.placement != PassPlacement.Npu: |
| continue |
| |
| sg_seperator_text = f"\n{str('#') * 80}\nPerformance for NPU Subgraph {sg.name}" |
| |
| # the data is a list (one entry per op) of lists (matching the header columns) |
| data = [] |
| for sched_op in sg.sched_ops: |
| # get source op name |
| sched_op_src_uid = DebugDatabase._optimisedUID[sched_op.parent_op][1] |
| if sched_op_src_uid == DebugDatabase.NULLREF: |
| src_op_type = None |
| else: |
| src_op_type = suid_inv_map[sched_op_src_uid].original_type |
| |
| src_op_name = nng_optype_to_input_op_type(src_op_type) |
| |
| max_macs = cycles[sched_op][PassCycles.Total] * arch.num_macs_per_cycle * arch.ncores |
| peak_sram = ( |
| _percentage(mem_usage[sched_op], nng.memory_used[MemArea.Sram]) |
| if MemArea.Sram in nng.memory_used |
| else 0 |
| ) |
| |
| data.append( |
| [ |
| src_op_name, |
| sched_op.op_type, |
| mem_usage[sched_op], |
| peak_sram, |
| cycles[sched_op][PassCycles.Total], |
| _percentage(cycles[sched_op][PassCycles.Total], nng.cycles[PassCycles.Total]), |
| cycles[sched_op][PassCycles.Npu], |
| cycles[sched_op][PassCycles.SramAccess], |
| cycles[sched_op][PassCycles.DramAccess], |
| cycles[sched_op][PassCycles.OnChipFlashAccess], |
| cycles[sched_op][PassCycles.OffChipFlashAccess], |
| macs[sched_op], |
| _percentage(macs[sched_op], nng.macs), |
| _percentage(macs[sched_op], max_macs), |
| sched_op.name, |
| ] |
| ) |
| |
| # print to console |
| print(sg_seperator_text) |
| line = "" |
| line2 = "" |
| for col_name, align, width, _ in header: |
| line_data = f"{col_name:{align}{width}}" |
| line += line_data + " " |
| line2 += "-" * len(line_data) + " " |
| print(line) |
| print(line2) |
| |
| for op_data in data: |
| line = "" |
| for idx, item in enumerate(op_data): |
| _, align, width, precision = header[idx] |
| if precision == -1: |
| w = str(width) |
| else: |
| w = str(width + precision) + "f" |
| line += f"{item:{align}{w}}" + " " |
| print(line) |
| |
| # print to csv |
| writer.writerow((sg_seperator_text,)) |
| writer.writerow(col_name for col_name, _, _, _ in header) |
| for op_data in data: |
| writer.writerow(op_data) |
| |
| # close the csv |
| csv_file.close() |
| |
| |
| def calc_new_performance_for_network( |
| nng: Graph, |
| arch, |
| network_type: NetworkType, |
| verbose_performance: bool, |
| output_basename: str = "output/unnamed_network", |
| ): |
| total_bws = make_bandwidth_array() |
| total_macs = 0 |
| total_cycles = np.zeros(PassCycles.Size) |
| total_weight_size = 0 |
| total_encoded_weight_size = 0 |
| |
| # Store unique instances of original/encoded weight tensor uuids to prevent double counting of weights |
| original_weight_uuids: Set[UUID] = set() |
| encoded_npu_weight_uuids: Set[UUID] = set() |
| |
| bws = {} |
| macs = {} |
| cycles = {} |
| mem_usage = {} |
| |
| for sg in nng.subgraphs: |
| prev_op = None |
| for sched_op in sg.sched_ops: |
| op_info: SchedulerOpInfo = sg.schedule.cost_map[sched_op] |
| bws[sched_op], macs[sched_op], cycles[sched_op] = estimate_full_op_performance( |
| arch, sg.schedule, sched_op, prev_op, op_info.block_config |
| ) |
| |
| # get op sram usage |
| mem_usage[sched_op] = ( |
| sg.schedule.memory_snapshot[op_info.time_index] |
| if op_info.time_index < len(sg.schedule.memory_snapshot) |
| else 0 |
| ) |
| |
| # Tensors for calculating weight sizes |
| original_weight = sched_op.parent_op.weights |
| encoded_npu_weight = op_info.npu_weights_tensor |
| |
| # Save UUIDs of original_weight so only unique instances of tensors are used to calculate weights |
| if original_weight and (original_weight.equivalence_id not in original_weight_uuids): |
| |
| original_weight_uuids.add(original_weight.equivalence_id) |
| total_weight_size += original_weight.values.itemsize * original_weight.values.size |
| |
| # Save UUIDs of encoded_npu_weight so only unique instances of tensors are used to calculate weights |
| if encoded_npu_weight and (encoded_npu_weight.equivalence_id not in encoded_npu_weight_uuids): |
| |
| encoded_npu_weight_uuids.add(encoded_npu_weight.equivalence_id) |
| total_encoded_weight_size += len(encoded_npu_weight.buffer) |
| |
| total_bws += bws[sched_op] |
| total_macs += macs[sched_op] |
| total_cycles += cycles[sched_op] |
| prev_op = sched_op |
| |
| nng.bandwidths = total_bws |
| nng.macs = total_macs |
| nng.cycles = total_cycles |
| nng.total_original_weights = total_weight_size |
| nng.total_npu_encoded_weights = total_encoded_weight_size |
| |
| if verbose_performance: |
| print_performance(nng, arch, network_type, bws, macs, cycles, mem_usage, output_basename) |