Johan Alfven | 9072496 | 2023-02-02 09:07:48 +0100 | [diff] [blame^] | 1 | # SPDX-FileCopyrightText: Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com> |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 2 | # |
| 3 | # SPDX-License-Identifier: Apache-2.0 |
| 4 | # |
| 5 | # Licensed under the Apache License, Version 2.0 (the License); you may |
| 6 | # not use this file except in compliance with the License. |
| 7 | # You may obtain a copy of the License at |
| 8 | # |
| 9 | # www.apache.org/licenses/LICENSE-2.0 |
| 10 | # |
| 11 | # Unless required by applicable law or agreed to in writing, software |
| 12 | # distributed under the License is distributed on an AS IS BASIS, WITHOUT |
| 13 | # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 14 | # See the License for the specific language governing permissions and |
| 15 | # limitations under the License. |
Rickard Bolin | bc6ee58 | 2022-11-04 08:24:29 +0000 | [diff] [blame] | 16 | # |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 17 | # Description: |
| 18 | # NPU performance estimation functions to estimate performance of a Pass and CascadedPass. Uses a model that takes the |
| 19 | # maximum of the 'cycles required for bandwidth' and 'cycles required for computing'. |
| 20 | # |
| 21 | # Called during scheduling to evaluate different proposals, as well as post-scheduling to provide a final performance |
| 22 | # estimate. |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 23 | import copy |
wilisa01 | 89a8cdd | 2022-08-22 16:13:06 +0000 | [diff] [blame] | 24 | import csv |
Diqing Zhong | e168b96 | 2020-11-05 17:18:47 +0100 | [diff] [blame] | 25 | from enum import auto |
| 26 | from enum import IntEnum |
Jonas Ohlsson | 845e232 | 2022-03-01 12:39:55 +0100 | [diff] [blame] | 27 | from typing import Optional |
Ayaan Masood | b801dda | 2022-02-22 11:28:55 +0000 | [diff] [blame] | 28 | from typing import Set |
| 29 | from uuid import UUID |
Diego Russo | ea6111a | 2020-04-14 18:41:58 +0100 | [diff] [blame] | 30 | |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 31 | import numpy as np |
Diego Russo | ea6111a | 2020-04-14 18:41:58 +0100 | [diff] [blame] | 32 | |
| 33 | from . import numeric_util |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 34 | from .architecture_allocator import ArchitectureBlockConfig |
Diqing Zhong | 09387e2 | 2020-09-28 18:46:22 +0200 | [diff] [blame] | 35 | from .architecture_features import Accelerator |
Tim Hall | c1be087 | 2022-03-03 17:50:52 +0000 | [diff] [blame] | 36 | from .architecture_features import ArchitectureFeatures |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 37 | from .architecture_features import NpuBlockType |
| 38 | from .architecture_features import SHRAMElements |
| 39 | from .architecture_features import TensorFormat |
Tim Hall | c1be087 | 2022-03-03 17:50:52 +0000 | [diff] [blame] | 40 | from .debug_database import DebugDatabase |
Ayaan Masood | b801dda | 2022-02-22 11:28:55 +0000 | [diff] [blame] | 41 | from .nn_graph import Graph |
Tim Hall | c1be087 | 2022-03-03 17:50:52 +0000 | [diff] [blame] | 42 | from .nn_graph import NetworkType |
| 43 | from .nn_graph import PassPlacement |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 44 | from .numeric_util import round_up |
Johan Alfvén | f8e353b | 2022-02-04 17:24:23 +0100 | [diff] [blame] | 45 | from .numeric_util import round_up_to_int |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 46 | from .operation import Kernel |
Diqing Zhong | e8887a3 | 2020-09-24 09:53:48 +0200 | [diff] [blame] | 47 | from .operation import Op |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 48 | from .scheduler import Schedule |
| 49 | from .scheduler import SchedulerOperation |
Ayaan Masood | b801dda | 2022-02-22 11:28:55 +0000 | [diff] [blame] | 50 | from .scheduler import SchedulerOpInfo |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 51 | from .shape4d import Shape4D |
Diqing Zhong | f842b69 | 2020-12-11 13:07:37 +0100 | [diff] [blame] | 52 | from .tensor import BandwidthDirection |
Diego Russo | e8a1045 | 2020-04-21 17:39:10 +0100 | [diff] [blame] | 53 | from .tensor import MemArea |
Diego Russo | e8a1045 | 2020-04-21 17:39:10 +0100 | [diff] [blame] | 54 | from .tensor import TensorPurpose |
Johan Alfvén | 0f98de6 | 2022-05-15 14:54:51 +0200 | [diff] [blame] | 55 | from .tensor import TensorSubPurpose |
Tim Hall | c1be087 | 2022-03-03 17:50:52 +0000 | [diff] [blame] | 56 | from .tflite_mapping import optype_to_builtintype as tflite_optype_to_builtintype |
| 57 | from .tosa_mapping import optype_to_tosa_op_type as tosa_optype_to_tosa_op_type |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 58 | from .weight_compressor import WeightKey |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 59 | |
| 60 | |
Diqing Zhong | e168b96 | 2020-11-05 17:18:47 +0100 | [diff] [blame] | 61 | class PassCycles(IntEnum): |
Diqing Zhong | 42e833d | 2020-10-02 13:18:42 +0200 | [diff] [blame] | 62 | Npu = 0 |
Diqing Zhong | e168b96 | 2020-11-05 17:18:47 +0100 | [diff] [blame] | 63 | SramAccess = auto() |
| 64 | DramAccess = auto() |
| 65 | OnChipFlashAccess = auto() |
| 66 | OffChipFlashAccess = auto() |
| 67 | Total = auto() |
| 68 | Size = auto() |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 69 | |
| 70 | def display_name(self): |
Jonas Ohlsson | d857507 | 2022-03-30 10:30:25 +0200 | [diff] [blame] | 71 | return ( |
| 72 | "NPU", |
| 73 | "SRAM Access", |
| 74 | "DRAM Access", |
| 75 | "On-chip Flash Access", |
| 76 | "Off-chip Flash Access", |
| 77 | "Total", |
| 78 | "Size", |
| 79 | )[self.value] |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 80 | |
| 81 | def identifier_name(self): |
Jonas Ohlsson | d857507 | 2022-03-30 10:30:25 +0200 | [diff] [blame] | 82 | return ( |
| 83 | "npu", |
| 84 | "sram_access", |
| 85 | "dram_access", |
| 86 | "on_chip_flash_access", |
| 87 | "off_chip_flash_access", |
| 88 | "total", |
| 89 | "size", |
| 90 | )[self.value] |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 91 | |
| 92 | @staticmethod |
| 93 | def all(): |
| 94 | return ( |
Diqing Zhong | 42e833d | 2020-10-02 13:18:42 +0200 | [diff] [blame] | 95 | PassCycles.Npu, |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 96 | PassCycles.SramAccess, |
| 97 | PassCycles.DramAccess, |
| 98 | PassCycles.OnChipFlashAccess, |
| 99 | PassCycles.OffChipFlashAccess, |
| 100 | PassCycles.Total, |
| 101 | ) |
| 102 | |
| 103 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 104 | class PerformanceQuery: |
| 105 | def __init__(self, npu_block_type=0): |
| 106 | self.npu_block_type = npu_block_type |
| 107 | self.ifm_shape = Shape4D(0) |
| 108 | self.ifm_format = TensorFormat.NHWC |
| 109 | self.ifm_memory_area = MemArea.Unknown |
| 110 | self.ifm2_memory_area = MemArea.Unknown |
| 111 | self.ifm_bits = 0 |
| 112 | self.ifm2_bits = 0 |
| 113 | self.ifm2_shape = None |
| 114 | self.ifm2_format = TensorFormat.NHWC |
| 115 | self.ofm_shape = Shape4D(0) |
| 116 | self.ofm_format = TensorFormat.NHWC |
| 117 | self.ofm_memory_area = MemArea.Unknown |
| 118 | self.ofm_bits = 0 |
| 119 | self.const_shape = Shape4D(0) |
| 120 | self.const_memory_area = MemArea.Unknown |
| 121 | self.kernel = Kernel(1, 1) |
| 122 | self.config = ArchitectureBlockConfig() |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 123 | |
| 124 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 125 | class CycleCost: |
| 126 | def __init__(self): |
| 127 | self.op_macs = 0 |
| 128 | self.op_cycles = 0 |
| 129 | |
| 130 | def __mul__(self, scale): |
| 131 | out = CycleCost() |
| 132 | out.op_macs = self.op_macs * scale |
| 133 | out.op_cycles = self.op_cycles * scale |
| 134 | return out |
| 135 | |
| 136 | def __iadd__(self, rhs): |
| 137 | self.op_macs += rhs.op_macs |
| 138 | self.op_cycles += rhs.op_cycles |
| 139 | return self |
| 140 | |
| 141 | def __str__(self): |
| 142 | return "macs = {}, cycles = {}".format(self.op_macs, self.op_cycles) |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 143 | |
| 144 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 145 | class ElementAccess: |
| 146 | def __init__(self): |
| 147 | # List of ONLY element access counts, consumers |
| 148 | # need to scale these values by the correct bitwidths |
| 149 | # to calculated memory bandwidth |
| 150 | self.ifm_read = [0, 0] # ifm1, ifm2 |
| 151 | self.ofm_write = 0 |
| 152 | self.weights_refetch = 0 |
| 153 | self.const_read = [0, 0] # weights, scales |
| 154 | |
| 155 | def __mul__(self, scale): |
| 156 | out = ElementAccess() |
| 157 | out.ifm_read[0] = self.ifm_read[0] * scale |
| 158 | out.ifm_read[1] = self.ifm_read[1] * scale |
| 159 | out.ofm_write = self.ofm_write * scale |
| 160 | out.weights_refetch = self.weights_refetch * scale |
| 161 | out.const_read[0] = self.const_read[0] * scale |
| 162 | out.const_read[1] = self.const_read[1] * scale |
| 163 | return out |
| 164 | |
| 165 | def __iadd__(self, rhs): |
| 166 | self.ifm_read[0] += rhs.ifm_read[0] |
| 167 | self.ifm_read[1] += rhs.ifm_read[1] |
| 168 | self.ofm_write += rhs.ofm_write |
| 169 | self.weights_refetch += rhs.weights_refetch |
| 170 | self.const_read[0] += rhs.const_read[0] |
| 171 | self.const_read[1] += rhs.const_read[1] |
| 172 | return self |
| 173 | |
| 174 | def __str__(self): |
| 175 | return "ifm read = {}, ofm write = {}, const read={}".format(self.ifm_read, self.ofm_write, self.const_read) |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 176 | |
| 177 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 178 | def _strides_for_shape(shape: Shape4D, format: TensorFormat, element_bits): |
| 179 | if format == TensorFormat.NHWC: |
| 180 | strides = [0, 0, 0, 0] |
| 181 | strides[3] = element_bits / 8 # +Z |
| 182 | strides[2] = (element_bits * shape.depth) // 8 # +X |
| 183 | strides[1] = (element_bits * shape.depth * shape.width) // 8 # +Y |
| 184 | strides[0] = (element_bits * shape.depth * shape.width * shape.height) // 8 # +N |
| 185 | elif format == TensorFormat.NHCWB16: |
| 186 | strides = [0, 0, 0, 0, 0] |
| 187 | strides[4] = element_bits / 8 # +Z |
| 188 | strides[3] = (element_bits * 16) / 8 # +X |
| 189 | strides[2] = (element_bits * 16 * shape.width) / 8 # +C |
| 190 | strides[1] = (element_bits * shape.width * shape.depth) / 8 # +Y |
| 191 | strides[0] = (element_bits * shape.width * shape.depth) / 8 # +N |
Diqing Zhong | 42e833d | 2020-10-02 13:18:42 +0200 | [diff] [blame] | 192 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 193 | return strides |
Diqing Zhong | 42e833d | 2020-10-02 13:18:42 +0200 | [diff] [blame] | 194 | |
| 195 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 196 | def _estimate_memory_transfer_efficiency( |
| 197 | arch, is_read, mem_area, format: TensorFormat, element_bits, block_size, shape4D, to_transfer |
Patrik Gustavsson | 3a26920 | 2021-01-21 08:28:55 +0100 | [diff] [blame] | 198 | ): |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 199 | burst_len = 8 |
Diqing Zhong | ef0c7fe | 2020-11-24 14:38:20 +0100 | [diff] [blame] | 200 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 201 | strides = _strides_for_shape(shape4D, format, element_bits) |
Diqing Zhong | ef0c7fe | 2020-11-24 14:38:20 +0100 | [diff] [blame] | 202 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 203 | if format == TensorFormat.NHCWB16: |
| 204 | if strides[2] == block_size.depth: # TODO is this check corrrect for non 8-bit |
| 205 | burst_len = element_bits * block_size.depth * block_size.width |
| 206 | elif is_read: |
| 207 | burst_len = 16 * element_bits * block_size.width |
Diqing Zhong | e8887a3 | 2020-09-24 09:53:48 +0200 | [diff] [blame] | 208 | else: |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 209 | burst_len = 16 * element_bits * block_size.width * arch.ncores |
| 210 | elif format == TensorFormat.NHWC: |
| 211 | if is_read: |
| 212 | if strides[3] == block_size.depth: |
| 213 | burst_len = element_bits * block_size.depth * block_size.width |
| 214 | else: |
| 215 | burst_len = element_bits * block_size.depth |
| 216 | else: |
| 217 | if block_size.depth <= 16 and strides[3] == block_size.depth: |
| 218 | burst_len = element_bits * block_size.depth * block_size.width |
| 219 | else: |
| 220 | burst_len = min(64 * 8, 16 * element_bits * arch.ncores, block_size.depth * element_bits) |
| 221 | |
| 222 | burst_len = burst_len // 8 # bits->bytes |
| 223 | burst_len = min(arch.memory_burst_length[mem_area], burst_len) |
| 224 | return to_transfer * (arch.memory_burst_length[mem_area] / burst_len) |
| 225 | |
| 226 | |
| 227 | def _estimate_minimum_memory_cycles(arch, query: PerformanceQuery): |
| 228 | # Input block HW transfer (only for elements present) |
| 229 | ifm_bytes = Shape4D.min(query.ifm_shape, query.config.ifm_block).elements() |
| 230 | cycles_ifm_blk = arch.memory_latency[query.ifm_memory_area][BandwidthDirection.Read] |
| 231 | cycles_ifm_blk = cycles_ifm_blk + ( |
| 232 | _estimate_memory_transfer_efficiency( |
| 233 | arch, |
| 234 | True, |
| 235 | query.ifm_memory_area, |
| 236 | query.ifm_format, |
| 237 | query.ifm_bits, |
| 238 | query.config.ifm_block, |
| 239 | query.ifm_shape, |
| 240 | ifm_bytes, |
| 241 | ) |
| 242 | / arch.memory_bandwidths_per_cycle[query.ifm_memory_area] |
| 243 | ) |
| 244 | # Output block HW transfer (only for elements present) |
| 245 | ofm_bytes = Shape4D.min(query.ofm_shape, query.config.ofm_block).elements() |
| 246 | cycles_ofm_blk = arch.memory_latency[query.ofm_memory_area][BandwidthDirection.Write] |
| 247 | cycles_ofm_blk = cycles_ofm_blk + ( |
| 248 | _estimate_memory_transfer_efficiency( |
| 249 | arch, |
| 250 | False, |
| 251 | query.ofm_memory_area, |
| 252 | query.ofm_format, |
| 253 | query.ofm_bits, |
| 254 | query.config.ofm_block, |
| 255 | query.ofm_shape, |
| 256 | ofm_bytes, |
| 257 | ) |
| 258 | / arch.memory_bandwidths_per_cycle[query.ofm_memory_area] |
| 259 | ) |
| 260 | return cycles_ifm_blk, cycles_ofm_blk |
| 261 | |
| 262 | |
| 263 | def _estimate_output_cycles_per_element(arch, op_type: Op, faf_type: Op, query: PerformanceQuery): |
| 264 | if query.npu_block_type == NpuBlockType.ElementWise and query.ifm_bits == 32: |
| 265 | # Unary op else Binary op |
| 266 | output_perf_index = 0 if query.ifm2_shape is not None else 1 |
| 267 | elif op_type == Op.Mul and query.ofm_bits == 32: |
Diqing Zhong | e8887a3 | 2020-09-24 09:53:48 +0200 | [diff] [blame] | 268 | output_perf_index = 2 |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 269 | elif op_type == Op.Mul or ( |
| 270 | query.npu_block_type |
Diqing Zhong | e8887a3 | 2020-09-24 09:53:48 +0200 | [diff] [blame] | 271 | in ( |
| 272 | NpuBlockType.ConvolutionMxN, |
| 273 | NpuBlockType.ConvolutionDepthWise, |
| 274 | NpuBlockType.Pooling, |
| 275 | NpuBlockType.ReduceSum, |
| 276 | NpuBlockType.VectorProduct, |
| 277 | ) |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 278 | and query.config.acc_type == SHRAMElements.Acc40 |
Diqing Zhong | e8887a3 | 2020-09-24 09:53:48 +0200 | [diff] [blame] | 279 | ): |
| 280 | output_perf_index = 3 |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 281 | elif op_type in (Op.Add, Op.Sub): |
| 282 | if False: |
Diqing Zhong | e8887a3 | 2020-09-24 09:53:48 +0200 | [diff] [blame] | 283 | # Simple Add/Sub |
| 284 | output_perf_index = 4 |
| 285 | else: |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 286 | # Advanced Add/Sub TODO: Add as perf selection as operator variant |
Diqing Zhong | e8887a3 | 2020-09-24 09:53:48 +0200 | [diff] [blame] | 287 | output_perf_index = 5 |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 288 | elif op_type.is_maxpool_op(): |
Diqing Zhong | e8887a3 | 2020-09-24 09:53:48 +0200 | [diff] [blame] | 289 | output_perf_index = 6 |
| 290 | else: |
| 291 | output_perf_index = 7 |
| 292 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 293 | if faf_type in (Op.Sigmoid, Op.Tanh, Op.LUT): |
Diqing Zhong | e8887a3 | 2020-09-24 09:53:48 +0200 | [diff] [blame] | 294 | activation_perf_index = 0 |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 295 | elif faf_type in (Op.Relu, Op.Relu6, Op.ReluN1To1): |
Diqing Zhong | e8887a3 | 2020-09-24 09:53:48 +0200 | [diff] [blame] | 296 | activation_perf_index = 1 |
| 297 | else: |
| 298 | activation_perf_index = 2 |
| 299 | |
Diqing Zhong | e8887a3 | 2020-09-24 09:53:48 +0200 | [diff] [blame] | 300 | cycle_per_elem = max( |
| 301 | arch.output_cycles_per_elem[output_perf_index], arch.activation_cycles_per_elem[activation_perf_index] |
| 302 | ) |
Diqing Zhong | 986e319 | 2020-11-16 16:15:56 +0100 | [diff] [blame] | 303 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 304 | if op_type.is_elementwise_op(): |
| 305 | num_elems_blk = query.config.ofm_block.elements() |
| 306 | ifm_blk_cycles, ofm_blk_cycles = _estimate_minimum_memory_cycles(arch, query) |
| 307 | cycle_cmd = ifm_blk_cycles + ofm_blk_cycles |
| 308 | cycle_cmd = (cycle_cmd + cycle_per_elem * num_elems_blk) / 4 # per DPU |
Diqing Zhong | ef0c7fe | 2020-11-24 14:38:20 +0100 | [diff] [blame] | 309 | cycle_per_elem = max(cycle_per_elem, cycle_cmd / num_elems_blk) |
| 310 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 311 | return cycle_per_elem |
Diqing Zhong | e8887a3 | 2020-09-24 09:53:48 +0200 | [diff] [blame] | 312 | |
| 313 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 314 | def _estimate_conv_cycles(arch, op_type: Op, faf_type: Op, query: PerformanceQuery): |
| 315 | ifm_block = Shape4D.min(query.ifm_shape, query.config.ifm_block) |
| 316 | ofm_block = Shape4D.min(query.ofm_shape, query.config.ofm_block) |
Diqing Zhong | e5204a6 | 2020-10-13 11:42:37 +0200 | [diff] [blame] | 317 | |
| 318 | if ( |
| 319 | arch.config.ofm_ublock.height == 2 |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 320 | and query.npu_block_type |
Diqing Zhong | e5204a6 | 2020-10-13 11:42:37 +0200 | [diff] [blame] | 321 | in (NpuBlockType.ConvolutionMxN, NpuBlockType.ConvolutionDepthWise, NpuBlockType.VectorProduct) |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 322 | and query.ofm_shape.height == 1 |
Diqing Zhong | e5204a6 | 2020-10-13 11:42:37 +0200 | [diff] [blame] | 323 | # Optimisation only applies for even width tensors |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 324 | and query.ofm_shape.width % 2 == 0 |
| 325 | and query.kernel.height == 1 |
Diqing Zhong | e5204a6 | 2020-10-13 11:42:37 +0200 | [diff] [blame] | 326 | ): |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 327 | ofm_ublock = Shape4D(1, 1, 4, arch.config.ofm_ublock.depth) |
| 328 | ofm_block = ofm_block.with_height(1) |
| 329 | else: |
| 330 | ofm_ublock = Shape4D(arch.config.ofm_ublock.to_hwc()) |
Diqing Zhong | e5204a6 | 2020-10-13 11:42:37 +0200 | [diff] [blame] | 331 | |
Diqing Zhong | ef0c7fe | 2020-11-24 14:38:20 +0100 | [diff] [blame] | 332 | num_ublk_x = numeric_util.round_up_divide(ofm_block.width, ofm_ublock.width) |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 333 | num_ublk_y = numeric_util.round_up_divide(ofm_block.height, ofm_ublock.height) |
Diqing Zhong | ef0c7fe | 2020-11-24 14:38:20 +0100 | [diff] [blame] | 334 | num_ublk_xy = num_ublk_x * num_ublk_y |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 335 | num_ublk_z = numeric_util.round_up_divide(ofm_block.depth, ofm_ublock.depth) |
| 336 | use_acc_40bits = query.config.acc_type == SHRAMElements.Acc40 |
Diqing Zhong | 09387e2 | 2020-09-28 18:46:22 +0200 | [diff] [blame] | 337 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 338 | sub_kernel_limits = arch.sub_kernel_limits[query.npu_block_type] |
| 339 | n_sub_kernels_y = numeric_util.round_up_divide(query.kernel.height, sub_kernel_limits[0]) |
| 340 | n_sub_kernels_x = numeric_util.round_up_divide(query.kernel.width, sub_kernel_limits[1]) |
Diqing Zhong | 09387e2 | 2020-09-28 18:46:22 +0200 | [diff] [blame] | 341 | sub_kernel_x = [ |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 342 | min((query.kernel.width - i * sub_kernel_limits[1]), sub_kernel_limits[1]) for i in range(n_sub_kernels_x) |
Diqing Zhong | 09387e2 | 2020-09-28 18:46:22 +0200 | [diff] [blame] | 343 | ] |
| 344 | sub_kernel_y = [ |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 345 | min((query.kernel.height - i * sub_kernel_limits[0]), sub_kernel_limits[0]) for i in range(n_sub_kernels_y) |
Diqing Zhong | 09387e2 | 2020-09-28 18:46:22 +0200 | [diff] [blame] | 346 | ] |
| 347 | sub_kernel_size = (x * y for y in sub_kernel_y for x in sub_kernel_x) |
| 348 | |
Diqing Zhong | 09387e2 | 2020-09-28 18:46:22 +0200 | [diff] [blame] | 349 | cycles_dpu_blk = 0 |
Diqing Zhong | 986e319 | 2020-11-16 16:15:56 +0100 | [diff] [blame] | 350 | cycles_wb = 32 * ofm_ublock.depth // 8 |
Diqing Zhong | 09387e2 | 2020-09-28 18:46:22 +0200 | [diff] [blame] | 351 | |
| 352 | for num_kernel_elems in sub_kernel_size: |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 353 | if query.npu_block_type == NpuBlockType.Pooling: |
Diqing Zhong | ef0c7fe | 2020-11-24 14:38:20 +0100 | [diff] [blame] | 354 | num_kernel_steps = 1 |
Diqing Zhong | 986e319 | 2020-11-16 16:15:56 +0100 | [diff] [blame] | 355 | cycles = max(4, num_kernel_elems) * num_ublk_xy * num_ublk_z |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 356 | if query.ifm_bits == 16 and arch.accelerator_config != Accelerator.Ethos_U55_32: |
Diqing Zhong | 09387e2 | 2020-09-28 18:46:22 +0200 | [diff] [blame] | 357 | cycles *= 2 |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 358 | elif query.npu_block_type == NpuBlockType.ConvolutionDepthWise: |
Diqing Zhong | 986e319 | 2020-11-16 16:15:56 +0100 | [diff] [blame] | 359 | cycles = 4 * num_ublk_xy |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 360 | if query.ifm_bits == 16: |
Diqing Zhong | 09387e2 | 2020-09-28 18:46:22 +0200 | [diff] [blame] | 361 | cycles *= 2 |
Diqing Zhong | ef0c7fe | 2020-11-24 14:38:20 +0100 | [diff] [blame] | 362 | num_kernel_steps = numeric_util.round_up_divide(num_kernel_elems, 4) |
| 363 | cycles = max(cycles_wb, cycles) * num_kernel_steps * num_ublk_z |
Diqing Zhong | 09387e2 | 2020-09-28 18:46:22 +0200 | [diff] [blame] | 364 | elif ( |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 365 | (query.npu_block_type == NpuBlockType.ConvolutionMxN and not query.config.is_partkernel) |
| 366 | or query.npu_block_type == NpuBlockType.VectorProduct |
| 367 | or query.npu_block_type == NpuBlockType.ReduceSum |
Diqing Zhong | 09387e2 | 2020-09-28 18:46:22 +0200 | [diff] [blame] | 368 | ): |
Diqing Zhong | ef0c7fe | 2020-11-24 14:38:20 +0100 | [diff] [blame] | 369 | num_kernel_steps = num_kernel_elems |
| 370 | cycles = max(cycles_wb, 4 * num_ublk_xy) * num_kernel_steps * num_ublk_z |
Diqing Zhong | 09387e2 | 2020-09-28 18:46:22 +0200 | [diff] [blame] | 371 | else: |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 372 | assert query.config.is_partkernel |
| 373 | divider = 2 if query.ifm_bits == 16 else 4 |
Diqing Zhong | ef0c7fe | 2020-11-24 14:38:20 +0100 | [diff] [blame] | 374 | num_kernel_steps = numeric_util.round_up_divide(num_kernel_elems, divider) |
Diqing Zhong | 986e319 | 2020-11-16 16:15:56 +0100 | [diff] [blame] | 375 | cycles = max(cycles_wb, 4 * num_ublk_xy) * ( |
Diqing Zhong | ef0c7fe | 2020-11-24 14:38:20 +0100 | [diff] [blame] | 376 | num_kernel_steps * numeric_util.round_up_divide(ifm_block.depth, 8) * num_ublk_z |
Diqing Zhong | 09387e2 | 2020-09-28 18:46:22 +0200 | [diff] [blame] | 377 | ) |
Diqing Zhong | ef0c7fe | 2020-11-24 14:38:20 +0100 | [diff] [blame] | 378 | |
| 379 | delay_cycles = 0 |
| 380 | if arch.accelerator_config is Accelerator.Ethos_U55_32: |
| 381 | delay = 7 if use_acc_40bits else 3 |
| 382 | if num_ublk_x == 1 and num_ublk_y == 1: |
| 383 | if num_ublk_z == 1: |
| 384 | delay_cycles = delay * num_kernel_steps |
| 385 | elif num_kernel_steps > 1: |
| 386 | delay_cycles = delay * (num_kernel_steps - 1) * num_ublk_z |
| 387 | if (num_ublk_x == 1 or num_ublk_y == 1) and num_ublk_z > 1 and use_acc_40bits: |
| 388 | delay_cycles += delay * num_ublk_z |
| 389 | else: |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 390 | if use_acc_40bits and arch.accelerator_config in (Accelerator.Ethos_U55_64, Accelerator.Ethos_U55_128): |
| 391 | delay = 3 |
| 392 | else: |
| 393 | delay = 2 |
| 394 | |
Diqing Zhong | ef0c7fe | 2020-11-24 14:38:20 +0100 | [diff] [blame] | 395 | if num_ublk_x == 1 and num_ublk_y == 1: |
| 396 | if num_ublk_z == 1: |
| 397 | delay_cycles = delay * num_kernel_steps |
| 398 | elif num_kernel_steps > 1: |
| 399 | delay_cycles = delay * (num_kernel_steps - 1) * num_ublk_z |
| 400 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 401 | if query.npu_block_type == NpuBlockType.ConvolutionMxN and query.config.is_partkernel: |
Diqing Zhong | ef0c7fe | 2020-11-24 14:38:20 +0100 | [diff] [blame] | 402 | delay_cycles *= numeric_util.round_up_divide(ifm_block.depth, 8) |
| 403 | |
Diqing Zhong | 09387e2 | 2020-09-28 18:46:22 +0200 | [diff] [blame] | 404 | cycles_dpu_blk += cycles |
Diqing Zhong | ef0c7fe | 2020-11-24 14:38:20 +0100 | [diff] [blame] | 405 | cycles_dpu_blk += delay_cycles |
| 406 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 407 | if query.npu_block_type in (NpuBlockType.ConvolutionMxN, NpuBlockType.VectorProduct, NpuBlockType.ReduceSum): |
| 408 | cycles_dpu_blk *= numeric_util.round_up_divide(query.ifm_shape.depth, ifm_block.depth) |
Diqing Zhong | 09387e2 | 2020-09-28 18:46:22 +0200 | [diff] [blame] | 409 | |
| 410 | cycles_dpu_blk /= arch.ncores |
| 411 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 412 | # Estimate output cycles |
| 413 | num_ofm_blks = query.ofm_shape.div_round_up(ofm_block).elements() |
Johan Alfvén | f8e353b | 2022-02-04 17:24:23 +0100 | [diff] [blame] | 414 | cycles_output_blk = round_up_to_int( |
| 415 | _estimate_output_cycles_per_element(arch, op_type, faf_type, query) * ofm_block.elements() |
| 416 | ) |
Diqing Zhong | 09387e2 | 2020-09-28 18:46:22 +0200 | [diff] [blame] | 417 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 418 | # Scale and bias tensor |
| 419 | if query.const_shape.depth > 0: |
Diqing Zhong | f842b69 | 2020-12-11 13:07:37 +0100 | [diff] [blame] | 420 | cycles_bias_blk = ( |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 421 | 10 * ofm_block.depth * arch.memory_latency[query.const_memory_area][BandwidthDirection.Read] / 256 |
Diqing Zhong | f842b69 | 2020-12-11 13:07:37 +0100 | [diff] [blame] | 422 | ) |
Diqing Zhong | 986e319 | 2020-11-16 16:15:56 +0100 | [diff] [blame] | 423 | cycles_output_blk = max(cycles_output_blk, cycles_bias_blk) |
| 424 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 425 | ifm_blk_cycles, ofm_blk_cycles = _estimate_minimum_memory_cycles(arch, query) |
| 426 | cycles_cmd = ifm_blk_cycles + ofm_blk_cycles |
| 427 | cycles_cmd = (cycles_cmd + cycles_output_blk + cycles_dpu_blk) / 4 # per DPU |
| 428 | |
Diqing Zhong | ef0c7fe | 2020-11-24 14:38:20 +0100 | [diff] [blame] | 429 | cycles_dpu_blk = max(cycles_dpu_blk, cycles_cmd) |
| 430 | cycles_output_blk = max(cycles_output_blk, cycles_cmd) |
| 431 | |
Diqing Zhong | 09387e2 | 2020-09-28 18:46:22 +0200 | [diff] [blame] | 432 | if cycles_dpu_blk > cycles_output_blk: |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 433 | total_cycles = cycles_dpu_blk * num_ofm_blks + cycles_output_blk |
Diqing Zhong | 09387e2 | 2020-09-28 18:46:22 +0200 | [diff] [blame] | 434 | else: |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 435 | total_cycles = cycles_output_blk * num_ofm_blks + cycles_dpu_blk |
Diqing Zhong | 09387e2 | 2020-09-28 18:46:22 +0200 | [diff] [blame] | 436 | |
| 437 | return total_cycles |
| 438 | |
| 439 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 440 | def measure_mem2mem_cycles(arch, from_mem_area, to_mem_area, to_transfer): |
| 441 | from_cycles = to_transfer // arch.memory_bandwidths_per_cycle[from_mem_area] |
Tim Hall | 789e6f3 | 2021-06-17 17:02:31 +0100 | [diff] [blame] | 442 | from_cycles += arch.memory_latency[from_mem_area][BandwidthDirection.Read] |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 443 | to_cycles = to_transfer // arch.memory_bandwidths_per_cycle[to_mem_area] |
| 444 | return max(from_cycles, to_cycles) |
Diqing Zhong | e168b96 | 2020-11-05 17:18:47 +0100 | [diff] [blame] | 445 | |
Patrik Gustavsson | ee99bb1 | 2021-04-08 09:04:00 +0200 | [diff] [blame] | 446 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 447 | def measure_cycle_cost(arch, op_type: Op, faf_type: Op, query: PerformanceQuery): |
| 448 | cycles = CycleCost() |
Diqing Zhong | e168b96 | 2020-11-05 17:18:47 +0100 | [diff] [blame] | 449 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 450 | # Convolution/Vector product cycle calculation |
| 451 | if query.npu_block_type in ( |
| 452 | NpuBlockType.ConvolutionMxN, |
| 453 | NpuBlockType.ConvolutionDepthWise, |
| 454 | NpuBlockType.VectorProduct, |
| 455 | NpuBlockType.Pooling, |
| 456 | NpuBlockType.ReduceSum, |
| 457 | ): |
| 458 | # cycles.op_macs and cycles.op_cycles should both handle >32-bits |
| 459 | if query.npu_block_type in (NpuBlockType.ConvolutionDepthWise, NpuBlockType.Pooling): |
| 460 | cycles.op_macs = int(query.kernel.elements_wh()) * 1 * int(query.ofm_shape.elements()) |
Diqing Zhong | e168b96 | 2020-11-05 17:18:47 +0100 | [diff] [blame] | 461 | else: |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 462 | cycles.op_macs = ( |
| 463 | int(query.kernel.elements_wh()) * int(query.ifm_shape.depth) * int(query.ofm_shape.elements()) |
| 464 | ) |
| 465 | |
| 466 | cycles.op_cycles = int(_estimate_conv_cycles(arch, op_type, faf_type, query)) |
| 467 | # Elementwise cycle calculation |
| 468 | elif query.npu_block_type == NpuBlockType.ElementWise: |
| 469 | cycles.op_macs = 0 |
Johan Alfvén | f8e353b | 2022-02-04 17:24:23 +0100 | [diff] [blame] | 470 | ofm_rounding = Shape4D(list(arch.storage_rounding_quantums[query.ofm_format])) |
| 471 | cycles.op_cycles = round_up_to_int( |
| 472 | _estimate_output_cycles_per_element(arch, op_type, faf_type, query) |
| 473 | * Shape4D.round_up(query.ofm_shape, ofm_rounding).elements() |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 474 | ) |
Johan Alfven | 9072496 | 2023-02-02 09:07:48 +0100 | [diff] [blame^] | 475 | # DMA cycle calculation |
| 476 | elif query.npu_block_type == NpuBlockType.Dma: |
| 477 | # Return 0 since this is not an actual NPU op |
| 478 | cycles.op_cycles = 0 |
Diqing Zhong | e168b96 | 2020-11-05 17:18:47 +0100 | [diff] [blame] | 479 | else: |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 480 | assert False |
Diqing Zhong | e168b96 | 2020-11-05 17:18:47 +0100 | [diff] [blame] | 481 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 482 | return cycles |
Diqing Zhong | e168b96 | 2020-11-05 17:18:47 +0100 | [diff] [blame] | 483 | |
| 484 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 485 | def measure_element_access(arch, query: PerformanceQuery): |
| 486 | access = ElementAccess() |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 487 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 488 | ifm_block = Shape4D.min(query.ifm_shape, query.config.ifm_block) |
| 489 | ofm_block = Shape4D.min(query.ofm_shape, query.config.ofm_block) |
| 490 | ifm_rounding = Shape4D(list(arch.storage_rounding_quantums[query.ifm_format])) |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 491 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 492 | # Number of ofm blocks in the overall output shape |
| 493 | ofm_blocks = query.ofm_shape.div_round_up(ofm_block) |
| 494 | ofm_block_depth = ofm_block.depth |
| 495 | if query.npu_block_type in (NpuBlockType.ConvolutionDepthWise, NpuBlockType.Pooling): |
| 496 | ofm_blocks = ofm_blocks.with_depth(1) |
| 497 | ofm_block_depth = query.ifm_shape.depth |
Diqing Zhong | e168b96 | 2020-11-05 17:18:47 +0100 | [diff] [blame] | 498 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 499 | # Convolution & pooling |
| 500 | if query.npu_block_type in ( |
| 501 | NpuBlockType.ConvolutionMxN, |
| 502 | NpuBlockType.ConvolutionDepthWise, |
| 503 | NpuBlockType.VectorProduct, |
| 504 | NpuBlockType.Pooling, |
| 505 | NpuBlockType.ReduceSum, |
| 506 | ): |
| 507 | # Number of sub kernels |
| 508 | sub_kernel_limits = arch.sub_kernel_limits[query.npu_block_type] |
| 509 | subkernels = numeric_util.round_up_divide(query.kernel.width, sub_kernel_limits[0]) |
| 510 | subkernels *= numeric_util.round_up_divide(query.kernel.height, sub_kernel_limits[1]) |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 511 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 512 | ofm_block_count = ofm_blocks.elements() |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 513 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 514 | ifm_fetch = ( |
| 515 | Shape4D.round_up(ifm_block, ifm_rounding).elements_wh() |
| 516 | * Shape4D.round_up(query.ifm_shape, ifm_rounding).depth |
Diqing Zhong | e168b96 | 2020-11-05 17:18:47 +0100 | [diff] [blame] | 517 | ) |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 518 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 519 | if query.npu_block_type in (NpuBlockType.ConvolutionDepthWise, NpuBlockType.Pooling): |
| 520 | kernel_read = query.kernel.elements_wh() * 1 # force to no reread |
| 521 | else: |
| 522 | kernel_read = query.kernel.elements_wh() * query.ifm_shape.depth |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 523 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 524 | weight_fetch = kernel_read * ofm_block_depth * ofm_block_count |
| 525 | |
| 526 | access.ifm_read[0] = ifm_fetch * subkernels * ofm_block_count |
| 527 | |
| 528 | if query.npu_block_type not in (NpuBlockType.Pooling, NpuBlockType.ReduceSum): |
| 529 | access.const_read[0] = weight_fetch |
| 530 | access.const_read[1] = query.ofm_shape.depth # Scales & biases |
| 531 | access.weights_refetch = ofm_blocks.elements_wh() |
| 532 | # Elementwise |
| 533 | elif query.npu_block_type == NpuBlockType.ElementWise: |
| 534 | if query.ifm_shape.elements() == 1: |
| 535 | if query.ifm_bits > 8: |
| 536 | # ifm is a non 8-bit scalar |
| 537 | access.ifm_read[0] = Shape4D.round_up(query.ifm_shape, ifm_rounding).elements() |
| 538 | if query.ifm2_shape: |
| 539 | access.ifm_read[1] = Shape4D.round_up(query.ofm_shape, ifm_rounding).elements() |
| 540 | else: |
| 541 | access.ifm_read[0] = Shape4D.round_up(query.ofm_shape, ifm_rounding).elements() |
| 542 | if query.ifm2_shape: |
| 543 | if query.ifm2_shape.elements() > 1: |
| 544 | access.ifm_read[1] = Shape4D.round_up(query.ofm_shape, ifm_rounding).elements() |
| 545 | elif query.ifm2_bits > 8: |
| 546 | # ifm2 is a non 8-bit scalar |
| 547 | access.ifm_read[1] = Shape4D.round_up(query.ifm2_shape, ifm_rounding).elements() |
Johan Alfven | 9072496 | 2023-02-02 09:07:48 +0100 | [diff] [blame^] | 548 | # DMA |
| 549 | elif query.npu_block_type == NpuBlockType.Dma: |
| 550 | # Return empty access since this is not an actual NPU op |
| 551 | return access |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 552 | # Unknown |
| 553 | else: |
| 554 | assert False |
| 555 | |
| 556 | ofm_rounding = Shape4D(list(arch.storage_rounding_quantums[query.ofm_format])) |
| 557 | access.ofm_write = Shape4D.round_up(query.ofm_shape, ofm_rounding).elements() |
| 558 | return access |
| 559 | |
| 560 | |
| 561 | def measure_performance_cost( |
| 562 | arch, op_type: Op, faf_type: Op, query: PerformanceQuery, offset: Shape4D, sub_shape: Shape4D |
| 563 | ): |
| 564 | assert (query.ofm_bits > 0) and (query.ifm_bits > 0) |
| 565 | assert query.ofm_shape.elements() != 0 |
| 566 | |
| 567 | # Default to start if no offset provided |
| 568 | if offset is None: |
| 569 | offset = Shape4D(0, 0, 0, 0) |
| 570 | |
| 571 | # Default to entire area if no sub-shape provided |
| 572 | if sub_shape is None: |
| 573 | sub_shape = query.ofm_shape |
| 574 | else: |
| 575 | sub_shape = Shape4D.min(sub_shape, query.ofm_shape) |
| 576 | |
| 577 | sub_query = copy.deepcopy(query) |
| 578 | sub_query.ofm_shape = query.ofm_shape.clip(offset, sub_shape) |
| 579 | |
| 580 | access = ElementAccess() |
| 581 | cycles = CycleCost() |
| 582 | |
| 583 | cycle_tmp = measure_cycle_cost(arch, op_type, faf_type, sub_query) |
| 584 | cycles += cycle_tmp |
| 585 | access = measure_element_access(arch, sub_query) |
| 586 | |
| 587 | return access, cycles |
| 588 | |
| 589 | |
| 590 | def make_bandwidth_array(): |
| 591 | return np.zeros((MemArea.Size, TensorPurpose.Size, BandwidthDirection.Size)) |
| 592 | |
| 593 | |
| 594 | def make_cycles_array(): |
| 595 | return np.zeros(PassCycles.Size) |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 596 | |
| 597 | |
Diqing Zhong | e168b96 | 2020-11-05 17:18:47 +0100 | [diff] [blame] | 598 | def update_summary_cycles(arch, bws, cycles): |
| 599 | cycles[PassCycles.SramAccess] = np.sum(bws[MemArea.Sram]) / arch.memory_bandwidths_per_cycle[MemArea.Sram] |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 600 | cycles[PassCycles.DramAccess] = np.sum(bws[MemArea.Dram]) / arch.memory_bandwidths_per_cycle[MemArea.Dram] |
| 601 | cycles[PassCycles.OnChipFlashAccess] = ( |
| 602 | np.sum(bws[MemArea.OnChipFlash]) / arch.memory_bandwidths_per_cycle[MemArea.OnChipFlash] |
| 603 | ) |
| 604 | cycles[PassCycles.OffChipFlashAccess] = ( |
| 605 | np.sum(bws[MemArea.OffChipFlash]) / arch.memory_bandwidths_per_cycle[MemArea.OffChipFlash] |
| 606 | ) |
| 607 | |
| 608 | cycles[PassCycles.Total] = np.max(cycles[: PassCycles.Total]) |
| 609 | return cycles |
| 610 | |
| 611 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 612 | def estimate_full_op_performance( |
Jonas Ohlsson | 845e232 | 2022-03-01 12:39:55 +0100 | [diff] [blame] | 613 | arch, schedule: Schedule, op: SchedulerOperation, prev_op: Optional[SchedulerOperation], block_config |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 614 | ): |
| 615 | cycles_a = make_cycles_array() |
| 616 | bws = make_bandwidth_array() |
| 617 | scaled_bws = make_bandwidth_array() # scaled bw with memory transfer efficiency |
| 618 | macs = 0 |
| 619 | |
| 620 | query = PerformanceQuery(op.op_type.npu_block_type) |
| 621 | query.ifm_shape = op.ifm.shape |
| 622 | query.ifm_format = op.ifm.format |
| 623 | query.ifm_memory_area = op.ifm.mem_area |
| 624 | query.ifm_bits = op.ifm.dtype.size_in_bits() |
| 625 | query.ifm2_shape = op.ifm2 and op.ifm2.shape |
| 626 | query.ifm2_format = op.ifm2 and op.ifm2.format |
| 627 | query.ifm2_memory_area = op.ifm2 and op.ifm2.mem_area |
| 628 | query.ifm2_bits = op.ifm2 and op.ifm2.dtype.size_in_bits() |
| 629 | query.ofm_shape = op.ofm.shape |
| 630 | query.ofm_memory_area = op.ofm.mem_area |
| 631 | query.ofm_bits = op.ofm.dtype.size_in_bits() |
| 632 | query.ofm_format = op.ofm.format |
| 633 | query.kernel = op.kernel |
| 634 | query.config = block_config |
| 635 | |
| 636 | cost = schedule.cost_map[op] |
| 637 | prev_cost = schedule.cost_map[prev_op] if prev_op else None |
| 638 | if op.parent_op.bias: |
| 639 | query.const_shape = Shape4D(1, 1, 1, op.ofm.shape.depth) |
Rickard Bolin | fd8b500 | 2022-05-16 09:11:06 +0000 | [diff] [blame] | 640 | if cost.buffered_weight_tensors: |
| 641 | query.const_memory_area = cost.buffered_weight_tensors[0].mem_area |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 642 | else: |
| 643 | query.const_memory_area = cost.npu_weights_tensor.mem_area |
| 644 | |
| 645 | cycles = measure_cycle_cost(arch, op.op_type, op.parent_op.activation and op.parent_op.activation.op_type, query) |
| 646 | cycles_a[PassCycles.Npu] = cycles.op_cycles |
| 647 | macs = cycles.op_macs |
| 648 | |
| 649 | access = measure_element_access(arch, query) |
| 650 | |
| 651 | # How many NPU cycles are available under the previously executing |
| 652 | # operator for performing buffered DMA transfers |
| 653 | slack_cycles = prev_cost.slack_buffering_cycles if prev_cost else 0 |
| 654 | |
| 655 | # LUT Transfer |
| 656 | parent_op = op.parent_op |
Johan Alfven | 9072496 | 2023-02-02 09:07:48 +0100 | [diff] [blame^] | 657 | dma_transfer_cycles = 0 |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 658 | if parent_op.activation_lut: |
| 659 | lut_tensor = [tens for tens in parent_op.inputs if tens.purpose == TensorPurpose.LUT][0] |
| 660 | src_tensor = lut_tensor.src_tensor |
| 661 | if src_tensor and lut_tensor.mem_area != src_tensor.mem_area: |
| 662 | bw = src_tensor.storage_size() |
Johan Alfven | 9072496 | 2023-02-02 09:07:48 +0100 | [diff] [blame^] | 663 | dma_transfer_cycles += measure_mem2mem_cycles(arch, src_tensor.mem_area, lut_tensor.mem_area, bw) |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 664 | |
| 665 | bws[src_tensor.mem_area][lut_tensor.purpose][BandwidthDirection.Read] += bw |
| 666 | # LUT read from SHRAM TODO remove? |
Ayaan Masood | d5cbef3 | 2022-02-22 15:56:35 +0000 | [diff] [blame] | 667 | scaled_bws[lut_tensor.mem_area][lut_tensor.purpose][BandwidthDirection.Read] += bw |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 668 | |
Johan Alfven | 9072496 | 2023-02-02 09:07:48 +0100 | [diff] [blame^] | 669 | # DMA Transfer |
| 670 | if parent_op.type == Op.Memcpy: |
| 671 | src_tensor = parent_op.ifm |
| 672 | dst_tensor = parent_op.ofm |
| 673 | if src_tensor.mem_area != dst_tensor.mem_area: |
| 674 | bw = src_tensor.storage_size() |
| 675 | dma_transfer_cycles += measure_mem2mem_cycles(arch, src_tensor.mem_area, dst_tensor.mem_area, bw) |
| 676 | bws[src_tensor.mem_area][src_tensor.purpose][BandwidthDirection.Read] += bw |
| 677 | bws[dst_tensor.mem_area][src_tensor.purpose][BandwidthDirection.Write] += bw |
| 678 | |
Rickard Bolin | fd8b500 | 2022-05-16 09:11:06 +0000 | [diff] [blame] | 679 | if cost.npu_weights_tensor and cost.buffered_weight_tensors: |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 680 | # DMA Weight Transfer |
| 681 | sz = 0 |
| 682 | # Get the size of the first DMA |
| 683 | for core in range(0, arch.ncores): |
| 684 | key = WeightKey(core, 0) |
| 685 | if key in cost.npu_weights_tensor.encoded_ranges: |
| 686 | weight_range = cost.npu_weights_tensor.encoded_ranges[key] |
| 687 | sz += round_up(weight_range.total_bytes, 16) |
| 688 | |
| 689 | total_sz = len(cost.npu_weights_tensor.buffer) |
| 690 | bws[cost.npu_weights_tensor.mem_area][TensorPurpose.Weights][BandwidthDirection.Read] += total_sz |
Rickard Bolin | fd8b500 | 2022-05-16 09:11:06 +0000 | [diff] [blame] | 691 | bws[cost.buffered_weight_tensors[0].mem_area][TensorPurpose.Weights][BandwidthDirection.Write] += total_sz |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 692 | |
| 693 | ws_first_transfer_cycles = measure_mem2mem_cycles( |
Rickard Bolin | fd8b500 | 2022-05-16 09:11:06 +0000 | [diff] [blame] | 694 | arch, cost.npu_weights_tensor.mem_area, cost.buffered_weight_tensors[0].mem_area, sz |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 695 | ) |
| 696 | |
| 697 | # Add cycles for Weight + Scale Transfer |
Johan Alfvén | 0f98de6 | 2022-05-15 14:54:51 +0200 | [diff] [blame] | 698 | if cost.buffered_weight_tensors[0].sub_purpose == TensorSubPurpose.DoubleBuffer: |
| 699 | # Double buffer - weights can be fetched in parallel |
| 700 | cycles_a[PassCycles.Npu] = max( |
| 701 | cost.full_weight_transfer_cycles - slack_cycles + cost.slack_buffering_cycles, |
| 702 | cycles.op_cycles + max(ws_first_transfer_cycles - slack_cycles, 0), |
| 703 | ) |
| 704 | else: |
| 705 | # Standard buffer - weights can not be fetched in parallel so weight transfer |
| 706 | # must be included in the result |
| 707 | cycles_a[PassCycles.Npu] = ( |
| 708 | cycles.op_cycles + cost.full_weight_transfer_cycles - min(ws_first_transfer_cycles, slack_cycles) |
| 709 | ) |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 710 | |
Johan Alfven | 9072496 | 2023-02-02 09:07:48 +0100 | [diff] [blame^] | 711 | # Add cycles for LUT + mempcy op Transfer |
| 712 | cycles_a[PassCycles.Npu] += dma_transfer_cycles |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 713 | else: |
Johan Alfven | 9072496 | 2023-02-02 09:07:48 +0100 | [diff] [blame^] | 714 | # Add cycles for LUT + mempcy op Transfer |
| 715 | cycles_a[PassCycles.Npu] += max(dma_transfer_cycles - slack_cycles, 0) |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 716 | |
| 717 | # OFM write |
| 718 | ofm = op.parent_op.ofm |
| 719 | bw = access.ofm_write * ofm.element_size() |
| 720 | bws[query.ofm_memory_area][ofm.purpose][BandwidthDirection.Write] += bw |
| 721 | scaled_bws[ofm.mem_area][ofm.purpose][BandwidthDirection.Write] += _estimate_memory_transfer_efficiency( |
| 722 | arch, False, query.ofm_memory_area, ofm.format, query.ofm_bits, query.config.ofm_block, query.ofm_shape, bw |
| 723 | ) |
| 724 | |
| 725 | # IFM read |
Johan Alfvén | 2f87617 | 2022-12-07 12:40:55 +0100 | [diff] [blame] | 726 | ifm = op.parent_op.ifm2 if op.reversed_operands else op.parent_op.ifm |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 727 | bw = access.ifm_read[0] * ifm.element_size() |
| 728 | bws[ifm.mem_area][ifm.purpose][BandwidthDirection.Read] += bw |
| 729 | scaled_bws[ifm.mem_area][ifm.purpose][BandwidthDirection.Read] += _estimate_memory_transfer_efficiency( |
| 730 | arch, True, query.ifm_memory_area, ifm.format, query.ifm_bits, query.config.ifm_block, query.ifm_shape, bw |
| 731 | ) |
Johan Alfvén | 2f87617 | 2022-12-07 12:40:55 +0100 | [diff] [blame] | 732 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 733 | if query.ifm2_shape: |
Johan Alfvén | 2f87617 | 2022-12-07 12:40:55 +0100 | [diff] [blame] | 734 | ifm2 = op.parent_op.ifm if op.reversed_operands else op.parent_op.ifm2 |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 735 | bw = access.ifm_read[1] * ifm2.element_size() |
| 736 | bws[ifm2.mem_area][ifm2.purpose][BandwidthDirection.Read] += bw |
| 737 | scaled_bws[ifm2.mem_area][ifm2.purpose][BandwidthDirection.Read] += _estimate_memory_transfer_efficiency( |
| 738 | arch, |
| 739 | True, |
| 740 | query.ifm2_memory_area, |
| 741 | ifm2.format, |
| 742 | op.ifm2.dtype.size_in_bits(), |
| 743 | query.config.ifm_block, |
| 744 | query.ifm2_shape, |
| 745 | bw, |
| 746 | ) |
| 747 | |
| 748 | # Weight read |
| 749 | if access.const_read[0] > 0: |
| 750 | # alignment not accounted for in bandwidth_compression_scale_approx |
| 751 | encoded_size_approx = ( |
| 752 | cost.npu_weights_tensor.elements() - access.const_read[1] * op.parent_op.bias.element_size() |
| 753 | ) |
| 754 | orig_weight_size = parent_op.weights.elements() |
| 755 | bandwidth_compression_scale_approx = encoded_size_approx / orig_weight_size |
| 756 | bw = access.const_read[0] * bandwidth_compression_scale_approx |
| 757 | bws[query.const_memory_area][TensorPurpose.Weights][BandwidthDirection.Read] += bw |
| 758 | |
Rickard Bolin | fd8b500 | 2022-05-16 09:11:06 +0000 | [diff] [blame] | 759 | if not cost.buffered_weight_tensors: |
Patrik Gustavsson | 225e19d | 2021-06-01 12:43:43 +0200 | [diff] [blame] | 760 | scaled_bws[query.const_memory_area][TensorPurpose.Weights][BandwidthDirection.Read] += bw |
| 761 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 762 | if access.const_read[1] > 0: |
| 763 | # Scales & biases |
| 764 | bw = access.const_read[1] * op.parent_op.bias.element_size() |
| 765 | bws[query.const_memory_area][TensorPurpose.FSBias][BandwidthDirection.Read] += bw |
| 766 | |
Rickard Bolin | fd8b500 | 2022-05-16 09:11:06 +0000 | [diff] [blame] | 767 | if not cost.buffered_weight_tensors: |
Patrik Gustavsson | 225e19d | 2021-06-01 12:43:43 +0200 | [diff] [blame] | 768 | scaled_bws[query.const_memory_area][TensorPurpose.FSBias][BandwidthDirection.Read] += bw |
| 769 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 770 | update_summary_cycles(arch, scaled_bws, cycles_a) |
| 771 | |
| 772 | return bws, macs, cycles_a |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 773 | |
| 774 | |
Tim Hall | c1be087 | 2022-03-03 17:50:52 +0000 | [diff] [blame] | 775 | def print_performance( |
| 776 | nng: Graph, |
| 777 | arch: ArchitectureFeatures, |
| 778 | network_type: NetworkType, |
| 779 | bws: dict, |
| 780 | macs: dict, |
| 781 | cycles: dict, |
| 782 | mem_usage: dict, |
wilisa01 | 89a8cdd | 2022-08-22 16:13:06 +0000 | [diff] [blame] | 783 | output_basename: str, |
Tim Hall | c1be087 | 2022-03-03 17:50:52 +0000 | [diff] [blame] | 784 | ): |
Tim Hall | 5ae6cb0 | 2022-11-11 18:55:49 +0000 | [diff] [blame] | 785 | def _percentage(part, whole): |
| 786 | # desired behaviour is for division by zero to return 100% |
| 787 | if whole == 0: |
| 788 | return 100.0 |
| 789 | else: |
| 790 | return part / whole * 100.0 |
| 791 | |
Tim Hall | c1be087 | 2022-03-03 17:50:52 +0000 | [diff] [blame] | 792 | if network_type == NetworkType.TFLite: |
| 793 | nng_optype_to_input_op_type = tflite_optype_to_builtintype |
| 794 | else: |
| 795 | nng_optype_to_input_op_type = tosa_optype_to_tosa_op_type |
| 796 | |
| 797 | suid_inv_map = {v: k for k, v in DebugDatabase._sourceUID.items()} |
| 798 | |
Tim Hall | 5ae6cb0 | 2022-11-11 18:55:49 +0000 | [diff] [blame] | 799 | # the header is a list (one entry per column) of tuples (column name, alignment, width, precision) |
| 800 | header = [ |
| 801 | (f"{network_type.name}_operator", "<", 20, -1), |
| 802 | ("NNG Operator", "<", 20, -1), |
| 803 | ("SRAM Usage", ">", 10, 0.0), |
| 804 | ("Peak%", ">", 6, 0.2), |
| 805 | ("Op Cycles", ">", 10, 0.0), |
| 806 | ("Network%", ">", 8, 0.2), |
| 807 | ("NPU", ">", 10, 0.0), |
| 808 | ("SRAM AC", ">", 10, 0.0), |
| 809 | ("DRAM AC", ">", 10, 0.0), |
| 810 | ("OnFlash AC", ">", 10, 0.0), |
| 811 | ("OffFlash AC", ">", 11, 0.0), |
| 812 | ("MAC Count", ">", 10, 0.0), |
| 813 | ("Network%", ">", 8, 0.2), |
| 814 | ("Util%", ">", 6, 0.2), |
| 815 | ("Name", "<", 20, -1), |
| 816 | ] |
| 817 | |
| 818 | # open the csv |
| 819 | csv_file = open(output_basename + "_per-layer.csv", "w", encoding="UTF8") |
| 820 | writer = csv.writer(csv_file) |
| 821 | |
Tim Hall | c1be087 | 2022-03-03 17:50:52 +0000 | [diff] [blame] | 822 | for sg in nng.subgraphs: |
| 823 | |
| 824 | if sg.placement != PassPlacement.Npu: |
| 825 | continue |
| 826 | |
Tim Hall | 5ae6cb0 | 2022-11-11 18:55:49 +0000 | [diff] [blame] | 827 | sg_seperator_text = f"\n{str('#') * 80}\nPerformance for NPU Subgraph {sg.name}" |
Tim Hall | c1be087 | 2022-03-03 17:50:52 +0000 | [diff] [blame] | 828 | |
Tim Hall | 5ae6cb0 | 2022-11-11 18:55:49 +0000 | [diff] [blame] | 829 | # the data is a list (one entry per op) of lists (matching the header columns) |
| 830 | data = [] |
| 831 | for sched_op in sg.sched_ops: |
| 832 | # get source op name |
| 833 | sched_op_src_uid = DebugDatabase._optimisedUID[sched_op.parent_op][1] |
| 834 | if sched_op_src_uid == DebugDatabase.NULLREF: |
| 835 | src_op_type = None |
| 836 | else: |
| 837 | src_op_type = suid_inv_map[sched_op_src_uid].original_type |
Tim Hall | c1be087 | 2022-03-03 17:50:52 +0000 | [diff] [blame] | 838 | |
Tim Hall | 5ae6cb0 | 2022-11-11 18:55:49 +0000 | [diff] [blame] | 839 | src_op_name = nng_optype_to_input_op_type(src_op_type) |
Tim Hall | c1be087 | 2022-03-03 17:50:52 +0000 | [diff] [blame] | 840 | |
Tim Hall | 5ae6cb0 | 2022-11-11 18:55:49 +0000 | [diff] [blame] | 841 | max_macs = cycles[sched_op][PassCycles.Total] * arch.num_macs_per_cycle * arch.ncores |
| 842 | peak_sram = ( |
| 843 | _percentage(mem_usage[sched_op], nng.memory_used[MemArea.Sram]) |
| 844 | if MemArea.Sram in nng.memory_used |
| 845 | else 0 |
| 846 | ) |
wilisa01 | 89a8cdd | 2022-08-22 16:13:06 +0000 | [diff] [blame] | 847 | |
Tim Hall | 5ae6cb0 | 2022-11-11 18:55:49 +0000 | [diff] [blame] | 848 | data.append( |
| 849 | [ |
| 850 | src_op_name, |
| 851 | sched_op.op_type, |
| 852 | mem_usage[sched_op], |
| 853 | peak_sram, |
| 854 | cycles[sched_op][PassCycles.Total], |
| 855 | _percentage(cycles[sched_op][PassCycles.Total], nng.cycles[PassCycles.Total]), |
| 856 | cycles[sched_op][PassCycles.Npu], |
| 857 | cycles[sched_op][PassCycles.SramAccess], |
| 858 | cycles[sched_op][PassCycles.DramAccess], |
| 859 | cycles[sched_op][PassCycles.OnChipFlashAccess], |
| 860 | cycles[sched_op][PassCycles.OffChipFlashAccess], |
| 861 | macs[sched_op], |
| 862 | _percentage(macs[sched_op], nng.macs), |
| 863 | _percentage(macs[sched_op], max_macs), |
| 864 | sched_op.name, |
wilisa01 | 89a8cdd | 2022-08-22 16:13:06 +0000 | [diff] [blame] | 865 | ] |
Tim Hall | 5ae6cb0 | 2022-11-11 18:55:49 +0000 | [diff] [blame] | 866 | ) |
| 867 | |
| 868 | # print to console |
| 869 | print(sg_seperator_text) |
| 870 | line = "" |
| 871 | line2 = "" |
| 872 | for col_name, align, width, _ in header: |
| 873 | line_data = f"{col_name:{align}{width}}" |
| 874 | line += line_data + " " |
| 875 | line2 += "-" * len(line_data) + " " |
| 876 | print(line) |
| 877 | print(line2) |
| 878 | |
| 879 | for op_data in data: |
| 880 | line = "" |
| 881 | for idx, item in enumerate(op_data): |
| 882 | _, align, width, precision = header[idx] |
| 883 | if precision == -1: |
| 884 | w = str(width) |
| 885 | else: |
| 886 | w = str(width + precision) + "f" |
| 887 | line += f"{item:{align}{w}}" + " " |
| 888 | print(line) |
| 889 | |
| 890 | # print to csv |
| 891 | writer.writerow((sg_seperator_text,)) |
| 892 | writer.writerow(col_name for col_name, _, _, _ in header) |
| 893 | for op_data in data: |
| 894 | writer.writerow(op_data) |
| 895 | |
| 896 | # close the csv |
| 897 | csv_file.close() |
Tim Hall | c1be087 | 2022-03-03 17:50:52 +0000 | [diff] [blame] | 898 | |
| 899 | |
wilisa01 | 89a8cdd | 2022-08-22 16:13:06 +0000 | [diff] [blame] | 900 | def calc_new_performance_for_network( |
| 901 | nng: Graph, |
| 902 | arch, |
| 903 | network_type: NetworkType, |
| 904 | verbose_performance: bool, |
| 905 | output_basename: str = "output/unnamed_network", |
| 906 | ): |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 907 | total_bws = make_bandwidth_array() |
Diqing Zhong | 69aadd0 | 2020-12-08 13:08:48 +0100 | [diff] [blame] | 908 | total_macs = 0 |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 909 | total_cycles = np.zeros(PassCycles.Size) |
Ayaan Masood | b801dda | 2022-02-22 11:28:55 +0000 | [diff] [blame] | 910 | total_weight_size = 0 |
| 911 | total_encoded_weight_size = 0 |
| 912 | |
| 913 | # Store unique instances of original/encoded weight tensor uuids to prevent double counting of weights |
| 914 | original_weight_uuids: Set[UUID] = set() |
| 915 | encoded_npu_weight_uuids: Set[UUID] = set() |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 916 | |
Tim Hall | c1be087 | 2022-03-03 17:50:52 +0000 | [diff] [blame] | 917 | bws = {} |
| 918 | macs = {} |
| 919 | cycles = {} |
| 920 | mem_usage = {} |
| 921 | |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 922 | for sg in nng.subgraphs: |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 923 | prev_op = None |
| 924 | for sched_op in sg.sched_ops: |
Ayaan Masood | b801dda | 2022-02-22 11:28:55 +0000 | [diff] [blame] | 925 | op_info: SchedulerOpInfo = sg.schedule.cost_map[sched_op] |
Tim Hall | c1be087 | 2022-03-03 17:50:52 +0000 | [diff] [blame] | 926 | bws[sched_op], macs[sched_op], cycles[sched_op] = estimate_full_op_performance( |
| 927 | arch, sg.schedule, sched_op, prev_op, op_info.block_config |
| 928 | ) |
| 929 | |
| 930 | # get op sram usage |
| 931 | mem_usage[sched_op] = ( |
| 932 | sg.schedule.memory_snapshot[op_info.time_index] |
| 933 | if op_info.time_index < len(sg.schedule.memory_snapshot) |
| 934 | else 0 |
| 935 | ) |
Ayaan Masood | b801dda | 2022-02-22 11:28:55 +0000 | [diff] [blame] | 936 | |
| 937 | # Tensors for calculating weight sizes |
| 938 | original_weight = sched_op.parent_op.weights |
| 939 | encoded_npu_weight = op_info.npu_weights_tensor |
| 940 | |
| 941 | # Save UUIDs of original_weight so only unique instances of tensors are used to calculate weights |
| 942 | if original_weight and (original_weight.equivalence_id not in original_weight_uuids): |
| 943 | |
| 944 | original_weight_uuids.add(original_weight.equivalence_id) |
| 945 | total_weight_size += original_weight.values.itemsize * original_weight.values.size |
| 946 | |
| 947 | # Save UUIDs of encoded_npu_weight so only unique instances of tensors are used to calculate weights |
| 948 | if encoded_npu_weight and (encoded_npu_weight.equivalence_id not in encoded_npu_weight_uuids): |
| 949 | |
Jonas Ohlsson | 77b448f | 2022-03-11 16:08:30 +0100 | [diff] [blame] | 950 | encoded_npu_weight_uuids.add(encoded_npu_weight.equivalence_id) |
Ayaan Masood | b801dda | 2022-02-22 11:28:55 +0000 | [diff] [blame] | 951 | total_encoded_weight_size += len(encoded_npu_weight.buffer) |
| 952 | |
Tim Hall | c1be087 | 2022-03-03 17:50:52 +0000 | [diff] [blame] | 953 | total_bws += bws[sched_op] |
| 954 | total_macs += macs[sched_op] |
| 955 | total_cycles += cycles[sched_op] |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 956 | prev_op = sched_op |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 957 | |
| 958 | nng.bandwidths = total_bws |
| 959 | nng.macs = total_macs |
| 960 | nng.cycles = total_cycles |
Ayaan Masood | b801dda | 2022-02-22 11:28:55 +0000 | [diff] [blame] | 961 | nng.total_original_weights = total_weight_size |
| 962 | nng.total_npu_encoded_weights = total_encoded_weight_size |
Tim Hall | c1be087 | 2022-03-03 17:50:52 +0000 | [diff] [blame] | 963 | |
| 964 | if verbose_performance: |
wilisa01 | 89a8cdd | 2022-08-22 16:13:06 +0000 | [diff] [blame] | 965 | print_performance(nng, arch, network_type, bws, macs, cycles, mem_usage, output_basename) |